From c73275cf6834787ca090317f1d20dbfa3b7f05aa Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Tue, 23 Aug 2022 09:15:03 +0800 Subject: [PATCH 0001/1423] apparmor: fix a memleak in multi_transaction_new() In multi_transaction_new(), the variable t is not freed or passed out on the failure of copy_from_user(t->data, buf, size), which could lead to a memleak. Fix this bug by adding a put_multi_transaction(t) in the error path. Fixes: 1dea3b41e84c5 ("apparmor: speed up transactional queries") Signed-off-by: Gaosheng Cui Signed-off-by: John Johansen --- security/apparmor/apparmorfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index d066ccc219e2d..7160e7aa58b94 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -868,8 +868,10 @@ static struct multi_transaction *multi_transaction_new(struct file *file, if (!t) return ERR_PTR(-ENOMEM); kref_init(&t->count); - if (copy_from_user(t->data, buf, size)) + if (copy_from_user(t->data, buf, size)) { + put_multi_transaction(t); return ERR_PTR(-EFAULT); + } return t; } -- GitLab From 9c4557efc558a68e4cd973490fd936d6e3414db8 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Tue, 6 Sep 2022 03:39:55 -0700 Subject: [PATCH 0002/1423] apparmor: fix lockdep warning when removing a namespace Fix the following lockdep warning [ 1119.158984] ============================================ [ 1119.158988] WARNING: possible recursive locking detected [ 1119.158996] 6.0.0-rc1+ #257 Tainted: G E N [ 1119.158999] -------------------------------------------- [ 1119.159001] bash/80100 is trying to acquire lock: [ 1119.159007] ffff88803e79b4a0 (&ns->lock/1){+.+.}-{4:4}, at: destroy_ns.part.0+0x43/0x140 [ 1119.159028] but task is already holding lock: [ 1119.159030] ffff8881009764a0 (&ns->lock/1){+.+.}-{4:4}, at: aa_remove_profiles+0x3f0/0x640 [ 1119.159040] other info that might help us debug this: [ 1119.159042] Possible unsafe locking scenario: [ 1119.159043] CPU0 [ 1119.159045] ---- [ 1119.159047] lock(&ns->lock/1); [ 1119.159051] lock(&ns->lock/1); [ 1119.159055] *** DEADLOCK *** Which is caused by an incorrect lockdep nesting notation Fixes: feb3c766a3ab ("apparmor: fix possible recursive lock warning in __aa_create_ns") Signed-off-by: John Johansen --- security/apparmor/policy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c index 499c0209b6a46..fbdfcef91c616 100644 --- a/security/apparmor/policy.c +++ b/security/apparmor/policy.c @@ -1170,7 +1170,7 @@ ssize_t aa_remove_profiles(struct aa_ns *policy_ns, struct aa_label *subj, if (!name) { /* remove namespace - can only happen if fqname[0] == ':' */ - mutex_lock_nested(&ns->parent->lock, ns->level); + mutex_lock_nested(&ns->parent->lock, ns->parent->level); __aa_bump_ns_revision(ns); __aa_remove_ns(ns); mutex_unlock(&ns->parent->lock); -- GitLab From f47acc4b7c43d566bf42816335830c4c17f9c200 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Tue, 6 Sep 2022 14:03:44 -0700 Subject: [PATCH 0003/1423] apparmor: reserve mediation classes Reserve mediation classes that exist in out of tree development branches or are used by userspace mediation helpers. Signed-off-by: John Johansen --- security/apparmor/include/apparmor.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/security/apparmor/include/apparmor.h b/security/apparmor/include/apparmor.h index 9c3fc36a07023..dd2c131ed1706 100644 --- a/security/apparmor/include/apparmor.h +++ b/security/apparmor/include/apparmor.h @@ -28,8 +28,15 @@ #define AA_CLASS_SIGNAL 10 #define AA_CLASS_NET 14 #define AA_CLASS_LABEL 16 +#define AA_CLASS_POSIX_MQUEUE 17 +#define AA_CLASS_IO_URING 18 +#define AA_CLASS_MODULE 19 +#define AA_CLASS_DISPLAY_LSM 20 -#define AA_CLASS_LAST AA_CLASS_LABEL +#define AA_CLASS_X 31 +#define AA_CLASS_DBUS 32 + +#define AA_CLASS_LAST AA_CLASS_DBUS /* Control parameters settable through module/boot flags */ extern enum audit_mode aa_g_audit; -- GitLab From f4d6b94b40c966ddd9eeb0d451e8a02c595ec7e3 Mon Sep 17 00:00:00 2001 From: Jon Tourville Date: Mon, 11 Jul 2022 11:36:08 -0500 Subject: [PATCH 0004/1423] apparmor: use zstd compression for profile data Change the algorithm used by apparmor to compress profile data from zlib to zstd, using the new zstd API introduced in 5.16. Zstd provides a larger range of compression levels than zlib and significantly better performance at the default level (for a relatively small increase in compressed size). The apparmor module parameter raw_data_compression_level is now clamped to the minimum and maximum compression levels reported by the zstd library. A compression level of 0 retains the previous behavior of disabling policy compression instead of using zstd's behavior, which is to use the default compression level. Signed-off-by: Jon Tourville Signed-off-by: John Johansen --- security/apparmor/Kconfig | 4 +- security/apparmor/apparmorfs.c | 60 +++++++--------- security/apparmor/lsm.c | 10 +-- security/apparmor/policy_unpack.c | 109 ++++++++++++++---------------- 4 files changed, 81 insertions(+), 102 deletions(-) diff --git a/security/apparmor/Kconfig b/security/apparmor/Kconfig index cb3496e00d8a6..acac3bb3eef2b 100644 --- a/security/apparmor/Kconfig +++ b/security/apparmor/Kconfig @@ -85,8 +85,8 @@ config SECURITY_APPARMOR_HASH_DEFAULT config SECURITY_APPARMOR_EXPORT_BINARY bool "Allow exporting the raw binary policy" depends on SECURITY_APPARMOR_INTROSPECT_POLICY - select ZLIB_INFLATE - select ZLIB_DEFLATE + select ZSTD_COMPRESS + select ZSTD_DECOMPRESS default y help This option allows reading back binary policy as it was loaded. diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 7160e7aa58b94..d98bbf267fc77 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include @@ -1297,42 +1297,30 @@ SEQ_RAWDATA_FOPS(revision); SEQ_RAWDATA_FOPS(hash); SEQ_RAWDATA_FOPS(compressed_size); -static int deflate_decompress(char *src, size_t slen, char *dst, size_t dlen) +static int decompress_zstd(char *src, size_t slen, char *dst, size_t dlen) { #ifdef CONFIG_SECURITY_APPARMOR_EXPORT_BINARY - if (aa_g_rawdata_compression_level != 0) { - int error = 0; - struct z_stream_s strm; - - memset(&strm, 0, sizeof(strm)); - - strm.workspace = kvzalloc(zlib_inflate_workspacesize(), GFP_KERNEL); - if (!strm.workspace) - return -ENOMEM; - - strm.next_in = src; - strm.avail_in = slen; - - error = zlib_inflateInit(&strm); - if (error != Z_OK) { - error = -ENOMEM; - goto fail_inflate_init; + if (aa_g_rawdata_compression_level == 0) { + const size_t wksp_len = zstd_dctx_workspace_bound(); + zstd_dctx *ctx; + void *wksp; + size_t out_len; + int ret = 0; + + wksp = kvzalloc(wksp_len, GFP_KERNEL); + if (!wksp) { + ret = -ENOMEM; + goto cleanup; } - strm.next_out = dst; - strm.avail_out = dlen; - - error = zlib_inflate(&strm, Z_FINISH); - if (error != Z_STREAM_END) - error = -EINVAL; - else - error = 0; - - zlib_inflateEnd(&strm); -fail_inflate_init: - kvfree(strm.workspace); - - return error; + out_len = zstd_decompress_dctx(ctx, dst, dlen, src, slen); + if (zstd_is_error(out_len)) { + ret = -EINVAL; + goto cleanup; + } +cleanup: + kvfree(wksp); + return ret; } #endif @@ -1381,9 +1369,9 @@ static int rawdata_open(struct inode *inode, struct file *file) private->loaddata = loaddata; - error = deflate_decompress(loaddata->data, loaddata->compressed_size, - RAWDATA_F_DATA_BUF(private), - loaddata->size); + error = decompress_zstd(loaddata->data, loaddata->compressed_size, + RAWDATA_F_DATA_BUF(private), + loaddata->size); if (error) goto fail_decompress; diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index e29cade7b6627..ec873ff0a4bbb 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include @@ -1361,7 +1361,7 @@ module_param_named(export_binary, aa_g_export_binary, aabool, 0600); #endif /* policy loaddata compression level */ -int aa_g_rawdata_compression_level = Z_DEFAULT_COMPRESSION; +int aa_g_rawdata_compression_level = ZSTD_CLEVEL_DEFAULT; module_param_named(rawdata_compression_level, aa_g_rawdata_compression_level, aacompressionlevel, 0400); @@ -1543,9 +1543,9 @@ static int param_set_aacompressionlevel(const char *val, error = param_set_int(val, kp); aa_g_rawdata_compression_level = clamp(aa_g_rawdata_compression_level, - Z_NO_COMPRESSION, - Z_BEST_COMPRESSION); - pr_info("AppArmor: policy rawdata compression level set to %u\n", + zstd_min_clevel(), + zstd_max_clevel()); + pr_info("AppArmor: policy rawdata compression level set to %d\n", aa_g_rawdata_compression_level); return error; diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 55d31bac4f35b..10e462d003213 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include "include/apparmor.h" #include "include/audit.h" @@ -1059,81 +1059,73 @@ struct aa_load_ent *aa_load_ent_alloc(void) return ent; } -static int deflate_compress(const char *src, size_t slen, char **dst, - size_t *dlen) +static int compress_zstd(const char *src, size_t slen, char **dst, size_t *dlen) { #ifdef CONFIG_SECURITY_APPARMOR_EXPORT_BINARY - int error; - struct z_stream_s strm; - void *stgbuf, *dstbuf; - size_t stglen = deflateBound(slen); - - memset(&strm, 0, sizeof(strm)); - - if (stglen < slen) - return -EFBIG; - - strm.workspace = kvzalloc(zlib_deflate_workspacesize(MAX_WBITS, - MAX_MEM_LEVEL), - GFP_KERNEL); - if (!strm.workspace) - return -ENOMEM; - - error = zlib_deflateInit(&strm, aa_g_rawdata_compression_level); - if (error != Z_OK) { - error = -ENOMEM; - goto fail_deflate_init; + const zstd_parameters params = + zstd_get_params(aa_g_rawdata_compression_level, slen); + const size_t wksp_len = zstd_cctx_workspace_bound(¶ms.cParams); + void *wksp = NULL; + zstd_cctx *ctx = NULL; + size_t out_len = zstd_compress_bound(slen); + void *out = NULL; + int ret = 0; + + out = kvzalloc(out_len, GFP_KERNEL); + if (!out) { + ret = -ENOMEM; + goto cleanup; } - stgbuf = kvzalloc(stglen, GFP_KERNEL); - if (!stgbuf) { - error = -ENOMEM; - goto fail_stg_alloc; + wksp = kvzalloc(wksp_len, GFP_KERNEL); + if (!wksp) { + ret = -ENOMEM; + goto cleanup; } - strm.next_in = src; - strm.avail_in = slen; - strm.next_out = stgbuf; - strm.avail_out = stglen; + ctx = zstd_init_cctx(wksp, wksp_len); + if (!ctx) { + ret = -EINVAL; + goto cleanup; + } - error = zlib_deflate(&strm, Z_FINISH); - if (error != Z_STREAM_END) { - error = -EINVAL; - goto fail_deflate; + out_len = zstd_compress_cctx(ctx, out, out_len, src, slen, ¶ms); + if (zstd_is_error(out_len)) { + ret = -EINVAL; + goto cleanup; } - error = 0; - if (is_vmalloc_addr(stgbuf)) { - dstbuf = kvzalloc(strm.total_out, GFP_KERNEL); - if (dstbuf) { - memcpy(dstbuf, stgbuf, strm.total_out); - kvfree(stgbuf); + if (is_vmalloc_addr(out)) { + *dst = kvzalloc(out_len, GFP_KERNEL); + if (*dst) { + memcpy(*dst, out, out_len); + kvfree(out); + out = NULL; } - } else + } else { /* * If the staging buffer was kmalloc'd, then using krealloc is * probably going to be faster. The destination buffer will * always be smaller, so it's just shrunk, avoiding a memcpy */ - dstbuf = krealloc(stgbuf, strm.total_out, GFP_KERNEL); + *dst = krealloc(out, out_len, GFP_KERNEL); + } - if (!dstbuf) { - error = -ENOMEM; - goto fail_deflate; + if (!*dst) { + ret = -ENOMEM; + goto cleanup; } - *dst = dstbuf; - *dlen = strm.total_out; + *dlen = out_len; -fail_stg_alloc: - zlib_deflateEnd(&strm); -fail_deflate_init: - kvfree(strm.workspace); - return error; +cleanup: + if (ret) { + kvfree(out); + *dst = NULL; + } -fail_deflate: - kvfree(stgbuf); - goto fail_stg_alloc; + kvfree(wksp); + return ret; #else *dlen = slen; return 0; @@ -1142,7 +1134,6 @@ fail_deflate: static int compress_loaddata(struct aa_loaddata *data) { - AA_BUG(data->compressed_size > 0); /* @@ -1151,8 +1142,8 @@ static int compress_loaddata(struct aa_loaddata *data) */ if (aa_g_rawdata_compression_level != 0) { void *udata = data->data; - int error = deflate_compress(udata, data->size, &data->data, - &data->compressed_size); + int error = compress_zstd(udata, data->size, &data->data, + &data->compressed_size); if (error) return error; -- GitLab From 2218d08123362c63bab257caf5ec3bc1a6e87ae9 Mon Sep 17 00:00:00 2001 From: Jon Tourville Date: Mon, 11 Jul 2022 11:36:09 -0500 Subject: [PATCH 0005/1423] apparmor: expose compression level limits in sysfs Create two new files in apparmor's sysfs: /sys/kernel/security/apparmor/raw_data_compression_level_min /sys/kernel/security/apparmor/raw_data_compression_level_max These correspond to the minimum and maximum zstd compression levels that can be assigned to the apparmor module parameter raw_data_compression_level. Signed-off-by: Jon Tourville Signed-off-by: John Johansen --- security/apparmor/apparmorfs.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index d98bbf267fc77..044affb1ce839 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -1199,10 +1199,24 @@ static int seq_ns_name_show(struct seq_file *seq, void *v) return 0; } +static int seq_ns_compress_min_show(struct seq_file *seq, void *v) +{ + seq_printf(seq, "%d\n", zstd_min_clevel()); + return 0; +} + +static int seq_ns_compress_max_show(struct seq_file *seq, void *v) +{ + seq_printf(seq, "%d\n", zstd_max_clevel()); + return 0; +} + SEQ_NS_FOPS(stacked); SEQ_NS_FOPS(nsstacked); SEQ_NS_FOPS(level); SEQ_NS_FOPS(name); +SEQ_NS_FOPS(compress_min); +SEQ_NS_FOPS(compress_max); /* policy/raw_data/ * file ops */ @@ -2382,6 +2396,8 @@ static struct aa_sfs_entry aa_sfs_entry_apparmor[] = { AA_SFS_FILE_FOPS(".ns_level", 0444, &seq_ns_level_fops), AA_SFS_FILE_FOPS(".ns_name", 0444, &seq_ns_name_fops), AA_SFS_FILE_FOPS("profiles", 0444, &aa_sfs_profiles_fops), + AA_SFS_FILE_FOPS("raw_data_compression_level_min", 0444, &seq_ns_compress_min_fops), + AA_SFS_FILE_FOPS("raw_data_compression_level_max", 0444, &seq_ns_compress_max_fops), AA_SFS_DIR("features", aa_sfs_entry_features), { } }; -- GitLab From 408d53e923bd852d5d80243a642004163db53a87 Mon Sep 17 00:00:00 2001 From: Mike Salvatore Date: Mon, 30 Mar 2020 16:43:29 -0400 Subject: [PATCH 0006/1423] apparmor: compute file permissions on profile load Rather than computing file permissions for each file access, file permissions can be computed once on profile load and stored for lookup. Signed-off-by: Mike Salvatore Signed-off-by: John Johansen --- security/apparmor/apparmorfs.c | 2 +- security/apparmor/domain.c | 10 +-- security/apparmor/file.c | 128 +++++++++++++++++++++--------- security/apparmor/include/file.h | 15 +++- security/apparmor/policy_unpack.c | 3 + 5 files changed, 110 insertions(+), 48 deletions(-) diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 044affb1ce839..825b3093dcddf 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -624,7 +624,7 @@ static void profile_query_cb(struct aa_profile *profile, struct aa_perms *perms, if (state) { struct path_cond cond = { }; - tmp = aa_compute_fperms(dfa, state, &cond); + tmp = *(aa_lookup_fperms(&(profile->file), state, &cond)); } } else if (profile->policy.dfa) { if (!PROFILE_MEDIATES(profile, *match_str)) diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c index 91689d34d2811..2c99edd8953a1 100644 --- a/security/apparmor/domain.c +++ b/security/apparmor/domain.c @@ -162,7 +162,7 @@ next: if (!state) goto fail; } - *perms = aa_compute_fperms(profile->file.dfa, state, &cond); + *perms = *(aa_lookup_fperms(&(profile->file), state, &cond)); aa_apply_modes_to_perms(profile, perms); if ((perms->allow & request) != request) return -EACCES; @@ -215,7 +215,7 @@ static int label_components_match(struct aa_profile *profile, return 0; next: - tmp = aa_compute_fperms(profile->file.dfa, state, &cond); + tmp = *(aa_lookup_fperms(&(profile->file), state, &cond)); aa_apply_modes_to_perms(profile, &tmp); aa_perms_accum(perms, &tmp); label_for_each_cont(i, label, tp) { @@ -224,7 +224,7 @@ next: state = match_component(profile, tp, stack, start); if (!state) goto fail; - tmp = aa_compute_fperms(profile->file.dfa, state, &cond); + tmp = *(aa_lookup_fperms(&(profile->file), state, &cond)); aa_apply_modes_to_perms(profile, &tmp); aa_perms_accum(perms, &tmp); } @@ -661,7 +661,7 @@ static struct aa_label *profile_transition(struct aa_profile *profile, } /* find exec permissions for name */ - state = aa_str_perms(profile->file.dfa, state, name, cond, &perms); + state = aa_str_perms(&(profile->file), state, name, cond, &perms); if (perms.allow & MAY_EXEC) { /* exec permission determine how to transition */ new = x_to_label(profile, bprm, name, perms.xindex, &target, @@ -756,7 +756,7 @@ static int profile_onexec(struct aa_profile *profile, struct aa_label *onexec, } /* find exec permissions for name */ - state = aa_str_perms(profile->file.dfa, state, xname, cond, &perms); + state = aa_str_perms(&(profile->file), state, xname, cond, &perms); if (!(perms.allow & AA_MAY_ONEXEC)) { info = "no change_onexec valid for executable"; goto audit; diff --git a/security/apparmor/file.c b/security/apparmor/file.c index e1b7e93602e4c..710b7d7517eb4 100644 --- a/security/apparmor/file.c +++ b/security/apparmor/file.c @@ -201,47 +201,97 @@ static u32 map_old_perms(u32 old) return new; } +static void __aa_compute_fperms_allow(struct aa_perms *perms, + struct aa_dfa *dfa, + unsigned int state) +{ + perms->allow |= AA_MAY_GETATTR; + + /* change_profile wasn't determined by ownership in old mapping */ + if (ACCEPT_TABLE(dfa)[state] & 0x80000000) + perms->allow |= AA_MAY_CHANGE_PROFILE; + if (ACCEPT_TABLE(dfa)[state] & 0x40000000) + perms->allow |= AA_MAY_ONEXEC; +} + +static struct aa_perms __aa_compute_fperms_user(struct aa_dfa *dfa, + unsigned int state) +{ + struct aa_perms perms = { }; + + perms.allow = map_old_perms(dfa_user_allow(dfa, state)); + perms.audit = map_old_perms(dfa_user_audit(dfa, state)); + perms.quiet = map_old_perms(dfa_user_quiet(dfa, state)); + perms.xindex = dfa_user_xindex(dfa, state); + + __aa_compute_fperms_allow(&perms, dfa, state); + + return perms; +} + +static struct aa_perms __aa_compute_fperms_other(struct aa_dfa *dfa, + unsigned int state) +{ + struct aa_perms perms = { }; + + perms.allow = map_old_perms(dfa_other_allow(dfa, state)); + perms.audit = map_old_perms(dfa_other_audit(dfa, state)); + perms.quiet = map_old_perms(dfa_other_quiet(dfa, state)); + perms.xindex = dfa_other_xindex(dfa, state); + + __aa_compute_fperms_allow(&perms, dfa, state); + + return perms; +} + +/** + * aa_compute_fperms - convert dfa compressed perms to internal perms and store + * them so they can be retrieved later. + * @file_rules: a file_rules structure containing a dfa (NOT NULL) for which + * permissions will be computed (NOT NULL) + * + * TODO: convert from dfa + state to permission entry + */ +void aa_compute_fperms(struct aa_file_rules *file_rules) +{ + int state; + int state_count = file_rules->dfa->tables[YYTD_ID_BASE]->td_lolen; + + // DFAs are restricted from having a state_count of less than 2 + file_rules->fperms_table = kvzalloc( + state_count * 2 * sizeof(struct aa_perms), GFP_KERNEL); + + // Since fperms_table is initialized with zeroes via kvzalloc(), we can + // skip the trap state (state == 0) + for (state = 1; state < state_count; state++) { + file_rules->fperms_table[state * 2] = + __aa_compute_fperms_user(file_rules->dfa, state); + file_rules->fperms_table[state * 2 + 1] = + __aa_compute_fperms_other(file_rules->dfa, state); + } +} + /** - * aa_compute_fperms - convert dfa compressed perms to internal perms - * @dfa: dfa to compute perms for (NOT NULL) + * aa_lookup_fperms - convert dfa compressed perms to internal perms + * @dfa: dfa to lookup perms for (NOT NULL) * @state: state in dfa * @cond: conditions to consider (NOT NULL) * - * TODO: convert from dfa + state to permission entry, do computation conversion - * at load time. + * TODO: convert from dfa + state to permission entry * - * Returns: computed permission set + * Returns: a pointer to a file permission set */ -struct aa_perms aa_compute_fperms(struct aa_dfa *dfa, unsigned int state, - struct path_cond *cond) +struct aa_perms default_perms = {}; +struct aa_perms *aa_lookup_fperms(struct aa_file_rules *file_rules, + unsigned int state, struct path_cond *cond) { - /* FIXME: change over to new dfa format - * currently file perms are encoded in the dfa, new format - * splits the permissions from the dfa. This mapping can be - * done at profile load - */ - struct aa_perms perms = { }; + if (!(file_rules->fperms_table)) + return &default_perms; - if (uid_eq(current_fsuid(), cond->uid)) { - perms.allow = map_old_perms(dfa_user_allow(dfa, state)); - perms.audit = map_old_perms(dfa_user_audit(dfa, state)); - perms.quiet = map_old_perms(dfa_user_quiet(dfa, state)); - perms.xindex = dfa_user_xindex(dfa, state); - } else { - perms.allow = map_old_perms(dfa_other_allow(dfa, state)); - perms.audit = map_old_perms(dfa_other_audit(dfa, state)); - perms.quiet = map_old_perms(dfa_other_quiet(dfa, state)); - perms.xindex = dfa_other_xindex(dfa, state); - } - perms.allow |= AA_MAY_GETATTR; + if (uid_eq(current_fsuid(), cond->uid)) + return &(file_rules->fperms_table[state * 2]); - /* change_profile wasn't determined by ownership in old mapping */ - if (ACCEPT_TABLE(dfa)[state] & 0x80000000) - perms.allow |= AA_MAY_CHANGE_PROFILE; - if (ACCEPT_TABLE(dfa)[state] & 0x40000000) - perms.allow |= AA_MAY_ONEXEC; - - return perms; + return &(file_rules->fperms_table[state * 2 + 1]); } /** @@ -254,13 +304,13 @@ struct aa_perms aa_compute_fperms(struct aa_dfa *dfa, unsigned int state, * * Returns: the final state in @dfa when beginning @start and walking @name */ -unsigned int aa_str_perms(struct aa_dfa *dfa, unsigned int start, +unsigned int aa_str_perms(struct aa_file_rules *file_rules, unsigned int start, const char *name, struct path_cond *cond, struct aa_perms *perms) { unsigned int state; - state = aa_dfa_match(dfa, start, name); - *perms = aa_compute_fperms(dfa, state, cond); + state = aa_dfa_match(file_rules->dfa, start, name); + *perms = *(aa_lookup_fperms(file_rules, state, cond)); return state; } @@ -273,7 +323,7 @@ int __aa_path_perm(const char *op, struct aa_profile *profile, const char *name, if (profile_unconfined(profile)) return 0; - aa_str_perms(profile->file.dfa, profile->file.start, name, cond, perms); + aa_str_perms(&(profile->file), profile->file.start, name, cond, perms); if (request & ~perms->allow) e = -EACCES; return aa_audit_file(profile, perms, op, request, name, NULL, NULL, @@ -380,7 +430,7 @@ static int profile_path_link(struct aa_profile *profile, error = -EACCES; /* aa_str_perms - handles the case of the dfa being NULL */ - state = aa_str_perms(profile->file.dfa, profile->file.start, lname, + state = aa_str_perms(&(profile->file), profile->file.start, lname, cond, &lperms); if (!(lperms.allow & AA_MAY_LINK)) @@ -388,7 +438,7 @@ static int profile_path_link(struct aa_profile *profile, /* test to see if target can be paired with link */ state = aa_dfa_null_transition(profile->file.dfa, state); - aa_str_perms(profile->file.dfa, state, tname, cond, &perms); + aa_str_perms(&(profile->file), state, tname, cond, &perms); /* force audit/quiet masks for link are stored in the second entry * in the link pair. @@ -410,7 +460,7 @@ static int profile_path_link(struct aa_profile *profile, /* Do link perm subset test requiring allowed permission on link are * a subset of the allowed permissions on target. */ - aa_str_perms(profile->file.dfa, profile->file.start, tname, cond, + aa_str_perms(&(profile->file), profile->file.start, tname, cond, &perms); /* AA_MAY_LINK is not considered in the subset test */ diff --git a/security/apparmor/include/file.h b/security/apparmor/include/file.h index 029cb20e322d5..ab201d625a344 100644 --- a/security/apparmor/include/file.h +++ b/security/apparmor/include/file.h @@ -181,11 +181,13 @@ struct aa_file_rules { /* struct perms perms; */ struct aa_domain trans; /* TODO: add delegate table */ + struct aa_perms *fperms_table; }; -struct aa_perms aa_compute_fperms(struct aa_dfa *dfa, unsigned int state, - struct path_cond *cond); -unsigned int aa_str_perms(struct aa_dfa *dfa, unsigned int start, +void aa_compute_fperms(struct aa_file_rules *file_rules); +struct aa_perms *aa_lookup_fperms(struct aa_file_rules *file_rules, + unsigned int state, struct path_cond *cond); +unsigned int aa_str_perms(struct aa_file_rules *file_rules, unsigned int start, const char *name, struct path_cond *cond, struct aa_perms *perms); @@ -204,10 +206,17 @@ int aa_file_perm(const char *op, struct aa_label *label, struct file *file, void aa_inherit_files(const struct cred *cred, struct files_struct *files); +static inline void aa_free_fperms_table(struct aa_perms *fperms_table) +{ + if (fperms_table) + kvfree(fperms_table); +} + static inline void aa_free_file_rules(struct aa_file_rules *rules) { aa_put_dfa(rules->dfa); aa_free_domain_entries(&rules->trans); + aa_free_fperms_table(rules->fperms_table); } /** diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 10e462d003213..54175bca42563 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -22,6 +22,7 @@ #include "include/audit.h" #include "include/cred.h" #include "include/crypto.h" +#include "include/file.h" #include "include/match.h" #include "include/path.h" #include "include/policy.h" @@ -878,6 +879,8 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) } else profile->file.dfa = aa_get_dfa(nulldfa); + aa_compute_fperms(&(profile->file)); + if (!unpack_trans_table(e, profile)) { info = "failed to unpack profile transition table"; goto fail; -- GitLab From b5b57993504f91785fa70e002e5e494fb549726e Mon Sep 17 00:00:00 2001 From: Mike Salvatore Date: Sun, 31 May 2020 10:52:06 -0400 Subject: [PATCH 0007/1423] apparmor: compute xmatch permissions on profile load Rather than computing xmatch permissions each time access is requested, these permissions can be computed once on profile load and stored for lookup. Signed-off-by: Mike Salvatore Signed-off-by: John Johansen --- security/apparmor/domain.c | 4 ++-- security/apparmor/include/policy.h | 2 ++ security/apparmor/policy.c | 1 + security/apparmor/policy_unpack.c | 22 +++++++++++++++++++++- 4 files changed, 26 insertions(+), 3 deletions(-) diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c index 2c99edd8953a1..22351b6d71e65 100644 --- a/security/apparmor/domain.c +++ b/security/apparmor/domain.c @@ -339,7 +339,7 @@ static int aa_xattrs_match(const struct linux_binprm *bprm, /* Check xattr value */ state = aa_dfa_match_len(profile->xmatch, state, value, size); - perm = dfa_user_allow(profile->xmatch, state); + perm = profile->xmatch_perms[state]; if (!(perm & MAY_EXEC)) { ret = -EINVAL; goto out; @@ -419,7 +419,7 @@ restart: state = aa_dfa_leftmatch(profile->xmatch, DFA_START, name, &count); - perm = dfa_user_allow(profile->xmatch, state); + perm = profile->xmatch_perms[state]; /* any accepting state means a valid match. */ if (perm & MAY_EXEC) { int ret = 0; diff --git a/security/apparmor/include/policy.h b/security/apparmor/include/policy.h index 639b5b248e63c..128c6a9430d46 100644 --- a/security/apparmor/include/policy.h +++ b/security/apparmor/include/policy.h @@ -104,6 +104,7 @@ struct aa_data { * @attach: human readable attachment string * @xmatch: optional extended matching for unconfined executables names * @xmatch_len: xmatch prefix len, used to determine xmatch priority + * @xmatch_perms: precomputed permissions for the xmatch DFA indexed by state * @audit: the auditing mode of the profile * @mode: the enforcement mode of the profile * @path_flags: flags controlling path generation behavior @@ -140,6 +141,7 @@ struct aa_profile { const char *attach; struct aa_dfa *xmatch; unsigned int xmatch_len; + u32 *xmatch_perms; enum audit_mode audit; long mode; u32 path_flags; diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c index fbdfcef91c616..e2d23cd85cd20 100644 --- a/security/apparmor/policy.c +++ b/security/apparmor/policy.c @@ -231,6 +231,7 @@ void aa_free_profile(struct aa_profile *profile) kfree_sensitive(profile->secmark); kfree_sensitive(profile->dirname); aa_put_dfa(profile->xmatch); + kvfree(profile->xmatch_perms); aa_put_dfa(profile->policy.dfa); if (profile->data) { diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 54175bca42563..70b7a35b5b96a 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -669,6 +669,23 @@ static int datacmp(struct rhashtable_compare_arg *arg, const void *obj) return strcmp(data->key, *key); } +static u32 *aa_compute_xmatch_perms(struct aa_dfa *xmatch) +{ + u32 *perms_table; + int state; + int state_count = xmatch->tables[YYTD_ID_BASE]->td_lolen; + + // DFAs are restricted from having a state_count of less than 2 + perms_table = kvcalloc(state_count, sizeof(u32), GFP_KERNEL); + + // Since perms_table is initialized with zeroes via kvcalloc(), we can + // skip the trap state (state == 0) + for (state = 1; state < state_count; state++) + perms_table[state] = dfa_user_allow(xmatch, state); + + return perms_table; +} + /** * unpack_profile - unpack a serialized profile * @e: serialized data extent information (NOT NULL) @@ -727,13 +744,16 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) info = "bad xmatch"; goto fail; } - /* xmatch_len is not optional if xmatch is set */ + /* neither xmatch_len not xmatch_perms are optional if xmatch is set */ if (profile->xmatch) { if (!unpack_u32(e, &tmp, NULL)) { info = "missing xmatch len"; goto fail; } profile->xmatch_len = tmp; + + profile->xmatch_perms = aa_compute_xmatch_perms( + profile->xmatch); } /* disconnected attachment string is optional */ -- GitLab From 754f209b811ac462e00ed0f79b48047c446f5c43 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Thu, 12 Nov 2020 10:07:25 -0800 Subject: [PATCH 0008/1423] apparmor: move fperm computation into policy_unpack fperm computation is only needed during policy_unpack so move the code there to isolate it fromt the run time code. Signed-off-by: John Johansen --- security/apparmor/file.c | 97 ------------------------------ security/apparmor/include/file.h | 1 - security/apparmor/policy_unpack.c | 98 +++++++++++++++++++++++++++++++ 3 files changed, 98 insertions(+), 98 deletions(-) diff --git a/security/apparmor/file.c b/security/apparmor/file.c index 710b7d7517eb4..1227ae8391546 100644 --- a/security/apparmor/file.c +++ b/security/apparmor/file.c @@ -174,103 +174,6 @@ static int path_name(const char *op, struct aa_label *label, return 0; } -/** - * map_old_perms - map old file perms layout to the new layout - * @old: permission set in old mapping - * - * Returns: new permission mapping - */ -static u32 map_old_perms(u32 old) -{ - u32 new = old & 0xf; - if (old & MAY_READ) - new |= AA_MAY_GETATTR | AA_MAY_OPEN; - if (old & MAY_WRITE) - new |= AA_MAY_SETATTR | AA_MAY_CREATE | AA_MAY_DELETE | - AA_MAY_CHMOD | AA_MAY_CHOWN | AA_MAY_OPEN; - if (old & 0x10) - new |= AA_MAY_LINK; - /* the old mapping lock and link_subset flags where overlaid - * and use was determined by part of a pair that they were in - */ - if (old & 0x20) - new |= AA_MAY_LOCK | AA_LINK_SUBSET; - if (old & 0x40) /* AA_EXEC_MMAP */ - new |= AA_EXEC_MMAP; - - return new; -} - -static void __aa_compute_fperms_allow(struct aa_perms *perms, - struct aa_dfa *dfa, - unsigned int state) -{ - perms->allow |= AA_MAY_GETATTR; - - /* change_profile wasn't determined by ownership in old mapping */ - if (ACCEPT_TABLE(dfa)[state] & 0x80000000) - perms->allow |= AA_MAY_CHANGE_PROFILE; - if (ACCEPT_TABLE(dfa)[state] & 0x40000000) - perms->allow |= AA_MAY_ONEXEC; -} - -static struct aa_perms __aa_compute_fperms_user(struct aa_dfa *dfa, - unsigned int state) -{ - struct aa_perms perms = { }; - - perms.allow = map_old_perms(dfa_user_allow(dfa, state)); - perms.audit = map_old_perms(dfa_user_audit(dfa, state)); - perms.quiet = map_old_perms(dfa_user_quiet(dfa, state)); - perms.xindex = dfa_user_xindex(dfa, state); - - __aa_compute_fperms_allow(&perms, dfa, state); - - return perms; -} - -static struct aa_perms __aa_compute_fperms_other(struct aa_dfa *dfa, - unsigned int state) -{ - struct aa_perms perms = { }; - - perms.allow = map_old_perms(dfa_other_allow(dfa, state)); - perms.audit = map_old_perms(dfa_other_audit(dfa, state)); - perms.quiet = map_old_perms(dfa_other_quiet(dfa, state)); - perms.xindex = dfa_other_xindex(dfa, state); - - __aa_compute_fperms_allow(&perms, dfa, state); - - return perms; -} - -/** - * aa_compute_fperms - convert dfa compressed perms to internal perms and store - * them so they can be retrieved later. - * @file_rules: a file_rules structure containing a dfa (NOT NULL) for which - * permissions will be computed (NOT NULL) - * - * TODO: convert from dfa + state to permission entry - */ -void aa_compute_fperms(struct aa_file_rules *file_rules) -{ - int state; - int state_count = file_rules->dfa->tables[YYTD_ID_BASE]->td_lolen; - - // DFAs are restricted from having a state_count of less than 2 - file_rules->fperms_table = kvzalloc( - state_count * 2 * sizeof(struct aa_perms), GFP_KERNEL); - - // Since fperms_table is initialized with zeroes via kvzalloc(), we can - // skip the trap state (state == 0) - for (state = 1; state < state_count; state++) { - file_rules->fperms_table[state * 2] = - __aa_compute_fperms_user(file_rules->dfa, state); - file_rules->fperms_table[state * 2 + 1] = - __aa_compute_fperms_other(file_rules->dfa, state); - } -} - /** * aa_lookup_fperms - convert dfa compressed perms to internal perms * @dfa: dfa to lookup perms for (NOT NULL) diff --git a/security/apparmor/include/file.h b/security/apparmor/include/file.h index ab201d625a344..1f9e54aa1adfe 100644 --- a/security/apparmor/include/file.h +++ b/security/apparmor/include/file.h @@ -184,7 +184,6 @@ struct aa_file_rules { struct aa_perms *fperms_table; }; -void aa_compute_fperms(struct aa_file_rules *file_rules); struct aa_perms *aa_lookup_fperms(struct aa_file_rules *file_rules, unsigned int state, struct path_cond *cond); unsigned int aa_str_perms(struct aa_file_rules *file_rules, unsigned int start, diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 70b7a35b5b96a..c22c6815ff4bb 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -669,6 +669,104 @@ static int datacmp(struct rhashtable_compare_arg *arg, const void *obj) return strcmp(data->key, *key); } +/** + * map_old_perms - map old file perms layout to the new layout + * @old: permission set in old mapping + * + * Returns: new permission mapping + */ +static u32 map_old_perms(u32 old) +{ + u32 new = old & 0xf; + + if (old & MAY_READ) + new |= AA_MAY_GETATTR | AA_MAY_OPEN; + if (old & MAY_WRITE) + new |= AA_MAY_SETATTR | AA_MAY_CREATE | AA_MAY_DELETE | + AA_MAY_CHMOD | AA_MAY_CHOWN | AA_MAY_OPEN; + if (old & 0x10) + new |= AA_MAY_LINK; + /* the old mapping lock and link_subset flags where overlaid + * and use was determined by part of a pair that they were in + */ + if (old & 0x20) + new |= AA_MAY_LOCK | AA_LINK_SUBSET; + if (old & 0x40) /* AA_EXEC_MMAP */ + new |= AA_EXEC_MMAP; + + return new; +} + +static void __aa_compute_fperms_allow(struct aa_perms *perms, + struct aa_dfa *dfa, + unsigned int state) +{ + perms->allow |= AA_MAY_GETATTR; + + /* change_profile wasn't determined by ownership in old mapping */ + if (ACCEPT_TABLE(dfa)[state] & 0x80000000) + perms->allow |= AA_MAY_CHANGE_PROFILE; + if (ACCEPT_TABLE(dfa)[state] & 0x40000000) + perms->allow |= AA_MAY_ONEXEC; +} + +static struct aa_perms __aa_compute_fperms_user(struct aa_dfa *dfa, + unsigned int state) +{ + struct aa_perms perms = { }; + + perms.allow = map_old_perms(dfa_user_allow(dfa, state)); + perms.audit = map_old_perms(dfa_user_audit(dfa, state)); + perms.quiet = map_old_perms(dfa_user_quiet(dfa, state)); + perms.xindex = dfa_user_xindex(dfa, state); + + __aa_compute_fperms_allow(&perms, dfa, state); + + return perms; +} + +static struct aa_perms __aa_compute_fperms_other(struct aa_dfa *dfa, + unsigned int state) +{ + struct aa_perms perms = { }; + + perms.allow = map_old_perms(dfa_other_allow(dfa, state)); + perms.audit = map_old_perms(dfa_other_audit(dfa, state)); + perms.quiet = map_old_perms(dfa_other_quiet(dfa, state)); + perms.xindex = dfa_other_xindex(dfa, state); + + __aa_compute_fperms_allow(&perms, dfa, state); + + return perms; +} + +/** + * aa_compute_fperms - convert dfa compressed perms to internal perms and store + * them so they can be retrieved later. + * @file_rules: a file_rules structure containing a dfa (NOT NULL) for which + * permissions will be computed (NOT NULL) + * + * TODO: convert from dfa + state to permission entry + */ +static void aa_compute_fperms(struct aa_file_rules *file_rules) +{ + int state; + int state_count = file_rules->dfa->tables[YYTD_ID_BASE]->td_lolen; + + // DFAs are restricted from having a state_count of less than 2 + file_rules->fperms_table = kvzalloc( + state_count * 2 * sizeof(struct aa_perms), GFP_KERNEL); + + // Since fperms_table is initialized with zeroes via kvzalloc(), we can + // skip the trap state (state == 0) + for (state = 1; state < state_count; state++) { + file_rules->fperms_table[state * 2] = + __aa_compute_fperms_user(file_rules->dfa, state); + file_rules->fperms_table[state * 2 + 1] = + __aa_compute_fperms_other(file_rules->dfa, state); + } +} + static u32 *aa_compute_xmatch_perms(struct aa_dfa *xmatch) { u32 *perms_table; -- GitLab From 0310f093ba95e7640c886298de36560c123df5bd Mon Sep 17 00:00:00 2001 From: John Johansen Date: Thu, 12 Nov 2020 10:26:26 -0800 Subject: [PATCH 0009/1423] apparmor: rework and cleanup fperm computation shorten the name of some of the mapping functions which shortens line lengths. change the mapping so it returns the perm table instead of operating directly on the file struct. Handle potential memory allocation failure. Signed-off-by: John Johansen --- security/apparmor/policy_unpack.c | 70 +++++++++++++++++-------------- 1 file changed, 38 insertions(+), 32 deletions(-) diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index c22c6815ff4bb..0f9a88354d639 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -697,9 +697,8 @@ static u32 map_old_perms(u32 old) return new; } -static void __aa_compute_fperms_allow(struct aa_perms *perms, - struct aa_dfa *dfa, - unsigned int state) +static void compute_fperms_allow(struct aa_perms *perms, struct aa_dfa *dfa, + unsigned int state) { perms->allow |= AA_MAY_GETATTR; @@ -710,8 +709,8 @@ static void __aa_compute_fperms_allow(struct aa_perms *perms, perms->allow |= AA_MAY_ONEXEC; } -static struct aa_perms __aa_compute_fperms_user(struct aa_dfa *dfa, - unsigned int state) +static struct aa_perms compute_fperms_user(struct aa_dfa *dfa, + unsigned int state) { struct aa_perms perms = { }; @@ -720,13 +719,13 @@ static struct aa_perms __aa_compute_fperms_user(struct aa_dfa *dfa, perms.quiet = map_old_perms(dfa_user_quiet(dfa, state)); perms.xindex = dfa_user_xindex(dfa, state); - __aa_compute_fperms_allow(&perms, dfa, state); + compute_fperms_allow(&perms, dfa, state); return perms; } -static struct aa_perms __aa_compute_fperms_other(struct aa_dfa *dfa, - unsigned int state) +static struct aa_perms compute_fperms_other(struct aa_dfa *dfa, + unsigned int state) { struct aa_perms perms = { }; @@ -735,7 +734,7 @@ static struct aa_perms __aa_compute_fperms_other(struct aa_dfa *dfa, perms.quiet = map_old_perms(dfa_other_quiet(dfa, state)); perms.xindex = dfa_other_xindex(dfa, state); - __aa_compute_fperms_allow(&perms, dfa, state); + compute_fperms_allow(&perms, dfa, state); return perms; } @@ -743,41 +742,46 @@ static struct aa_perms __aa_compute_fperms_other(struct aa_dfa *dfa, /** * aa_compute_fperms - convert dfa compressed perms to internal perms and store * them so they can be retrieved later. - * @file_rules: a file_rules structure containing a dfa (NOT NULL) for which - * permissions will be computed (NOT NULL) + * @dfa: a dfa using fperms to remap to internal permissions * - * TODO: convert from dfa + state to permission entry + * Returns: remapped perm table */ -static void aa_compute_fperms(struct aa_file_rules *file_rules) +static struct aa_perms *compute_fperms(struct aa_dfa *dfa) { int state; - int state_count = file_rules->dfa->tables[YYTD_ID_BASE]->td_lolen; + int state_count; + struct aa_perms *table; - // DFAs are restricted from having a state_count of less than 2 - file_rules->fperms_table = kvzalloc( - state_count * 2 * sizeof(struct aa_perms), GFP_KERNEL); + AA_BUG(!dfa); - // Since fperms_table is initialized with zeroes via kvzalloc(), we can - // skip the trap state (state == 0) + state_count = dfa->tables[YYTD_ID_BASE]->td_lolen; + /* DFAs are restricted from having a state_count of less than 2 */ + table = kvzalloc(state_count * 2 * sizeof(struct aa_perms), GFP_KERNEL); + if (!table) + return NULL; + + /* zero init so skip the trap state (state == 0) */ for (state = 1; state < state_count; state++) { - file_rules->fperms_table[state * 2] = - __aa_compute_fperms_user(file_rules->dfa, state); - file_rules->fperms_table[state * 2 + 1] = - __aa_compute_fperms_other(file_rules->dfa, state); + table[state * 2] = compute_fperms_user(dfa, state); + table[state * 2 + 1] = compute_fperms_other(dfa, state); } + + return table; } -static u32 *aa_compute_xmatch_perms(struct aa_dfa *xmatch) +static u32 *compute_xmatch_perms(struct aa_dfa *xmatch) { u32 *perms_table; int state; - int state_count = xmatch->tables[YYTD_ID_BASE]->td_lolen; + int state_count; + + AA_BUG(!xmatch); - // DFAs are restricted from having a state_count of less than 2 + state_count = xmatch->tables[YYTD_ID_BASE]->td_lolen; + /* DFAs are restricted from having a state_count of less than 2 */ perms_table = kvcalloc(state_count, sizeof(u32), GFP_KERNEL); - // Since perms_table is initialized with zeroes via kvcalloc(), we can - // skip the trap state (state == 0) + /* zero init so skip the trap state (state == 0) */ for (state = 1; state < state_count; state++) perms_table[state] = dfa_user_allow(xmatch, state); @@ -850,8 +854,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) } profile->xmatch_len = tmp; - profile->xmatch_perms = aa_compute_xmatch_perms( - profile->xmatch); + profile->xmatch_perms = compute_xmatch_perms(profile->xmatch); } /* disconnected attachment string is optional */ @@ -997,8 +1000,11 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) } else profile->file.dfa = aa_get_dfa(nulldfa); - aa_compute_fperms(&(profile->file)); - + profile->file.fperms_table = compute_fperms(profile->file.dfa); + if (!profile->file.fperms_table) { + info = "failed to remap file permission table"; + goto fail; + } if (!unpack_trans_table(e, profile)) { info = "failed to unpack profile transition table"; goto fail; -- GitLab From e48ffd24c1d87dba227225615790cd059a707adb Mon Sep 17 00:00:00 2001 From: John Johansen Date: Fri, 13 Nov 2020 16:30:47 -0800 Subject: [PATCH 0010/1423] apparmor: convert xmatch to use aa_perms structure Convert xmatch from using perms encoded in the accept entry of the dfa to the common external aa_perms in a table. Signed-off-by: John Johansen --- security/apparmor/domain.c | 4 ++-- security/apparmor/include/policy.h | 3 ++- security/apparmor/policy_unpack.c | 13 +++++++++---- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c index 22351b6d71e65..4fcdcc0de48cc 100644 --- a/security/apparmor/domain.c +++ b/security/apparmor/domain.c @@ -339,7 +339,7 @@ static int aa_xattrs_match(const struct linux_binprm *bprm, /* Check xattr value */ state = aa_dfa_match_len(profile->xmatch, state, value, size); - perm = profile->xmatch_perms[state]; + perm = profile->xmatch_perms[state].allow; if (!(perm & MAY_EXEC)) { ret = -EINVAL; goto out; @@ -419,7 +419,7 @@ restart: state = aa_dfa_leftmatch(profile->xmatch, DFA_START, name, &count); - perm = profile->xmatch_perms[state]; + perm = profile->xmatch_perms[state].allow; /* any accepting state means a valid match. */ if (perm & MAY_EXEC) { int ret = 0; diff --git a/security/apparmor/include/policy.h b/security/apparmor/include/policy.h index 128c6a9430d46..7882d5e5096b1 100644 --- a/security/apparmor/include/policy.h +++ b/security/apparmor/include/policy.h @@ -141,7 +141,8 @@ struct aa_profile { const char *attach; struct aa_dfa *xmatch; unsigned int xmatch_len; - u32 *xmatch_perms; + struct aa_perms *xmatch_perms; + enum audit_mode audit; long mode; u32 path_flags; diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 0f9a88354d639..44910c201c492 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -769,9 +769,9 @@ static struct aa_perms *compute_fperms(struct aa_dfa *dfa) return table; } -static u32 *compute_xmatch_perms(struct aa_dfa *xmatch) +static struct aa_perms *compute_xmatch_perms(struct aa_dfa *xmatch) { - u32 *perms_table; + struct aa_perms *perms_table; int state; int state_count; @@ -779,11 +779,12 @@ static u32 *compute_xmatch_perms(struct aa_dfa *xmatch) state_count = xmatch->tables[YYTD_ID_BASE]->td_lolen; /* DFAs are restricted from having a state_count of less than 2 */ - perms_table = kvcalloc(state_count, sizeof(u32), GFP_KERNEL); + perms_table = kvcalloc(state_count, sizeof(struct aa_perms), + GFP_KERNEL); /* zero init so skip the trap state (state == 0) */ for (state = 1; state < state_count; state++) - perms_table[state] = dfa_user_allow(xmatch, state); + perms_table[state].allow = dfa_user_allow(xmatch, state); return perms_table; } @@ -855,6 +856,10 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) profile->xmatch_len = tmp; profile->xmatch_perms = compute_xmatch_perms(profile->xmatch); + if (!profile->xmatch_perms) { + info = "failed to convert xmatch permission table"; + goto fail; + } } /* disconnected attachment string is optional */ -- GitLab From e2967ede22978f132cd52929edff96c701bde0eb Mon Sep 17 00:00:00 2001 From: John Johansen Date: Tue, 17 Nov 2020 01:38:16 -0800 Subject: [PATCH 0011/1423] apparmor: compute policydb permission on profile load Rather than computing policydb permissions for each access permissions can be computed once on profile load and stored for lookup. Signed-off-by: John Johansen --- security/apparmor/apparmorfs.c | 2 +- security/apparmor/include/perms.h | 13 +++++-- security/apparmor/include/policy.h | 1 + security/apparmor/label.c | 6 ++-- security/apparmor/lib.c | 42 ----------------------- security/apparmor/mount.c | 53 ++++++++++------------------ security/apparmor/net.c | 2 +- security/apparmor/policy.c | 2 +- security/apparmor/policy_unpack.c | 55 +++++++++++++++++++++++++++++- 9 files changed, 90 insertions(+), 86 deletions(-) diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 825b3093dcddf..117783779337d 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -633,7 +633,7 @@ static void profile_query_cb(struct aa_profile *profile, struct aa_perms *perms, state = aa_dfa_match_len(dfa, profile->policy.start[0], match_str, match_len); if (state) - aa_compute_perms(dfa, state, &tmp); + tmp = *aa_lookup_perms(profile->policy.perms, state); } aa_apply_modes_to_perms(profile, &tmp); aa_perms_accum_raw(perms, &tmp); diff --git a/security/apparmor/include/perms.h b/security/apparmor/include/perms.h index 13f20c5984480..de9631edb1ff9 100644 --- a/security/apparmor/include/perms.h +++ b/security/apparmor/include/perms.h @@ -133,6 +133,17 @@ extern struct aa_perms allperms; xcheck(fn_for_each((L1), (P), (FN1)), fn_for_each((L2), (P), (FN2))) +extern struct aa_perms default_perms; + +static inline struct aa_perms *aa_lookup_perms(struct aa_perms *perms, + unsigned int state) +{ + if (!(perms)) + return &default_perms; + + return &(perms[state]); +} + void aa_perm_mask_to_str(char *str, size_t str_size, const char *chrs, u32 mask); void aa_audit_perm_names(struct audit_buffer *ab, const char * const *names, @@ -141,8 +152,6 @@ void aa_audit_perm_mask(struct audit_buffer *ab, u32 mask, const char *chrs, u32 chrsmask, const char * const *names, u32 namesmask); void aa_apply_modes_to_perms(struct aa_profile *profile, struct aa_perms *perms); -void aa_compute_perms(struct aa_dfa *dfa, unsigned int state, - struct aa_perms *perms); void aa_perms_accum(struct aa_perms *accum, struct aa_perms *addend); void aa_perms_accum_raw(struct aa_perms *accum, struct aa_perms *addend); void aa_profile_match_label(struct aa_profile *profile, struct aa_label *label, diff --git a/security/apparmor/include/policy.h b/security/apparmor/include/policy.h index 7882d5e5096b1..0dec18cd95e56 100644 --- a/security/apparmor/include/policy.h +++ b/security/apparmor/include/policy.h @@ -77,6 +77,7 @@ enum profile_mode { struct aa_policydb { /* Generic policy DFA specific rule types will be subsections of it */ struct aa_dfa *dfa; + struct aa_perms *perms; unsigned int start[AA_CLASS_LAST + 1]; }; diff --git a/security/apparmor/label.c b/security/apparmor/label.c index 0f36ee9074381..ddb04417bdabf 100644 --- a/security/apparmor/label.c +++ b/security/apparmor/label.c @@ -1328,7 +1328,7 @@ next: if (!state) goto fail; } - aa_compute_perms(profile->policy.dfa, state, perms); + *perms = *aa_lookup_perms(profile->policy.perms, state); aa_apply_modes_to_perms(profile, perms); if ((perms->allow & request) != request) return -EACCES; @@ -1379,7 +1379,7 @@ static int label_components_match(struct aa_profile *profile, return 0; next: - aa_compute_perms(profile->policy.dfa, state, &tmp); + tmp = *aa_lookup_perms(profile->policy.perms, state); aa_apply_modes_to_perms(profile, &tmp); aa_perms_accum(perms, &tmp); label_for_each_cont(i, label, tp) { @@ -1388,7 +1388,7 @@ next: state = match_component(profile, tp, start); if (!state) goto fail; - aa_compute_perms(profile->policy.dfa, state, &tmp); + tmp = *aa_lookup_perms(profile->policy.perms, state); aa_apply_modes_to_perms(profile, &tmp); aa_perms_accum(perms, &tmp); } diff --git a/security/apparmor/lib.c b/security/apparmor/lib.c index 1c72a61108d3f..505ef5848f7ce 100644 --- a/security/apparmor/lib.c +++ b/security/apparmor/lib.c @@ -315,48 +315,6 @@ void aa_apply_modes_to_perms(struct aa_profile *profile, struct aa_perms *perms) */ } -static u32 map_other(u32 x) -{ - return ((x & 0x3) << 8) | /* SETATTR/GETATTR */ - ((x & 0x1c) << 18) | /* ACCEPT/BIND/LISTEN */ - ((x & 0x60) << 19); /* SETOPT/GETOPT */ -} - -static u32 map_xbits(u32 x) -{ - return ((x & 0x1) << 7) | - ((x & 0x7e) << 9); -} - -void aa_compute_perms(struct aa_dfa *dfa, unsigned int state, - struct aa_perms *perms) -{ - /* This mapping is convulated due to history. - * v1-v4: only file perms - * v5: added policydb which dropped in perm user conditional to - * gain new perm bits, but had to map around the xbits because - * the userspace compiler was still munging them. - * v9: adds using the xbits in policydb because the compiler now - * supports treating policydb permission bits different. - * Unfortunately there is not way to force auditing on the - * perms represented by the xbits - */ - *perms = (struct aa_perms) { - .allow = dfa_user_allow(dfa, state) | - map_xbits(dfa_user_xbits(dfa, state)), - .audit = dfa_user_audit(dfa, state), - .quiet = dfa_user_quiet(dfa, state) | - map_xbits(dfa_other_xbits(dfa, state)), - }; - - /* for v5-v9 perm mapping in the policydb, the other set is used - * to extend the general perm set - */ - perms->allow |= map_other(dfa_other_allow(dfa, state)); - perms->audit |= map_other(dfa_other_audit(dfa, state)); - perms->quiet |= map_other(dfa_other_quiet(dfa, state)); -} - /** * aa_perms_accum_raw - accumulate perms with out masking off overlapping perms * @accum - perms struct to accumulate into diff --git a/security/apparmor/mount.c b/security/apparmor/mount.c index f612472418033..1e978c2b1ee45 100644 --- a/security/apparmor/mount.c +++ b/security/apparmor/mount.c @@ -203,25 +203,6 @@ static unsigned int match_mnt_flags(struct aa_dfa *dfa, unsigned int state, return state; } -/** - * compute_mnt_perms - compute mount permission associated with @state - * @dfa: dfa to match against (NOT NULL) - * @state: state match finished in - * - * Returns: mount permissions - */ -static struct aa_perms compute_mnt_perms(struct aa_dfa *dfa, - unsigned int state) -{ - struct aa_perms perms = { - .allow = dfa_user_allow(dfa, state), - .audit = dfa_user_audit(dfa, state), - .quiet = dfa_user_quiet(dfa, state), - }; - - return perms; -} - static const char * const mnt_info_table[] = { "match succeeded", "failed mntpnt match", @@ -236,50 +217,52 @@ static const char * const mnt_info_table[] = { * Returns 0 on success else element that match failed in, this is the * index into the mnt_info_table above */ -static int do_match_mnt(struct aa_dfa *dfa, unsigned int start, +static int do_match_mnt(struct aa_policydb *policy, unsigned int start, const char *mntpnt, const char *devname, const char *type, unsigned long flags, void *data, bool binary, struct aa_perms *perms) { unsigned int state; - AA_BUG(!dfa); + AA_BUG(!policy); + AA_BUG(!policy->dfa); + AA_BUG(!policy->perms); AA_BUG(!perms); - state = aa_dfa_match(dfa, start, mntpnt); - state = aa_dfa_null_transition(dfa, state); + state = aa_dfa_match(policy->dfa, start, mntpnt); + state = aa_dfa_null_transition(policy->dfa, state); if (!state) return 1; if (devname) - state = aa_dfa_match(dfa, state, devname); - state = aa_dfa_null_transition(dfa, state); + state = aa_dfa_match(policy->dfa, state, devname); + state = aa_dfa_null_transition(policy->dfa, state); if (!state) return 2; if (type) - state = aa_dfa_match(dfa, state, type); - state = aa_dfa_null_transition(dfa, state); + state = aa_dfa_match(policy->dfa, state, type); + state = aa_dfa_null_transition(policy->dfa, state); if (!state) return 3; - state = match_mnt_flags(dfa, state, flags); + state = match_mnt_flags(policy->dfa, state, flags); if (!state) return 4; - *perms = compute_mnt_perms(dfa, state); + *perms = *aa_lookup_perms(policy->perms, state); if (perms->allow & AA_MAY_MOUNT) return 0; /* only match data if not binary and the DFA flags data is expected */ if (data && !binary && (perms->allow & AA_MNT_CONT_MATCH)) { - state = aa_dfa_null_transition(dfa, state); + state = aa_dfa_null_transition(policy->dfa, state); if (!state) return 4; - state = aa_dfa_match(dfa, state, data); + state = aa_dfa_match(policy->dfa, state, data); if (!state) return 5; - *perms = compute_mnt_perms(dfa, state); + *perms = *aa_lookup_perms(policy->perms, state); if (perms->allow & AA_MAY_MOUNT) return 0; } @@ -341,7 +324,7 @@ static int match_mnt_path_str(struct aa_profile *profile, } error = -EACCES; - pos = do_match_mnt(profile->policy.dfa, + pos = do_match_mnt(&profile->policy, profile->policy.start[AA_CLASS_MOUNT], mntpnt, devname, type, flags, data, binary, &perms); if (pos) { @@ -601,7 +584,7 @@ static int profile_umount(struct aa_profile *profile, const struct path *path, state = aa_dfa_match(profile->policy.dfa, profile->policy.start[AA_CLASS_MOUNT], name); - perms = compute_mnt_perms(profile->policy.dfa, state); + perms = *aa_lookup_perms(profile->policy.perms, state); if (AA_MAY_UMOUNT & ~perms.allow) error = -EACCES; @@ -672,7 +655,7 @@ static struct aa_label *build_pivotroot(struct aa_profile *profile, new_name); state = aa_dfa_null_transition(profile->policy.dfa, state); state = aa_dfa_match(profile->policy.dfa, state, old_name); - perms = compute_mnt_perms(profile->policy.dfa, state); + perms = *aa_lookup_perms(profile->policy.perms, state); if (AA_MAY_PIVOTROOT & perms.allow) error = 0; diff --git a/security/apparmor/net.c b/security/apparmor/net.c index 7efe4d17273d9..88e8a7ea54c0c 100644 --- a/security/apparmor/net.c +++ b/security/apparmor/net.c @@ -125,7 +125,7 @@ int aa_profile_af_perm(struct aa_profile *profile, struct common_audit_data *sa, buffer[1] = cpu_to_be16((u16) type); state = aa_dfa_match_len(profile->policy.dfa, state, (char *) &buffer, 4); - aa_compute_perms(profile->policy.dfa, state, &perms); + perms = *aa_lookup_perms(profile->policy.perms, state); aa_apply_modes_to_perms(profile, &perms); return aa_check_perms(profile, &perms, request, sa, audit_net_cb); diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c index e2d23cd85cd20..6c3086e2c8201 100644 --- a/security/apparmor/policy.c +++ b/security/apparmor/policy.c @@ -233,7 +233,7 @@ void aa_free_profile(struct aa_profile *profile) aa_put_dfa(profile->xmatch); kvfree(profile->xmatch_perms); aa_put_dfa(profile->policy.dfa); - + kvfree(profile->policy.perms); if (profile->data) { rht = profile->data; profile->data = NULL; diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 44910c201c492..ed063385a83b4 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -756,7 +756,7 @@ static struct aa_perms *compute_fperms(struct aa_dfa *dfa) state_count = dfa->tables[YYTD_ID_BASE]->td_lolen; /* DFAs are restricted from having a state_count of less than 2 */ - table = kvzalloc(state_count * 2 * sizeof(struct aa_perms), GFP_KERNEL); + table = kvcalloc(state_count * 2, sizeof(struct aa_perms), GFP_KERNEL); if (!table) return NULL; @@ -789,6 +789,54 @@ static struct aa_perms *compute_xmatch_perms(struct aa_dfa *xmatch) return perms_table; } +static u32 map_other(u32 x) +{ + return ((x & 0x3) << 8) | /* SETATTR/GETATTR */ + ((x & 0x1c) << 18) | /* ACCEPT/BIND/LISTEN */ + ((x & 0x60) << 19); /* SETOPT/GETOPT */ +} + +static struct aa_perms compute_perms_entry(struct aa_dfa *dfa, + unsigned int state) +{ + struct aa_perms perms = { }; + + perms.allow = dfa_user_allow(dfa, state); + perms.audit = dfa_user_audit(dfa, state); + perms.quiet = dfa_user_quiet(dfa, state); + + /* for v5 perm mapping in the policydb, the other set is used + * to extend the general perm set + */ + + perms.allow |= map_other(dfa_other_allow(dfa, state)); + perms.audit |= map_other(dfa_other_audit(dfa, state)); + perms.quiet |= map_other(dfa_other_quiet(dfa, state)); + + return perms; +} + +static struct aa_perms *compute_perms(struct aa_dfa *dfa) +{ + int state; + int state_count; + struct aa_perms *table; + + AA_BUG(!dfa); + + state_count = dfa->tables[YYTD_ID_BASE]->td_lolen; + /* DFAs are restricted from having a state_count of less than 2 */ + table = kvcalloc(state_count, sizeof(struct aa_perms), GFP_KERNEL); + if (!table) + return NULL; + + /* zero init so skip the trap state (state == 0) */ + for (state = 1; state < state_count; state++) + table[state] = compute_perms_entry(dfa, state); + + return table; +} + /** * unpack_profile - unpack a serialized profile * @e: serialized data extent information (NOT NULL) @@ -986,6 +1034,11 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) goto fail; } else profile->policy.dfa = aa_get_dfa(nulldfa); + profile->policy.perms = compute_perms(profile->policy.dfa); + if (!profile->policy.perms) { + info = "failed to remap policydb permission table"; + goto fail; + } /* get file rules */ profile->file.dfa = unpack_dfa(e); -- GitLab From 53bdc46f4bdd20d477afb374767cabe627fd04ae Mon Sep 17 00:00:00 2001 From: John Johansen Date: Thu, 19 Nov 2020 10:37:48 -0800 Subject: [PATCH 0012/1423] apparmor: combine file_rules and aa_policydb into a single shared struct file_rules and policydb are almost the same and will need the same features in the future so combine them. Signed-off-by: John Johansen --- security/apparmor/apparmorfs.c | 3 ++- security/apparmor/domain.c | 7 +++--- security/apparmor/file.c | 20 ++++++++------- security/apparmor/include/file.h | 39 +++--------------------------- security/apparmor/include/policy.h | 14 ++++++++--- security/apparmor/policy.c | 5 ++-- security/apparmor/policy_unpack.c | 11 +++++---- 7 files changed, 40 insertions(+), 59 deletions(-) diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 117783779337d..1625fee17fc75 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -619,7 +619,8 @@ static void profile_query_cb(struct aa_profile *profile, struct aa_perms *perms, return; if (profile->file.dfa && *match_str == AA_CLASS_FILE) { dfa = profile->file.dfa; - state = aa_dfa_match_len(dfa, profile->file.start, + state = aa_dfa_match_len(dfa, + profile->file.start[AA_CLASS_FILE], match_str + 1, match_len - 1); if (state) { struct path_cond cond = { }; diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c index 4fcdcc0de48cc..819b7828cbc44 100644 --- a/security/apparmor/domain.c +++ b/security/apparmor/domain.c @@ -627,7 +627,7 @@ static struct aa_label *profile_transition(struct aa_profile *profile, { struct aa_label *new = NULL; const char *info = NULL, *name = NULL, *target = NULL; - unsigned int state = profile->file.start; + unsigned int state = profile->file.start[AA_CLASS_FILE]; struct aa_perms perms = {}; bool nonewprivs = false; int error = 0; @@ -723,7 +723,7 @@ static int profile_onexec(struct aa_profile *profile, struct aa_label *onexec, char *buffer, struct path_cond *cond, bool *secure_exec) { - unsigned int state = profile->file.start; + unsigned int state = profile->file.start[AA_CLASS_FILE]; struct aa_perms perms = {}; const char *xname = NULL, *info = "change_profile onexec"; int error = -EACCES; @@ -1267,7 +1267,8 @@ static int change_profile_perms_wrapper(const char *op, const char *name, if (!error) error = change_profile_perms(profile, target, stack, request, - profile->file.start, perms); + profile->file.start[AA_CLASS_FILE], + perms); if (error) error = aa_audit_file(profile, perms, op, request, name, NULL, target, GLOBAL_ROOT_UID, info, diff --git a/security/apparmor/file.c b/security/apparmor/file.c index 1227ae8391546..d2be851be4121 100644 --- a/security/apparmor/file.c +++ b/security/apparmor/file.c @@ -185,16 +185,16 @@ static int path_name(const char *op, struct aa_label *label, * Returns: a pointer to a file permission set */ struct aa_perms default_perms = {}; -struct aa_perms *aa_lookup_fperms(struct aa_file_rules *file_rules, +struct aa_perms *aa_lookup_fperms(struct aa_policydb *file_rules, unsigned int state, struct path_cond *cond) { - if (!(file_rules->fperms_table)) + if (!(file_rules->perms)) return &default_perms; if (uid_eq(current_fsuid(), cond->uid)) - return &(file_rules->fperms_table[state * 2]); + return &(file_rules->perms[state * 2]); - return &(file_rules->fperms_table[state * 2 + 1]); + return &(file_rules->perms[state * 2 + 1]); } /** @@ -207,7 +207,7 @@ struct aa_perms *aa_lookup_fperms(struct aa_file_rules *file_rules, * * Returns: the final state in @dfa when beginning @start and walking @name */ -unsigned int aa_str_perms(struct aa_file_rules *file_rules, unsigned int start, +unsigned int aa_str_perms(struct aa_policydb *file_rules, unsigned int start, const char *name, struct path_cond *cond, struct aa_perms *perms) { @@ -226,7 +226,8 @@ int __aa_path_perm(const char *op, struct aa_profile *profile, const char *name, if (profile_unconfined(profile)) return 0; - aa_str_perms(&(profile->file), profile->file.start, name, cond, perms); + aa_str_perms(&(profile->file), profile->file.start[AA_CLASS_FILE], + name, cond, perms); if (request & ~perms->allow) e = -EACCES; return aa_audit_file(profile, perms, op, request, name, NULL, NULL, @@ -333,7 +334,8 @@ static int profile_path_link(struct aa_profile *profile, error = -EACCES; /* aa_str_perms - handles the case of the dfa being NULL */ - state = aa_str_perms(&(profile->file), profile->file.start, lname, + state = aa_str_perms(&(profile->file), + profile->file.start[AA_CLASS_FILE], lname, cond, &lperms); if (!(lperms.allow & AA_MAY_LINK)) @@ -363,8 +365,8 @@ static int profile_path_link(struct aa_profile *profile, /* Do link perm subset test requiring allowed permission on link are * a subset of the allowed permissions on target. */ - aa_str_perms(&(profile->file), profile->file.start, tname, cond, - &perms); + aa_str_perms(&(profile->file), profile->file.start[AA_CLASS_FILE], + tname, cond, &perms); /* AA_MAY_LINK is not considered in the subset test */ request = lperms.allow & ~AA_MAY_LINK; diff --git a/security/apparmor/include/file.h b/security/apparmor/include/file.h index 1f9e54aa1adfe..736b8f6554044 100644 --- a/security/apparmor/include/file.h +++ b/security/apparmor/include/file.h @@ -17,6 +17,7 @@ #include "match.h" #include "perms.h" +struct aa_policydb; struct aa_profile; struct path; @@ -164,29 +165,9 @@ int aa_audit_file(struct aa_profile *profile, struct aa_perms *perms, const char *target, struct aa_label *tlabel, kuid_t ouid, const char *info, int error); -/** - * struct aa_file_rules - components used for file rule permissions - * @dfa: dfa to match path names and conditionals against - * @perms: permission table indexed by the matched state accept entry of @dfa - * @trans: transition table for indexed by named x transitions - * - * File permission are determined by matching a path against @dfa and - * then using the value of the accept entry for the matching state as - * an index into @perms. If a named exec transition is required it is - * looked up in the transition table. - */ -struct aa_file_rules { - unsigned int start; - struct aa_dfa *dfa; - /* struct perms perms; */ - struct aa_domain trans; - /* TODO: add delegate table */ - struct aa_perms *fperms_table; -}; - -struct aa_perms *aa_lookup_fperms(struct aa_file_rules *file_rules, - unsigned int state, struct path_cond *cond); -unsigned int aa_str_perms(struct aa_file_rules *file_rules, unsigned int start, +struct aa_perms *aa_lookup_fperms(struct aa_policydb *file_rules, + unsigned int state, struct path_cond *cond); +unsigned int aa_str_perms(struct aa_policydb *file_rules, unsigned int start, const char *name, struct path_cond *cond, struct aa_perms *perms); @@ -205,18 +186,6 @@ int aa_file_perm(const char *op, struct aa_label *label, struct file *file, void aa_inherit_files(const struct cred *cred, struct files_struct *files); -static inline void aa_free_fperms_table(struct aa_perms *fperms_table) -{ - if (fperms_table) - kvfree(fperms_table); -} - -static inline void aa_free_file_rules(struct aa_file_rules *rules) -{ - aa_put_dfa(rules->dfa); - aa_free_domain_entries(&rules->trans); - aa_free_fperms_table(rules->fperms_table); -} /** * aa_map_file_perms - map file flags to AppArmor permissions diff --git a/security/apparmor/include/policy.h b/security/apparmor/include/policy.h index 0dec18cd95e56..9bafeb3847d59 100644 --- a/security/apparmor/include/policy.h +++ b/security/apparmor/include/policy.h @@ -75,13 +75,21 @@ enum profile_mode { * start: set of start states for the different classes of data */ struct aa_policydb { - /* Generic policy DFA specific rule types will be subsections of it */ struct aa_dfa *dfa; struct aa_perms *perms; + struct aa_domain trans; unsigned int start[AA_CLASS_LAST + 1]; - }; +static inline void aa_destroy_policydb(struct aa_policydb *policy) +{ + aa_put_dfa(policy->dfa); + if (policy->perms) + kvfree(policy->perms); + aa_free_domain_entries(&policy->trans); + +} + /* struct aa_data - generic data structure * key: name for retrieving this data * size: size of data in bytes @@ -151,7 +159,7 @@ struct aa_profile { int size; struct aa_policydb policy; - struct aa_file_rules file; + struct aa_policydb file; struct aa_caps caps; int xattr_count; diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c index 6c3086e2c8201..0814ee57a06bf 100644 --- a/security/apparmor/policy.c +++ b/security/apparmor/policy.c @@ -219,7 +219,7 @@ void aa_free_profile(struct aa_profile *profile) aa_put_ns(profile->ns); kfree_sensitive(profile->rename); - aa_free_file_rules(&profile->file); + aa_destroy_policydb(&profile->file); aa_free_cap_rules(&profile->caps); aa_free_rlimit_rules(&profile->rlimits); @@ -232,8 +232,7 @@ void aa_free_profile(struct aa_profile *profile) kfree_sensitive(profile->dirname); aa_put_dfa(profile->xmatch); kvfree(profile->xmatch_perms); - aa_put_dfa(profile->policy.dfa); - kvfree(profile->policy.perms); + aa_destroy_policydb(&profile->policy); if (profile->data) { rht = profile->data; profile->data = NULL; diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index ed063385a83b4..726fa02026b57 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -1048,18 +1048,19 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) info = "failed to unpack profile file rules"; goto fail; } else if (profile->file.dfa) { - if (!unpack_u32(e, &profile->file.start, "dfa_start")) + if (!unpack_u32(e, &profile->file.start[AA_CLASS_FILE], + "dfa_start")) /* default start state */ - profile->file.start = DFA_START; + profile->file.start[AA_CLASS_FILE] = DFA_START; } else if (profile->policy.dfa && profile->policy.start[AA_CLASS_FILE]) { profile->file.dfa = aa_get_dfa(profile->policy.dfa); - profile->file.start = profile->policy.start[AA_CLASS_FILE]; + profile->file.start[AA_CLASS_FILE] = profile->policy.start[AA_CLASS_FILE]; } else profile->file.dfa = aa_get_dfa(nulldfa); - profile->file.fperms_table = compute_fperms(profile->file.dfa); - if (!profile->file.fperms_table) { + profile->file.perms = compute_fperms(profile->file.dfa); + if (!profile->file.perms) { info = "failed to remap file permission table"; goto fail; } -- GitLab From 048d49544455b3e3a535c4ec89057ea5ca8676f0 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Sat, 21 Nov 2020 01:42:40 -0800 Subject: [PATCH 0013/1423] apparmor: convert xmatch to using the new shared policydb struct continue permission unification by converting xmatch to use the policydb struct that is used by the other profile dfas. Signed-off-by: John Johansen --- security/apparmor/apparmorfs.c | 2 +- security/apparmor/domain.c | 22 ++++++++++++---------- security/apparmor/include/apparmor.h | 1 + security/apparmor/include/policy.h | 4 +--- security/apparmor/policy.c | 3 +-- security/apparmor/policy_unpack.c | 25 ++++++++++++------------- 6 files changed, 28 insertions(+), 29 deletions(-) diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 1625fee17fc75..a2d12b80592bd 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -1095,7 +1095,7 @@ static int seq_profile_attach_show(struct seq_file *seq, void *v) struct aa_profile *profile = labels_profile(label); if (profile->attach) seq_printf(seq, "%s\n", profile->attach); - else if (profile->xmatch) + else if (profile->xmatch.dfa) seq_puts(seq, "\n"); else seq_printf(seq, "%s\n", profile->base.name); diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c index 819b7828cbc44..0df17fb236c78 100644 --- a/security/apparmor/domain.c +++ b/security/apparmor/domain.c @@ -321,7 +321,7 @@ static int aa_xattrs_match(const struct linux_binprm *bprm, might_sleep(); /* transition from exec match to xattr set */ - state = aa_dfa_outofband_transition(profile->xmatch, state); + state = aa_dfa_outofband_transition(profile->xmatch.dfa, state); d = bprm->file->f_path.dentry; for (i = 0; i < profile->xattr_count; i++) { @@ -335,18 +335,19 @@ static int aa_xattrs_match(const struct linux_binprm *bprm, * that not present xattr can be distinguished from a 0 * length value or rule that matches any value */ - state = aa_dfa_null_transition(profile->xmatch, state); + state = aa_dfa_null_transition(profile->xmatch.dfa, + state); /* Check xattr value */ - state = aa_dfa_match_len(profile->xmatch, state, value, - size); - perm = profile->xmatch_perms[state].allow; + state = aa_dfa_match_len(profile->xmatch.dfa, state, + value, size); + perm = profile->xmatch.perms[state].allow; if (!(perm & MAY_EXEC)) { ret = -EINVAL; goto out; } } /* transition to next element */ - state = aa_dfa_outofband_transition(profile->xmatch, state); + state = aa_dfa_outofband_transition(profile->xmatch.dfa, state); if (size < 0) { /* * No xattr match, so verify if transition to @@ -413,13 +414,14 @@ restart: * as another profile, signal a conflict and refuse to * match. */ - if (profile->xmatch) { + if (profile->xmatch.dfa) { unsigned int state, count; u32 perm; - state = aa_dfa_leftmatch(profile->xmatch, DFA_START, - name, &count); - perm = profile->xmatch_perms[state].allow; + state = aa_dfa_leftmatch(profile->xmatch.dfa, + profile->xmatch.start[AA_CLASS_XMATCH], + name, &count); + perm = profile->xmatch.perms[state].allow; /* any accepting state means a valid match. */ if (perm & MAY_EXEC) { int ret = 0; diff --git a/security/apparmor/include/apparmor.h b/security/apparmor/include/apparmor.h index dd2c131ed1706..8fd66a4ca0b86 100644 --- a/security/apparmor/include/apparmor.h +++ b/security/apparmor/include/apparmor.h @@ -26,6 +26,7 @@ #define AA_CLASS_MOUNT 7 #define AA_CLASS_PTRACE 9 #define AA_CLASS_SIGNAL 10 +#define AA_CLASS_XMATCH 11 #define AA_CLASS_NET 14 #define AA_CLASS_LABEL 16 #define AA_CLASS_POSIX_MQUEUE 17 diff --git a/security/apparmor/include/policy.h b/security/apparmor/include/policy.h index 9bafeb3847d59..44d8cbb1c3685 100644 --- a/security/apparmor/include/policy.h +++ b/security/apparmor/include/policy.h @@ -113,7 +113,6 @@ struct aa_data { * @attach: human readable attachment string * @xmatch: optional extended matching for unconfined executables names * @xmatch_len: xmatch prefix len, used to determine xmatch priority - * @xmatch_perms: precomputed permissions for the xmatch DFA indexed by state * @audit: the auditing mode of the profile * @mode: the enforcement mode of the profile * @path_flags: flags controlling path generation behavior @@ -148,9 +147,8 @@ struct aa_profile { const char *rename; const char *attach; - struct aa_dfa *xmatch; + struct aa_policydb xmatch; unsigned int xmatch_len; - struct aa_perms *xmatch_perms; enum audit_mode audit; long mode; diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c index 0814ee57a06bf..cdcf26c9bed57 100644 --- a/security/apparmor/policy.c +++ b/security/apparmor/policy.c @@ -230,8 +230,7 @@ void aa_free_profile(struct aa_profile *profile) kfree_sensitive(profile->secmark[i].label); kfree_sensitive(profile->secmark); kfree_sensitive(profile->dirname); - aa_put_dfa(profile->xmatch); - kvfree(profile->xmatch_perms); + aa_destroy_policydb(&profile->xmatch); aa_destroy_policydb(&profile->policy); if (profile->data) { rht = profile->data; diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 726fa02026b57..f2a075986e49f 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -771,7 +771,7 @@ static struct aa_perms *compute_fperms(struct aa_dfa *dfa) static struct aa_perms *compute_xmatch_perms(struct aa_dfa *xmatch) { - struct aa_perms *perms_table; + struct aa_perms *perms; int state; int state_count; @@ -779,14 +779,13 @@ static struct aa_perms *compute_xmatch_perms(struct aa_dfa *xmatch) state_count = xmatch->tables[YYTD_ID_BASE]->td_lolen; /* DFAs are restricted from having a state_count of less than 2 */ - perms_table = kvcalloc(state_count, sizeof(struct aa_perms), - GFP_KERNEL); + perms = kvcalloc(state_count, sizeof(struct aa_perms), GFP_KERNEL); /* zero init so skip the trap state (state == 0) */ for (state = 1; state < state_count; state++) - perms_table[state].allow = dfa_user_allow(xmatch, state); + perms[state].allow = dfa_user_allow(xmatch, state); - return perms_table; + return perms; } static u32 map_other(u32 x) @@ -888,23 +887,23 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) (void) unpack_str(e, &profile->attach, "attach"); /* xmatch is optional and may be NULL */ - profile->xmatch = unpack_dfa(e); - if (IS_ERR(profile->xmatch)) { - error = PTR_ERR(profile->xmatch); - profile->xmatch = NULL; + profile->xmatch.dfa = unpack_dfa(e); + if (IS_ERR(profile->xmatch.dfa)) { + error = PTR_ERR(profile->xmatch.dfa); + profile->xmatch.dfa = NULL; info = "bad xmatch"; goto fail; } /* neither xmatch_len not xmatch_perms are optional if xmatch is set */ - if (profile->xmatch) { + if (profile->xmatch.dfa) { if (!unpack_u32(e, &tmp, NULL)) { info = "missing xmatch len"; goto fail; } profile->xmatch_len = tmp; - - profile->xmatch_perms = compute_xmatch_perms(profile->xmatch); - if (!profile->xmatch_perms) { + profile->xmatch.start[AA_CLASS_XMATCH] = DFA_START; + profile->xmatch.perms = compute_xmatch_perms(profile->xmatch.dfa); + if (!profile->xmatch.perms) { info = "failed to convert xmatch permission table"; goto fail; } -- GitLab From 7572fea31e3e5c4c19154ccc064eb1f83dfe1333 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Fri, 13 Nov 2020 01:46:23 -0800 Subject: [PATCH 0014/1423] apparmor: convert fperm lookup to use accept as an index Remap file dfa accept table from embedded perms to index and then move fperm lookup to use the accept entry as an index into the fperm table. This is a step toward unifying permission lookup. Signed-off-by: John Johansen --- security/apparmor/file.c | 6 ++-- security/apparmor/policy_unpack.c | 57 ++++++++++++++++++++++--------- 2 files changed, 44 insertions(+), 19 deletions(-) diff --git a/security/apparmor/file.c b/security/apparmor/file.c index d2be851be4121..7bddec3df75f7 100644 --- a/security/apparmor/file.c +++ b/security/apparmor/file.c @@ -188,13 +188,15 @@ struct aa_perms default_perms = {}; struct aa_perms *aa_lookup_fperms(struct aa_policydb *file_rules, unsigned int state, struct path_cond *cond) { + unsigned int index = ACCEPT_TABLE(file_rules->dfa)[state]; + if (!(file_rules->perms)) return &default_perms; if (uid_eq(current_fsuid(), cond->uid)) - return &(file_rules->perms[state * 2]); + return &(file_rules->perms[index]); - return &(file_rules->perms[state * 2 + 1]); + return &(file_rules->perms[index + 1]); } /** diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index f2a075986e49f..4cf62c1be388f 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -836,6 +836,29 @@ static struct aa_perms *compute_perms(struct aa_dfa *dfa) return table; } +/** + * remap_dfa_accept - remap old dfa accept table to be an index + * @dfa: dfa to do the remapping on + * @factor: scaling factor for the index conversion. + * + * Used in conjunction with compute_Xperms, it converts old style perms + * that are encoded in the dfa accept tables to the new style where + * there is a permission table and the accept table is an index into + * the permission table. + */ +static void remap_dfa_accept(struct aa_dfa *dfa, unsigned int factor) +{ + unsigned int state; + unsigned int state_count = dfa->tables[YYTD_ID_BASE]->td_lolen; + + AA_BUG(!dfa); + + for (state = 0; state < state_count; state++) + ACCEPT_TABLE(dfa)[state] = state * factor; + kvfree(dfa->tables[YYTD_ID_ACCEPT2]); + dfa->tables[YYTD_ID_ACCEPT2] = NULL; +} + /** * unpack_profile - unpack a serialized profile * @e: serialized data extent information (NOT NULL) @@ -1051,6 +1074,16 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) "dfa_start")) /* default start state */ profile->file.start[AA_CLASS_FILE] = DFA_START; + profile->file.perms = compute_fperms(profile->file.dfa); + if (!profile->file.perms) { + info = "failed to remap file permission table"; + goto fail; + } + remap_dfa_accept(profile->file.dfa, 2); + if (!unpack_trans_table(e, profile)) { + info = "failed to unpack profile transition table"; + goto fail; + } } else if (profile->policy.dfa && profile->policy.start[AA_CLASS_FILE]) { profile->file.dfa = aa_get_dfa(profile->policy.dfa); @@ -1058,16 +1091,6 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) } else profile->file.dfa = aa_get_dfa(nulldfa); - profile->file.perms = compute_fperms(profile->file.dfa); - if (!profile->file.perms) { - info = "failed to remap file permission table"; - goto fail; - } - if (!unpack_trans_table(e, profile)) { - info = "failed to unpack profile transition table"; - goto fail; - } - if (unpack_nameX(e, AA_STRUCT, "data")) { info = "out of memory"; profile->data = kzalloc(sizeof(*profile->data), GFP_KERNEL); @@ -1198,9 +1221,7 @@ static bool verify_dfa_xindex(struct aa_dfa *dfa, int table_size) { int i; for (i = 0; i < dfa->tables[YYTD_ID_ACCEPT]->td_lolen; i++) { - if (!verify_xindex(dfa_user_xindex(dfa, i), table_size)) - return false; - if (!verify_xindex(dfa_other_xindex(dfa, i), table_size)) + if (!verify_xindex(ACCEPT_TABLE(dfa)[i], table_size)) return false; } return true; @@ -1211,14 +1232,16 @@ static bool verify_dfa_xindex(struct aa_dfa *dfa, int table_size) * @profile: profile to verify (NOT NULL) * * Returns: 0 if passes verification else error + * + * This verification is post any unpack mapping or changes */ static int verify_profile(struct aa_profile *profile) { if (profile->file.dfa && - !verify_dfa_xindex(profile->file.dfa, - profile->file.trans.size)) { - audit_iface(profile, NULL, NULL, "Invalid named transition", - NULL, -EPROTO); + !verify_dfa_xindex(profile->file.dfa, + profile->file.trans.size)) { + audit_iface(profile, NULL, NULL, + "Unpack: Invalid named transition", NULL, -EPROTO); return -EPROTO; } -- GitLab From 2d63dd43ae334ec6f5374d37bb06c4cc57621b3c Mon Sep 17 00:00:00 2001 From: John Johansen Date: Fri, 13 Nov 2020 23:36:09 -0800 Subject: [PATCH 0015/1423] apparmor: convert xmatch lookup to use accept as an index Remap xmatch dfa accept table from embedded perms to an index and then move xmatch lookup to use accept entry to index into the xmatch table. This is step towards unifying permission lookup and reducing the size of permissions tables. Signed-off-by: John Johansen --- security/apparmor/domain.c | 10 ++++++---- security/apparmor/policy_unpack.c | 1 + 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c index 0df17fb236c78..45a8887021f10 100644 --- a/security/apparmor/domain.c +++ b/security/apparmor/domain.c @@ -328,7 +328,7 @@ static int aa_xattrs_match(const struct linux_binprm *bprm, size = vfs_getxattr_alloc(&init_user_ns, d, profile->xattrs[i], &value, value_size, GFP_KERNEL); if (size >= 0) { - u32 perm; + u32 index, perm; /* * Check the xattr presence before value. This ensure @@ -340,7 +340,8 @@ static int aa_xattrs_match(const struct linux_binprm *bprm, /* Check xattr value */ state = aa_dfa_match_len(profile->xmatch.dfa, state, value, size); - perm = profile->xmatch.perms[state].allow; + index = ACCEPT_TABLE(profile->xmatch.dfa)[state]; + perm = profile->xmatch.perms[index].allow; if (!(perm & MAY_EXEC)) { ret = -EINVAL; goto out; @@ -416,12 +417,13 @@ restart: */ if (profile->xmatch.dfa) { unsigned int state, count; - u32 perm; + u32 index, perm; state = aa_dfa_leftmatch(profile->xmatch.dfa, profile->xmatch.start[AA_CLASS_XMATCH], name, &count); - perm = profile->xmatch.perms[state].allow; + index = ACCEPT_TABLE(profile->xmatch.dfa)[state]; + perm = profile->xmatch.perms[index].allow; /* any accepting state means a valid match. */ if (perm & MAY_EXEC) { int ret = 0; diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 4cf62c1be388f..4cdc969887832 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -930,6 +930,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) info = "failed to convert xmatch permission table"; goto fail; } + remap_dfa_accept(profile->xmatch.dfa, 1); } /* disconnected attachment string is optional */ -- GitLab From bf690f59d0429c62de4db1234f16557eedcb39bf Mon Sep 17 00:00:00 2001 From: John Johansen Date: Sat, 10 Apr 2021 02:09:44 -0700 Subject: [PATCH 0016/1423] apparmor: cleanup shared permission struct The shared permissions struct has the stop field which is unneeded and the "reserved" subtree field commented which is needed. Also reorganize so that the entries are logically grouped. Signed-off-by: John Johansen --- security/apparmor/include/perms.h | 17 +++++++---------- security/apparmor/lib.c | 4 ++-- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/security/apparmor/include/perms.h b/security/apparmor/include/perms.h index de9631edb1ff9..1f3e7680e8092 100644 --- a/security/apparmor/include/perms.h +++ b/security/apparmor/include/perms.h @@ -65,22 +65,19 @@ extern const char *aa_file_perm_names[]; struct aa_perms { u32 allow; - u32 audit; /* set only when allow is set */ - u32 deny; /* explicit deny, or conflict if allow also set */ - u32 quiet; /* set only when ~allow | deny */ - u32 kill; /* set only when ~allow | deny */ - u32 stop; /* set only when ~allow | deny */ - u32 complain; /* accumulates only used when ~allow & ~deny */ + u32 subtree; /* allow perm on full subtree only when allow is set */ u32 cond; /* set only when ~allow and ~deny */ - u32 hide; /* set only when ~allow | deny */ + u32 kill; /* set only when ~allow | deny */ + u32 complain; /* accumulates only used when ~allow & ~deny */ u32 prompt; /* accumulates only used when ~allow & ~deny */ - /* Reserved: - * u32 subtree; / * set only when allow is set * / - */ + u32 audit; /* set only when allow is set */ + u32 quiet; /* set only when ~allow | deny */ + u32 hide; /* set only when ~allow | deny */ + u16 xindex; }; diff --git a/security/apparmor/lib.c b/security/apparmor/lib.c index 505ef5848f7ce..974a217218a6e 100644 --- a/security/apparmor/lib.c +++ b/security/apparmor/lib.c @@ -327,11 +327,11 @@ void aa_perms_accum_raw(struct aa_perms *accum, struct aa_perms *addend) accum->audit |= addend->audit & addend->allow; accum->quiet &= addend->quiet & ~addend->allow; accum->kill |= addend->kill & ~addend->allow; - accum->stop |= addend->stop & ~addend->allow; accum->complain |= addend->complain & ~addend->allow & ~addend->deny; accum->cond |= addend->cond & ~addend->allow & ~addend->deny; accum->hide &= addend->hide & ~addend->allow; accum->prompt |= addend->prompt & ~addend->allow & ~addend->deny; + accum->subtree |= addend->subtree & ~addend->deny; } /** @@ -346,11 +346,11 @@ void aa_perms_accum(struct aa_perms *accum, struct aa_perms *addend) accum->audit |= addend->audit & accum->allow; accum->quiet &= addend->quiet & ~accum->allow; accum->kill |= addend->kill & ~accum->allow; - accum->stop |= addend->stop & ~accum->allow; accum->complain |= addend->complain & ~accum->allow & ~accum->deny; accum->cond |= addend->cond & ~accum->allow & ~accum->deny; accum->hide &= addend->hide & ~accum->allow; accum->prompt |= addend->prompt & ~accum->allow & ~accum->deny; + accum->subtree &= addend->subtree & ~accum->deny; } void aa_profile_match_label(struct aa_profile *profile, struct aa_label *label, -- GitLab From e844fe9b51c984472ea98be3b2d1201ba9ee3213 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Sat, 16 Jul 2022 01:53:46 -0700 Subject: [PATCH 0017/1423] apparmor: convert policy lookup to use accept as an index Remap polidydb dfa accept table from embedded perms to an index, and then move the perm lookup to use the accept entry as an index into the perm table. This is done so that the perm table can be separated from the dfa, allowing dfa accept to index to share expanded permission sets. Signed-off-by: John Johansen --- security/apparmor/apparmorfs.c | 2 +- security/apparmor/include/perms.h | 8 -------- security/apparmor/include/policy.h | 12 ++++++++++++ security/apparmor/label.c | 6 +++--- security/apparmor/mount.c | 8 ++++---- security/apparmor/net.c | 2 +- security/apparmor/policy_unpack.c | 19 ++++++++++++------- 7 files changed, 33 insertions(+), 24 deletions(-) diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index a2d12b80592bd..f2b78108bae81 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -634,7 +634,7 @@ static void profile_query_cb(struct aa_profile *profile, struct aa_perms *perms, state = aa_dfa_match_len(dfa, profile->policy.start[0], match_str, match_len); if (state) - tmp = *aa_lookup_perms(profile->policy.perms, state); + tmp = *aa_lookup_perms(&profile->policy, state); } aa_apply_modes_to_perms(profile, &tmp); aa_perms_accum_raw(perms, &tmp); diff --git a/security/apparmor/include/perms.h b/security/apparmor/include/perms.h index 1f3e7680e8092..1014a7bbc0278 100644 --- a/security/apparmor/include/perms.h +++ b/security/apparmor/include/perms.h @@ -132,14 +132,6 @@ extern struct aa_perms allperms; extern struct aa_perms default_perms; -static inline struct aa_perms *aa_lookup_perms(struct aa_perms *perms, - unsigned int state) -{ - if (!(perms)) - return &default_perms; - - return &(perms[state]); -} void aa_perm_mask_to_str(char *str, size_t str_size, const char *chrs, u32 mask); diff --git a/security/apparmor/include/policy.h b/security/apparmor/include/policy.h index 44d8cbb1c3685..31c0af8762505 100644 --- a/security/apparmor/include/policy.h +++ b/security/apparmor/include/policy.h @@ -90,6 +90,18 @@ static inline void aa_destroy_policydb(struct aa_policydb *policy) } +static inline struct aa_perms *aa_lookup_perms(struct aa_policydb *policy, + unsigned int state) +{ + unsigned int index = ACCEPT_TABLE(policy->dfa)[state]; + + if (!(policy->perms)) + return &default_perms; + + return &(policy->perms[index]); +} + + /* struct aa_data - generic data structure * key: name for retrieving this data * size: size of data in bytes diff --git a/security/apparmor/label.c b/security/apparmor/label.c index ddb04417bdabf..30cb68641c0f5 100644 --- a/security/apparmor/label.c +++ b/security/apparmor/label.c @@ -1328,7 +1328,7 @@ next: if (!state) goto fail; } - *perms = *aa_lookup_perms(profile->policy.perms, state); + *perms = *aa_lookup_perms(&profile->policy, state); aa_apply_modes_to_perms(profile, perms); if ((perms->allow & request) != request) return -EACCES; @@ -1379,7 +1379,7 @@ static int label_components_match(struct aa_profile *profile, return 0; next: - tmp = *aa_lookup_perms(profile->policy.perms, state); + tmp = *aa_lookup_perms(&profile->policy, state); aa_apply_modes_to_perms(profile, &tmp); aa_perms_accum(perms, &tmp); label_for_each_cont(i, label, tp) { @@ -1388,7 +1388,7 @@ next: state = match_component(profile, tp, start); if (!state) goto fail; - tmp = *aa_lookup_perms(profile->policy.perms, state); + tmp = *aa_lookup_perms(&profile->policy, state); aa_apply_modes_to_perms(profile, &tmp); aa_perms_accum(perms, &tmp); } diff --git a/security/apparmor/mount.c b/security/apparmor/mount.c index 1e978c2b1ee45..7594f3a3441e5 100644 --- a/security/apparmor/mount.c +++ b/security/apparmor/mount.c @@ -249,7 +249,7 @@ static int do_match_mnt(struct aa_policydb *policy, unsigned int start, state = match_mnt_flags(policy->dfa, state, flags); if (!state) return 4; - *perms = *aa_lookup_perms(policy->perms, state); + *perms = *aa_lookup_perms(policy, state); if (perms->allow & AA_MAY_MOUNT) return 0; @@ -262,7 +262,7 @@ static int do_match_mnt(struct aa_policydb *policy, unsigned int start, state = aa_dfa_match(policy->dfa, state, data); if (!state) return 5; - *perms = *aa_lookup_perms(policy->perms, state); + *perms = *aa_lookup_perms(policy, state); if (perms->allow & AA_MAY_MOUNT) return 0; } @@ -584,7 +584,7 @@ static int profile_umount(struct aa_profile *profile, const struct path *path, state = aa_dfa_match(profile->policy.dfa, profile->policy.start[AA_CLASS_MOUNT], name); - perms = *aa_lookup_perms(profile->policy.perms, state); + perms = *aa_lookup_perms(&profile->policy, state); if (AA_MAY_UMOUNT & ~perms.allow) error = -EACCES; @@ -655,7 +655,7 @@ static struct aa_label *build_pivotroot(struct aa_profile *profile, new_name); state = aa_dfa_null_transition(profile->policy.dfa, state); state = aa_dfa_match(profile->policy.dfa, state, old_name); - perms = *aa_lookup_perms(profile->policy.perms, state); + perms = *aa_lookup_perms(&profile->policy, state); if (AA_MAY_PIVOTROOT & perms.allow) error = 0; diff --git a/security/apparmor/net.c b/security/apparmor/net.c index 88e8a7ea54c0c..fcfb97079e1be 100644 --- a/security/apparmor/net.c +++ b/security/apparmor/net.c @@ -125,7 +125,7 @@ int aa_profile_af_perm(struct aa_profile *profile, struct common_audit_data *sa, buffer[1] = cpu_to_be16((u16) type); state = aa_dfa_match_len(profile->policy.dfa, state, (char *) &buffer, 4); - perms = *aa_lookup_perms(profile->policy.perms, state); + perms = *aa_lookup_perms(&profile->policy, state); aa_apply_modes_to_perms(profile, &perms); return aa_check_perms(profile, &perms, request, sa, audit_net_cb); diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 4cdc969887832..0917412ba48f4 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -1055,13 +1055,15 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) } if (!unpack_nameX(e, AA_STRUCTEND, NULL)) goto fail; + profile->policy.perms = compute_perms(profile->policy.dfa); + if (!profile->policy.perms) { + info = "failed to remap policydb permission table"; + goto fail; + } + /* Do not remap internal dfas */ + remap_dfa_accept(profile->policy.dfa, 1); } else profile->policy.dfa = aa_get_dfa(nulldfa); - profile->policy.perms = compute_perms(profile->policy.dfa); - if (!profile->policy.perms) { - info = "failed to remap policydb permission table"; - goto fail; - } /* get file rules */ profile->file.dfa = unpack_dfa(e); @@ -1238,9 +1240,12 @@ static bool verify_dfa_xindex(struct aa_dfa *dfa, int table_size) */ static int verify_profile(struct aa_profile *profile) { - if (profile->file.dfa && + if ((profile->file.dfa && !verify_dfa_xindex(profile->file.dfa, - profile->file.trans.size)) { + profile->file.trans.size)) || + (profile->policy.dfa && + !verify_dfa_xindex(profile->policy.dfa, + profile->policy.trans.size))) { audit_iface(profile, NULL, NULL, "Unpack: Invalid named transition", NULL, -EPROTO); return -EPROTO; -- GitLab From 33fc95d8293cfca352ac875668857293e22d7d51 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Mon, 17 Jan 2022 13:43:49 -0800 Subject: [PATCH 0018/1423] apparmor: preparse for state being more than just an integer Convert from an unsigned int to a state_t for state position. This is a step in prepping for the state position carrying some additional flags, and a limited form of backtracking to support variables. Signed-off-by: John Johansen --- security/apparmor/apparmorfs.c | 2 +- security/apparmor/domain.c | 25 ++++++------- security/apparmor/file.c | 12 +++---- security/apparmor/include/file.h | 8 ++--- security/apparmor/include/label.h | 6 ++-- security/apparmor/include/lib.h | 4 +-- security/apparmor/include/match.h | 28 +++++++-------- security/apparmor/include/policy.h | 14 ++++---- security/apparmor/ipc.c | 2 +- security/apparmor/label.c | 14 ++++---- security/apparmor/lib.c | 2 +- security/apparmor/match.c | 58 +++++++++++++++--------------- security/apparmor/mount.c | 10 +++--- security/apparmor/net.c | 2 +- security/apparmor/policy_unpack.c | 16 ++++----- 15 files changed, 101 insertions(+), 102 deletions(-) diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index f2b78108bae81..fb9d2ccb34d60 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -613,7 +613,7 @@ static void profile_query_cb(struct aa_profile *profile, struct aa_perms *perms, { struct aa_perms tmp = { }; struct aa_dfa *dfa; - unsigned int state = 0; + aa_state_t state = DFA_NOMATCH; if (profile_unconfined(profile)) return; diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c index 45a8887021f10..5883f0fc02d3d 100644 --- a/security/apparmor/domain.c +++ b/security/apparmor/domain.c @@ -95,9 +95,9 @@ out: * If a subns profile is not to be matched should be prescreened with * visibility test. */ -static inline unsigned int match_component(struct aa_profile *profile, - struct aa_profile *tp, - bool stack, unsigned int state) +static inline aa_state_t match_component(struct aa_profile *profile, + struct aa_profile *tp, + bool stack, aa_state_t state) { const char *ns_name; @@ -132,7 +132,7 @@ static inline unsigned int match_component(struct aa_profile *profile, */ static int label_compound_match(struct aa_profile *profile, struct aa_label *label, bool stack, - unsigned int state, bool subns, u32 request, + aa_state_t state, bool subns, u32 request, struct aa_perms *perms) { struct aa_profile *tp; @@ -192,14 +192,14 @@ fail: */ static int label_components_match(struct aa_profile *profile, struct aa_label *label, bool stack, - unsigned int start, bool subns, u32 request, + aa_state_t start, bool subns, u32 request, struct aa_perms *perms) { struct aa_profile *tp; struct label_it i; struct aa_perms tmp; struct path_cond cond = { }; - unsigned int state = 0; + aa_state_t state = 0; /* find first subcomponent to test */ label_for_each(i, label, tp) { @@ -252,7 +252,7 @@ fail: * Returns: the state the match finished in, may be the none matching state */ static int label_match(struct aa_profile *profile, struct aa_label *label, - bool stack, unsigned int state, bool subns, u32 request, + bool stack, aa_state_t state, bool subns, u32 request, struct aa_perms *perms) { int error; @@ -286,7 +286,7 @@ static int label_match(struct aa_profile *profile, struct aa_label *label, */ static int change_profile_perms(struct aa_profile *profile, struct aa_label *target, bool stack, - u32 request, unsigned int start, + u32 request, aa_state_t start, struct aa_perms *perms) { if (profile_unconfined(profile)) { @@ -308,7 +308,7 @@ static int change_profile_perms(struct aa_profile *profile, * Returns: number of extended attributes that matched, or < 0 on error */ static int aa_xattrs_match(const struct linux_binprm *bprm, - struct aa_profile *profile, unsigned int state) + struct aa_profile *profile, aa_state_t state) { int i; ssize_t size; @@ -416,7 +416,8 @@ restart: * match. */ if (profile->xmatch.dfa) { - unsigned int state, count; + unsigned int count; + aa_state_t state; u32 index, perm; state = aa_dfa_leftmatch(profile->xmatch.dfa, @@ -631,7 +632,7 @@ static struct aa_label *profile_transition(struct aa_profile *profile, { struct aa_label *new = NULL; const char *info = NULL, *name = NULL, *target = NULL; - unsigned int state = profile->file.start[AA_CLASS_FILE]; + aa_state_t state = profile->file.start[AA_CLASS_FILE]; struct aa_perms perms = {}; bool nonewprivs = false; int error = 0; @@ -727,7 +728,7 @@ static int profile_onexec(struct aa_profile *profile, struct aa_label *onexec, char *buffer, struct path_cond *cond, bool *secure_exec) { - unsigned int state = profile->file.start[AA_CLASS_FILE]; + aa_state_t state = profile->file.start[AA_CLASS_FILE]; struct aa_perms perms = {}; const char *xname = NULL, *info = "change_profile onexec"; int error = -EACCES; diff --git a/security/apparmor/file.c b/security/apparmor/file.c index 7bddec3df75f7..636efcade3f58 100644 --- a/security/apparmor/file.c +++ b/security/apparmor/file.c @@ -186,7 +186,7 @@ static int path_name(const char *op, struct aa_label *label, */ struct aa_perms default_perms = {}; struct aa_perms *aa_lookup_fperms(struct aa_policydb *file_rules, - unsigned int state, struct path_cond *cond) + aa_state_t state, struct path_cond *cond) { unsigned int index = ACCEPT_TABLE(file_rules->dfa)[state]; @@ -209,11 +209,11 @@ struct aa_perms *aa_lookup_fperms(struct aa_policydb *file_rules, * * Returns: the final state in @dfa when beginning @start and walking @name */ -unsigned int aa_str_perms(struct aa_policydb *file_rules, unsigned int start, - const char *name, struct path_cond *cond, - struct aa_perms *perms) +aa_state_t aa_str_perms(struct aa_policydb *file_rules, aa_state_t start, + const char *name, struct path_cond *cond, + struct aa_perms *perms) { - unsigned int state; + aa_state_t state; state = aa_dfa_match(file_rules->dfa, start, name); *perms = *(aa_lookup_fperms(file_rules, state, cond)); @@ -320,7 +320,7 @@ static int profile_path_link(struct aa_profile *profile, struct aa_perms lperms = {}, perms; const char *info = NULL; u32 request = AA_MAY_LINK; - unsigned int state; + aa_state_t state; int error; error = path_name(OP_LINK, &profile->label, link, profile->path_flags, diff --git a/security/apparmor/include/file.h b/security/apparmor/include/file.h index 736b8f6554044..8c82cf279dc2a 100644 --- a/security/apparmor/include/file.h +++ b/security/apparmor/include/file.h @@ -166,10 +166,10 @@ int aa_audit_file(struct aa_profile *profile, struct aa_perms *perms, const char *info, int error); struct aa_perms *aa_lookup_fperms(struct aa_policydb *file_rules, - unsigned int state, struct path_cond *cond); -unsigned int aa_str_perms(struct aa_policydb *file_rules, unsigned int start, - const char *name, struct path_cond *cond, - struct aa_perms *perms); + aa_state_t state, struct path_cond *cond); +aa_state_t aa_str_perms(struct aa_policydb *file_rules, aa_state_t start, + const char *name, struct path_cond *cond, + struct aa_perms *perms); int __aa_path_perm(const char *op, struct aa_profile *profile, const char *name, u32 request, struct path_cond *cond, diff --git a/security/apparmor/include/label.h b/security/apparmor/include/label.h index 860484c6f99a6..1130ba10a1526 100644 --- a/security/apparmor/include/label.h +++ b/security/apparmor/include/label.h @@ -333,7 +333,7 @@ struct aa_label *aa_label_parse(struct aa_label *base, const char *str, static inline const char *aa_label_strn_split(const char *str, int n) { const char *pos; - unsigned int state; + aa_state_t state; state = aa_dfa_matchn_until(stacksplitdfa, DFA_START, str, n, &pos); if (!ACCEPT_TABLE(stacksplitdfa)[state]) @@ -345,7 +345,7 @@ static inline const char *aa_label_strn_split(const char *str, int n) static inline const char *aa_label_str_split(const char *str) { const char *pos; - unsigned int state; + aa_state_t state; state = aa_dfa_match_until(stacksplitdfa, DFA_START, str, &pos); if (!ACCEPT_TABLE(stacksplitdfa)[state]) @@ -358,7 +358,7 @@ static inline const char *aa_label_str_split(const char *str) struct aa_perms; int aa_label_match(struct aa_profile *profile, struct aa_label *label, - unsigned int state, bool subns, u32 request, + aa_state_t state, bool subns, u32 request, struct aa_perms *perms); diff --git a/security/apparmor/include/lib.h b/security/apparmor/include/lib.h index f42359f58eb58..f176f3ced2a36 100644 --- a/security/apparmor/include/lib.h +++ b/security/apparmor/include/lib.h @@ -87,8 +87,8 @@ static inline bool aa_strneq(const char *str, const char *sub, int len) * character which is not used in standard matching and is only * used to separate pairs. */ -static inline unsigned int aa_dfa_null_transition(struct aa_dfa *dfa, - unsigned int start) +static inline aa_state_t aa_dfa_null_transition(struct aa_dfa *dfa, + aa_state_t start) { /* the null transition only needs the string's null terminator byte */ return aa_dfa_next(dfa, start, 0); diff --git a/security/apparmor/include/match.h b/security/apparmor/include/match.h index 8844895905881..58fbf67139b9c 100644 --- a/security/apparmor/include/match.h +++ b/security/apparmor/include/match.h @@ -125,19 +125,19 @@ static inline size_t table_size(size_t len, size_t el_size) int aa_setup_dfa_engine(void); void aa_teardown_dfa_engine(void); +#define aa_state_t unsigned int + struct aa_dfa *aa_dfa_unpack(void *blob, size_t size, int flags); -unsigned int aa_dfa_match_len(struct aa_dfa *dfa, unsigned int start, - const char *str, int len); -unsigned int aa_dfa_match(struct aa_dfa *dfa, unsigned int start, - const char *str); -unsigned int aa_dfa_next(struct aa_dfa *dfa, unsigned int state, - const char c); -unsigned int aa_dfa_outofband_transition(struct aa_dfa *dfa, - unsigned int state); -unsigned int aa_dfa_match_until(struct aa_dfa *dfa, unsigned int start, - const char *str, const char **retpos); -unsigned int aa_dfa_matchn_until(struct aa_dfa *dfa, unsigned int start, - const char *str, int n, const char **retpos); +aa_state_t aa_dfa_match_len(struct aa_dfa *dfa, aa_state_t start, + const char *str, int len); +aa_state_t aa_dfa_match(struct aa_dfa *dfa, aa_state_t start, + const char *str); +aa_state_t aa_dfa_next(struct aa_dfa *dfa, aa_state_t state, const char c); +aa_state_t aa_dfa_outofband_transition(struct aa_dfa *dfa, aa_state_t state); +aa_state_t aa_dfa_match_until(struct aa_dfa *dfa, aa_state_t start, + const char *str, const char **retpos); +aa_state_t aa_dfa_matchn_until(struct aa_dfa *dfa, aa_state_t start, + const char *str, int n, const char **retpos); void aa_dfa_free_kref(struct kref *kref); @@ -156,8 +156,8 @@ struct match_workbuf N = { \ .len = 0, \ } -unsigned int aa_dfa_leftmatch(struct aa_dfa *dfa, unsigned int start, - const char *str, unsigned int *count); +aa_state_t aa_dfa_leftmatch(struct aa_dfa *dfa, aa_state_t start, + const char *str, unsigned int *count); /** * aa_get_dfa - increment refcount on dfa @p diff --git a/security/apparmor/include/policy.h b/security/apparmor/include/policy.h index 31c0af8762505..3a7d165e8fcc1 100644 --- a/security/apparmor/include/policy.h +++ b/security/apparmor/include/policy.h @@ -78,7 +78,7 @@ struct aa_policydb { struct aa_dfa *dfa; struct aa_perms *perms; struct aa_domain trans; - unsigned int start[AA_CLASS_LAST + 1]; + aa_state_t start[AA_CLASS_LAST + 1]; }; static inline void aa_destroy_policydb(struct aa_policydb *policy) @@ -91,7 +91,7 @@ static inline void aa_destroy_policydb(struct aa_policydb *policy) } static inline struct aa_perms *aa_lookup_perms(struct aa_policydb *policy, - unsigned int state) + aa_state_t state) { unsigned int index = ACCEPT_TABLE(policy->dfa)[state]; @@ -239,7 +239,7 @@ static inline struct aa_profile *aa_get_newest_profile(struct aa_profile *p) return labels_profile(aa_get_newest_label(&p->label)); } -static inline unsigned int PROFILE_MEDIATES(struct aa_profile *profile, +static inline aa_state_t PROFILE_MEDIATES(struct aa_profile *profile, unsigned char class) { if (class <= AA_CLASS_LAST) @@ -249,13 +249,13 @@ static inline unsigned int PROFILE_MEDIATES(struct aa_profile *profile, profile->policy.start[0], &class, 1); } -static inline unsigned int PROFILE_MEDIATES_AF(struct aa_profile *profile, - u16 AF) { - unsigned int state = PROFILE_MEDIATES(profile, AA_CLASS_NET); +static inline aa_state_t PROFILE_MEDIATES_AF(struct aa_profile *profile, + u16 AF) { + aa_state_t state = PROFILE_MEDIATES(profile, AA_CLASS_NET); __be16 be_af = cpu_to_be16(AF); if (!state) - return 0; + return DFA_NOMATCH; return aa_dfa_match_len(profile->policy.dfa, state, (char *) &be_af, 2); } diff --git a/security/apparmor/ipc.c b/security/apparmor/ipc.c index 3dbbc59d440da..7255a9d523728 100644 --- a/security/apparmor/ipc.c +++ b/security/apparmor/ipc.c @@ -79,7 +79,7 @@ static int profile_signal_perm(struct aa_profile *profile, struct common_audit_data *sa) { struct aa_perms perms; - unsigned int state; + aa_state_t state; if (profile_unconfined(profile) || !PROFILE_MEDIATES(profile, AA_CLASS_SIGNAL)) diff --git a/security/apparmor/label.c b/security/apparmor/label.c index 30cb68641c0f5..3a967003fa7c7 100644 --- a/security/apparmor/label.c +++ b/security/apparmor/label.c @@ -1265,9 +1265,9 @@ static inline bool label_is_visible(struct aa_profile *profile, * If a subns profile is not to be matched should be prescreened with * visibility test. */ -static inline unsigned int match_component(struct aa_profile *profile, - struct aa_profile *tp, - unsigned int state) +static inline aa_state_t match_component(struct aa_profile *profile, + struct aa_profile *tp, + aa_state_t state) { const char *ns_name; @@ -1299,7 +1299,7 @@ static inline unsigned int match_component(struct aa_profile *profile, */ static int label_compound_match(struct aa_profile *profile, struct aa_label *label, - unsigned int state, bool subns, u32 request, + aa_state_t state, bool subns, u32 request, struct aa_perms *perms) { struct aa_profile *tp; @@ -1356,14 +1356,14 @@ fail: * check to be stacked. */ static int label_components_match(struct aa_profile *profile, - struct aa_label *label, unsigned int start, + struct aa_label *label, aa_state_t start, bool subns, u32 request, struct aa_perms *perms) { struct aa_profile *tp; struct label_it i; struct aa_perms tmp; - unsigned int state = 0; + aa_state_t state = 0; /* find first subcomponent to test */ label_for_each(i, label, tp) { @@ -1415,7 +1415,7 @@ fail: * Returns: the state the match finished in, may be the none matching state */ int aa_label_match(struct aa_profile *profile, struct aa_label *label, - unsigned int state, bool subns, u32 request, + aa_state_t state, bool subns, u32 request, struct aa_perms *perms) { int error = label_compound_match(profile, label, state, subns, request, diff --git a/security/apparmor/lib.c b/security/apparmor/lib.c index 974a217218a6e..60deb4dc30c7f 100644 --- a/security/apparmor/lib.c +++ b/security/apparmor/lib.c @@ -357,7 +357,7 @@ void aa_profile_match_label(struct aa_profile *profile, struct aa_label *label, int type, u32 request, struct aa_perms *perms) { /* TODO: doesn't yet handle extended types */ - unsigned int state; + aa_state_t state; state = aa_dfa_next(profile->policy.dfa, profile->policy.start[AA_CLASS_LABEL], diff --git a/security/apparmor/match.c b/security/apparmor/match.c index 3e9e1eaf990ed..5095c26ca6836 100644 --- a/security/apparmor/match.c +++ b/security/apparmor/match.c @@ -436,17 +436,17 @@ do { \ * * Returns: final state reached after input is consumed */ -unsigned int aa_dfa_match_len(struct aa_dfa *dfa, unsigned int start, - const char *str, int len) +aa_state_t aa_dfa_match_len(struct aa_dfa *dfa, aa_state_t start, + const char *str, int len) { u16 *def = DEFAULT_TABLE(dfa); u32 *base = BASE_TABLE(dfa); u16 *next = NEXT_TABLE(dfa); u16 *check = CHECK_TABLE(dfa); - unsigned int state = start; + aa_state_t state = start; - if (state == 0) - return 0; + if (state == DFA_NOMATCH) + return DFA_NOMATCH; /* current state is , matching character *str */ if (dfa->tables[YYTD_ID_EC]) { @@ -476,17 +476,16 @@ unsigned int aa_dfa_match_len(struct aa_dfa *dfa, unsigned int start, * * Returns: final state reached after input is consumed */ -unsigned int aa_dfa_match(struct aa_dfa *dfa, unsigned int start, - const char *str) +aa_state_t aa_dfa_match(struct aa_dfa *dfa, aa_state_t start, const char *str) { u16 *def = DEFAULT_TABLE(dfa); u32 *base = BASE_TABLE(dfa); u16 *next = NEXT_TABLE(dfa); u16 *check = CHECK_TABLE(dfa); - unsigned int state = start; + aa_state_t state = start; - if (state == 0) - return 0; + if (state == DFA_NOMATCH) + return DFA_NOMATCH; /* current state is , matching character *str */ if (dfa->tables[YYTD_ID_EC]) { @@ -515,8 +514,7 @@ unsigned int aa_dfa_match(struct aa_dfa *dfa, unsigned int start, * * Returns: state reach after input @c */ -unsigned int aa_dfa_next(struct aa_dfa *dfa, unsigned int state, - const char c) +aa_state_t aa_dfa_next(struct aa_dfa *dfa, aa_state_t state, const char c) { u16 *def = DEFAULT_TABLE(dfa); u32 *base = BASE_TABLE(dfa); @@ -534,7 +532,7 @@ unsigned int aa_dfa_next(struct aa_dfa *dfa, unsigned int state, return state; } -unsigned int aa_dfa_outofband_transition(struct aa_dfa *dfa, unsigned int state) +aa_state_t aa_dfa_outofband_transition(struct aa_dfa *dfa, aa_state_t state) { u16 *def = DEFAULT_TABLE(dfa); u32 *base = BASE_TABLE(dfa); @@ -564,7 +562,7 @@ unsigned int aa_dfa_outofband_transition(struct aa_dfa *dfa, unsigned int state) * * Returns: final state reached after input is consumed */ -unsigned int aa_dfa_match_until(struct aa_dfa *dfa, unsigned int start, +aa_state_t aa_dfa_match_until(struct aa_dfa *dfa, aa_state_t start, const char *str, const char **retpos) { u16 *def = DEFAULT_TABLE(dfa); @@ -572,10 +570,10 @@ unsigned int aa_dfa_match_until(struct aa_dfa *dfa, unsigned int start, u16 *next = NEXT_TABLE(dfa); u16 *check = CHECK_TABLE(dfa); u32 *accept = ACCEPT_TABLE(dfa); - unsigned int state = start, pos; + aa_state_t state = start, pos; - if (state == 0) - return 0; + if (state == DFA_NOMATCH) + return DFA_NOMATCH; /* current state is , matching character *str */ if (dfa->tables[YYTD_ID_EC]) { @@ -625,7 +623,7 @@ unsigned int aa_dfa_match_until(struct aa_dfa *dfa, unsigned int start, * * Returns: final state reached after input is consumed */ -unsigned int aa_dfa_matchn_until(struct aa_dfa *dfa, unsigned int start, +aa_state_t aa_dfa_matchn_until(struct aa_dfa *dfa, aa_state_t start, const char *str, int n, const char **retpos) { u16 *def = DEFAULT_TABLE(dfa); @@ -633,11 +631,11 @@ unsigned int aa_dfa_matchn_until(struct aa_dfa *dfa, unsigned int start, u16 *next = NEXT_TABLE(dfa); u16 *check = CHECK_TABLE(dfa); u32 *accept = ACCEPT_TABLE(dfa); - unsigned int state = start, pos; + aa_state_t state = start, pos; *retpos = NULL; - if (state == 0) - return 0; + if (state == DFA_NOMATCH) + return DFA_NOMATCH; /* current state is , matching character *str */ if (dfa->tables[YYTD_ID_EC]) { @@ -677,11 +675,11 @@ do { \ } while (0) /* For DFAs that don't support extended tagging of states */ -static bool is_loop(struct match_workbuf *wb, unsigned int state, +static bool is_loop(struct match_workbuf *wb, aa_state_t state, unsigned int *adjust) { - unsigned int pos = wb->pos; - unsigned int i; + aa_state_t pos = wb->pos; + aa_state_t i; if (wb->history[pos] < state) return false; @@ -700,7 +698,7 @@ static bool is_loop(struct match_workbuf *wb, unsigned int state, return true; } -static unsigned int leftmatch_fb(struct aa_dfa *dfa, unsigned int start, +static aa_state_t leftmatch_fb(struct aa_dfa *dfa, aa_state_t start, const char *str, struct match_workbuf *wb, unsigned int *count) { @@ -708,7 +706,7 @@ static unsigned int leftmatch_fb(struct aa_dfa *dfa, unsigned int start, u32 *base = BASE_TABLE(dfa); u16 *next = NEXT_TABLE(dfa); u16 *check = CHECK_TABLE(dfa); - unsigned int state = start, pos; + aa_state_t state = start, pos; AA_BUG(!dfa); AA_BUG(!str); @@ -716,8 +714,8 @@ static unsigned int leftmatch_fb(struct aa_dfa *dfa, unsigned int start, AA_BUG(!count); *count = 0; - if (state == 0) - return 0; + if (state == DFA_NOMATCH) + return DFA_NOMATCH; /* current state is , matching character *str */ if (dfa->tables[YYTD_ID_EC]) { @@ -781,8 +779,8 @@ out: * * Returns: final state reached after input is consumed */ -unsigned int aa_dfa_leftmatch(struct aa_dfa *dfa, unsigned int start, - const char *str, unsigned int *count) +aa_state_t aa_dfa_leftmatch(struct aa_dfa *dfa, aa_state_t start, + const char *str, unsigned int *count) { DEFINE_MATCH_WB(wb); diff --git a/security/apparmor/mount.c b/security/apparmor/mount.c index 7594f3a3441e5..84aaf25e5deef 100644 --- a/security/apparmor/mount.c +++ b/security/apparmor/mount.c @@ -190,7 +190,7 @@ static int audit_mount(struct aa_profile *profile, const char *op, * * Returns: next state after flags match */ -static unsigned int match_mnt_flags(struct aa_dfa *dfa, unsigned int state, +static aa_state_t match_mnt_flags(struct aa_dfa *dfa, aa_state_t state, unsigned long flags) { unsigned int i; @@ -217,12 +217,12 @@ static const char * const mnt_info_table[] = { * Returns 0 on success else element that match failed in, this is the * index into the mnt_info_table above */ -static int do_match_mnt(struct aa_policydb *policy, unsigned int start, +static int do_match_mnt(struct aa_policydb *policy, aa_state_t start, const char *mntpnt, const char *devname, const char *type, unsigned long flags, void *data, bool binary, struct aa_perms *perms) { - unsigned int state; + aa_state_t state; AA_BUG(!policy); AA_BUG(!policy->dfa); @@ -567,7 +567,7 @@ static int profile_umount(struct aa_profile *profile, const struct path *path, { struct aa_perms perms = { }; const char *name = NULL, *info = NULL; - unsigned int state; + aa_state_t state; int error; AA_BUG(!profile); @@ -627,7 +627,7 @@ static struct aa_label *build_pivotroot(struct aa_profile *profile, const char *old_name, *new_name = NULL, *info = NULL; const char *trans_name = NULL; struct aa_perms perms = { }; - unsigned int state; + aa_state_t state; int error; AA_BUG(!profile); diff --git a/security/apparmor/net.c b/security/apparmor/net.c index fcfb97079e1be..d420d3aec3b85 100644 --- a/security/apparmor/net.c +++ b/security/apparmor/net.c @@ -109,7 +109,7 @@ int aa_profile_af_perm(struct aa_profile *profile, struct common_audit_data *sa, u32 request, u16 family, int type) { struct aa_perms perms = { }; - unsigned int state; + aa_state_t state; __be16 buffer[2]; AA_BUG(family >= AF_MAX); diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 0917412ba48f4..3ea591d31be7e 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -698,7 +698,7 @@ static u32 map_old_perms(u32 old) } static void compute_fperms_allow(struct aa_perms *perms, struct aa_dfa *dfa, - unsigned int state) + aa_state_t state) { perms->allow |= AA_MAY_GETATTR; @@ -710,7 +710,7 @@ static void compute_fperms_allow(struct aa_perms *perms, struct aa_dfa *dfa, } static struct aa_perms compute_fperms_user(struct aa_dfa *dfa, - unsigned int state) + aa_state_t state) { struct aa_perms perms = { }; @@ -725,7 +725,7 @@ static struct aa_perms compute_fperms_user(struct aa_dfa *dfa, } static struct aa_perms compute_fperms_other(struct aa_dfa *dfa, - unsigned int state) + aa_state_t state) { struct aa_perms perms = { }; @@ -748,8 +748,8 @@ static struct aa_perms compute_fperms_other(struct aa_dfa *dfa, */ static struct aa_perms *compute_fperms(struct aa_dfa *dfa) { - int state; - int state_count; + aa_state_t state; + unsigned int state_count; struct aa_perms *table; AA_BUG(!dfa); @@ -796,7 +796,7 @@ static u32 map_other(u32 x) } static struct aa_perms compute_perms_entry(struct aa_dfa *dfa, - unsigned int state) + aa_state_t state) { struct aa_perms perms = { }; @@ -817,8 +817,8 @@ static struct aa_perms compute_perms_entry(struct aa_dfa *dfa, static struct aa_perms *compute_perms(struct aa_dfa *dfa) { - int state; - int state_count; + unsigned int state; + unsigned int state_count; struct aa_perms *table; AA_BUG(!dfa); -- GitLab From 1b5a6198f5a9d0aa5497da0dc4bcd4fc166ee516 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Fri, 6 May 2022 18:57:12 -0700 Subject: [PATCH 0019/1423] apparmor: Fix abi check to include v8 abi The v8 abi is supported by the kernel but the userspace supported version check does not allow for it. This was missed when v8 was added due to a bug in the userspace compiler which was setting an older abi version for v8 encoding (which is forward compatible except on the network encoding). However it is possible to detect the network encoding by checking the policydb network support which the code does. The end result was that missing the abi flag worked until userspace was fixed and began correctly checking for the v8 abi version. Fixes: 56974a6fcfef ("apparmor: add base infastructure for socket mediation") Signed-off-by: John Johansen --- security/apparmor/policy_unpack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 3ea591d31be7e..0203e43460b66 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -1183,7 +1183,7 @@ static int verify_header(struct aa_ext *e, int required, const char **ns) * if not specified use previous version * Mask off everything that is not kernel abi version */ - if (VERSION_LT(e->version, v5) || VERSION_GT(e->version, v7)) { + if (VERSION_LT(e->version, v5) || VERSION_GT(e->version, v8)) { audit_iface(NULL, NULL, NULL, "unsupported interface version", e, error); return error; -- GitLab From 1cf26c3d2c4c2098e39a9905174d7842b531e693 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Sat, 7 May 2022 01:58:36 -0700 Subject: [PATCH 0020/1423] apparmor: fix apparmor mediating locking non-fs unix sockets the v8 and earlier policy does not encode the locking permission for no-fs unix sockets. However the kernel is enforcing mediation. Add the AA_MAY_LOCK perm to v8 and earlier computed perm mask which will grant permission for all current abi profiles, but still allow specifying auditing of the operation if needed. Link: http://bugs.launchpad.net/bugs/1780227 Signed-off-by: John Johansen --- security/apparmor/policy_unpack.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 0203e43460b66..2406c5c4caaf2 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -31,6 +31,7 @@ #define K_ABI_MASK 0x3ff #define FORCE_COMPLAIN_FLAG 0x800 #define VERSION_LT(X, Y) (((X) & K_ABI_MASK) < ((Y) & K_ABI_MASK)) +#define VERSION_LE(X, Y) (((X) & K_ABI_MASK) <= ((Y) & K_ABI_MASK)) #define VERSION_GT(X, Y) (((X) & K_ABI_MASK) > ((Y) & K_ABI_MASK)) #define v5 5 /* base version */ @@ -796,7 +797,8 @@ static u32 map_other(u32 x) } static struct aa_perms compute_perms_entry(struct aa_dfa *dfa, - aa_state_t state) + aa_state_t state, + u32 version) { struct aa_perms perms = { }; @@ -809,13 +811,15 @@ static struct aa_perms compute_perms_entry(struct aa_dfa *dfa, */ perms.allow |= map_other(dfa_other_allow(dfa, state)); + if (VERSION_LE(version, v8)) + perms.allow |= AA_MAY_LOCK; perms.audit |= map_other(dfa_other_audit(dfa, state)); perms.quiet |= map_other(dfa_other_quiet(dfa, state)); return perms; } -static struct aa_perms *compute_perms(struct aa_dfa *dfa) +static struct aa_perms *compute_perms(struct aa_dfa *dfa, u32 version) { unsigned int state; unsigned int state_count; @@ -831,7 +835,7 @@ static struct aa_perms *compute_perms(struct aa_dfa *dfa) /* zero init so skip the trap state (state == 0) */ for (state = 1; state < state_count; state++) - table[state] = compute_perms_entry(dfa, state); + table[state] = compute_perms_entry(dfa, state, version); return table; } @@ -1055,7 +1059,8 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) } if (!unpack_nameX(e, AA_STRUCTEND, NULL)) goto fail; - profile->policy.perms = compute_perms(profile->policy.dfa); + profile->policy.perms = compute_perms(profile->policy.dfa, + e->version); if (!profile->policy.perms) { info = "failed to remap policydb permission table"; goto fail; -- GitLab From 3c076531c5529c94cee330dffc4615ad02bb6edb Mon Sep 17 00:00:00 2001 From: John Johansen Date: Tue, 10 May 2022 02:21:22 -0700 Subject: [PATCH 0021/1423] apparmor: extend policydb permission set by making use of the xbits The policydb permission set has left the xbits unused. Make them available for mediation. Note: that this does not bring full auditing control of the permissions as there are not enough bits. The quieting of denials is provided as that is used more than forced auditing of allowed permissions. Signed-off-by: John Johansen --- security/apparmor/policy_unpack.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 2406c5c4caaf2..e918831166630 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -38,6 +38,7 @@ #define v6 6 /* per entry policydb mediation check */ #define v7 7 #define v8 8 /* full network masking */ +#define v9 9 /* xbits are used as permission bits in policydb */ /* * The AppArmor interface treats data as a type byte followed by the @@ -796,6 +797,12 @@ static u32 map_other(u32 x) ((x & 0x60) << 19); /* SETOPT/GETOPT */ } +static u32 map_xbits(u32 x) +{ + return ((x & 0x1) << 7) | + ((x & 0x7e) << 9); +} + static struct aa_perms compute_perms_entry(struct aa_dfa *dfa, aa_state_t state, u32 version) @@ -806,15 +813,31 @@ static struct aa_perms compute_perms_entry(struct aa_dfa *dfa, perms.audit = dfa_user_audit(dfa, state); perms.quiet = dfa_user_quiet(dfa, state); - /* for v5 perm mapping in the policydb, the other set is used - * to extend the general perm set + /* + * This mapping is convulated due to history. + * v1-v4: only file perms, which are handled by compute_fperms + * v5: added policydb which dropped user conditional to gain new + * perm bits, but had to map around the xbits because the + * userspace compiler was still munging them. + * v9: adds using the xbits in policydb because the compiler now + * supports treating policydb permission bits different. + * Unfortunately there is no way to force auditing on the + * perms represented by the xbits */ - perms.allow |= map_other(dfa_other_allow(dfa, state)); if (VERSION_LE(version, v8)) perms.allow |= AA_MAY_LOCK; + else + perms.allow |= map_xbits(dfa_user_xbits(dfa, state)); + + /* + * for v5-v9 perm mapping in the policydb, the other set is used + * to extend the general perm set + */ perms.audit |= map_other(dfa_other_audit(dfa, state)); perms.quiet |= map_other(dfa_other_quiet(dfa, state)); + if (VERSION_GT(version, v8)) + perms.quiet |= map_xbits(dfa_other_xbits(dfa, state)); return perms; } @@ -1188,7 +1211,7 @@ static int verify_header(struct aa_ext *e, int required, const char **ns) * if not specified use previous version * Mask off everything that is not kernel abi version */ - if (VERSION_LT(e->version, v5) || VERSION_GT(e->version, v8)) { + if (VERSION_LT(e->version, v5) || VERSION_GT(e->version, v9)) { audit_iface(NULL, NULL, NULL, "unsupported interface version", e, error); return error; -- GitLab From b06a62ebf5a3f041b22def1608f1a8ab9bbfa951 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Mon, 16 May 2022 04:37:08 -0700 Subject: [PATCH 0022/1423] apparmor: move dfa perm macros into policy_unpack Now that the permission remapping macros aren't needed anywhere except during profile unpack, move them. Signed-off-by: John Johansen --- security/apparmor/include/file.h | 51 ------------------------------- security/apparmor/policy_unpack.c | 49 +++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 51 deletions(-) diff --git a/security/apparmor/include/file.h b/security/apparmor/include/file.h index 8c82cf279dc2a..4212426020cbe 100644 --- a/security/apparmor/include/file.h +++ b/security/apparmor/include/file.h @@ -109,57 +109,6 @@ struct path_cond { #define COMBINED_PERM_MASK(X) ((X).allow | (X).audit | (X).quiet | (X).kill) -/* FIXME: split perms from dfa and match this to description - * also add delegation info. - */ -static inline u16 dfa_map_xindex(u16 mask) -{ - u16 old_index = (mask >> 10) & 0xf; - u16 index = 0; - - if (mask & 0x100) - index |= AA_X_UNSAFE; - if (mask & 0x200) - index |= AA_X_INHERIT; - if (mask & 0x80) - index |= AA_X_UNCONFINED; - - if (old_index == 1) { - index |= AA_X_UNCONFINED; - } else if (old_index == 2) { - index |= AA_X_NAME; - } else if (old_index == 3) { - index |= AA_X_NAME | AA_X_CHILD; - } else if (old_index) { - index |= AA_X_TABLE; - index |= old_index - 4; - } - - return index; -} - -/* - * map old dfa inline permissions to new format - */ -#define dfa_user_allow(dfa, state) (((ACCEPT_TABLE(dfa)[state]) & 0x7f) | \ - ((ACCEPT_TABLE(dfa)[state]) & 0x80000000)) -#define dfa_user_xbits(dfa, state) (((ACCEPT_TABLE(dfa)[state]) >> 7) & 0x7f) -#define dfa_user_audit(dfa, state) ((ACCEPT_TABLE2(dfa)[state]) & 0x7f) -#define dfa_user_quiet(dfa, state) (((ACCEPT_TABLE2(dfa)[state]) >> 7) & 0x7f) -#define dfa_user_xindex(dfa, state) \ - (dfa_map_xindex(ACCEPT_TABLE(dfa)[state] & 0x3fff)) - -#define dfa_other_allow(dfa, state) ((((ACCEPT_TABLE(dfa)[state]) >> 14) & \ - 0x7f) | \ - ((ACCEPT_TABLE(dfa)[state]) & 0x80000000)) -#define dfa_other_xbits(dfa, state) \ - ((((ACCEPT_TABLE(dfa)[state]) >> 7) >> 14) & 0x7f) -#define dfa_other_audit(dfa, state) (((ACCEPT_TABLE2(dfa)[state]) >> 14) & 0x7f) -#define dfa_other_quiet(dfa, state) \ - ((((ACCEPT_TABLE2(dfa)[state]) >> 7) >> 14) & 0x7f) -#define dfa_other_xindex(dfa, state) \ - dfa_map_xindex((ACCEPT_TABLE(dfa)[state] >> 14) & 0x3fff) - int aa_audit_file(struct aa_profile *profile, struct aa_perms *perms, const char *op, u32 request, const char *name, const char *target, struct aa_label *tlabel, kuid_t ouid, diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index e918831166630..32cca5f27b8ff 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -671,6 +671,55 @@ static int datacmp(struct rhashtable_compare_arg *arg, const void *obj) return strcmp(data->key, *key); } +/* remap old accept table embedded permissions to separate permission table */ +static u16 dfa_map_xindex(u16 mask) +{ + u16 old_index = (mask >> 10) & 0xf; + u16 index = 0; + + if (mask & 0x100) + index |= AA_X_UNSAFE; + if (mask & 0x200) + index |= AA_X_INHERIT; + if (mask & 0x80) + index |= AA_X_UNCONFINED; + + if (old_index == 1) { + index |= AA_X_UNCONFINED; + } else if (old_index == 2) { + index |= AA_X_NAME; + } else if (old_index == 3) { + index |= AA_X_NAME | AA_X_CHILD; + } else if (old_index) { + index |= AA_X_TABLE; + index |= old_index - 4; + } + + return index; +} + +/* + * map old dfa inline permissions to new format + */ +#define dfa_user_allow(dfa, state) (((ACCEPT_TABLE(dfa)[state]) & 0x7f) | \ + ((ACCEPT_TABLE(dfa)[state]) & 0x80000000)) +#define dfa_user_xbits(dfa, state) (((ACCEPT_TABLE(dfa)[state]) >> 7) & 0x7f) +#define dfa_user_audit(dfa, state) ((ACCEPT_TABLE2(dfa)[state]) & 0x7f) +#define dfa_user_quiet(dfa, state) (((ACCEPT_TABLE2(dfa)[state]) >> 7) & 0x7f) +#define dfa_user_xindex(dfa, state) \ + (dfa_map_xindex(ACCEPT_TABLE(dfa)[state] & 0x3fff)) + +#define dfa_other_allow(dfa, state) ((((ACCEPT_TABLE(dfa)[state]) >> 14) & \ + 0x7f) | \ + ((ACCEPT_TABLE(dfa)[state]) & 0x80000000)) +#define dfa_other_xbits(dfa, state) \ + ((((ACCEPT_TABLE(dfa)[state]) >> 7) >> 14) & 0x7f) +#define dfa_other_audit(dfa, state) (((ACCEPT_TABLE2(dfa)[state]) >> 14) & 0x7f) +#define dfa_other_quiet(dfa, state) \ + ((((ACCEPT_TABLE2(dfa)[state]) >> 7) >> 14) & 0x7f) +#define dfa_other_xindex(dfa, state) \ + dfa_map_xindex((ACCEPT_TABLE(dfa)[state] >> 14) & 0x3fff) + /** * map_old_perms - map old file perms layout to the new layout * @old: permission set in old mapping -- GitLab From ae6d35ed0a481824a8730c39d5b319c8a76ea00e Mon Sep 17 00:00:00 2001 From: John Johansen Date: Sat, 16 Jul 2022 03:29:19 -0700 Subject: [PATCH 0023/1423] apparmor: extend xindex size Allow the xindex to have 2^24 entries. Signed-off-by: John Johansen --- security/apparmor/include/file.h | 23 +++++++++++------------ security/apparmor/include/perms.h | 2 +- security/apparmor/policy_unpack.c | 8 ++++---- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/security/apparmor/include/file.h b/security/apparmor/include/file.h index 4212426020cbe..521c8568f6d49 100644 --- a/security/apparmor/include/file.h +++ b/security/apparmor/include/file.h @@ -88,18 +88,17 @@ static inline struct aa_label *aa_get_file_label(struct aa_file_ctx *ctx) * - exec type - which determines how the executable name and index are used * - flags - which modify how the destination name is applied */ -#define AA_X_INDEX_MASK 0x03ff - -#define AA_X_TYPE_MASK 0x0c00 -#define AA_X_TYPE_SHIFT 10 -#define AA_X_NONE 0x0000 -#define AA_X_NAME 0x0400 /* use executable name px */ -#define AA_X_TABLE 0x0800 /* use a specified name ->n# */ - -#define AA_X_UNSAFE 0x1000 -#define AA_X_CHILD 0x2000 /* make >AA_X_NONE apply to children */ -#define AA_X_INHERIT 0x4000 -#define AA_X_UNCONFINED 0x8000 +#define AA_X_INDEX_MASK 0x00ffffff + +#define AA_X_TYPE_MASK 0x0c000000 +#define AA_X_NONE 0x00000000 +#define AA_X_NAME 0x04000000 /* use executable name px */ +#define AA_X_TABLE 0x08000000 /* use a specified name ->n# */ + +#define AA_X_UNSAFE 0x10000000 +#define AA_X_CHILD 0x20000000 +#define AA_X_INHERIT 0x40000000 +#define AA_X_UNCONFINED 0x80000000 /* need to make conditional which ones are being set */ struct path_cond { diff --git a/security/apparmor/include/perms.h b/security/apparmor/include/perms.h index 1014a7bbc0278..8739cef735493 100644 --- a/security/apparmor/include/perms.h +++ b/security/apparmor/include/perms.h @@ -78,7 +78,7 @@ struct aa_perms { u32 quiet; /* set only when ~allow | deny */ u32 hide; /* set only when ~allow | deny */ - u16 xindex; + u32 xindex; }; #define ALL_PERMS_MASK 0xffffffff diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 32cca5f27b8ff..c578d9af785e0 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -489,8 +489,8 @@ static bool unpack_trans_table(struct aa_ext *e, struct aa_profile *profile) int i, size; size = unpack_array(e, NULL); - /* currently 4 exec bits and entries 0-3 are reserved iupcx */ - if (size > 16 - 4) + /* currently 2^24 bits entries 0-3 */ + if (size > (1 << 24)) goto fail; profile->file.trans.table = kcalloc(size, sizeof(char *), GFP_KERNEL); @@ -672,10 +672,10 @@ static int datacmp(struct rhashtable_compare_arg *arg, const void *obj) } /* remap old accept table embedded permissions to separate permission table */ -static u16 dfa_map_xindex(u16 mask) +static u32 dfa_map_xindex(u16 mask) { u16 old_index = (mask >> 10) & 0xf; - u16 index = 0; + u32 index = 0; if (mask & 0x100) index |= AA_X_UNSAFE; -- GitLab From caa9f579ca7255e9d6c25f072447d895c5928c97 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Sun, 21 Aug 2022 22:48:32 -0700 Subject: [PATCH 0024/1423] apparmor: isolate policy backwards compatibility to its own file The details of mapping old policy into newer policy formats clutters up the unpack code and makes it possible to accidentally use old mappings in code, so isolate the mapping code into its own file. This will become more important when the dfa remapping code lands, as it will greatly expand the compat code base. Signed-off-by: John Johansen --- security/apparmor/Makefile | 3 +- security/apparmor/include/policy_compat.h | 33 +++ security/apparmor/include/policy_unpack.h | 1 + security/apparmor/policy_compat.c | 319 ++++++++++++++++++++++ security/apparmor/policy_unpack.c | 290 +------------------- 5 files changed, 359 insertions(+), 287 deletions(-) create mode 100644 security/apparmor/include/policy_compat.h create mode 100644 security/apparmor/policy_compat.c diff --git a/security/apparmor/Makefile b/security/apparmor/Makefile index ff23fcfefe196..4377123c2b988 100644 --- a/security/apparmor/Makefile +++ b/security/apparmor/Makefile @@ -5,7 +5,8 @@ obj-$(CONFIG_SECURITY_APPARMOR) += apparmor.o apparmor-y := apparmorfs.o audit.o capability.o task.o ipc.o lib.o match.o \ path.o domain.o policy.o policy_unpack.o procattr.o lsm.o \ - resource.o secid.o file.o policy_ns.o label.o mount.o net.o + resource.o secid.o file.o policy_ns.o label.o mount.o net.o \ + policy_compat.o apparmor-$(CONFIG_SECURITY_APPARMOR_HASH) += crypto.o clean-files := capability_names.h rlim_names.h net_names.h diff --git a/security/apparmor/include/policy_compat.h b/security/apparmor/include/policy_compat.h new file mode 100644 index 0000000000000..af0e174332df5 --- /dev/null +++ b/security/apparmor/include/policy_compat.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * AppArmor security module + * + * Code to provide backwards compatibility with older policy versions, + * by converting/mapping older policy formats into the newer internal + * formats. + * + * Copyright 2022 Canonical Ltd. + */ + +#ifndef __POLICY_COMPAT_H +#define __POLICY_COMPAT_H + +#include "policy.h" + +#define K_ABI_MASK 0x3ff +#define FORCE_COMPLAIN_FLAG 0x800 +#define VERSION_LT(X, Y) (((X) & K_ABI_MASK) < ((Y) & K_ABI_MASK)) +#define VERSION_LE(X, Y) (((X) & K_ABI_MASK) <= ((Y) & K_ABI_MASK)) +#define VERSION_GT(X, Y) (((X) & K_ABI_MASK) > ((Y) & K_ABI_MASK)) + +#define v5 5 /* base version */ +#define v6 6 /* per entry policydb mediation check */ +#define v7 7 +#define v8 8 /* full network masking */ +#define v9 9 /* xbits are used as permission bits in policydb */ + +int aa_compat_map_xmatch(struct aa_policydb *policy); +int aa_compat_map_policy(struct aa_policydb *policy, u32 version); +int aa_compat_map_file(struct aa_policydb *policy); + +#endif /* __POLICY_COMPAT_H */ diff --git a/security/apparmor/include/policy_unpack.h b/security/apparmor/include/policy_unpack.h index eb5f7d7f132bb..cdfbc8a54a9d2 100644 --- a/security/apparmor/include/policy_unpack.h +++ b/security/apparmor/include/policy_unpack.h @@ -16,6 +16,7 @@ #include #include + struct aa_load_ent { struct list_head list; struct aa_profile *new; diff --git a/security/apparmor/policy_compat.c b/security/apparmor/policy_compat.c new file mode 100644 index 0000000000000..1aa5cced935ee --- /dev/null +++ b/security/apparmor/policy_compat.c @@ -0,0 +1,319 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AppArmor security module + * + * This file contains AppArmor functions for unpacking policy loaded + * from userspace. + * + * Copyright (C) 1998-2008 Novell/SUSE + * Copyright 2009-2022 Canonical Ltd. + * + * Code to provide backwards compatibility with older policy versions, + * by converting/mapping older policy formats into the newer internal + * formats. + */ + +#include +#include + +#include "include/lib.h" +#include "include/policy_unpack.h" +#include "include/policy_compat.h" + +/* remap old accept table embedded permissions to separate permission table */ +static u32 dfa_map_xindex(u16 mask) +{ + u16 old_index = (mask >> 10) & 0xf; + u32 index = 0; + + if (mask & 0x100) + index |= AA_X_UNSAFE; + if (mask & 0x200) + index |= AA_X_INHERIT; + if (mask & 0x80) + index |= AA_X_UNCONFINED; + + if (old_index == 1) { + index |= AA_X_UNCONFINED; + } else if (old_index == 2) { + index |= AA_X_NAME; + } else if (old_index == 3) { + index |= AA_X_NAME | AA_X_CHILD; + } else if (old_index) { + index |= AA_X_TABLE; + index |= old_index - 4; + } + + return index; +} + +/* + * map old dfa inline permissions to new format + */ +#define dfa_user_allow(dfa, state) (((ACCEPT_TABLE(dfa)[state]) & 0x7f) | \ + ((ACCEPT_TABLE(dfa)[state]) & 0x80000000)) +#define dfa_user_xbits(dfa, state) (((ACCEPT_TABLE(dfa)[state]) >> 7) & 0x7f) +#define dfa_user_audit(dfa, state) ((ACCEPT_TABLE2(dfa)[state]) & 0x7f) +#define dfa_user_quiet(dfa, state) (((ACCEPT_TABLE2(dfa)[state]) >> 7) & 0x7f) +#define dfa_user_xindex(dfa, state) \ + (dfa_map_xindex(ACCEPT_TABLE(dfa)[state] & 0x3fff)) + +#define dfa_other_allow(dfa, state) ((((ACCEPT_TABLE(dfa)[state]) >> 14) & \ + 0x7f) | \ + ((ACCEPT_TABLE(dfa)[state]) & 0x80000000)) +#define dfa_other_xbits(dfa, state) \ + ((((ACCEPT_TABLE(dfa)[state]) >> 7) >> 14) & 0x7f) +#define dfa_other_audit(dfa, state) (((ACCEPT_TABLE2(dfa)[state]) >> 14) & 0x7f) +#define dfa_other_quiet(dfa, state) \ + ((((ACCEPT_TABLE2(dfa)[state]) >> 7) >> 14) & 0x7f) +#define dfa_other_xindex(dfa, state) \ + dfa_map_xindex((ACCEPT_TABLE(dfa)[state] >> 14) & 0x3fff) + +/** + * map_old_perms - map old file perms layout to the new layout + * @old: permission set in old mapping + * + * Returns: new permission mapping + */ +static u32 map_old_perms(u32 old) +{ + u32 new = old & 0xf; + + if (old & MAY_READ) + new |= AA_MAY_GETATTR | AA_MAY_OPEN; + if (old & MAY_WRITE) + new |= AA_MAY_SETATTR | AA_MAY_CREATE | AA_MAY_DELETE | + AA_MAY_CHMOD | AA_MAY_CHOWN | AA_MAY_OPEN; + if (old & 0x10) + new |= AA_MAY_LINK; + /* the old mapping lock and link_subset flags where overlaid + * and use was determined by part of a pair that they were in + */ + if (old & 0x20) + new |= AA_MAY_LOCK | AA_LINK_SUBSET; + if (old & 0x40) /* AA_EXEC_MMAP */ + new |= AA_EXEC_MMAP; + + return new; +} + +static void compute_fperms_allow(struct aa_perms *perms, struct aa_dfa *dfa, + aa_state_t state) +{ + perms->allow |= AA_MAY_GETATTR; + + /* change_profile wasn't determined by ownership in old mapping */ + if (ACCEPT_TABLE(dfa)[state] & 0x80000000) + perms->allow |= AA_MAY_CHANGE_PROFILE; + if (ACCEPT_TABLE(dfa)[state] & 0x40000000) + perms->allow |= AA_MAY_ONEXEC; +} + +static struct aa_perms compute_fperms_user(struct aa_dfa *dfa, + aa_state_t state) +{ + struct aa_perms perms = { }; + + perms.allow = map_old_perms(dfa_user_allow(dfa, state)); + perms.audit = map_old_perms(dfa_user_audit(dfa, state)); + perms.quiet = map_old_perms(dfa_user_quiet(dfa, state)); + perms.xindex = dfa_user_xindex(dfa, state); + + compute_fperms_allow(&perms, dfa, state); + + return perms; +} + +static struct aa_perms compute_fperms_other(struct aa_dfa *dfa, + aa_state_t state) +{ + struct aa_perms perms = { }; + + perms.allow = map_old_perms(dfa_other_allow(dfa, state)); + perms.audit = map_old_perms(dfa_other_audit(dfa, state)); + perms.quiet = map_old_perms(dfa_other_quiet(dfa, state)); + perms.xindex = dfa_other_xindex(dfa, state); + + compute_fperms_allow(&perms, dfa, state); + + return perms; +} + +/** + * aa_compute_fperms - convert dfa compressed perms to internal perms and store + * them so they can be retrieved later. + * @dfa: a dfa using fperms to remap to internal permissions + * + * Returns: remapped perm table + */ +static struct aa_perms *compute_fperms(struct aa_dfa *dfa) +{ + aa_state_t state; + unsigned int state_count; + struct aa_perms *table; + + AA_BUG(!dfa); + + state_count = dfa->tables[YYTD_ID_BASE]->td_lolen; + /* DFAs are restricted from having a state_count of less than 2 */ + table = kvcalloc(state_count * 2, sizeof(struct aa_perms), GFP_KERNEL); + if (!table) + return NULL; + + /* zero init so skip the trap state (state == 0) */ + for (state = 1; state < state_count; state++) { + table[state * 2] = compute_fperms_user(dfa, state); + table[state * 2 + 1] = compute_fperms_other(dfa, state); + } + + return table; +} + +static struct aa_perms *compute_xmatch_perms(struct aa_dfa *xmatch) +{ + struct aa_perms *perms; + int state; + int state_count; + + AA_BUG(!xmatch); + + state_count = xmatch->tables[YYTD_ID_BASE]->td_lolen; + /* DFAs are restricted from having a state_count of less than 2 */ + perms = kvcalloc(state_count, sizeof(struct aa_perms), GFP_KERNEL); + + /* zero init so skip the trap state (state == 0) */ + for (state = 1; state < state_count; state++) + perms[state].allow = dfa_user_allow(xmatch, state); + + return perms; +} + +static u32 map_other(u32 x) +{ + return ((x & 0x3) << 8) | /* SETATTR/GETATTR */ + ((x & 0x1c) << 18) | /* ACCEPT/BIND/LISTEN */ + ((x & 0x60) << 19); /* SETOPT/GETOPT */ +} + +static u32 map_xbits(u32 x) +{ + return ((x & 0x1) << 7) | + ((x & 0x7e) << 9); +} + +static struct aa_perms compute_perms_entry(struct aa_dfa *dfa, + aa_state_t state, + u32 version) +{ + struct aa_perms perms = { }; + + perms.allow = dfa_user_allow(dfa, state); + perms.audit = dfa_user_audit(dfa, state); + perms.quiet = dfa_user_quiet(dfa, state); + + /* + * This mapping is convulated due to history. + * v1-v4: only file perms, which are handled by compute_fperms + * v5: added policydb which dropped user conditional to gain new + * perm bits, but had to map around the xbits because the + * userspace compiler was still munging them. + * v9: adds using the xbits in policydb because the compiler now + * supports treating policydb permission bits different. + * Unfortunately there is no way to force auditing on the + * perms represented by the xbits + */ + perms.allow |= map_other(dfa_other_allow(dfa, state)); + if (VERSION_LE(version, v8)) + perms.allow |= AA_MAY_LOCK; + else + perms.allow |= map_xbits(dfa_user_xbits(dfa, state)); + + /* + * for v5-v9 perm mapping in the policydb, the other set is used + * to extend the general perm set + */ + perms.audit |= map_other(dfa_other_audit(dfa, state)); + perms.quiet |= map_other(dfa_other_quiet(dfa, state)); + if (VERSION_GT(version, v8)) + perms.quiet |= map_xbits(dfa_other_xbits(dfa, state)); + + return perms; +} + +static struct aa_perms *compute_perms(struct aa_dfa *dfa, u32 version) +{ + unsigned int state; + unsigned int state_count; + struct aa_perms *table; + + AA_BUG(!dfa); + + state_count = dfa->tables[YYTD_ID_BASE]->td_lolen; + /* DFAs are restricted from having a state_count of less than 2 */ + table = kvcalloc(state_count, sizeof(struct aa_perms), GFP_KERNEL); + if (!table) + return NULL; + + /* zero init so skip the trap state (state == 0) */ + for (state = 1; state < state_count; state++) + table[state] = compute_perms_entry(dfa, state, version); + + return table; +} + +/** + * remap_dfa_accept - remap old dfa accept table to be an index + * @dfa: dfa to do the remapping on + * @factor: scaling factor for the index conversion. + * + * Used in conjunction with compute_Xperms, it converts old style perms + * that are encoded in the dfa accept tables to the new style where + * there is a permission table and the accept table is an index into + * the permission table. + */ +static void remap_dfa_accept(struct aa_dfa *dfa, unsigned int factor) +{ + unsigned int state; + unsigned int state_count = dfa->tables[YYTD_ID_BASE]->td_lolen; + + AA_BUG(!dfa); + + for (state = 0; state < state_count; state++) + ACCEPT_TABLE(dfa)[state] = state * factor; + kvfree(dfa->tables[YYTD_ID_ACCEPT2]); + dfa->tables[YYTD_ID_ACCEPT2] = NULL; +} + +/* TODO: merge different dfa mappings into single map_policy fn */ +int aa_compat_map_xmatch(struct aa_policydb *policy) +{ + policy->perms = compute_xmatch_perms(policy->dfa); + if (!policy->perms) + return -ENOMEM; + + remap_dfa_accept(policy->dfa, 1); + + return 0; +} + +int aa_compat_map_policy(struct aa_policydb *policy, u32 version) +{ + policy->perms = compute_perms(policy->dfa, version); + if (!policy->perms) + return -ENOMEM; + + remap_dfa_accept(policy->dfa, 1); + + return 0; +} + +int aa_compat_map_file(struct aa_policydb *policy) +{ + policy->perms = compute_fperms(policy->dfa); + if (!policy->perms) + return -ENOMEM; + + remap_dfa_accept(policy->dfa, 2); + + return 0; +} diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index c578d9af785e0..63196df2841b5 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -27,18 +27,8 @@ #include "include/path.h" #include "include/policy.h" #include "include/policy_unpack.h" +#include "include/policy_compat.h" -#define K_ABI_MASK 0x3ff -#define FORCE_COMPLAIN_FLAG 0x800 -#define VERSION_LT(X, Y) (((X) & K_ABI_MASK) < ((Y) & K_ABI_MASK)) -#define VERSION_LE(X, Y) (((X) & K_ABI_MASK) <= ((Y) & K_ABI_MASK)) -#define VERSION_GT(X, Y) (((X) & K_ABI_MASK) > ((Y) & K_ABI_MASK)) - -#define v5 5 /* base version */ -#define v6 6 /* per entry policydb mediation check */ -#define v7 7 -#define v8 8 /* full network masking */ -#define v9 9 /* xbits are used as permission bits in policydb */ /* * The AppArmor interface treats data as a type byte followed by the @@ -671,270 +661,6 @@ static int datacmp(struct rhashtable_compare_arg *arg, const void *obj) return strcmp(data->key, *key); } -/* remap old accept table embedded permissions to separate permission table */ -static u32 dfa_map_xindex(u16 mask) -{ - u16 old_index = (mask >> 10) & 0xf; - u32 index = 0; - - if (mask & 0x100) - index |= AA_X_UNSAFE; - if (mask & 0x200) - index |= AA_X_INHERIT; - if (mask & 0x80) - index |= AA_X_UNCONFINED; - - if (old_index == 1) { - index |= AA_X_UNCONFINED; - } else if (old_index == 2) { - index |= AA_X_NAME; - } else if (old_index == 3) { - index |= AA_X_NAME | AA_X_CHILD; - } else if (old_index) { - index |= AA_X_TABLE; - index |= old_index - 4; - } - - return index; -} - -/* - * map old dfa inline permissions to new format - */ -#define dfa_user_allow(dfa, state) (((ACCEPT_TABLE(dfa)[state]) & 0x7f) | \ - ((ACCEPT_TABLE(dfa)[state]) & 0x80000000)) -#define dfa_user_xbits(dfa, state) (((ACCEPT_TABLE(dfa)[state]) >> 7) & 0x7f) -#define dfa_user_audit(dfa, state) ((ACCEPT_TABLE2(dfa)[state]) & 0x7f) -#define dfa_user_quiet(dfa, state) (((ACCEPT_TABLE2(dfa)[state]) >> 7) & 0x7f) -#define dfa_user_xindex(dfa, state) \ - (dfa_map_xindex(ACCEPT_TABLE(dfa)[state] & 0x3fff)) - -#define dfa_other_allow(dfa, state) ((((ACCEPT_TABLE(dfa)[state]) >> 14) & \ - 0x7f) | \ - ((ACCEPT_TABLE(dfa)[state]) & 0x80000000)) -#define dfa_other_xbits(dfa, state) \ - ((((ACCEPT_TABLE(dfa)[state]) >> 7) >> 14) & 0x7f) -#define dfa_other_audit(dfa, state) (((ACCEPT_TABLE2(dfa)[state]) >> 14) & 0x7f) -#define dfa_other_quiet(dfa, state) \ - ((((ACCEPT_TABLE2(dfa)[state]) >> 7) >> 14) & 0x7f) -#define dfa_other_xindex(dfa, state) \ - dfa_map_xindex((ACCEPT_TABLE(dfa)[state] >> 14) & 0x3fff) - -/** - * map_old_perms - map old file perms layout to the new layout - * @old: permission set in old mapping - * - * Returns: new permission mapping - */ -static u32 map_old_perms(u32 old) -{ - u32 new = old & 0xf; - - if (old & MAY_READ) - new |= AA_MAY_GETATTR | AA_MAY_OPEN; - if (old & MAY_WRITE) - new |= AA_MAY_SETATTR | AA_MAY_CREATE | AA_MAY_DELETE | - AA_MAY_CHMOD | AA_MAY_CHOWN | AA_MAY_OPEN; - if (old & 0x10) - new |= AA_MAY_LINK; - /* the old mapping lock and link_subset flags where overlaid - * and use was determined by part of a pair that they were in - */ - if (old & 0x20) - new |= AA_MAY_LOCK | AA_LINK_SUBSET; - if (old & 0x40) /* AA_EXEC_MMAP */ - new |= AA_EXEC_MMAP; - - return new; -} - -static void compute_fperms_allow(struct aa_perms *perms, struct aa_dfa *dfa, - aa_state_t state) -{ - perms->allow |= AA_MAY_GETATTR; - - /* change_profile wasn't determined by ownership in old mapping */ - if (ACCEPT_TABLE(dfa)[state] & 0x80000000) - perms->allow |= AA_MAY_CHANGE_PROFILE; - if (ACCEPT_TABLE(dfa)[state] & 0x40000000) - perms->allow |= AA_MAY_ONEXEC; -} - -static struct aa_perms compute_fperms_user(struct aa_dfa *dfa, - aa_state_t state) -{ - struct aa_perms perms = { }; - - perms.allow = map_old_perms(dfa_user_allow(dfa, state)); - perms.audit = map_old_perms(dfa_user_audit(dfa, state)); - perms.quiet = map_old_perms(dfa_user_quiet(dfa, state)); - perms.xindex = dfa_user_xindex(dfa, state); - - compute_fperms_allow(&perms, dfa, state); - - return perms; -} - -static struct aa_perms compute_fperms_other(struct aa_dfa *dfa, - aa_state_t state) -{ - struct aa_perms perms = { }; - - perms.allow = map_old_perms(dfa_other_allow(dfa, state)); - perms.audit = map_old_perms(dfa_other_audit(dfa, state)); - perms.quiet = map_old_perms(dfa_other_quiet(dfa, state)); - perms.xindex = dfa_other_xindex(dfa, state); - - compute_fperms_allow(&perms, dfa, state); - - return perms; -} - -/** - * aa_compute_fperms - convert dfa compressed perms to internal perms and store - * them so they can be retrieved later. - * @dfa: a dfa using fperms to remap to internal permissions - * - * Returns: remapped perm table - */ -static struct aa_perms *compute_fperms(struct aa_dfa *dfa) -{ - aa_state_t state; - unsigned int state_count; - struct aa_perms *table; - - AA_BUG(!dfa); - - state_count = dfa->tables[YYTD_ID_BASE]->td_lolen; - /* DFAs are restricted from having a state_count of less than 2 */ - table = kvcalloc(state_count * 2, sizeof(struct aa_perms), GFP_KERNEL); - if (!table) - return NULL; - - /* zero init so skip the trap state (state == 0) */ - for (state = 1; state < state_count; state++) { - table[state * 2] = compute_fperms_user(dfa, state); - table[state * 2 + 1] = compute_fperms_other(dfa, state); - } - - return table; -} - -static struct aa_perms *compute_xmatch_perms(struct aa_dfa *xmatch) -{ - struct aa_perms *perms; - int state; - int state_count; - - AA_BUG(!xmatch); - - state_count = xmatch->tables[YYTD_ID_BASE]->td_lolen; - /* DFAs are restricted from having a state_count of less than 2 */ - perms = kvcalloc(state_count, sizeof(struct aa_perms), GFP_KERNEL); - - /* zero init so skip the trap state (state == 0) */ - for (state = 1; state < state_count; state++) - perms[state].allow = dfa_user_allow(xmatch, state); - - return perms; -} - -static u32 map_other(u32 x) -{ - return ((x & 0x3) << 8) | /* SETATTR/GETATTR */ - ((x & 0x1c) << 18) | /* ACCEPT/BIND/LISTEN */ - ((x & 0x60) << 19); /* SETOPT/GETOPT */ -} - -static u32 map_xbits(u32 x) -{ - return ((x & 0x1) << 7) | - ((x & 0x7e) << 9); -} - -static struct aa_perms compute_perms_entry(struct aa_dfa *dfa, - aa_state_t state, - u32 version) -{ - struct aa_perms perms = { }; - - perms.allow = dfa_user_allow(dfa, state); - perms.audit = dfa_user_audit(dfa, state); - perms.quiet = dfa_user_quiet(dfa, state); - - /* - * This mapping is convulated due to history. - * v1-v4: only file perms, which are handled by compute_fperms - * v5: added policydb which dropped user conditional to gain new - * perm bits, but had to map around the xbits because the - * userspace compiler was still munging them. - * v9: adds using the xbits in policydb because the compiler now - * supports treating policydb permission bits different. - * Unfortunately there is no way to force auditing on the - * perms represented by the xbits - */ - perms.allow |= map_other(dfa_other_allow(dfa, state)); - if (VERSION_LE(version, v8)) - perms.allow |= AA_MAY_LOCK; - else - perms.allow |= map_xbits(dfa_user_xbits(dfa, state)); - - /* - * for v5-v9 perm mapping in the policydb, the other set is used - * to extend the general perm set - */ - perms.audit |= map_other(dfa_other_audit(dfa, state)); - perms.quiet |= map_other(dfa_other_quiet(dfa, state)); - if (VERSION_GT(version, v8)) - perms.quiet |= map_xbits(dfa_other_xbits(dfa, state)); - - return perms; -} - -static struct aa_perms *compute_perms(struct aa_dfa *dfa, u32 version) -{ - unsigned int state; - unsigned int state_count; - struct aa_perms *table; - - AA_BUG(!dfa); - - state_count = dfa->tables[YYTD_ID_BASE]->td_lolen; - /* DFAs are restricted from having a state_count of less than 2 */ - table = kvcalloc(state_count, sizeof(struct aa_perms), GFP_KERNEL); - if (!table) - return NULL; - - /* zero init so skip the trap state (state == 0) */ - for (state = 1; state < state_count; state++) - table[state] = compute_perms_entry(dfa, state, version); - - return table; -} - -/** - * remap_dfa_accept - remap old dfa accept table to be an index - * @dfa: dfa to do the remapping on - * @factor: scaling factor for the index conversion. - * - * Used in conjunction with compute_Xperms, it converts old style perms - * that are encoded in the dfa accept tables to the new style where - * there is a permission table and the accept table is an index into - * the permission table. - */ -static void remap_dfa_accept(struct aa_dfa *dfa, unsigned int factor) -{ - unsigned int state; - unsigned int state_count = dfa->tables[YYTD_ID_BASE]->td_lolen; - - AA_BUG(!dfa); - - for (state = 0; state < state_count; state++) - ACCEPT_TABLE(dfa)[state] = state * factor; - kvfree(dfa->tables[YYTD_ID_ACCEPT2]); - dfa->tables[YYTD_ID_ACCEPT2] = NULL; -} - /** * unpack_profile - unpack a serialized profile * @e: serialized data extent information (NOT NULL) @@ -1001,12 +727,10 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) } profile->xmatch_len = tmp; profile->xmatch.start[AA_CLASS_XMATCH] = DFA_START; - profile->xmatch.perms = compute_xmatch_perms(profile->xmatch.dfa); - if (!profile->xmatch.perms) { + if (aa_compat_map_xmatch(&profile->xmatch)) { info = "failed to convert xmatch permission table"; goto fail; } - remap_dfa_accept(profile->xmatch.dfa, 1); } /* disconnected attachment string is optional */ @@ -1131,14 +855,10 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) } if (!unpack_nameX(e, AA_STRUCTEND, NULL)) goto fail; - profile->policy.perms = compute_perms(profile->policy.dfa, - e->version); - if (!profile->policy.perms) { + if (aa_compat_map_policy(&profile->policy, e->version)) { info = "failed to remap policydb permission table"; goto fail; } - /* Do not remap internal dfas */ - remap_dfa_accept(profile->policy.dfa, 1); } else profile->policy.dfa = aa_get_dfa(nulldfa); @@ -1154,12 +874,10 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) "dfa_start")) /* default start state */ profile->file.start[AA_CLASS_FILE] = DFA_START; - profile->file.perms = compute_fperms(profile->file.dfa); - if (!profile->file.perms) { + if (aa_compat_map_file(&profile->file)) { info = "failed to remap file permission table"; goto fail; } - remap_dfa_accept(profile->file.dfa, 2); if (!unpack_trans_table(e, profile)) { info = "failed to unpack profile transition table"; goto fail; -- GitLab From 90917d5b6866df79d892087ba51b46c983d2fcfe Mon Sep 17 00:00:00 2001 From: John Johansen Date: Sat, 16 Jul 2022 03:33:43 -0700 Subject: [PATCH 0025/1423] apparmor: extend permissions to support a label and tag string add indexes for label and tag entries. Rename the domain table to the str_table as its a shared string table with label and tags. Signed-off-by: John Johansen --- security/apparmor/domain.c | 18 ------------------ security/apparmor/include/domain.h | 6 ------ security/apparmor/include/lib.h | 6 ++++++ security/apparmor/include/perms.h | 2 ++ security/apparmor/include/policy.h | 6 ++++-- security/apparmor/lib.c | 19 +++++++++++++++++++ security/apparmor/policy_unpack.c | 2 +- 7 files changed, 32 insertions(+), 27 deletions(-) diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c index 5883f0fc02d3d..4cb046cf3a14f 100644 --- a/security/apparmor/domain.c +++ b/security/apparmor/domain.c @@ -29,24 +29,6 @@ #include "include/policy.h" #include "include/policy_ns.h" -/** - * aa_free_domain_entries - free entries in a domain table - * @domain: the domain table to free (MAYBE NULL) - */ -void aa_free_domain_entries(struct aa_domain *domain) -{ - int i; - if (domain) { - if (!domain->table) - return; - - for (i = 0; i < domain->size; i++) - kfree_sensitive(domain->table[i]); - kfree_sensitive(domain->table); - domain->table = NULL; - } -} - /** * may_change_ptraced_domain - check if can change profile on ptraced task * @to_label: profile to change to (NOT NULL) diff --git a/security/apparmor/include/domain.h b/security/apparmor/include/domain.h index d14928fe1c6f1..77f9a0ed0f048 100644 --- a/security/apparmor/include/domain.h +++ b/security/apparmor/include/domain.h @@ -16,11 +16,6 @@ #ifndef __AA_DOMAIN_H #define __AA_DOMAIN_H -struct aa_domain { - int size; - char **table; -}; - #define AA_CHANGE_NOFLAGS 0 #define AA_CHANGE_TEST 1 #define AA_CHANGE_CHILD 2 @@ -32,7 +27,6 @@ struct aa_label *x_table_lookup(struct aa_profile *profile, u32 xindex, int apparmor_bprm_creds_for_exec(struct linux_binprm *bprm); -void aa_free_domain_entries(struct aa_domain *domain); int aa_change_hat(const char *hats[], int count, u64 token, int flags); int aa_change_profile(const char *fqname, int flags); diff --git a/security/apparmor/include/lib.h b/security/apparmor/include/lib.h index f176f3ced2a36..f1a29ab7ea1b4 100644 --- a/security/apparmor/include/lib.h +++ b/security/apparmor/include/lib.h @@ -99,6 +99,12 @@ static inline bool path_mediated_fs(struct dentry *dentry) return !(dentry->d_sb->s_flags & SB_NOUSER); } +struct aa_str_table { + int size; + char **table; +}; + +void aa_free_str_table(struct aa_str_table *table); struct counted_str { struct kref count; diff --git a/security/apparmor/include/perms.h b/security/apparmor/include/perms.h index 8739cef735493..d66059fcebb45 100644 --- a/security/apparmor/include/perms.h +++ b/security/apparmor/include/perms.h @@ -79,6 +79,8 @@ struct aa_perms { u32 hide; /* set only when ~allow | deny */ u32 xindex; + u32 tag; /* tag string index, if present */ + u32 label; /* label string index, if present */ }; #define ALL_PERMS_MASK 0xffffffff diff --git a/security/apparmor/include/policy.h b/security/apparmor/include/policy.h index 3a7d165e8fcc1..a28a662a06221 100644 --- a/security/apparmor/include/policy.h +++ b/security/apparmor/include/policy.h @@ -72,12 +72,14 @@ enum profile_mode { /* struct aa_policydb - match engine for a policy * dfa: dfa pattern match + * perms: table of permissions + * strs: table of strings, index by x * start: set of start states for the different classes of data */ struct aa_policydb { struct aa_dfa *dfa; struct aa_perms *perms; - struct aa_domain trans; + struct aa_str_table trans; aa_state_t start[AA_CLASS_LAST + 1]; }; @@ -86,7 +88,7 @@ static inline void aa_destroy_policydb(struct aa_policydb *policy) aa_put_dfa(policy->dfa); if (policy->perms) kvfree(policy->perms); - aa_free_domain_entries(&policy->trans); + aa_free_str_table(&policy->trans); } diff --git a/security/apparmor/lib.c b/security/apparmor/lib.c index 60deb4dc30c7f..69aeb2dbd6d6b 100644 --- a/security/apparmor/lib.c +++ b/security/apparmor/lib.c @@ -25,6 +25,25 @@ struct aa_perms allperms = { .allow = ALL_PERMS_MASK, .quiet = ALL_PERMS_MASK, .hide = ALL_PERMS_MASK }; +/** + * aa_free_str_table - free entries str table + * @str: the string table to free (MAYBE NULL) + */ +void aa_free_str_table(struct aa_str_table *t) +{ + int i; + + if (t) { + if (!t->table) + return; + + for (i = 0; i < t->size; i++) + kfree_sensitive(t->table[i]); + kfree_sensitive(t->table); + t->table = NULL; + } +} + /** * aa_split_fqname - split a fqname into a profile and namespace name * @fqname: a full qualified name in namespace profile format (NOT NULL) diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 63196df2841b5..df39ee8f4e03d 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -534,7 +534,7 @@ static bool unpack_trans_table(struct aa_ext *e, struct aa_profile *profile) return true; fail: - aa_free_domain_entries(&profile->file.trans); + aa_free_str_table(&profile->file.trans); e->pos = saved_pos; return false; } -- GitLab From 8c4b785a86be1219f7d50f7b38266c454d6a9bbc Mon Sep 17 00:00:00 2001 From: John Johansen Date: Tue, 19 Apr 2022 16:25:55 -0700 Subject: [PATCH 0026/1423] apparmor: add mediation class information to auditing Audit messages currently don't contain the mediation class which can make them less clear than they should be in some circumstances. With newer mediation classes coming this potential confusion will become worse. Fix this by adding the mediatin class to the messages. Signed-off-by: John Johansen --- security/apparmor/audit.c | 28 ++++++++++++++++++++++++++++ security/apparmor/capability.c | 2 +- security/apparmor/file.c | 2 +- security/apparmor/include/apparmor.h | 2 +- security/apparmor/include/audit.h | 8 ++++++-- security/apparmor/include/net.h | 1 + security/apparmor/ipc.c | 2 +- security/apparmor/lib.c | 2 +- security/apparmor/lsm.c | 3 ++- security/apparmor/mount.c | 2 +- security/apparmor/policy.c | 2 +- security/apparmor/policy_unpack.c | 2 +- security/apparmor/resource.c | 3 ++- security/apparmor/task.c | 2 +- 14 files changed, 48 insertions(+), 13 deletions(-) diff --git a/security/apparmor/audit.c b/security/apparmor/audit.c index 704b0c895605a..e638f7bc9f528 100644 --- a/security/apparmor/audit.c +++ b/security/apparmor/audit.c @@ -36,6 +36,28 @@ static const char *const aa_audit_type[] = { "AUTO" }; +static const char *const aa_class_names[] = { + "none", + "unknown", + "file", + "cap", + "net", + "rlimits", + "domain", + "mount", + "unknown", + "ptrace", + "signal", + "unknown", + "unknown", + "unknown", + "net", + "unknown", + "label", + "lsm", +}; + + /* * Currently AppArmor auditing is fed straight into the audit framework. * @@ -65,6 +87,12 @@ static void audit_pre(struct audit_buffer *ab, void *ca) audit_log_format(ab, " operation=\"%s\"", aad(sa)->op); } + if (aad(sa)->class) + audit_log_format(ab, " class=\"%s\"", + aad(sa)->class <= AA_CLASS_LAST ? + aa_class_names[aad(sa)->class] : + "unknown"); + if (aad(sa)->info) { audit_log_format(ab, " info=\"%s\"", aad(sa)->info); if (aad(sa)->error) diff --git a/security/apparmor/capability.c b/security/apparmor/capability.c index deccea8654ad8..6cabd6109f12a 100644 --- a/security/apparmor/capability.c +++ b/security/apparmor/capability.c @@ -148,7 +148,7 @@ int aa_capable(struct aa_label *label, int cap, unsigned int opts) { struct aa_profile *profile; int error = 0; - DEFINE_AUDIT_DATA(sa, LSM_AUDIT_DATA_CAP, OP_CAPABLE); + DEFINE_AUDIT_DATA(sa, LSM_AUDIT_DATA_CAP, AA_CLASS_CAP, OP_CAPABLE); sa.u.cap = cap; error = fn_for_each_confined(label, profile, diff --git a/security/apparmor/file.c b/security/apparmor/file.c index 636efcade3f58..69d936d04f948 100644 --- a/security/apparmor/file.c +++ b/security/apparmor/file.c @@ -95,7 +95,7 @@ int aa_audit_file(struct aa_profile *profile, struct aa_perms *perms, kuid_t ouid, const char *info, int error) { int type = AUDIT_APPARMOR_AUTO; - DEFINE_AUDIT_DATA(sa, LSM_AUDIT_DATA_TASK, op); + DEFINE_AUDIT_DATA(sa, LSM_AUDIT_DATA_TASK, AA_CLASS_FILE, op); sa.u.tsk = NULL; aad(&sa)->request = request; diff --git a/security/apparmor/include/apparmor.h b/security/apparmor/include/apparmor.h index 8fd66a4ca0b86..6d9ca075fcb9c 100644 --- a/security/apparmor/include/apparmor.h +++ b/security/apparmor/include/apparmor.h @@ -16,7 +16,7 @@ /* * Class of mediation types in the AppArmor policy db */ -#define AA_CLASS_ENTRY 0 +#define AA_CLASS_NONE 0 #define AA_CLASS_UNKNOWN 1 #define AA_CLASS_FILE 2 #define AA_CLASS_CAP 3 diff --git a/security/apparmor/include/audit.h b/security/apparmor/include/audit.h index 18519a4eb67e3..c328f07f11cd8 100644 --- a/security/apparmor/include/audit.h +++ b/security/apparmor/include/audit.h @@ -107,6 +107,7 @@ enum audit_type { struct apparmor_audit_data { int error; int type; + u16 class; const char *op; struct aa_label *label; const char *name; @@ -155,9 +156,12 @@ struct apparmor_audit_data { /* macros for dealing with apparmor_audit_data structure */ #define aad(SA) ((SA)->apparmor_audit_data) -#define DEFINE_AUDIT_DATA(NAME, T, X) \ +#define DEFINE_AUDIT_DATA(NAME, T, C, X) \ /* TODO: cleanup audit init so we don't need _aad = {0,} */ \ - struct apparmor_audit_data NAME ## _aad = { .op = (X), }; \ + struct apparmor_audit_data NAME ## _aad = { \ + .class = (C), \ + .op = (X), \ + }; \ struct common_audit_data NAME = \ { \ .type = (T), \ diff --git a/security/apparmor/include/net.h b/security/apparmor/include/net.h index aadb4b29fb66e..6fa440b5daed8 100644 --- a/security/apparmor/include/net.h +++ b/security/apparmor/include/net.h @@ -59,6 +59,7 @@ struct aa_sk_ctx { DEFINE_AUDIT_DATA(NAME, \ ((SK) && (F) != AF_UNIX) ? LSM_AUDIT_DATA_NET : \ LSM_AUDIT_DATA_NONE, \ + AA_CLASS_NET, \ OP); \ NAME.u.net = &(NAME ## _net); \ aad(&NAME)->net.type = (T); \ diff --git a/security/apparmor/ipc.c b/security/apparmor/ipc.c index 7255a9d523728..4ecaf2ba26c54 100644 --- a/security/apparmor/ipc.c +++ b/security/apparmor/ipc.c @@ -98,7 +98,7 @@ static int profile_signal_perm(struct aa_profile *profile, int aa_may_signal(struct aa_label *sender, struct aa_label *target, int sig) { struct aa_profile *profile; - DEFINE_AUDIT_DATA(sa, LSM_AUDIT_DATA_NONE, OP_SIGNAL); + DEFINE_AUDIT_DATA(sa, LSM_AUDIT_DATA_NONE, AA_CLASS_SIGNAL, OP_SIGNAL); aad(&sa)->signal = map_signal_num(sig); aad(&sa)->unmappedsig = sig; diff --git a/security/apparmor/lib.c b/security/apparmor/lib.c index 69aeb2dbd6d6b..768cc182e9cab 100644 --- a/security/apparmor/lib.c +++ b/security/apparmor/lib.c @@ -143,7 +143,7 @@ const char *aa_splitn_fqname(const char *fqname, size_t n, const char **ns_name, void aa_info_message(const char *str) { if (audit_enabled) { - DEFINE_AUDIT_DATA(sa, LSM_AUDIT_DATA_NONE, NULL); + DEFINE_AUDIT_DATA(sa, LSM_AUDIT_DATA_NONE, AA_CLASS_NONE, NULL); aad(&sa)->info = str; aa_audit_msg(AUDIT_APPARMOR_STATUS, &sa, NULL); diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index ec873ff0a4bbb..784709286a626 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -647,7 +647,8 @@ static int apparmor_setprocattr(const char *name, void *value, char *command, *largs = NULL, *args = value; size_t arg_size; int error; - DEFINE_AUDIT_DATA(sa, LSM_AUDIT_DATA_NONE, OP_SETPROCATTR); + DEFINE_AUDIT_DATA(sa, LSM_AUDIT_DATA_NONE, AA_CLASS_NONE, + OP_SETPROCATTR); if (size == 0) return -EINVAL; diff --git a/security/apparmor/mount.c b/security/apparmor/mount.c index 84aaf25e5deef..02d8215cb9fd6 100644 --- a/security/apparmor/mount.c +++ b/security/apparmor/mount.c @@ -134,7 +134,7 @@ static int audit_mount(struct aa_profile *profile, const char *op, struct aa_perms *perms, const char *info, int error) { int audit_type = AUDIT_APPARMOR_AUTO; - DEFINE_AUDIT_DATA(sa, LSM_AUDIT_DATA_NONE, op); + DEFINE_AUDIT_DATA(sa, LSM_AUDIT_DATA_NONE, AA_CLASS_MOUNT, op); if (likely(!error)) { u32 mask = perms->audit; diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c index cdcf26c9bed57..6222236de0216 100644 --- a/security/apparmor/policy.c +++ b/security/apparmor/policy.c @@ -617,7 +617,7 @@ static int audit_policy(struct aa_label *label, const char *op, const char *ns_name, const char *name, const char *info, int error) { - DEFINE_AUDIT_DATA(sa, LSM_AUDIT_DATA_NONE, op); + DEFINE_AUDIT_DATA(sa, LSM_AUDIT_DATA_NONE, AA_CLASS_NONE, op); aad(&sa)->iface.ns = ns_name; aad(&sa)->name = name; diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index df39ee8f4e03d..4bf33bd0ca694 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -100,7 +100,7 @@ static int audit_iface(struct aa_profile *new, const char *ns_name, int error) { struct aa_profile *profile = labels_profile(aa_current_raw_label()); - DEFINE_AUDIT_DATA(sa, LSM_AUDIT_DATA_NONE, NULL); + DEFINE_AUDIT_DATA(sa, LSM_AUDIT_DATA_NONE, AA_CLASS_NONE, NULL); if (e) aad(&sa)->iface.pos = e->pos - e->start; aad(&sa)->iface.ns = ns_name; diff --git a/security/apparmor/resource.c b/security/apparmor/resource.c index 1ae4874251a96..cc018469e22d7 100644 --- a/security/apparmor/resource.c +++ b/security/apparmor/resource.c @@ -53,7 +53,8 @@ static int audit_resource(struct aa_profile *profile, unsigned int resource, unsigned long value, struct aa_label *peer, const char *info, int error) { - DEFINE_AUDIT_DATA(sa, LSM_AUDIT_DATA_NONE, OP_SETRLIMIT); + DEFINE_AUDIT_DATA(sa, LSM_AUDIT_DATA_NONE, AA_CLASS_RLIMITS, + OP_SETRLIMIT); aad(&sa)->rlim.rlim = resource; aad(&sa)->rlim.max = value; diff --git a/security/apparmor/task.c b/security/apparmor/task.c index 503dc0877fb1a..b19900f85c148 100644 --- a/security/apparmor/task.c +++ b/security/apparmor/task.c @@ -285,7 +285,7 @@ int aa_may_ptrace(struct aa_label *tracer, struct aa_label *tracee, { struct aa_profile *profile; u32 xrequest = request << PTRACE_PERM_SHIFT; - DEFINE_AUDIT_DATA(sa, LSM_AUDIT_DATA_NONE, OP_PTRACE); + DEFINE_AUDIT_DATA(sa, LSM_AUDIT_DATA_NONE, AA_CLASS_PTRACE, OP_PTRACE); return xcheck_labels(tracer, tracee, profile, profile_tracer_perm(profile, tracee, request, &sa), -- GitLab From 22fac8a051191113becc0da62bf88b0ba8ce6c08 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Tue, 17 Dec 2019 15:40:41 -0800 Subject: [PATCH 0027/1423] apparmor: add user mode flag Allow the profile to contain a user mode prompt flag. This works similar to complain mode but will try to send messages to a userspace daemon. If the daemon is not present or timesout regular informent will occur. Signed-off-by: John Johansen --- security/apparmor/include/policy.h | 3 +++ security/apparmor/include/policy_unpack.h | 1 + security/apparmor/lib.c | 7 ++----- security/apparmor/policy.c | 1 + security/apparmor/policy_unpack.c | 2 ++ 5 files changed, 9 insertions(+), 5 deletions(-) diff --git a/security/apparmor/include/policy.h b/security/apparmor/include/policy.h index a28a662a06221..9fc5d7fa36e86 100644 --- a/security/apparmor/include/policy.h +++ b/security/apparmor/include/policy.h @@ -44,6 +44,8 @@ extern const char *const aa_profile_mode_names[]; #define COMPLAIN_MODE(_profile) PROFILE_MODE((_profile), APPARMOR_COMPLAIN) +#define USER_MODE(_profile) PROFILE_MODE((_profile), APPARMOR_USER) + #define KILL_MODE(_profile) PROFILE_MODE((_profile), APPARMOR_KILL) #define PROFILE_IS_HAT(_profile) ((_profile)->label.flags & FLAG_HAT) @@ -67,6 +69,7 @@ enum profile_mode { APPARMOR_COMPLAIN, /* allow and log access violations */ APPARMOR_KILL, /* kill task on access violation */ APPARMOR_UNCONFINED, /* profile set to unconfined */ + APPARMOR_USER, /* modified complain mode to userspace */ }; diff --git a/security/apparmor/include/policy_unpack.h b/security/apparmor/include/policy_unpack.h index cdfbc8a54a9d2..1e10e360a0ec1 100644 --- a/security/apparmor/include/policy_unpack.h +++ b/security/apparmor/include/policy_unpack.h @@ -36,6 +36,7 @@ struct aa_load_ent *aa_load_ent_alloc(void); #define PACKED_MODE_COMPLAIN 1 #define PACKED_MODE_KILL 2 #define PACKED_MODE_UNCONFINED 3 +#define PACKED_MODE_USER 4 struct aa_ns; diff --git a/security/apparmor/lib.c b/security/apparmor/lib.c index 768cc182e9cab..b0fcec8932745 100644 --- a/security/apparmor/lib.c +++ b/security/apparmor/lib.c @@ -327,11 +327,8 @@ void aa_apply_modes_to_perms(struct aa_profile *profile, struct aa_perms *perms) perms->kill = ALL_PERMS_MASK; else if (COMPLAIN_MODE(profile)) perms->complain = ALL_PERMS_MASK; -/* - * TODO: - * else if (PROMPT_MODE(profile)) - * perms->prompt = ALL_PERMS_MASK; - */ + else if (USER_MODE(profile)) + perms->prompt = ALL_PERMS_MASK; } /** diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c index 6222236de0216..3c3a5263695de 100644 --- a/security/apparmor/policy.c +++ b/security/apparmor/policy.c @@ -94,6 +94,7 @@ const char *const aa_profile_mode_names[] = { "complain", "kill", "unconfined", + "user", }; diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 4bf33bd0ca694..04e9fca250df0 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -761,6 +761,8 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) } else if (tmp == PACKED_MODE_UNCONFINED) { profile->mode = APPARMOR_UNCONFINED; profile->label.flags |= FLAG_UNCONFINED; + } else if (tmp == PACKED_MODE_USER) { + profile->mode = APPARMOR_USER; } else { goto fail; } -- GitLab From a0792e2ceddc1bff8bda34a82b5ef7f00cbe7a9f Mon Sep 17 00:00:00 2001 From: John Johansen Date: Tue, 23 Aug 2022 01:06:15 -0700 Subject: [PATCH 0028/1423] apparmor: make transition table unpack generic so it can be reused Currently the transition table is tied to the file dfa. Make it so we can unpack a transition table against any dfa. Signed-off-by: John Johansen --- security/apparmor/policy_unpack.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 04e9fca250df0..052e3b914c180 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -466,13 +466,14 @@ static struct aa_dfa *unpack_dfa(struct aa_ext *e) /** * unpack_trans_table - unpack a profile transition table * @e: serialized data extent information (NOT NULL) - * @profile: profile to add the accept table to (NOT NULL) + * @table: str table to unpack to (NOT NULL) * - * Returns: true if table successfully unpacked + * Returns: true if table successfully unpacked or not present */ -static bool unpack_trans_table(struct aa_ext *e, struct aa_profile *profile) +static bool unpack_trans_table(struct aa_ext *e, struct aa_str_table *strs) { void *saved_pos = e->pos; + char **table; /* exec table is optional */ if (unpack_nameX(e, AA_STRUCT, "xtable")) { @@ -482,12 +483,10 @@ static bool unpack_trans_table(struct aa_ext *e, struct aa_profile *profile) /* currently 2^24 bits entries 0-3 */ if (size > (1 << 24)) goto fail; - profile->file.trans.table = kcalloc(size, sizeof(char *), - GFP_KERNEL); - if (!profile->file.trans.table) + table = kcalloc(size, sizeof(char *), GFP_KERNEL); + if (!table) goto fail; - profile->file.trans.size = size; for (i = 0; i < size; i++) { char *str; int c, j, pos, size2 = unpack_strdup(e, &str, NULL); @@ -496,7 +495,7 @@ static bool unpack_trans_table(struct aa_ext *e, struct aa_profile *profile) */ if (!size2) goto fail; - profile->file.trans.table[i] = str; + table[i] = str; /* verify that name doesn't start with space */ if (isspace(*str)) goto fail; @@ -530,11 +529,14 @@ static bool unpack_trans_table(struct aa_ext *e, struct aa_profile *profile) goto fail; if (!unpack_nameX(e, AA_STRUCTEND, NULL)) goto fail; + + strs->table = table; + strs->size = size; } return true; fail: - aa_free_str_table(&profile->file.trans); + kfree_sensitive(table); e->pos = saved_pos; return false; } @@ -880,7 +882,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) info = "failed to remap file permission table"; goto fail; } - if (!unpack_trans_table(e, profile)) { + if (!unpack_trans_table(e, &profile->file.trans)) { info = "failed to unpack profile transition table"; goto fail; } -- GitLab From ad596ea74e746d60bb7e13f3adde097a08b2089b Mon Sep 17 00:00:00 2001 From: John Johansen Date: Mon, 18 Jul 2022 16:53:17 -0700 Subject: [PATCH 0029/1423] apparmor: group dfa policydb unpacking There are currently three policydb rule groupings (xmatch, file, policydb) that each do their own slightly different thing. Group them into a single routine and unify. This extends/unifies dfa features by - all dfas are allowed having an optional start field - all dfas are allowed having a string/transition table Signed-off-by: John Johansen --- security/apparmor/policy_unpack.c | 101 +++++++++++++++++++----------- 1 file changed, 63 insertions(+), 38 deletions(-) diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 052e3b914c180..a1fe0a5e8e57d 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -648,6 +648,54 @@ fail: return false; } +static int unpack_pdb(struct aa_ext *e, struct aa_policydb *policy, + bool required_dfa, bool required_trans, + const char **info) +{ + int i; + + policy->dfa = unpack_dfa(e); + if (IS_ERR(policy->dfa)) { + int error = PTR_ERR(policy->dfa); + + policy->dfa = NULL; + *info = "failed to unpack - dfa"; + return error; + } else if (!policy->dfa) { + if (required_dfa) { + *info = "missing required dfa"; + return -EPROTO; + } + goto out; + } + + /* + * only unpack the following if a dfa is present + * + * sadly start was given different names for file and policydb + * but since it is optional we can try both + */ + if (!unpack_u32(e, &policy->start[0], "start")) + /* default start state */ + policy->start[0] = DFA_START; + if (!unpack_u32(e, &policy->start[AA_CLASS_FILE], "dfa_start")) { + /* default start state for xmatch and file dfa */ + policy->start[AA_CLASS_FILE] = DFA_START; + } /* setup class index */ + for (i = AA_CLASS_FILE + 1; i <= AA_CLASS_LAST; i++) { + policy->start[i] = aa_dfa_next(policy->dfa, policy->start[0], + i); + } + if (!unpack_trans_table(e, &policy->trans) && required_trans) { + *info = "failed to unpack profile transition table"; + return -EPROTO; + } + /* TODO: move compat mapping here, requires dfa merging first */ + +out: + return 0; +} + static u32 strhash(const void *data, u32 len, u32 seed) { const char * const *key = data; @@ -679,7 +727,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) struct rhashtable_params params = { 0 }; char *key = NULL; struct aa_data *data; - int i, error = -EPROTO; + int error = -EPROTO; kernel_cap_t tmpcap; u32 tmp; @@ -714,13 +762,10 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) (void) unpack_str(e, &profile->attach, "attach"); /* xmatch is optional and may be NULL */ - profile->xmatch.dfa = unpack_dfa(e); - if (IS_ERR(profile->xmatch.dfa)) { - error = PTR_ERR(profile->xmatch.dfa); - profile->xmatch.dfa = NULL; - info = "bad xmatch"; + error = unpack_pdb(e, &profile->xmatch, false, false, &info); + if (error) goto fail; - } + /* neither xmatch_len not xmatch_perms are optional if xmatch is set */ if (profile->xmatch.dfa) { if (!unpack_u32(e, &tmp, NULL)) { @@ -838,25 +883,16 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) if (unpack_nameX(e, AA_STRUCT, "policydb")) { /* generic policy dfa - optional and may be NULL */ info = "failed to unpack policydb"; - profile->policy.dfa = unpack_dfa(e); - if (IS_ERR(profile->policy.dfa)) { - error = PTR_ERR(profile->policy.dfa); - profile->policy.dfa = NULL; - goto fail; - } else if (!profile->policy.dfa) { - error = -EPROTO; + error = unpack_pdb(e, &profile->policy, true, false, &info); + if (error) goto fail; - } - if (!unpack_u32(e, &profile->policy.start[0], "start")) - /* default start state */ - profile->policy.start[0] = DFA_START; - /* setup class index */ - for (i = AA_CLASS_FILE; i <= AA_CLASS_LAST; i++) { - profile->policy.start[i] = - aa_dfa_next(profile->policy.dfa, - profile->policy.start[0], - i); - } + /* Fixup: drop when we get rid of start array */ + if (aa_dfa_next(profile->policy.dfa, profile->policy.start[0], + AA_CLASS_FILE)) + profile->policy.start[AA_CLASS_FILE] = + aa_dfa_next(profile->policy.dfa, + profile->policy.start[0], + AA_CLASS_FILE); if (!unpack_nameX(e, AA_STRUCTEND, NULL)) goto fail; if (aa_compat_map_policy(&profile->policy, e->version)) { @@ -867,25 +903,14 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) profile->policy.dfa = aa_get_dfa(nulldfa); /* get file rules */ - profile->file.dfa = unpack_dfa(e); - if (IS_ERR(profile->file.dfa)) { - error = PTR_ERR(profile->file.dfa); - profile->file.dfa = NULL; - info = "failed to unpack profile file rules"; + error = unpack_pdb(e, &profile->file, false, true, &info); + if (error) { goto fail; } else if (profile->file.dfa) { - if (!unpack_u32(e, &profile->file.start[AA_CLASS_FILE], - "dfa_start")) - /* default start state */ - profile->file.start[AA_CLASS_FILE] = DFA_START; if (aa_compat_map_file(&profile->file)) { info = "failed to remap file permission table"; goto fail; } - if (!unpack_trans_table(e, &profile->file.trans)) { - info = "failed to unpack profile transition table"; - goto fail; - } } else if (profile->policy.dfa && profile->policy.start[AA_CLASS_FILE]) { profile->file.dfa = aa_get_dfa(profile->policy.dfa); -- GitLab From 371e50a0b19f9765bfb9e4f172e72f4e9a4625bc Mon Sep 17 00:00:00 2001 From: John Johansen Date: Fri, 26 Aug 2022 09:26:57 -0700 Subject: [PATCH 0030/1423] apparmor: make unpack_array return a trianary value currently unpack_array() does not return an error nor whether the array is not present. The ability to detect an error or the array not being present is needed so rework the unpack_array() to return the needed information. Signed-off-by: John Johansen --- security/apparmor/policy_unpack.c | 43 ++++++++++++++++---------- security/apparmor/policy_unpack_test.c | 12 +++---- 2 files changed, 33 insertions(+), 22 deletions(-) diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index a1fe0a5e8e57d..7d3b3e664c1c1 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -67,6 +67,11 @@ struct aa_ext { u32 version; }; +#define tri int +#define TRI_TRUE 1 +#define TRI_NONE 0 +#define TRI_FALSE -1 + /* audit callback for unpack fields */ static void audit_cb(struct audit_buffer *ab, void *va) { @@ -344,22 +349,22 @@ fail: return false; } -static size_t unpack_array(struct aa_ext *e, const char *name) +static tri unpack_array(struct aa_ext *e, const char *name, u16 *size) { void *pos = e->pos; if (unpack_nameX(e, AA_ARRAY, name)) { - int size; if (!inbounds(e, sizeof(u16))) goto fail; - size = (int)le16_to_cpu(get_unaligned((__le16 *) e->pos)); + *size = le16_to_cpu(get_unaligned((__le16 *) e->pos)); e->pos += sizeof(u16); - return size; + return TRI_TRUE; } + return TRI_NONE; fail: e->pos = pos; - return 0; + return TRI_FALSE; } static size_t unpack_blob(struct aa_ext *e, char **blob, const char *name) @@ -477,11 +482,12 @@ static bool unpack_trans_table(struct aa_ext *e, struct aa_str_table *strs) /* exec table is optional */ if (unpack_nameX(e, AA_STRUCT, "xtable")) { - int i, size; + u16 size; + int i; - size = unpack_array(e, NULL); - /* currently 2^24 bits entries 0-3 */ - if (size > (1 << 24)) + if (unpack_array(e, NULL, &size) != TRI_TRUE || + size > (1 << 24)) + /* currently 2^24 bits entries 0-3 */ goto fail; table = kcalloc(size, sizeof(char *), GFP_KERNEL); if (!table) @@ -546,9 +552,11 @@ static bool unpack_xattrs(struct aa_ext *e, struct aa_profile *profile) void *pos = e->pos; if (unpack_nameX(e, AA_STRUCT, "xattrs")) { - int i, size; + u16 size; + int i; - size = unpack_array(e, NULL); + if (unpack_array(e, NULL, &size) != TRI_TRUE) + goto fail; profile->xattr_count = size; profile->xattrs = kcalloc(size, sizeof(char *), GFP_KERNEL); if (!profile->xattrs) @@ -573,10 +581,12 @@ fail: static bool unpack_secmark(struct aa_ext *e, struct aa_profile *profile) { void *pos = e->pos; - int i, size; + u16 size; + int i; if (unpack_nameX(e, AA_STRUCT, "secmark")) { - size = unpack_array(e, NULL); + if (unpack_array(e, NULL, &size) != TRI_TRUE) + goto fail; profile->secmark = kcalloc(size, sizeof(struct aa_secmark), GFP_KERNEL); @@ -620,14 +630,15 @@ static bool unpack_rlimits(struct aa_ext *e, struct aa_profile *profile) /* rlimits are optional */ if (unpack_nameX(e, AA_STRUCT, "rlimits")) { - int i, size; + u16 size; + int i; u32 tmp = 0; if (!unpack_u32(e, &tmp, NULL)) goto fail; profile->rlimits.mask = tmp; - size = unpack_array(e, NULL); - if (size > RLIM_NLIMITS) + if (unpack_array(e, NULL, &size) != TRI_TRUE || + size > RLIM_NLIMITS) goto fail; for (i = 0; i < size; i++) { u64 tmp2 = 0; diff --git a/security/apparmor/policy_unpack_test.c b/security/apparmor/policy_unpack_test.c index 0a969b2e03dba..1a43d538c4c04 100644 --- a/security/apparmor/policy_unpack_test.c +++ b/security/apparmor/policy_unpack_test.c @@ -144,8 +144,8 @@ static void policy_unpack_test_unpack_array_with_null_name(struct kunit *test) puf->e->pos += TEST_ARRAY_BUF_OFFSET; - array_size = unpack_array(puf->e, NULL); - + KUNIT_EXPECT_EQ(test, unpack_array(puf->e, NULL, &array_size), + TRI_TRUE); KUNIT_EXPECT_EQ(test, array_size, (u16)TEST_ARRAY_SIZE); KUNIT_EXPECT_PTR_EQ(test, puf->e->pos, puf->e->start + TEST_ARRAY_BUF_OFFSET + sizeof(u16) + 1); @@ -159,8 +159,8 @@ static void policy_unpack_test_unpack_array_with_name(struct kunit *test) puf->e->pos += TEST_NAMED_ARRAY_BUF_OFFSET; - array_size = unpack_array(puf->e, name); - + KUNIT_EXPECT_EQ(test, unpack_array(puf->e, name, &array_size), + TRI_TRUE); KUNIT_EXPECT_EQ(test, array_size, (u16)TEST_ARRAY_SIZE); KUNIT_EXPECT_PTR_EQ(test, puf->e->pos, puf->e->start + TEST_ARRAY_BUF_OFFSET + sizeof(u16) + 1); @@ -175,8 +175,8 @@ static void policy_unpack_test_unpack_array_out_of_bounds(struct kunit *test) puf->e->pos += TEST_NAMED_ARRAY_BUF_OFFSET; puf->e->end = puf->e->start + TEST_ARRAY_BUF_OFFSET + sizeof(u16); - array_size = unpack_array(puf->e, name); - + KUNIT_EXPECT_EQ(test, unpack_array(puf->e, name, &array_size), + TRI_TRUE); KUNIT_EXPECT_EQ(test, array_size, 0); KUNIT_EXPECT_PTR_EQ(test, puf->e->pos, puf->e->start + TEST_NAMED_ARRAY_BUF_OFFSET); -- GitLab From fd1b2b95a21177eaa9e26989637e477be4d93b2f Mon Sep 17 00:00:00 2001 From: John Johansen Date: Fri, 26 Aug 2022 08:53:42 -0700 Subject: [PATCH 0031/1423] apparmor: add the ability for policy to specify a permission table Currently permissions are encoded in the dfa accept entries that are then mapped to an internal permission structure. This limits the permissions that userspace can specify, so allow userspace to directly specify the permission table. Signed-off-by: John Johansen --- security/apparmor/include/policy.h | 5 +- security/apparmor/policy_unpack.c | 104 ++++++++++++++++++++++++++--- 2 files changed, 98 insertions(+), 11 deletions(-) diff --git a/security/apparmor/include/policy.h b/security/apparmor/include/policy.h index 9fc5d7fa36e86..2c39bd389f878 100644 --- a/security/apparmor/include/policy.h +++ b/security/apparmor/include/policy.h @@ -81,7 +81,10 @@ enum profile_mode { */ struct aa_policydb { struct aa_dfa *dfa; - struct aa_perms *perms; + struct { + struct aa_perms *perms; + u32 size; + }; struct aa_str_table trans; aa_state_t start[AA_CLASS_LAST + 1]; }; diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 7d3b3e664c1c1..b85dbdde89390 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -435,10 +435,11 @@ static int unpack_strdup(struct aa_ext *e, char **string, const char *name) /** * unpack_dfa - unpack a file rule dfa * @e: serialized data extent information (NOT NULL) + * @flags: dfa flags to check * * returns dfa or ERR_PTR or NULL if no dfa */ -static struct aa_dfa *unpack_dfa(struct aa_ext *e) +static struct aa_dfa *unpack_dfa(struct aa_ext *e, int flags) { char *blob = NULL; size_t size; @@ -454,8 +455,6 @@ static struct aa_dfa *unpack_dfa(struct aa_ext *e) size_t sz = blob - (char *) e->start - ((e->pos - e->start) & 7); size_t pad = ALIGN(sz, 8) - sz; - int flags = TO_ACCEPT1_FLAG(YYTD_DATA32) | - TO_ACCEPT2_FLAG(YYTD_DATA32); if (aa_g_paranoid_load) flags |= DFA_FLAG_VERIFY_STATES; dfa = aa_dfa_unpack(blob + pad, size - pad, flags); @@ -659,23 +658,104 @@ fail: return false; } +static bool unpack_perm(struct aa_ext *e, u32 version, struct aa_perms *perm) +{ + bool res; + + if (version != 1) + return false; + + res = unpack_u32(e, &perm->allow, NULL); + res = res && unpack_u32(e, &perm->allow, NULL); + res = res && unpack_u32(e, &perm->deny, NULL); + res = res && unpack_u32(e, &perm->subtree, NULL); + res = res && unpack_u32(e, &perm->cond, NULL); + res = res && unpack_u32(e, &perm->kill, NULL); + res = res && unpack_u32(e, &perm->complain, NULL); + res = res && unpack_u32(e, &perm->prompt, NULL); + res = res && unpack_u32(e, &perm->audit, NULL); + res = res && unpack_u32(e, &perm->quiet, NULL); + res = res && unpack_u32(e, &perm->hide, NULL); + res = res && unpack_u32(e, &perm->xindex, NULL); + res = res && unpack_u32(e, &perm->tag, NULL); + res = res && unpack_u32(e, &perm->label, NULL); + + return res; +} + +static ssize_t unpack_perms_table(struct aa_ext *e, struct aa_perms **perms) +{ + void *pos = e->pos; + u16 size = 0; + + AA_BUG(!perms); + /* + * policy perms are optional, in which case perms are embedded + * in the dfa accept table + */ + if (unpack_nameX(e, AA_STRUCT, "perms")) { + int i; + u32 version; + + if (!unpack_u32(e, &version, "version")) + goto fail_reset; + if (unpack_array(e, NULL, &size) != TRI_TRUE) + goto fail_reset; + *perms = kcalloc(size, sizeof(struct aa_perms), GFP_KERNEL); + if (!*perms) + goto fail_reset; + for (i = 0; i < size; i++) { + if (!unpack_perm(e, version, &(*perms)[i])) + goto fail; + } + if (!unpack_nameX(e, AA_ARRAYEND, NULL)) + goto fail; + if (!unpack_nameX(e, AA_STRUCTEND, NULL)) + goto fail; + } else + *perms = NULL; + + return size; + +fail: + kfree(*perms); +fail_reset: + e->pos = pos; + return -EPROTO; +} + static int unpack_pdb(struct aa_ext *e, struct aa_policydb *policy, bool required_dfa, bool required_trans, const char **info) { - int i; + void *pos = e->pos; + int i, flags, error = -EPROTO; - policy->dfa = unpack_dfa(e); - if (IS_ERR(policy->dfa)) { - int error = PTR_ERR(policy->dfa); + policy->size = unpack_perms_table(e, &policy->perms); + if (policy->size < 0) { + error = policy->size; + policy->perms = NULL; + *info = "failed to unpack - perms"; + goto fail; + } else if (policy->perms) { + /* perms table present accept is index */ + flags = TO_ACCEPT1_FLAG(YYTD_DATA32); + } else { + /* packed perms in accept1 and accept2 */ + flags = TO_ACCEPT1_FLAG(YYTD_DATA32) | + TO_ACCEPT2_FLAG(YYTD_DATA32); + } + policy->dfa = unpack_dfa(e, flags); + if (IS_ERR(policy->dfa)) { + error = PTR_ERR(policy->dfa); policy->dfa = NULL; *info = "failed to unpack - dfa"; - return error; + goto fail; } else if (!policy->dfa) { if (required_dfa) { *info = "missing required dfa"; - return -EPROTO; + goto fail; } goto out; } @@ -699,12 +779,16 @@ static int unpack_pdb(struct aa_ext *e, struct aa_policydb *policy, } if (!unpack_trans_table(e, &policy->trans) && required_trans) { *info = "failed to unpack profile transition table"; - return -EPROTO; + goto fail; } /* TODO: move compat mapping here, requires dfa merging first */ out: return 0; + +fail: + e->pos = pos; + return error; } static u32 strhash(const void *data, u32 len, u32 seed) -- GitLab From 670f31774ab6bf8e2d756f27444b035b9be8a0c9 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Fri, 26 Aug 2022 13:32:34 -0700 Subject: [PATCH 0032/1423] apparmor: verify permission table indexes While the dfa xindex's are verified, the indexes in the permission table are not currently verified. Fix this. Signed-off-by: John Johansen --- security/apparmor/policy_unpack.c | 35 ++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index b85dbdde89390..312bd632a472b 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -781,8 +781,9 @@ static int unpack_pdb(struct aa_ext *e, struct aa_policydb *policy, *info = "failed to unpack profile transition table"; goto fail; } - /* TODO: move compat mapping here, requires dfa merging first */ + /* TODO: move compat mapping here, requires dfa merging first */ + /* TODO: move verify here, it has to be done after compat mappings */ out: return 0; @@ -1149,6 +1150,22 @@ static bool verify_dfa_xindex(struct aa_dfa *dfa, int table_size) return true; } +static bool verify_perm_indexes(struct aa_policydb *pdb) +{ + int i; + + for (i = 0; i < pdb->size; i++) { + if (pdb->perms[i].xindex >= pdb->trans.size) + return false; + if (pdb->perms[i].tag >= pdb->trans.size) + return false; + if (pdb->perms[i].label >= pdb->trans.size) + return false; + } + + return true; +} + /** * verify_profile - Do post unpack analysis to verify profile consistency * @profile: profile to verify (NOT NULL) @@ -1170,6 +1187,22 @@ static int verify_profile(struct aa_profile *profile) return -EPROTO; } + if (!verify_perm_indexes(&profile->file)) { + audit_iface(profile, NULL, NULL, + "Unpack: Invalid perm index", NULL, -EPROTO); + return -EPROTO; + } + if (!verify_perm_indexes(&profile->policy)) { + audit_iface(profile, NULL, NULL, + "Unpack: Invalid perm index", NULL, -EPROTO); + return -EPROTO; + } + if (!verify_perm_indexes(&profile->xmatch)) { + audit_iface(profile, NULL, NULL, + "Unpack: Invalid perm index", NULL, -EPROTO); + return -EPROTO; + } + return 0; } -- GitLab From 0bece4fa97a2bd397da66d4fced78f76eb214a3e Mon Sep 17 00:00:00 2001 From: John Johansen Date: Mon, 5 Sep 2022 23:53:29 -0700 Subject: [PATCH 0033/1423] apparmor: make sure perm indexes are accumulated accumulate permission indexes on a first encountered basis. This favors original rulesets so that new ones can not override without profile replacement. Signed-off-by: John Johansen --- security/apparmor/include/file.h | 4 ++-- security/apparmor/include/perms.h | 9 +++++++++ security/apparmor/lib.c | 14 ++++++++++++++ 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/security/apparmor/include/file.h b/security/apparmor/include/file.h index 521c8568f6d49..1a1c0f0c50710 100644 --- a/security/apparmor/include/file.h +++ b/security/apparmor/include/file.h @@ -88,10 +88,10 @@ static inline struct aa_label *aa_get_file_label(struct aa_file_ctx *ctx) * - exec type - which determines how the executable name and index are used * - flags - which modify how the destination name is applied */ -#define AA_X_INDEX_MASK 0x00ffffff +#define AA_X_INDEX_MASK AA_INDEX_MASK #define AA_X_TYPE_MASK 0x0c000000 -#define AA_X_NONE 0x00000000 +#define AA_X_NONE AA_INDEX_NONE #define AA_X_NAME 0x04000000 /* use executable name px */ #define AA_X_TABLE 0x08000000 /* use a specified name ->n# */ diff --git a/security/apparmor/include/perms.h b/security/apparmor/include/perms.h index d66059fcebb45..0de8c3fb090dc 100644 --- a/security/apparmor/include/perms.h +++ b/security/apparmor/include/perms.h @@ -78,11 +78,20 @@ struct aa_perms { u32 quiet; /* set only when ~allow | deny */ u32 hide; /* set only when ~allow | deny */ + u32 xindex; u32 tag; /* tag string index, if present */ u32 label; /* label string index, if present */ }; +/* + * Indexes are broken into a 24 bit index and 8 bit flag. + * For the index to be valid there must be a value in the flag + */ +#define AA_INDEX_MASK 0x00ffffff +#define AA_INDEX_FLAG_MASK 0xff000000 +#define AA_INDEX_NONE 0 + #define ALL_PERMS_MASK 0xffffffff extern struct aa_perms nullperms; extern struct aa_perms allperms; diff --git a/security/apparmor/lib.c b/security/apparmor/lib.c index b0fcec8932745..d6a8c361025b4 100644 --- a/security/apparmor/lib.c +++ b/security/apparmor/lib.c @@ -348,6 +348,13 @@ void aa_perms_accum_raw(struct aa_perms *accum, struct aa_perms *addend) accum->hide &= addend->hide & ~addend->allow; accum->prompt |= addend->prompt & ~addend->allow & ~addend->deny; accum->subtree |= addend->subtree & ~addend->deny; + + if (!accum->xindex) + accum->xindex = addend->xindex; + if (!accum->tag) + accum->tag = addend->tag; + if (!accum->label) + accum->label = addend->label; } /** @@ -367,6 +374,13 @@ void aa_perms_accum(struct aa_perms *accum, struct aa_perms *addend) accum->hide &= addend->hide & ~accum->allow; accum->prompt |= addend->prompt & ~accum->allow & ~accum->deny; accum->subtree &= addend->subtree & ~accum->deny; + + if (!accum->xindex) + accum->xindex = addend->xindex; + if (!accum->tag) + accum->tag = addend->tag; + if (!accum->label) + accum->label = addend->label; } void aa_profile_match_label(struct aa_profile *profile, struct aa_label *label, -- GitLab From 3dfd16ab697ff23973b6fbb89808372bcd008dd1 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Mon, 5 Sep 2022 23:57:51 -0700 Subject: [PATCH 0034/1423] apparmor: cleanup: move perm accumulation into perms.h Perm accumulation is going to be used much more frequently so let the compiler figure out if it can be optimized when used. Signed-off-by: John Johansen --- security/apparmor/include/perms.h | 53 +++++++++++++++++++++++++++++++ security/apparmor/lib.c | 52 ------------------------------ 2 files changed, 53 insertions(+), 52 deletions(-) diff --git a/security/apparmor/include/perms.h b/security/apparmor/include/perms.h index 0de8c3fb090dc..9fa71957ac3a0 100644 --- a/security/apparmor/include/perms.h +++ b/security/apparmor/include/perms.h @@ -96,6 +96,59 @@ struct aa_perms { extern struct aa_perms nullperms; extern struct aa_perms allperms; +/** + * aa_perms_accum_raw - accumulate perms with out masking off overlapping perms + * @accum - perms struct to accumulate into + * @addend - perms struct to add to @accum + */ +static inline void aa_perms_accum_raw(struct aa_perms *accum, + struct aa_perms *addend) +{ + accum->deny |= addend->deny; + accum->allow &= addend->allow & ~addend->deny; + accum->audit |= addend->audit & addend->allow; + accum->quiet &= addend->quiet & ~addend->allow; + accum->kill |= addend->kill & ~addend->allow; + accum->complain |= addend->complain & ~addend->allow & ~addend->deny; + accum->cond |= addend->cond & ~addend->allow & ~addend->deny; + accum->hide &= addend->hide & ~addend->allow; + accum->prompt |= addend->prompt & ~addend->allow & ~addend->deny; + accum->subtree |= addend->subtree & ~addend->deny; + + if (!accum->xindex) + accum->xindex = addend->xindex; + if (!accum->tag) + accum->tag = addend->tag; + if (!accum->label) + accum->label = addend->label; +} + +/** + * aa_perms_accum - accumulate perms, masking off overlapping perms + * @accum - perms struct to accumulate into + * @addend - perms struct to add to @accum + */ +static inline void aa_perms_accum(struct aa_perms *accum, + struct aa_perms *addend) +{ + accum->deny |= addend->deny; + accum->allow &= addend->allow & ~accum->deny; + accum->audit |= addend->audit & accum->allow; + accum->quiet &= addend->quiet & ~accum->allow; + accum->kill |= addend->kill & ~accum->allow; + accum->complain |= addend->complain & ~accum->allow & ~accum->deny; + accum->cond |= addend->cond & ~accum->allow & ~accum->deny; + accum->hide &= addend->hide & ~accum->allow; + accum->prompt |= addend->prompt & ~accum->allow & ~accum->deny; + accum->subtree &= addend->subtree & ~accum->deny; + + if (!accum->xindex) + accum->xindex = addend->xindex; + if (!accum->tag) + accum->tag = addend->tag; + if (!accum->label) + accum->label = addend->label; +} #define xcheck(FN1, FN2) \ ({ \ diff --git a/security/apparmor/lib.c b/security/apparmor/lib.c index d6a8c361025b4..10e3b11e02adf 100644 --- a/security/apparmor/lib.c +++ b/security/apparmor/lib.c @@ -331,58 +331,6 @@ void aa_apply_modes_to_perms(struct aa_profile *profile, struct aa_perms *perms) perms->prompt = ALL_PERMS_MASK; } -/** - * aa_perms_accum_raw - accumulate perms with out masking off overlapping perms - * @accum - perms struct to accumulate into - * @addend - perms struct to add to @accum - */ -void aa_perms_accum_raw(struct aa_perms *accum, struct aa_perms *addend) -{ - accum->deny |= addend->deny; - accum->allow &= addend->allow & ~addend->deny; - accum->audit |= addend->audit & addend->allow; - accum->quiet &= addend->quiet & ~addend->allow; - accum->kill |= addend->kill & ~addend->allow; - accum->complain |= addend->complain & ~addend->allow & ~addend->deny; - accum->cond |= addend->cond & ~addend->allow & ~addend->deny; - accum->hide &= addend->hide & ~addend->allow; - accum->prompt |= addend->prompt & ~addend->allow & ~addend->deny; - accum->subtree |= addend->subtree & ~addend->deny; - - if (!accum->xindex) - accum->xindex = addend->xindex; - if (!accum->tag) - accum->tag = addend->tag; - if (!accum->label) - accum->label = addend->label; -} - -/** - * aa_perms_accum - accumulate perms, masking off overlapping perms - * @accum - perms struct to accumulate into - * @addend - perms struct to add to @accum - */ -void aa_perms_accum(struct aa_perms *accum, struct aa_perms *addend) -{ - accum->deny |= addend->deny; - accum->allow &= addend->allow & ~accum->deny; - accum->audit |= addend->audit & accum->allow; - accum->quiet &= addend->quiet & ~accum->allow; - accum->kill |= addend->kill & ~accum->allow; - accum->complain |= addend->complain & ~accum->allow & ~accum->deny; - accum->cond |= addend->cond & ~accum->allow & ~accum->deny; - accum->hide &= addend->hide & ~accum->allow; - accum->prompt |= addend->prompt & ~accum->allow & ~accum->deny; - accum->subtree &= addend->subtree & ~accum->deny; - - if (!accum->xindex) - accum->xindex = addend->xindex; - if (!accum->tag) - accum->tag = addend->tag; - if (!accum->label) - accum->label = addend->label; -} - void aa_profile_match_label(struct aa_profile *profile, struct aa_label *label, int type, u32 request, struct aa_perms *perms) { -- GitLab From 3bf3d728a58d7dcf2bbf179e3263fb8651f6097b Mon Sep 17 00:00:00 2001 From: John Johansen Date: Tue, 6 Sep 2022 00:38:20 -0700 Subject: [PATCH 0035/1423] apparmor: verify loaded permission bits masks don't overlap Add an additional verification that loaded permission sets don't overlap in ways that are not intended. This will help ensure that permission accumulation can't result in an invalid permission set. Signed-off-by: John Johansen --- security/apparmor/policy_unpack.c | 34 +++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 312bd632a472b..5a78aaa0eea49 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -1150,11 +1150,37 @@ static bool verify_dfa_xindex(struct aa_dfa *dfa, int table_size) return true; } -static bool verify_perm_indexes(struct aa_policydb *pdb) +static bool verify_perm(struct aa_perms *perm) +{ + /* TODO: allow option to just force the perms into a valid state */ + if (perm->allow & perm->deny) + return false; + if (perm->subtree & ~perm->allow) + return false; + if (perm->cond & (perm->allow | perm->deny)) + return false; + if (perm->kill & perm->allow) + return false; + if (perm->complain & (perm->allow | perm->deny)) + return false; + if (perm->prompt & (perm->allow | perm->deny)) + return false; + if (perm->complain & perm->prompt) + return false; + if (perm->hide & perm->allow) + return false; + + return true; +} + +static bool verify_perms(struct aa_policydb *pdb) { int i; for (i = 0; i < pdb->size; i++) { + if (!verify_perm(&pdb->perms[i])) + return false; + /* verify indexes into str table */ if (pdb->perms[i].xindex >= pdb->trans.size) return false; if (pdb->perms[i].tag >= pdb->trans.size) @@ -1187,17 +1213,17 @@ static int verify_profile(struct aa_profile *profile) return -EPROTO; } - if (!verify_perm_indexes(&profile->file)) { + if (!verify_perms(&profile->file)) { audit_iface(profile, NULL, NULL, "Unpack: Invalid perm index", NULL, -EPROTO); return -EPROTO; } - if (!verify_perm_indexes(&profile->policy)) { + if (!verify_perms(&profile->policy)) { audit_iface(profile, NULL, NULL, "Unpack: Invalid perm index", NULL, -EPROTO); return -EPROTO; } - if (!verify_perm_indexes(&profile->xmatch)) { + if (!verify_perms(&profile->xmatch)) { audit_iface(profile, NULL, NULL, "Unpack: Invalid perm index", NULL, -EPROTO); return -EPROTO; -- GitLab From 217af7e2f4deb629aaa49622685ccfee923898ca Mon Sep 17 00:00:00 2001 From: John Johansen Date: Fri, 29 Jul 2022 17:17:31 -0700 Subject: [PATCH 0036/1423] apparmor: refactor profile rules and attachments In preparation for moving from a single set of rules and a single attachment to multiple rulesets and attachments separate from the profile refactor attachment information and ruleset info into their own structures. Signed-off-by: John Johansen --- security/apparmor/apparmorfs.c | 27 ++++--- security/apparmor/capability.c | 12 +-- security/apparmor/domain.c | 81 +++++++++++--------- security/apparmor/file.c | 14 ++-- security/apparmor/include/label.h | 9 ++- security/apparmor/include/perms.h | 3 +- security/apparmor/include/policy.h | 84 ++++++++++++-------- security/apparmor/ipc.c | 9 ++- security/apparmor/label.c | 45 ++++++----- security/apparmor/lib.c | 13 ++-- security/apparmor/lsm.c | 4 +- security/apparmor/mount.c | 31 ++++---- security/apparmor/net.c | 24 +++--- security/apparmor/policy.c | 44 +++++++---- security/apparmor/policy_ns.c | 4 +- security/apparmor/policy_unpack.c | 118 +++++++++++++++-------------- security/apparmor/resource.c | 15 ++-- security/apparmor/task.c | 10 +-- 18 files changed, 308 insertions(+), 239 deletions(-) diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index fb9d2ccb34d60..84ef8b400b401 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -611,30 +611,29 @@ static const struct file_operations aa_fs_ns_revision_fops = { static void profile_query_cb(struct aa_profile *profile, struct aa_perms *perms, const char *match_str, size_t match_len) { + struct aa_ruleset *rules = &profile->rules; struct aa_perms tmp = { }; - struct aa_dfa *dfa; aa_state_t state = DFA_NOMATCH; if (profile_unconfined(profile)) return; - if (profile->file.dfa && *match_str == AA_CLASS_FILE) { - dfa = profile->file.dfa; - state = aa_dfa_match_len(dfa, - profile->file.start[AA_CLASS_FILE], + if (rules->file.dfa && *match_str == AA_CLASS_FILE) { + state = aa_dfa_match_len(rules->file.dfa, + rules->file.start[AA_CLASS_FILE], match_str + 1, match_len - 1); if (state) { struct path_cond cond = { }; - tmp = *(aa_lookup_fperms(&(profile->file), state, &cond)); + tmp = *(aa_lookup_fperms(&(rules->file), state, &cond)); } - } else if (profile->policy.dfa) { - if (!PROFILE_MEDIATES(profile, *match_str)) + } else if (rules->policy.dfa) { + if (!RULE_MEDIATES(rules, *match_str)) return; /* no change to current perms */ - dfa = profile->policy.dfa; - state = aa_dfa_match_len(dfa, profile->policy.start[0], + state = aa_dfa_match_len(rules->policy.dfa, + rules->policy.start[0], match_str, match_len); if (state) - tmp = *aa_lookup_perms(&profile->policy, state); + tmp = *aa_lookup_perms(&rules->policy, state); } aa_apply_modes_to_perms(profile, &tmp); aa_perms_accum_raw(perms, &tmp); @@ -1093,9 +1092,9 @@ static int seq_profile_attach_show(struct seq_file *seq, void *v) struct aa_proxy *proxy = seq->private; struct aa_label *label = aa_get_label_rcu(&proxy->label); struct aa_profile *profile = labels_profile(label); - if (profile->attach) - seq_printf(seq, "%s\n", profile->attach); - else if (profile->xmatch.dfa) + if (profile->attach.xmatch_str) + seq_printf(seq, "%s\n", profile->attach.xmatch_str); + else if (profile->attach.xmatch.dfa) seq_puts(seq, "\n"); else seq_printf(seq, "%s\n", profile->base.name); diff --git a/security/apparmor/capability.c b/security/apparmor/capability.c index 6cabd6109f12a..b66ec63e2a489 100644 --- a/security/apparmor/capability.c +++ b/security/apparmor/capability.c @@ -64,6 +64,7 @@ static void audit_cb(struct audit_buffer *ab, void *va) static int audit_caps(struct common_audit_data *sa, struct aa_profile *profile, int cap, int error) { + struct aa_ruleset *rules = &profile->rules; struct audit_cache *ent; int type = AUDIT_APPARMOR_AUTO; @@ -72,13 +73,13 @@ static int audit_caps(struct common_audit_data *sa, struct aa_profile *profile, if (likely(!error)) { /* test if auditing is being forced */ if (likely((AUDIT_MODE(profile) != AUDIT_ALL) && - !cap_raised(profile->caps.audit, cap))) + !cap_raised(rules->caps.audit, cap))) return 0; type = AUDIT_APPARMOR_AUDIT; } else if (KILL_MODE(profile) || - cap_raised(profile->caps.kill, cap)) { + cap_raised(rules->caps.kill, cap)) { type = AUDIT_APPARMOR_KILL; - } else if (cap_raised(profile->caps.quiet, cap) && + } else if (cap_raised(rules->caps.quiet, cap) && AUDIT_MODE(profile) != AUDIT_NOQUIET && AUDIT_MODE(profile) != AUDIT_ALL) { /* quiet auditing */ @@ -114,10 +115,11 @@ static int audit_caps(struct common_audit_data *sa, struct aa_profile *profile, static int profile_capable(struct aa_profile *profile, int cap, unsigned int opts, struct common_audit_data *sa) { + struct aa_ruleset *rules = &profile->rules; int error; - if (cap_raised(profile->caps.allow, cap) && - !cap_raised(profile->caps.denied, cap)) + if (cap_raised(rules->caps.allow, cap) && + !cap_raised(rules->caps.denied, cap)) error = 0; else error = -EPERM; diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c index 4cb046cf3a14f..ad035d14cfc57 100644 --- a/security/apparmor/domain.c +++ b/security/apparmor/domain.c @@ -81,19 +81,20 @@ static inline aa_state_t match_component(struct aa_profile *profile, struct aa_profile *tp, bool stack, aa_state_t state) { + struct aa_ruleset *rules = &profile->rules; const char *ns_name; if (stack) - state = aa_dfa_match(profile->file.dfa, state, "&"); + state = aa_dfa_match(rules->file.dfa, state, "&"); if (profile->ns == tp->ns) - return aa_dfa_match(profile->file.dfa, state, tp->base.hname); + return aa_dfa_match(rules->file.dfa, state, tp->base.hname); /* try matching with namespace name and then profile */ ns_name = aa_ns_name(profile->ns, tp->ns, true); - state = aa_dfa_match_len(profile->file.dfa, state, ":", 1); - state = aa_dfa_match(profile->file.dfa, state, ns_name); - state = aa_dfa_match_len(profile->file.dfa, state, ":", 1); - return aa_dfa_match(profile->file.dfa, state, tp->base.hname); + state = aa_dfa_match_len(rules->file.dfa, state, ":", 1); + state = aa_dfa_match(rules->file.dfa, state, ns_name); + state = aa_dfa_match_len(rules->file.dfa, state, ":", 1); + return aa_dfa_match(rules->file.dfa, state, tp->base.hname); } /** @@ -117,6 +118,7 @@ static int label_compound_match(struct aa_profile *profile, aa_state_t state, bool subns, u32 request, struct aa_perms *perms) { + struct aa_ruleset *rules = &profile->rules; struct aa_profile *tp; struct label_it i; struct path_cond cond = { }; @@ -139,12 +141,12 @@ next: label_for_each_cont(i, label, tp) { if (!aa_ns_visible(profile->ns, tp->ns, subns)) continue; - state = aa_dfa_match(profile->file.dfa, state, "//&"); + state = aa_dfa_match(rules->file.dfa, state, "//&"); state = match_component(profile, tp, false, state); if (!state) goto fail; } - *perms = *(aa_lookup_fperms(&(profile->file), state, &cond)); + *perms = *(aa_lookup_fperms(&(rules->file), state, &cond)); aa_apply_modes_to_perms(profile, perms); if ((perms->allow & request) != request) return -EACCES; @@ -177,6 +179,7 @@ static int label_components_match(struct aa_profile *profile, aa_state_t start, bool subns, u32 request, struct aa_perms *perms) { + struct aa_ruleset *rules = &profile->rules; struct aa_profile *tp; struct label_it i; struct aa_perms tmp; @@ -197,7 +200,7 @@ static int label_components_match(struct aa_profile *profile, return 0; next: - tmp = *(aa_lookup_fperms(&(profile->file), state, &cond)); + tmp = *(aa_lookup_fperms(&(rules->file), state, &cond)); aa_apply_modes_to_perms(profile, &tmp); aa_perms_accum(perms, &tmp); label_for_each_cont(i, label, tp) { @@ -206,7 +209,7 @@ next: state = match_component(profile, tp, stack, start); if (!state) goto fail; - tmp = *(aa_lookup_fperms(&(profile->file), state, &cond)); + tmp = *(aa_lookup_fperms(&(rules->file), state, &cond)); aa_apply_modes_to_perms(profile, &tmp); aa_perms_accum(perms, &tmp); } @@ -296,18 +299,19 @@ static int aa_xattrs_match(const struct linux_binprm *bprm, ssize_t size; struct dentry *d; char *value = NULL; - int value_size = 0, ret = profile->xattr_count; + struct aa_attachment *attach = &profile->attach; + int value_size = 0, ret = attach->xattr_count; - if (!bprm || !profile->xattr_count) + if (!bprm || !attach->xattr_count) return 0; might_sleep(); /* transition from exec match to xattr set */ - state = aa_dfa_outofband_transition(profile->xmatch.dfa, state); + state = aa_dfa_outofband_transition(attach->xmatch.dfa, state); d = bprm->file->f_path.dentry; - for (i = 0; i < profile->xattr_count; i++) { - size = vfs_getxattr_alloc(&init_user_ns, d, profile->xattrs[i], + for (i = 0; i < attach->xattr_count; i++) { + size = vfs_getxattr_alloc(&init_user_ns, d, attach->xattrs[i], &value, value_size, GFP_KERNEL); if (size >= 0) { u32 index, perm; @@ -317,20 +321,20 @@ static int aa_xattrs_match(const struct linux_binprm *bprm, * that not present xattr can be distinguished from a 0 * length value or rule that matches any value */ - state = aa_dfa_null_transition(profile->xmatch.dfa, + state = aa_dfa_null_transition(attach->xmatch.dfa, state); /* Check xattr value */ - state = aa_dfa_match_len(profile->xmatch.dfa, state, + state = aa_dfa_match_len(attach->xmatch.dfa, state, value, size); - index = ACCEPT_TABLE(profile->xmatch.dfa)[state]; - perm = profile->xmatch.perms[index].allow; + index = ACCEPT_TABLE(attach->xmatch.dfa)[state]; + perm = attach->xmatch.perms[index].allow; if (!(perm & MAY_EXEC)) { ret = -EINVAL; goto out; } } /* transition to next element */ - state = aa_dfa_outofband_transition(profile->xmatch.dfa, state); + state = aa_dfa_outofband_transition(attach->xmatch.dfa, state); if (size < 0) { /* * No xattr match, so verify if transition to @@ -382,6 +386,8 @@ static struct aa_label *find_attach(const struct linux_binprm *bprm, rcu_read_lock(); restart: list_for_each_entry_rcu(profile, head, base.list) { + struct aa_attachment *attach = &profile->attach; + if (profile->label.flags & FLAG_NULL && &profile->label == ns_unconfined(profile->ns)) continue; @@ -397,16 +403,16 @@ restart: * as another profile, signal a conflict and refuse to * match. */ - if (profile->xmatch.dfa) { + if (attach->xmatch.dfa) { unsigned int count; aa_state_t state; u32 index, perm; - state = aa_dfa_leftmatch(profile->xmatch.dfa, - profile->xmatch.start[AA_CLASS_XMATCH], + state = aa_dfa_leftmatch(attach->xmatch.dfa, + attach->xmatch.start[AA_CLASS_XMATCH], name, &count); - index = ACCEPT_TABLE(profile->xmatch.dfa)[state]; - perm = profile->xmatch.perms[index].allow; + index = ACCEPT_TABLE(attach->xmatch.dfa)[state]; + perm = attach->xmatch.perms[index].allow; /* any accepting state means a valid match. */ if (perm & MAY_EXEC) { int ret = 0; @@ -414,7 +420,7 @@ restart: if (count < candidate_len) continue; - if (bprm && profile->xattr_count) { + if (bprm && attach->xattr_count) { long rev = READ_ONCE(ns->revision); if (!aa_get_profile_not0(profile)) @@ -453,7 +459,7 @@ restart: * xattrs, or a longer match */ candidate = profile; - candidate_len = max(count, profile->xmatch_len); + candidate_len = max(count, attach->xmatch_len); candidate_xattrs = ret; conflict = false; } @@ -497,6 +503,7 @@ static const char *next_name(int xtype, const char *name) struct aa_label *x_table_lookup(struct aa_profile *profile, u32 xindex, const char **name) { + struct aa_ruleset *rules = &profile->rules; struct aa_label *label = NULL; u32 xtype = xindex & AA_X_TYPE_MASK; int index = xindex & AA_X_INDEX_MASK; @@ -507,7 +514,7 @@ struct aa_label *x_table_lookup(struct aa_profile *profile, u32 xindex, /* TODO: move lookup parsing to unpack time so this is a straight * index into the resultant label */ - for (*name = profile->file.trans.table[index]; !label && *name; + for (*name = rules->file.trans.table[index]; !label && *name; *name = next_name(xtype, *name)) { if (xindex & AA_X_CHILD) { struct aa_profile *new_profile; @@ -546,6 +553,7 @@ static struct aa_label *x_to_label(struct aa_profile *profile, const char **lookupname, const char **info) { + struct aa_ruleset *rules = &profile->rules; struct aa_label *new = NULL; struct aa_ns *ns = profile->ns; u32 xtype = xindex & AA_X_TYPE_MASK; @@ -558,7 +566,7 @@ static struct aa_label *x_to_label(struct aa_profile *profile, break; case AA_X_TABLE: /* TODO: fix when perm mapping done at unload */ - stack = profile->file.trans.table[xindex & AA_X_INDEX_MASK]; + stack = rules->file.trans.table[xindex & AA_X_INDEX_MASK]; if (*stack != '&') { /* released by caller */ new = x_table_lookup(profile, xindex, lookupname); @@ -612,9 +620,10 @@ static struct aa_label *profile_transition(struct aa_profile *profile, char *buffer, struct path_cond *cond, bool *secure_exec) { + struct aa_ruleset *rules = &profile->rules; struct aa_label *new = NULL; const char *info = NULL, *name = NULL, *target = NULL; - aa_state_t state = profile->file.start[AA_CLASS_FILE]; + aa_state_t state = rules->file.start[AA_CLASS_FILE]; struct aa_perms perms = {}; bool nonewprivs = false; int error = 0; @@ -648,7 +657,7 @@ static struct aa_label *profile_transition(struct aa_profile *profile, } /* find exec permissions for name */ - state = aa_str_perms(&(profile->file), state, name, cond, &perms); + state = aa_str_perms(&(rules->file), state, name, cond, &perms); if (perms.allow & MAY_EXEC) { /* exec permission determine how to transition */ new = x_to_label(profile, bprm, name, perms.xindex, &target, @@ -710,7 +719,8 @@ static int profile_onexec(struct aa_profile *profile, struct aa_label *onexec, char *buffer, struct path_cond *cond, bool *secure_exec) { - aa_state_t state = profile->file.start[AA_CLASS_FILE]; + struct aa_ruleset *rules = &profile->rules; + aa_state_t state = rules->file.start[AA_CLASS_FILE]; struct aa_perms perms = {}; const char *xname = NULL, *info = "change_profile onexec"; int error = -EACCES; @@ -743,7 +753,7 @@ static int profile_onexec(struct aa_profile *profile, struct aa_label *onexec, } /* find exec permissions for name */ - state = aa_str_perms(&(profile->file), state, xname, cond, &perms); + state = aa_str_perms(&(rules->file), state, xname, cond, &perms); if (!(perms.allow & AA_MAY_ONEXEC)) { info = "no change_onexec valid for executable"; goto audit; @@ -752,7 +762,7 @@ static int profile_onexec(struct aa_profile *profile, struct aa_label *onexec, * onexec permission is linked to exec with a standard pairing * exec\0change_profile */ - state = aa_dfa_null_transition(profile->file.dfa, state); + state = aa_dfa_null_transition(rules->file.dfa, state); error = change_profile_perms(profile, onexec, stack, AA_MAY_ONEXEC, state, &perms); if (error) { @@ -1249,12 +1259,13 @@ static int change_profile_perms_wrapper(const char *op, const char *name, struct aa_label *target, bool stack, u32 request, struct aa_perms *perms) { + struct aa_ruleset *rules = &profile->rules; const char *info = NULL; int error = 0; if (!error) error = change_profile_perms(profile, target, stack, request, - profile->file.start[AA_CLASS_FILE], + rules->file.start[AA_CLASS_FILE], perms); if (error) error = aa_audit_file(profile, perms, op, request, name, diff --git a/security/apparmor/file.c b/security/apparmor/file.c index 69d936d04f948..ef5d98f81a2b8 100644 --- a/security/apparmor/file.c +++ b/security/apparmor/file.c @@ -224,11 +224,12 @@ int __aa_path_perm(const char *op, struct aa_profile *profile, const char *name, u32 request, struct path_cond *cond, int flags, struct aa_perms *perms) { + struct aa_ruleset *rules = &profile->rules; int e = 0; if (profile_unconfined(profile)) return 0; - aa_str_perms(&(profile->file), profile->file.start[AA_CLASS_FILE], + aa_str_perms(&(rules->file), rules->file.start[AA_CLASS_FILE], name, cond, perms); if (request & ~perms->allow) e = -EACCES; @@ -316,6 +317,7 @@ static int profile_path_link(struct aa_profile *profile, const struct path *target, char *buffer2, struct path_cond *cond) { + struct aa_ruleset *rules = &profile->rules; const char *lname, *tname = NULL; struct aa_perms lperms = {}, perms; const char *info = NULL; @@ -336,16 +338,16 @@ static int profile_path_link(struct aa_profile *profile, error = -EACCES; /* aa_str_perms - handles the case of the dfa being NULL */ - state = aa_str_perms(&(profile->file), - profile->file.start[AA_CLASS_FILE], lname, + state = aa_str_perms(&(rules->file), + rules->file.start[AA_CLASS_FILE], lname, cond, &lperms); if (!(lperms.allow & AA_MAY_LINK)) goto audit; /* test to see if target can be paired with link */ - state = aa_dfa_null_transition(profile->file.dfa, state); - aa_str_perms(&(profile->file), state, tname, cond, &perms); + state = aa_dfa_null_transition(rules->file.dfa, state); + aa_str_perms(&(rules->file), state, tname, cond, &perms); /* force audit/quiet masks for link are stored in the second entry * in the link pair. @@ -367,7 +369,7 @@ static int profile_path_link(struct aa_profile *profile, /* Do link perm subset test requiring allowed permission on link are * a subset of the allowed permissions on target. */ - aa_str_perms(&(profile->file), profile->file.start[AA_CLASS_FILE], + aa_str_perms(&(rules->file), rules->file.start[AA_CLASS_FILE], tname, cond, &perms); /* AA_MAY_LINK is not considered in the subset test */ diff --git a/security/apparmor/include/label.h b/security/apparmor/include/label.h index 1130ba10a1526..2a72e6b17d68b 100644 --- a/security/apparmor/include/label.h +++ b/security/apparmor/include/label.h @@ -261,7 +261,7 @@ for ((I).i = (I).j = 0; \ struct label_it i; \ int ret = 0; \ label_for_each(i, (L), profile) { \ - if (PROFILE_MEDIATES(profile, (C))) { \ + if (RULE_MEDIATES(&profile->rules, (C))) { \ ret = 1; \ break; \ } \ @@ -357,9 +357,10 @@ static inline const char *aa_label_str_split(const char *str) struct aa_perms; -int aa_label_match(struct aa_profile *profile, struct aa_label *label, - aa_state_t state, bool subns, u32 request, - struct aa_perms *perms); +struct aa_ruleset; +int aa_label_match(struct aa_profile *profile, struct aa_ruleset *rules, + struct aa_label *label, aa_state_t state, bool subns, + u32 request, struct aa_perms *perms); /** diff --git a/security/apparmor/include/perms.h b/security/apparmor/include/perms.h index 9fa71957ac3a0..797a7a00644d2 100644 --- a/security/apparmor/include/perms.h +++ b/security/apparmor/include/perms.h @@ -207,7 +207,8 @@ void aa_apply_modes_to_perms(struct aa_profile *profile, struct aa_perms *perms); void aa_perms_accum(struct aa_perms *accum, struct aa_perms *addend); void aa_perms_accum_raw(struct aa_perms *accum, struct aa_perms *addend); -void aa_profile_match_label(struct aa_profile *profile, struct aa_label *label, +void aa_profile_match_label(struct aa_profile *profile, + struct aa_ruleset *rules, struct aa_label *label, int type, u32 request, struct aa_perms *perms); int aa_profile_label_perm(struct aa_profile *profile, struct aa_profile *target, u32 request, int type, u32 *deny, diff --git a/security/apparmor/include/policy.h b/security/apparmor/include/policy.h index 2c39bd389f878..9ee2c05e28951 100644 --- a/security/apparmor/include/policy.h +++ b/security/apparmor/include/policy.h @@ -123,6 +123,43 @@ struct aa_data { struct rhash_head head; }; +/* struct aa_ruleset - data covering mediation rules + * @size: the memory consumed by this ruleset + * @policy: general match rules governing policy + * @file: The set of rules governing basic file access and domain transitions + * @caps: capabilities for the profile + * @rlimits: rlimits for the profile + * @secmark_count: number of secmark entries + * @secmark: secmark label match info + */ +struct aa_ruleset { + int size; + + /* TODO: merge policy and file */ + struct aa_policydb policy; + struct aa_policydb file; + struct aa_caps caps; + + struct aa_rlimit rlimits; + + int secmark_count; + struct aa_secmark *secmark; +}; + +/* struct aa_attachment - data and rules for a profiles attachment + * @xmatch_str: human readable attachment string + * @xmatch: optional extended matching for unconfined executables names + * @xmatch_len: xmatch prefix len, used to determine xmatch priority + * @xattr_count: number of xattrs in table + * @xattrs: table of xattrs + */ +struct aa_attachment { + const char *xmatch_str; + struct aa_policydb xmatch; + unsigned int xmatch_len; + int xattr_count; + char **xattrs; +}; /* struct aa_profile - basic confinement data * @base - base components of the profile (name, refcount, lists, lock ...) @@ -130,18 +167,13 @@ struct aa_data { * @parent: parent of profile * @ns: namespace the profile is in * @rename: optional profile name that this profile renamed - * @attach: human readable attachment string - * @xmatch: optional extended matching for unconfined executables names - * @xmatch_len: xmatch prefix len, used to determine xmatch priority + * * @audit: the auditing mode of the profile * @mode: the enforcement mode of the profile * @path_flags: flags controlling path generation behavior * @disconnected: what to prepend if attach_disconnected is specified - * @size: the memory consumed by this profiles rules - * @policy: general match rules governing policy - * @file: The set of rules governing basic file access and domain transitions - * @caps: capabilities for the profile - * @rlimits: rlimits for the profile + * @attach: attachment rules for the profile + * @rules: rules to be enforced * * @dents: dentries for the profiles file entries in apparmorfs * @dirname: name of the profile dir in apparmorfs @@ -166,27 +198,13 @@ struct aa_profile { struct aa_ns *ns; const char *rename; - const char *attach; - struct aa_policydb xmatch; - unsigned int xmatch_len; - enum audit_mode audit; long mode; u32 path_flags; const char *disconnected; - int size; - struct aa_policydb policy; - struct aa_policydb file; - struct aa_caps caps; - - int xattr_count; - char **xattrs; - - struct aa_rlimit rlimits; - - int secmark_count; - struct aa_secmark *secmark; + struct aa_attachment attach; + struct aa_ruleset rules; struct aa_loaddata *rawdata; unsigned char *hash; @@ -247,24 +265,24 @@ static inline struct aa_profile *aa_get_newest_profile(struct aa_profile *p) return labels_profile(aa_get_newest_label(&p->label)); } -static inline aa_state_t PROFILE_MEDIATES(struct aa_profile *profile, - unsigned char class) +static inline aa_state_t RULE_MEDIATES(struct aa_ruleset *rules, + unsigned char class) { if (class <= AA_CLASS_LAST) - return profile->policy.start[class]; + return rules->policy.start[class]; else - return aa_dfa_match_len(profile->policy.dfa, - profile->policy.start[0], &class, 1); + return aa_dfa_match_len(rules->policy.dfa, + rules->policy.start[0], &class, 1); } -static inline aa_state_t PROFILE_MEDIATES_AF(struct aa_profile *profile, - u16 AF) { - aa_state_t state = PROFILE_MEDIATES(profile, AA_CLASS_NET); +static inline aa_state_t RULE_MEDIATES_AF(struct aa_ruleset *rules, u16 AF) +{ + aa_state_t state = RULE_MEDIATES(rules, AA_CLASS_NET); __be16 be_af = cpu_to_be16(AF); if (!state) return DFA_NOMATCH; - return aa_dfa_match_len(profile->policy.dfa, state, (char *) &be_af, 2); + return aa_dfa_match_len(rules->policy.dfa, state, (char *) &be_af, 2); } /** diff --git a/security/apparmor/ipc.c b/security/apparmor/ipc.c index 4ecaf2ba26c54..dc2fa548312db 100644 --- a/security/apparmor/ipc.c +++ b/security/apparmor/ipc.c @@ -78,19 +78,20 @@ static int profile_signal_perm(struct aa_profile *profile, struct aa_label *peer, u32 request, struct common_audit_data *sa) { + struct aa_ruleset *rules = &profile->rules; struct aa_perms perms; aa_state_t state; if (profile_unconfined(profile) || - !PROFILE_MEDIATES(profile, AA_CLASS_SIGNAL)) + !RULE_MEDIATES(rules, AA_CLASS_SIGNAL)) return 0; aad(sa)->peer = peer; /* TODO: secondary cache check */ - state = aa_dfa_next(profile->policy.dfa, - profile->policy.start[AA_CLASS_SIGNAL], + state = aa_dfa_next(rules->policy.dfa, + rules->policy.start[AA_CLASS_SIGNAL], aad(sa)->signal); - aa_label_match(profile, peer, state, false, request, &perms); + aa_label_match(profile, rules, peer, state, false, request, &perms); aa_apply_modes_to_perms(profile, &perms); return aa_check_perms(profile, &perms, request, sa, audit_signal_cb); } diff --git a/security/apparmor/label.c b/security/apparmor/label.c index 3a967003fa7c7..98dadd9609772 100644 --- a/security/apparmor/label.c +++ b/security/apparmor/label.c @@ -1266,20 +1266,21 @@ static inline bool label_is_visible(struct aa_profile *profile, * visibility test. */ static inline aa_state_t match_component(struct aa_profile *profile, + struct aa_ruleset *rules, struct aa_profile *tp, aa_state_t state) { const char *ns_name; if (profile->ns == tp->ns) - return aa_dfa_match(profile->policy.dfa, state, tp->base.hname); + return aa_dfa_match(rules->policy.dfa, state, tp->base.hname); /* try matching with namespace name and then profile */ ns_name = aa_ns_name(profile->ns, tp->ns, true); - state = aa_dfa_match_len(profile->policy.dfa, state, ":", 1); - state = aa_dfa_match(profile->policy.dfa, state, ns_name); - state = aa_dfa_match_len(profile->policy.dfa, state, ":", 1); - return aa_dfa_match(profile->policy.dfa, state, tp->base.hname); + state = aa_dfa_match_len(rules->policy.dfa, state, ":", 1); + state = aa_dfa_match(rules->policy.dfa, state, ns_name); + state = aa_dfa_match_len(rules->policy.dfa, state, ":", 1); + return aa_dfa_match(rules->policy.dfa, state, tp->base.hname); } /** @@ -1298,6 +1299,7 @@ static inline aa_state_t match_component(struct aa_profile *profile, * check to be stacked. */ static int label_compound_match(struct aa_profile *profile, + struct aa_ruleset *rules, struct aa_label *label, aa_state_t state, bool subns, u32 request, struct aa_perms *perms) @@ -1309,7 +1311,7 @@ static int label_compound_match(struct aa_profile *profile, label_for_each(i, label, tp) { if (!aa_ns_visible(profile->ns, tp->ns, subns)) continue; - state = match_component(profile, tp, state); + state = match_component(profile, rules, tp, state); if (!state) goto fail; goto next; @@ -1323,12 +1325,12 @@ next: label_for_each_cont(i, label, tp) { if (!aa_ns_visible(profile->ns, tp->ns, subns)) continue; - state = aa_dfa_match(profile->policy.dfa, state, "//&"); - state = match_component(profile, tp, state); + state = aa_dfa_match(rules->policy.dfa, state, "//&"); + state = match_component(profile, rules, tp, state); if (!state) goto fail; } - *perms = *aa_lookup_perms(&profile->policy, state); + *perms = *aa_lookup_perms(&rules->policy, state); aa_apply_modes_to_perms(profile, perms); if ((perms->allow & request) != request) return -EACCES; @@ -1343,6 +1345,7 @@ fail: /** * label_components_match - find perms for all subcomponents of a label * @profile: profile to find perms for + * @rules: ruleset to search * @label: label to check access permissions for * @start: state to start match in * @subns: whether to do permission checks on components in a subns @@ -1356,6 +1359,7 @@ fail: * check to be stacked. */ static int label_components_match(struct aa_profile *profile, + struct aa_ruleset *rules, struct aa_label *label, aa_state_t start, bool subns, u32 request, struct aa_perms *perms) @@ -1369,7 +1373,7 @@ static int label_components_match(struct aa_profile *profile, label_for_each(i, label, tp) { if (!aa_ns_visible(profile->ns, tp->ns, subns)) continue; - state = match_component(profile, tp, start); + state = match_component(profile, rules, tp, start); if (!state) goto fail; goto next; @@ -1379,16 +1383,16 @@ static int label_components_match(struct aa_profile *profile, return 0; next: - tmp = *aa_lookup_perms(&profile->policy, state); + tmp = *aa_lookup_perms(&rules->policy, state); aa_apply_modes_to_perms(profile, &tmp); aa_perms_accum(perms, &tmp); label_for_each_cont(i, label, tp) { if (!aa_ns_visible(profile->ns, tp->ns, subns)) continue; - state = match_component(profile, tp, start); + state = match_component(profile, rules, tp, start); if (!state) goto fail; - tmp = *aa_lookup_perms(&profile->policy, state); + tmp = *aa_lookup_perms(&rules->policy, state); aa_apply_modes_to_perms(profile, &tmp); aa_perms_accum(perms, &tmp); } @@ -1406,6 +1410,7 @@ fail: /** * aa_label_match - do a multi-component label match * @profile: profile to match against (NOT NULL) + * @rules: ruleset to search * @label: label to match (NOT NULL) * @state: state to start in * @subns: whether to match subns components @@ -1414,18 +1419,18 @@ fail: * * Returns: the state the match finished in, may be the none matching state */ -int aa_label_match(struct aa_profile *profile, struct aa_label *label, - aa_state_t state, bool subns, u32 request, - struct aa_perms *perms) +int aa_label_match(struct aa_profile *profile, struct aa_ruleset *rules, + struct aa_label *label, aa_state_t state, bool subns, + u32 request, struct aa_perms *perms) { - int error = label_compound_match(profile, label, state, subns, request, - perms); + int error = label_compound_match(profile, rules, label, state, subns, + request, perms); if (!error) return error; *perms = allperms; - return label_components_match(profile, label, state, subns, request, - perms); + return label_components_match(profile, rules, label, state, subns, + request, perms); } diff --git a/security/apparmor/lib.c b/security/apparmor/lib.c index 10e3b11e02adf..ec73e51ca7e3b 100644 --- a/security/apparmor/lib.c +++ b/security/apparmor/lib.c @@ -331,16 +331,18 @@ void aa_apply_modes_to_perms(struct aa_profile *profile, struct aa_perms *perms) perms->prompt = ALL_PERMS_MASK; } -void aa_profile_match_label(struct aa_profile *profile, struct aa_label *label, +void aa_profile_match_label(struct aa_profile *profile, + struct aa_ruleset *rules, + struct aa_label *label, int type, u32 request, struct aa_perms *perms) { /* TODO: doesn't yet handle extended types */ aa_state_t state; - state = aa_dfa_next(profile->policy.dfa, - profile->policy.start[AA_CLASS_LABEL], + state = aa_dfa_next(rules->policy.dfa, + rules->policy.start[AA_CLASS_LABEL], type); - aa_label_match(profile, label, state, false, request, perms); + aa_label_match(profile, rules, label, state, false, request, perms); } @@ -355,7 +357,8 @@ int aa_profile_label_perm(struct aa_profile *profile, struct aa_profile *target, aad(sa)->peer = &target->label; aad(sa)->request = request; - aa_profile_match_label(profile, &target->label, type, request, &perms); + aa_profile_match_label(profile, &profile->rules, &target->label, type, + request, &perms); aa_apply_modes_to_perms(profile, &perms); *deny |= request & perms.deny; return aa_check_perms(profile, &perms, request, sa, aa_audit_perms_cb); diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 784709286a626..62f2ca32b9596 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -166,9 +166,9 @@ static int apparmor_capget(struct task_struct *target, kernel_cap_t *effective, if (COMPLAIN_MODE(profile)) continue; *effective = cap_intersect(*effective, - profile->caps.allow); + profile->rules.caps.allow); *permitted = cap_intersect(*permitted, - profile->caps.allow); + profile->rules.caps.allow); } } rcu_read_unlock(); diff --git a/security/apparmor/mount.c b/security/apparmor/mount.c index 02d8215cb9fd6..d4724bdcb07f1 100644 --- a/security/apparmor/mount.c +++ b/security/apparmor/mount.c @@ -303,13 +303,14 @@ static int match_mnt_path_str(struct aa_profile *profile, { struct aa_perms perms = { }; const char *mntpnt = NULL, *info = NULL; + struct aa_ruleset *rules = &profile->rules; int pos, error; AA_BUG(!profile); AA_BUG(!mntpath); AA_BUG(!buffer); - if (!PROFILE_MEDIATES(profile, AA_CLASS_MOUNT)) + if (!RULE_MEDIATES(rules, AA_CLASS_MOUNT)) return 0; error = aa_path_name(mntpath, path_flags(profile, mntpath), buffer, @@ -324,8 +325,8 @@ static int match_mnt_path_str(struct aa_profile *profile, } error = -EACCES; - pos = do_match_mnt(&profile->policy, - profile->policy.start[AA_CLASS_MOUNT], + pos = do_match_mnt(&rules->policy, + rules->policy.start[AA_CLASS_MOUNT], mntpnt, devname, type, flags, data, binary, &perms); if (pos) { info = mnt_info_table[pos]; @@ -363,7 +364,7 @@ static int match_mnt(struct aa_profile *profile, const struct path *path, AA_BUG(!profile); AA_BUG(devpath && !devbuffer); - if (!PROFILE_MEDIATES(profile, AA_CLASS_MOUNT)) + if (!RULE_MEDIATES(&profile->rules, AA_CLASS_MOUNT)) return 0; if (devpath) { @@ -565,6 +566,7 @@ out: static int profile_umount(struct aa_profile *profile, const struct path *path, char *buffer) { + struct aa_ruleset *rules = &profile->rules; struct aa_perms perms = { }; const char *name = NULL, *info = NULL; aa_state_t state; @@ -573,7 +575,7 @@ static int profile_umount(struct aa_profile *profile, const struct path *path, AA_BUG(!profile); AA_BUG(!path); - if (!PROFILE_MEDIATES(profile, AA_CLASS_MOUNT)) + if (!RULE_MEDIATES(rules, AA_CLASS_MOUNT)) return 0; error = aa_path_name(path, path_flags(profile, path), buffer, &name, @@ -581,10 +583,10 @@ static int profile_umount(struct aa_profile *profile, const struct path *path, if (error) goto audit; - state = aa_dfa_match(profile->policy.dfa, - profile->policy.start[AA_CLASS_MOUNT], + state = aa_dfa_match(rules->policy.dfa, + rules->policy.start[AA_CLASS_MOUNT], name); - perms = *aa_lookup_perms(&profile->policy, state); + perms = *aa_lookup_perms(&rules->policy, state); if (AA_MAY_UMOUNT & ~perms.allow) error = -EACCES; @@ -624,6 +626,7 @@ static struct aa_label *build_pivotroot(struct aa_profile *profile, const struct path *old_path, char *old_buffer) { + struct aa_ruleset *rules = &profile->rules; const char *old_name, *new_name = NULL, *info = NULL; const char *trans_name = NULL; struct aa_perms perms = { }; @@ -635,7 +638,7 @@ static struct aa_label *build_pivotroot(struct aa_profile *profile, AA_BUG(!old_path); if (profile_unconfined(profile) || - !PROFILE_MEDIATES(profile, AA_CLASS_MOUNT)) + !RULE_MEDIATES(rules, AA_CLASS_MOUNT)) return aa_get_newest_label(&profile->label); error = aa_path_name(old_path, path_flags(profile, old_path), @@ -650,12 +653,12 @@ static struct aa_label *build_pivotroot(struct aa_profile *profile, goto audit; error = -EACCES; - state = aa_dfa_match(profile->policy.dfa, - profile->policy.start[AA_CLASS_MOUNT], + state = aa_dfa_match(rules->policy.dfa, + rules->policy.start[AA_CLASS_MOUNT], new_name); - state = aa_dfa_null_transition(profile->policy.dfa, state); - state = aa_dfa_match(profile->policy.dfa, state, old_name); - perms = *aa_lookup_perms(&profile->policy, state); + state = aa_dfa_null_transition(rules->policy.dfa, state); + state = aa_dfa_match(rules->policy.dfa, state, old_name); + perms = *aa_lookup_perms(&rules->policy, state); if (AA_MAY_PIVOTROOT & perms.allow) error = 0; diff --git a/security/apparmor/net.c b/security/apparmor/net.c index d420d3aec3b85..ae789ee834ada 100644 --- a/security/apparmor/net.c +++ b/security/apparmor/net.c @@ -108,6 +108,7 @@ void audit_net_cb(struct audit_buffer *ab, void *va) int aa_profile_af_perm(struct aa_profile *profile, struct common_audit_data *sa, u32 request, u16 family, int type) { + struct aa_ruleset *rules = &profile->rules; struct aa_perms perms = { }; aa_state_t state; __be16 buffer[2]; @@ -117,15 +118,15 @@ int aa_profile_af_perm(struct aa_profile *profile, struct common_audit_data *sa, if (profile_unconfined(profile)) return 0; - state = PROFILE_MEDIATES(profile, AA_CLASS_NET); + state = RULE_MEDIATES(rules, AA_CLASS_NET); if (!state) return 0; buffer[0] = cpu_to_be16(family); buffer[1] = cpu_to_be16((u16) type); - state = aa_dfa_match_len(profile->policy.dfa, state, (char *) &buffer, + state = aa_dfa_match_len(rules->policy.dfa, state, (char *) &buffer, 4); - perms = *aa_lookup_perms(&profile->policy, state); + perms = *aa_lookup_perms(&rules->policy, state); aa_apply_modes_to_perms(profile, &perms); return aa_check_perms(profile, &perms, request, sa, audit_net_cb); @@ -216,25 +217,26 @@ static int aa_secmark_perm(struct aa_profile *profile, u32 request, u32 secid, { int i, ret; struct aa_perms perms = { }; + struct aa_ruleset *rules = &profile->rules; - if (profile->secmark_count == 0) + if (rules->secmark_count == 0) return 0; - for (i = 0; i < profile->secmark_count; i++) { - if (!profile->secmark[i].secid) { - ret = apparmor_secmark_init(&profile->secmark[i]); + for (i = 0; i < rules->secmark_count; i++) { + if (!rules->secmark[i].secid) { + ret = apparmor_secmark_init(&rules->secmark[i]); if (ret) return ret; } - if (profile->secmark[i].secid == secid || - profile->secmark[i].secid == AA_SECID_WILDCARD) { - if (profile->secmark[i].deny) + if (rules->secmark[i].secid == secid || + rules->secmark[i].secid == AA_SECID_WILDCARD) { + if (rules->secmark[i].deny) perms.deny = ALL_PERMS_MASK; else perms.allow = ALL_PERMS_MASK; - if (profile->secmark[i].audit) + if (rules->secmark[i].audit) perms.audit = ALL_PERMS_MASK; } } diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c index 3c3a5263695de..74c0a3b34e9ba 100644 --- a/security/apparmor/policy.c +++ b/security/apparmor/policy.c @@ -193,6 +193,30 @@ static void aa_free_data(void *ptr, void *arg) kfree_sensitive(data); } +static void free_attachment(struct aa_attachment *attach) +{ + int i; + + for (i = 0; i < attach->xattr_count; i++) + kfree_sensitive(attach->xattrs[i]); + kfree_sensitive(attach->xattrs); + aa_destroy_policydb(&attach->xmatch); +} + +static void free_ruleset(struct aa_ruleset *rules) +{ + int i; + + aa_destroy_policydb(&rules->file); + aa_destroy_policydb(&rules->policy); + aa_free_cap_rules(&rules->caps); + aa_free_rlimit_rules(&rules->rlimits); + + for (i = 0; i < rules->secmark_count; i++) + kfree_sensitive(rules->secmark[i].label); + kfree_sensitive(rules->secmark); +} + /** * aa_free_profile - free a profile * @profile: the profile to free (MAYBE NULL) @@ -206,7 +230,6 @@ static void aa_free_data(void *ptr, void *arg) void aa_free_profile(struct aa_profile *profile) { struct rhashtable *rht; - int i; AA_DEBUG("%s(%p)\n", __func__, profile); @@ -220,19 +243,10 @@ void aa_free_profile(struct aa_profile *profile) aa_put_ns(profile->ns); kfree_sensitive(profile->rename); - aa_destroy_policydb(&profile->file); - aa_free_cap_rules(&profile->caps); - aa_free_rlimit_rules(&profile->rlimits); - - for (i = 0; i < profile->xattr_count; i++) - kfree_sensitive(profile->xattrs[i]); - kfree_sensitive(profile->xattrs); - for (i = 0; i < profile->secmark_count; i++) - kfree_sensitive(profile->secmark[i].label); - kfree_sensitive(profile->secmark); + free_attachment(&profile->attach); + free_ruleset(&profile->rules); kfree_sensitive(profile->dirname); - aa_destroy_policydb(&profile->xmatch); - aa_destroy_policydb(&profile->policy); + if (profile->data) { rht = profile->data; profile->data = NULL; @@ -544,8 +558,8 @@ name: /* released on free_profile */ rcu_assign_pointer(profile->parent, aa_get_profile(parent)); profile->ns = aa_get_ns(parent->ns); - profile->file.dfa = aa_get_dfa(nulldfa); - profile->policy.dfa = aa_get_dfa(nulldfa); + profile->rules.file.dfa = aa_get_dfa(nulldfa); + profile->rules.policy.dfa = aa_get_dfa(nulldfa); mutex_lock_nested(&profile->ns->lock, profile->ns->level); p = __find_child(&parent->base.profiles, bname); diff --git a/security/apparmor/policy_ns.c b/security/apparmor/policy_ns.c index 43beaad083feb..cb10994cd3b6b 100644 --- a/security/apparmor/policy_ns.c +++ b/security/apparmor/policy_ns.c @@ -91,8 +91,8 @@ static struct aa_profile *alloc_unconfined(const char *name) profile->label.flags |= FLAG_IX_ON_NAME_ERROR | FLAG_IMMUTIBLE | FLAG_NS_COUNT | FLAG_UNCONFINED; profile->mode = APPARMOR_UNCONFINED; - profile->file.dfa = aa_get_dfa(nulldfa); - profile->policy.dfa = aa_get_dfa(nulldfa); + profile->rules.file.dfa = aa_get_dfa(nulldfa); + profile->rules.policy.dfa = aa_get_dfa(nulldfa); return profile; } diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 5a78aaa0eea49..bbca7772dfa2b 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -556,12 +556,12 @@ static bool unpack_xattrs(struct aa_ext *e, struct aa_profile *profile) if (unpack_array(e, NULL, &size) != TRI_TRUE) goto fail; - profile->xattr_count = size; - profile->xattrs = kcalloc(size, sizeof(char *), GFP_KERNEL); - if (!profile->xattrs) + profile->attach.xattr_count = size; + profile->attach.xattrs = kcalloc(size, sizeof(char *), GFP_KERNEL); + if (!profile->attach.xattrs) goto fail; for (i = 0; i < size; i++) { - if (!unpack_strdup(e, &profile->xattrs[i], NULL)) + if (!unpack_strdup(e, &profile->attach.xattrs[i], NULL)) goto fail; } if (!unpack_nameX(e, AA_ARRAYEND, NULL)) @@ -579,6 +579,7 @@ fail: static bool unpack_secmark(struct aa_ext *e, struct aa_profile *profile) { + struct aa_ruleset *rules = &profile->rules; void *pos = e->pos; u16 size; int i; @@ -587,19 +588,19 @@ static bool unpack_secmark(struct aa_ext *e, struct aa_profile *profile) if (unpack_array(e, NULL, &size) != TRI_TRUE) goto fail; - profile->secmark = kcalloc(size, sizeof(struct aa_secmark), + rules->secmark = kcalloc(size, sizeof(struct aa_secmark), GFP_KERNEL); - if (!profile->secmark) + if (!rules->secmark) goto fail; - profile->secmark_count = size; + rules->secmark_count = size; for (i = 0; i < size; i++) { - if (!unpack_u8(e, &profile->secmark[i].audit, NULL)) + if (!unpack_u8(e, &rules->secmark[i].audit, NULL)) goto fail; - if (!unpack_u8(e, &profile->secmark[i].deny, NULL)) + if (!unpack_u8(e, &rules->secmark[i].deny, NULL)) goto fail; - if (!unpack_strdup(e, &profile->secmark[i].label, NULL)) + if (!unpack_strdup(e, &rules->secmark[i].label, NULL)) goto fail; } if (!unpack_nameX(e, AA_ARRAYEND, NULL)) @@ -611,12 +612,12 @@ static bool unpack_secmark(struct aa_ext *e, struct aa_profile *profile) return true; fail: - if (profile->secmark) { + if (rules->secmark) { for (i = 0; i < size; i++) - kfree(profile->secmark[i].label); - kfree(profile->secmark); - profile->secmark_count = 0; - profile->secmark = NULL; + kfree(rules->secmark[i].label); + kfree(rules->secmark); + rules->secmark_count = 0; + rules->secmark = NULL; } e->pos = pos; @@ -634,7 +635,7 @@ static bool unpack_rlimits(struct aa_ext *e, struct aa_profile *profile) u32 tmp = 0; if (!unpack_u32(e, &tmp, NULL)) goto fail; - profile->rlimits.mask = tmp; + profile->rules.rlimits.mask = tmp; if (unpack_array(e, NULL, &size) != TRI_TRUE || size > RLIM_NLIMITS) @@ -644,7 +645,7 @@ static bool unpack_rlimits(struct aa_ext *e, struct aa_profile *profile) int a = aa_map_resource(i); if (!unpack_u64(e, &tmp2, NULL)) goto fail; - profile->rlimits.limits[a].rlim_max = tmp2; + profile->rules.rlimits.limits[a].rlim_max = tmp2; } if (!unpack_nameX(e, AA_ARRAYEND, NULL)) goto fail; @@ -816,6 +817,7 @@ static int datacmp(struct rhashtable_compare_arg *arg, const void *obj) */ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) { + struct aa_ruleset *rules; struct aa_profile *profile = NULL; const char *tmpname, *tmpns = NULL, *name = NULL; const char *info = "failed to unpack profile"; @@ -850,27 +852,30 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) profile = aa_alloc_profile(name, NULL, GFP_KERNEL); if (!profile) return ERR_PTR(-ENOMEM); + rules = &profile->rules; /* profile renaming is optional */ (void) unpack_str(e, &profile->rename, "rename"); /* attachment string is optional */ - (void) unpack_str(e, &profile->attach, "attach"); + (void) unpack_str(e, &profile->attach.xmatch_str, "attach"); /* xmatch is optional and may be NULL */ - error = unpack_pdb(e, &profile->xmatch, false, false, &info); - if (error) + error = unpack_pdb(e, &profile->attach.xmatch, false, false, &info); + if (error) { + info = "bad xmatch"; goto fail; + } /* neither xmatch_len not xmatch_perms are optional if xmatch is set */ - if (profile->xmatch.dfa) { + if (profile->attach.xmatch.dfa) { if (!unpack_u32(e, &tmp, NULL)) { info = "missing xmatch len"; goto fail; } - profile->xmatch_len = tmp; - profile->xmatch.start[AA_CLASS_XMATCH] = DFA_START; - if (aa_compat_map_xmatch(&profile->xmatch)) { + profile->attach.xmatch_len = tmp; + profile->attach.xmatch.start[AA_CLASS_XMATCH] = DFA_START; + if (aa_compat_map_xmatch(&profile->attach.xmatch)) { info = "failed to convert xmatch permission table"; goto fail; } @@ -926,11 +931,11 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) profile->path_flags = PATH_MEDIATE_DELETED; info = "failed to unpack profile capabilities"; - if (!unpack_u32(e, &(profile->caps.allow.cap[0]), NULL)) + if (!unpack_u32(e, &(rules->caps.allow.cap[0]), NULL)) goto fail; - if (!unpack_u32(e, &(profile->caps.audit.cap[0]), NULL)) + if (!unpack_u32(e, &(rules->caps.audit.cap[0]), NULL)) goto fail; - if (!unpack_u32(e, &(profile->caps.quiet.cap[0]), NULL)) + if (!unpack_u32(e, &(rules->caps.quiet.cap[0]), NULL)) goto fail; if (!unpack_u32(e, &tmpcap.cap[0], NULL)) goto fail; @@ -938,11 +943,11 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) info = "failed to unpack upper profile capabilities"; if (unpack_nameX(e, AA_STRUCT, "caps64")) { /* optional upper half of 64 bit caps */ - if (!unpack_u32(e, &(profile->caps.allow.cap[1]), NULL)) + if (!unpack_u32(e, &(rules->caps.allow.cap[1]), NULL)) goto fail; - if (!unpack_u32(e, &(profile->caps.audit.cap[1]), NULL)) + if (!unpack_u32(e, &(rules->caps.audit.cap[1]), NULL)) goto fail; - if (!unpack_u32(e, &(profile->caps.quiet.cap[1]), NULL)) + if (!unpack_u32(e, &(rules->caps.quiet.cap[1]), NULL)) goto fail; if (!unpack_u32(e, &(tmpcap.cap[1]), NULL)) goto fail; @@ -953,9 +958,9 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) info = "failed to unpack extended profile capabilities"; if (unpack_nameX(e, AA_STRUCT, "capsx")) { /* optional extended caps mediation mask */ - if (!unpack_u32(e, &(profile->caps.extended.cap[0]), NULL)) + if (!unpack_u32(e, &(rules->caps.extended.cap[0]), NULL)) goto fail; - if (!unpack_u32(e, &(profile->caps.extended.cap[1]), NULL)) + if (!unpack_u32(e, &(rules->caps.extended.cap[1]), NULL)) goto fail; if (!unpack_nameX(e, AA_STRUCTEND, NULL)) goto fail; @@ -979,40 +984,41 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) if (unpack_nameX(e, AA_STRUCT, "policydb")) { /* generic policy dfa - optional and may be NULL */ info = "failed to unpack policydb"; - error = unpack_pdb(e, &profile->policy, true, false, &info); + error = unpack_pdb(e, &rules->policy, true, false, + &info); if (error) goto fail; /* Fixup: drop when we get rid of start array */ - if (aa_dfa_next(profile->policy.dfa, profile->policy.start[0], + if (aa_dfa_next(rules->policy.dfa, rules->policy.start[0], AA_CLASS_FILE)) - profile->policy.start[AA_CLASS_FILE] = - aa_dfa_next(profile->policy.dfa, - profile->policy.start[0], + rules->policy.start[AA_CLASS_FILE] = + aa_dfa_next(rules->policy.dfa, + rules->policy.start[0], AA_CLASS_FILE); if (!unpack_nameX(e, AA_STRUCTEND, NULL)) goto fail; - if (aa_compat_map_policy(&profile->policy, e->version)) { + if (aa_compat_map_policy(&rules->policy, e->version)) { info = "failed to remap policydb permission table"; goto fail; } } else - profile->policy.dfa = aa_get_dfa(nulldfa); + rules->policy.dfa = aa_get_dfa(nulldfa); /* get file rules */ - error = unpack_pdb(e, &profile->file, false, true, &info); + error = unpack_pdb(e, &rules->file, false, true, &info); if (error) { goto fail; - } else if (profile->file.dfa) { - if (aa_compat_map_file(&profile->file)) { + } else if (rules->file.dfa) { + if (aa_compat_map_file(&rules->file)) { info = "failed to remap file permission table"; goto fail; } - } else if (profile->policy.dfa && - profile->policy.start[AA_CLASS_FILE]) { - profile->file.dfa = aa_get_dfa(profile->policy.dfa); - profile->file.start[AA_CLASS_FILE] = profile->policy.start[AA_CLASS_FILE]; + } else if (rules->policy.dfa && + rules->policy.start[AA_CLASS_FILE]) { + rules->file.dfa = aa_get_dfa(rules->policy.dfa); + rules->file.start[AA_CLASS_FILE] = rules->policy.start[AA_CLASS_FILE]; } else - profile->file.dfa = aa_get_dfa(nulldfa); + rules->file.dfa = aa_get_dfa(nulldfa); if (unpack_nameX(e, AA_STRUCT, "data")) { info = "out of memory"; @@ -1202,28 +1208,28 @@ static bool verify_perms(struct aa_policydb *pdb) */ static int verify_profile(struct aa_profile *profile) { - if ((profile->file.dfa && - !verify_dfa_xindex(profile->file.dfa, - profile->file.trans.size)) || - (profile->policy.dfa && - !verify_dfa_xindex(profile->policy.dfa, - profile->policy.trans.size))) { + if ((profile->rules.file.dfa && + !verify_dfa_xindex(profile->rules.file.dfa, + profile->rules.file.trans.size)) || + (profile->rules.policy.dfa && + !verify_dfa_xindex(profile->rules.policy.dfa, + profile->rules.policy.trans.size))) { audit_iface(profile, NULL, NULL, "Unpack: Invalid named transition", NULL, -EPROTO); return -EPROTO; } - if (!verify_perms(&profile->file)) { + if (!verify_perms(&profile->rules.file)) { audit_iface(profile, NULL, NULL, "Unpack: Invalid perm index", NULL, -EPROTO); return -EPROTO; } - if (!verify_perms(&profile->policy)) { + if (!verify_perms(&profile->rules.policy)) { audit_iface(profile, NULL, NULL, "Unpack: Invalid perm index", NULL, -EPROTO); return -EPROTO; } - if (!verify_perms(&profile->xmatch)) { + if (!verify_perms(&profile->attach.xmatch)) { audit_iface(profile, NULL, NULL, "Unpack: Invalid perm index", NULL, -EPROTO); return -EPROTO; diff --git a/security/apparmor/resource.c b/security/apparmor/resource.c index cc018469e22d7..f28026804d13d 100644 --- a/security/apparmor/resource.c +++ b/security/apparmor/resource.c @@ -82,10 +82,11 @@ int aa_map_resource(int resource) static int profile_setrlimit(struct aa_profile *profile, unsigned int resource, struct rlimit *new_rlim) { + struct aa_ruleset *rules = &profile->rules; int e = 0; - if (profile->rlimits.mask & (1 << resource) && new_rlim->rlim_max > - profile->rlimits.limits[resource].rlim_max) + if (rules->rlimits.mask & (1 << resource) && new_rlim->rlim_max > + rules->rlimits.limits[resource].rlim_max) e = -EACCES; return audit_resource(profile, resource, new_rlim->rlim_max, NULL, NULL, e); @@ -153,12 +154,12 @@ void __aa_transition_rlimits(struct aa_label *old_l, struct aa_label *new_l) * to the lesser of the tasks hard limit and the init tasks soft limit */ label_for_each_confined(i, old_l, old) { - if (old->rlimits.mask) { + if (old->rules.rlimits.mask) { int j; for (j = 0, mask = 1; j < RLIM_NLIMITS; j++, mask <<= 1) { - if (old->rlimits.mask & mask) { + if (old->rules.rlimits.mask & mask) { rlim = current->signal->rlim + j; initrlim = init_task.signal->rlim + j; rlim->rlim_cur = min(rlim->rlim_max, @@ -172,15 +173,15 @@ void __aa_transition_rlimits(struct aa_label *old_l, struct aa_label *new_l) label_for_each_confined(i, new_l, new) { int j; - if (!new->rlimits.mask) + if (!new->rules.rlimits.mask) continue; for (j = 0, mask = 1; j < RLIM_NLIMITS; j++, mask <<= 1) { - if (!(new->rlimits.mask & mask)) + if (!(new->rules.rlimits.mask & mask)) continue; rlim = current->signal->rlim + j; rlim->rlim_max = min(rlim->rlim_max, - new->rlimits.limits[j].rlim_max); + new->rules.rlimits.limits[j].rlim_max); /* soft limit should not exceed hard limit */ rlim->rlim_cur = min(rlim->rlim_cur, rlim->rlim_max); } diff --git a/security/apparmor/task.c b/security/apparmor/task.c index b19900f85c148..7e64fba42ca3d 100644 --- a/security/apparmor/task.c +++ b/security/apparmor/task.c @@ -223,7 +223,7 @@ static void audit_ptrace_cb(struct audit_buffer *ab, void *va) FLAGS_NONE, GFP_ATOMIC); } -/* assumes check for PROFILE_MEDIATES is already done */ +/* assumes check for RULE_MEDIATES is already done */ /* TODO: conditionals */ static int profile_ptrace_perm(struct aa_profile *profile, struct aa_label *peer, u32 request, @@ -232,8 +232,8 @@ static int profile_ptrace_perm(struct aa_profile *profile, struct aa_perms perms = { }; aad(sa)->peer = peer; - aa_profile_match_label(profile, peer, AA_CLASS_PTRACE, request, - &perms); + aa_profile_match_label(profile, &profile->rules, peer, + AA_CLASS_PTRACE, request, &perms); aa_apply_modes_to_perms(profile, &perms); return aa_check_perms(profile, &perms, request, sa, audit_ptrace_cb); } @@ -243,7 +243,7 @@ static int profile_tracee_perm(struct aa_profile *tracee, struct common_audit_data *sa) { if (profile_unconfined(tracee) || unconfined(tracer) || - !PROFILE_MEDIATES(tracee, AA_CLASS_PTRACE)) + !RULE_MEDIATES(&tracee->rules, AA_CLASS_PTRACE)) return 0; return profile_ptrace_perm(tracee, tracer, request, sa); @@ -256,7 +256,7 @@ static int profile_tracer_perm(struct aa_profile *tracer, if (profile_unconfined(tracer)) return 0; - if (PROFILE_MEDIATES(tracer, AA_CLASS_PTRACE)) + if (RULE_MEDIATES(&tracer->rules, AA_CLASS_PTRACE)) return profile_ptrace_perm(tracer, tracee, request, sa); /* profile uses the old style capability check for ptrace */ -- GitLab From 1ad22fcc4d0d2fb2e0f35aed555a86d016d5e590 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Mon, 5 Sep 2022 20:47:36 -0700 Subject: [PATCH 0037/1423] apparmor: rework profile->rules to be a list Convert profile->rules to a list as the next step towards supporting multiple rulesets in a profile. For this step only support a single list entry item. The logic for iterating the list will come as a separate step. Signed-off-by: John Johansen --- security/apparmor/apparmorfs.c | 3 ++- security/apparmor/capability.c | 6 +++-- security/apparmor/domain.c | 24 ++++++++++++------- security/apparmor/file.c | 6 +++-- security/apparmor/include/policy.h | 17 +++++++++++++- security/apparmor/ipc.c | 5 ++-- security/apparmor/lib.c | 6 +++-- security/apparmor/lsm.c | 7 ++++-- security/apparmor/mount.c | 13 +++++++---- security/apparmor/net.c | 6 +++-- security/apparmor/policy.c | 37 +++++++++++++++++++++++++++--- security/apparmor/policy_ns.c | 6 +++-- security/apparmor/policy_unpack.c | 34 ++++++++++++++------------- security/apparmor/resource.c | 19 ++++++++++----- security/apparmor/task.c | 10 ++++---- 15 files changed, 142 insertions(+), 57 deletions(-) diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 84ef8b400b401..f6d83ffde3c42 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -611,7 +611,8 @@ static const struct file_operations aa_fs_ns_revision_fops = { static void profile_query_cb(struct aa_profile *profile, struct aa_perms *perms, const char *match_str, size_t match_len) { - struct aa_ruleset *rules = &profile->rules; + struct aa_ruleset *rules = list_first_entry(&profile->rules, + typeof(*rules), list); struct aa_perms tmp = { }; aa_state_t state = DFA_NOMATCH; diff --git a/security/apparmor/capability.c b/security/apparmor/capability.c index b66ec63e2a489..326a51838ef28 100644 --- a/security/apparmor/capability.c +++ b/security/apparmor/capability.c @@ -64,7 +64,8 @@ static void audit_cb(struct audit_buffer *ab, void *va) static int audit_caps(struct common_audit_data *sa, struct aa_profile *profile, int cap, int error) { - struct aa_ruleset *rules = &profile->rules; + struct aa_ruleset *rules = list_first_entry(&profile->rules, + typeof(*rules), list); struct audit_cache *ent; int type = AUDIT_APPARMOR_AUTO; @@ -115,7 +116,8 @@ static int audit_caps(struct common_audit_data *sa, struct aa_profile *profile, static int profile_capable(struct aa_profile *profile, int cap, unsigned int opts, struct common_audit_data *sa) { - struct aa_ruleset *rules = &profile->rules; + struct aa_ruleset *rules = list_first_entry(&profile->rules, + typeof(*rules), list); int error; if (cap_raised(rules->caps.allow, cap) && diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c index ad035d14cfc57..d4b09f061aee2 100644 --- a/security/apparmor/domain.c +++ b/security/apparmor/domain.c @@ -81,7 +81,8 @@ static inline aa_state_t match_component(struct aa_profile *profile, struct aa_profile *tp, bool stack, aa_state_t state) { - struct aa_ruleset *rules = &profile->rules; + struct aa_ruleset *rules = list_first_entry(&profile->rules, + typeof(*rules), list); const char *ns_name; if (stack) @@ -118,7 +119,8 @@ static int label_compound_match(struct aa_profile *profile, aa_state_t state, bool subns, u32 request, struct aa_perms *perms) { - struct aa_ruleset *rules = &profile->rules; + struct aa_ruleset *rules = list_first_entry(&profile->rules, + typeof(*rules), list); struct aa_profile *tp; struct label_it i; struct path_cond cond = { }; @@ -179,7 +181,8 @@ static int label_components_match(struct aa_profile *profile, aa_state_t start, bool subns, u32 request, struct aa_perms *perms) { - struct aa_ruleset *rules = &profile->rules; + struct aa_ruleset *rules = list_first_entry(&profile->rules, + typeof(*rules), list); struct aa_profile *tp; struct label_it i; struct aa_perms tmp; @@ -503,7 +506,8 @@ static const char *next_name(int xtype, const char *name) struct aa_label *x_table_lookup(struct aa_profile *profile, u32 xindex, const char **name) { - struct aa_ruleset *rules = &profile->rules; + struct aa_ruleset *rules = list_first_entry(&profile->rules, + typeof(*rules), list); struct aa_label *label = NULL; u32 xtype = xindex & AA_X_TYPE_MASK; int index = xindex & AA_X_INDEX_MASK; @@ -553,7 +557,8 @@ static struct aa_label *x_to_label(struct aa_profile *profile, const char **lookupname, const char **info) { - struct aa_ruleset *rules = &profile->rules; + struct aa_ruleset *rules = list_first_entry(&profile->rules, + typeof(*rules), list); struct aa_label *new = NULL; struct aa_ns *ns = profile->ns; u32 xtype = xindex & AA_X_TYPE_MASK; @@ -620,7 +625,8 @@ static struct aa_label *profile_transition(struct aa_profile *profile, char *buffer, struct path_cond *cond, bool *secure_exec) { - struct aa_ruleset *rules = &profile->rules; + struct aa_ruleset *rules = list_first_entry(&profile->rules, + typeof(*rules), list); struct aa_label *new = NULL; const char *info = NULL, *name = NULL, *target = NULL; aa_state_t state = rules->file.start[AA_CLASS_FILE]; @@ -719,7 +725,8 @@ static int profile_onexec(struct aa_profile *profile, struct aa_label *onexec, char *buffer, struct path_cond *cond, bool *secure_exec) { - struct aa_ruleset *rules = &profile->rules; + struct aa_ruleset *rules = list_first_entry(&profile->rules, + typeof(*rules), list); aa_state_t state = rules->file.start[AA_CLASS_FILE]; struct aa_perms perms = {}; const char *xname = NULL, *info = "change_profile onexec"; @@ -1259,7 +1266,8 @@ static int change_profile_perms_wrapper(const char *op, const char *name, struct aa_label *target, bool stack, u32 request, struct aa_perms *perms) { - struct aa_ruleset *rules = &profile->rules; + struct aa_ruleset *rules = list_first_entry(&profile->rules, + typeof(*rules), list); const char *info = NULL; int error = 0; diff --git a/security/apparmor/file.c b/security/apparmor/file.c index ef5d98f81a2b8..d7f27848e7cc9 100644 --- a/security/apparmor/file.c +++ b/security/apparmor/file.c @@ -224,7 +224,8 @@ int __aa_path_perm(const char *op, struct aa_profile *profile, const char *name, u32 request, struct path_cond *cond, int flags, struct aa_perms *perms) { - struct aa_ruleset *rules = &profile->rules; + struct aa_ruleset *rules = list_first_entry(&profile->rules, + typeof(*rules), list); int e = 0; if (profile_unconfined(profile)) @@ -317,7 +318,8 @@ static int profile_path_link(struct aa_profile *profile, const struct path *target, char *buffer2, struct path_cond *cond) { - struct aa_ruleset *rules = &profile->rules; + struct aa_ruleset *rules = list_first_entry(&profile->rules, + typeof(*rules), list); const char *lname, *tname = NULL; struct aa_perms lperms = {}, perms; const char *info = NULL; diff --git a/security/apparmor/include/policy.h b/security/apparmor/include/policy.h index 9ee2c05e28951..5cadfb20df294 100644 --- a/security/apparmor/include/policy.h +++ b/security/apparmor/include/policy.h @@ -124,6 +124,7 @@ struct aa_data { }; /* struct aa_ruleset - data covering mediation rules + * @list: list the rule is on * @size: the memory consumed by this ruleset * @policy: general match rules governing policy * @file: The set of rules governing basic file access and domain transitions @@ -133,6 +134,8 @@ struct aa_data { * @secmark: secmark label match info */ struct aa_ruleset { + struct list_head list; + int size; /* TODO: merge policy and file */ @@ -147,6 +150,7 @@ struct aa_ruleset { }; /* struct aa_attachment - data and rules for a profiles attachment + * @list: * @xmatch_str: human readable attachment string * @xmatch: optional extended matching for unconfined executables names * @xmatch_len: xmatch prefix len, used to determine xmatch priority @@ -204,7 +208,7 @@ struct aa_profile { const char *disconnected; struct aa_attachment attach; - struct aa_ruleset rules; + struct list_head rules; struct aa_loaddata *rawdata; unsigned char *hash; @@ -227,6 +231,7 @@ void aa_add_profile(struct aa_policy *common, struct aa_profile *profile); void aa_free_proxy_kref(struct kref *kref); +struct aa_ruleset *aa_alloc_ruleset(gfp_t gfp); struct aa_profile *aa_alloc_profile(const char *name, struct aa_proxy *proxy, gfp_t gfp); struct aa_profile *aa_new_null_profile(struct aa_profile *parent, bool hat, @@ -285,6 +290,16 @@ static inline aa_state_t RULE_MEDIATES_AF(struct aa_ruleset *rules, u16 AF) return aa_dfa_match_len(rules->policy.dfa, state, (char *) &be_af, 2); } +static inline aa_state_t ANY_RULE_MEDIATES(struct list_head *head, + unsigned char class) +{ + struct aa_ruleset *rule; + + /* TODO: change to list walk */ + rule = list_first_entry(head, typeof(*rule), list); + return RULE_MEDIATES(rule, class); +} + /** * aa_get_profile - increment refcount on profile @p * @p: profile (MAYBE NULL) diff --git a/security/apparmor/ipc.c b/security/apparmor/ipc.c index dc2fa548312db..1d4099385bdf8 100644 --- a/security/apparmor/ipc.c +++ b/security/apparmor/ipc.c @@ -78,12 +78,13 @@ static int profile_signal_perm(struct aa_profile *profile, struct aa_label *peer, u32 request, struct common_audit_data *sa) { - struct aa_ruleset *rules = &profile->rules; + struct aa_ruleset *rules = list_first_entry(&profile->rules, + typeof(*rules), list); struct aa_perms perms; aa_state_t state; if (profile_unconfined(profile) || - !RULE_MEDIATES(rules, AA_CLASS_SIGNAL)) + !ANY_RULE_MEDIATES(&profile->rules, AA_CLASS_SIGNAL)) return 0; aad(sa)->peer = peer; diff --git a/security/apparmor/lib.c b/security/apparmor/lib.c index ec73e51ca7e3b..a630c951bb3b8 100644 --- a/security/apparmor/lib.c +++ b/security/apparmor/lib.c @@ -351,14 +351,16 @@ int aa_profile_label_perm(struct aa_profile *profile, struct aa_profile *target, u32 request, int type, u32 *deny, struct common_audit_data *sa) { + struct aa_ruleset *rules = list_first_entry(&profile->rules, + typeof(*rules), list); struct aa_perms perms; aad(sa)->label = &profile->label; aad(sa)->peer = &target->label; aad(sa)->request = request; - aa_profile_match_label(profile, &profile->rules, &target->label, type, - request, &perms); + aa_profile_match_label(profile, rules, &target->label, type, request, + &perms); aa_apply_modes_to_perms(profile, &perms); *deny |= request & perms.deny; return aa_check_perms(profile, &perms, request, sa, aa_audit_perms_cb); diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 62f2ca32b9596..a22e53e44123b 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -163,12 +163,15 @@ static int apparmor_capget(struct task_struct *target, kernel_cap_t *effective, struct label_it i; label_for_each_confined(i, label, profile) { + struct aa_ruleset *rules; if (COMPLAIN_MODE(profile)) continue; + rules = list_first_entry(&profile->rules, + typeof(*rules), list); *effective = cap_intersect(*effective, - profile->rules.caps.allow); + rules->caps.allow); *permitted = cap_intersect(*permitted, - profile->rules.caps.allow); + rules->caps.allow); } } rcu_read_unlock(); diff --git a/security/apparmor/mount.c b/security/apparmor/mount.c index d4724bdcb07f1..cdfa430ae2161 100644 --- a/security/apparmor/mount.c +++ b/security/apparmor/mount.c @@ -303,7 +303,8 @@ static int match_mnt_path_str(struct aa_profile *profile, { struct aa_perms perms = { }; const char *mntpnt = NULL, *info = NULL; - struct aa_ruleset *rules = &profile->rules; + struct aa_ruleset *rules = list_first_entry(&profile->rules, + typeof(*rules), list); int pos, error; AA_BUG(!profile); @@ -359,12 +360,14 @@ static int match_mnt(struct aa_profile *profile, const struct path *path, bool binary) { const char *devname = NULL, *info = NULL; + struct aa_ruleset *rules = list_first_entry(&profile->rules, + typeof(*rules), list); int error = -EACCES; AA_BUG(!profile); AA_BUG(devpath && !devbuffer); - if (!RULE_MEDIATES(&profile->rules, AA_CLASS_MOUNT)) + if (!RULE_MEDIATES(rules, AA_CLASS_MOUNT)) return 0; if (devpath) { @@ -566,7 +569,8 @@ out: static int profile_umount(struct aa_profile *profile, const struct path *path, char *buffer) { - struct aa_ruleset *rules = &profile->rules; + struct aa_ruleset *rules = list_first_entry(&profile->rules, + typeof(*rules), list); struct aa_perms perms = { }; const char *name = NULL, *info = NULL; aa_state_t state; @@ -626,7 +630,8 @@ static struct aa_label *build_pivotroot(struct aa_profile *profile, const struct path *old_path, char *old_buffer) { - struct aa_ruleset *rules = &profile->rules; + struct aa_ruleset *rules = list_first_entry(&profile->rules, + typeof(*rules), list); const char *old_name, *new_name = NULL, *info = NULL; const char *trans_name = NULL; struct aa_perms perms = { }; diff --git a/security/apparmor/net.c b/security/apparmor/net.c index ae789ee834ada..788be1609a865 100644 --- a/security/apparmor/net.c +++ b/security/apparmor/net.c @@ -108,7 +108,8 @@ void audit_net_cb(struct audit_buffer *ab, void *va) int aa_profile_af_perm(struct aa_profile *profile, struct common_audit_data *sa, u32 request, u16 family, int type) { - struct aa_ruleset *rules = &profile->rules; + struct aa_ruleset *rules = list_first_entry(&profile->rules, + typeof(*rules), list); struct aa_perms perms = { }; aa_state_t state; __be16 buffer[2]; @@ -217,7 +218,8 @@ static int aa_secmark_perm(struct aa_profile *profile, u32 request, u32 secid, { int i, ret; struct aa_perms perms = { }; - struct aa_ruleset *rules = &profile->rules; + struct aa_ruleset *rules = list_first_entry(&profile->rules, + typeof(*rules), list); if (rules->secmark_count == 0) return 0; diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c index 74c0a3b34e9ba..6f4cc8bfe03de 100644 --- a/security/apparmor/policy.c +++ b/security/apparmor/policy.c @@ -217,6 +217,17 @@ static void free_ruleset(struct aa_ruleset *rules) kfree_sensitive(rules->secmark); } +struct aa_ruleset *aa_alloc_ruleset(gfp_t gfp) +{ + struct aa_ruleset *rules; + + rules = kzalloc(sizeof(*rules), gfp); + if (rules) + INIT_LIST_HEAD(&rules->list); + + return rules; +} + /** * aa_free_profile - free a profile * @profile: the profile to free (MAYBE NULL) @@ -229,6 +240,7 @@ static void free_ruleset(struct aa_ruleset *rules) */ void aa_free_profile(struct aa_profile *profile) { + struct aa_ruleset *rule, *tmp; struct rhashtable *rht; AA_DEBUG("%s(%p)\n", __func__, profile); @@ -244,7 +256,15 @@ void aa_free_profile(struct aa_profile *profile) kfree_sensitive(profile->rename); free_attachment(&profile->attach); - free_ruleset(&profile->rules); + + /* + * at this point there are no tasks that can have a reference + * to rules + */ + list_for_each_entry_safe(rule, tmp, &profile->rules, list) { + list_del_init(&rule->list); + free_ruleset(rule); + } kfree_sensitive(profile->dirname); if (profile->data) { @@ -272,6 +292,7 @@ struct aa_profile *aa_alloc_profile(const char *hname, struct aa_proxy *proxy, gfp_t gfp) { struct aa_profile *profile; + struct aa_ruleset *rules; /* freed by free_profile - usually through aa_put_profile */ profile = kzalloc(struct_size(profile, label.vec, 2), gfp); @@ -283,6 +304,14 @@ struct aa_profile *aa_alloc_profile(const char *hname, struct aa_proxy *proxy, if (!aa_label_init(&profile->label, 1, gfp)) goto fail; + INIT_LIST_HEAD(&profile->rules); + + /* allocate the first ruleset, but leave it empty */ + rules = aa_alloc_ruleset(gfp); + if (!rules) + goto fail; + list_add(&rules->list, &profile->rules); + /* update being set needed by fs interface */ if (!proxy) { proxy = aa_alloc_proxy(&profile->label, gfp); @@ -516,6 +545,7 @@ struct aa_profile *aa_fqlookupn_profile(struct aa_label *base, struct aa_profile *aa_new_null_profile(struct aa_profile *parent, bool hat, const char *base, gfp_t gfp) { + struct aa_ruleset *rules; struct aa_profile *p, *profile; const char *bname; char *name = NULL; @@ -558,8 +588,9 @@ name: /* released on free_profile */ rcu_assign_pointer(profile->parent, aa_get_profile(parent)); profile->ns = aa_get_ns(parent->ns); - profile->rules.file.dfa = aa_get_dfa(nulldfa); - profile->rules.policy.dfa = aa_get_dfa(nulldfa); + rules = list_first_entry(&profile->rules, typeof(*rules), list); + rules->file.dfa = aa_get_dfa(nulldfa); + rules->policy.dfa = aa_get_dfa(nulldfa); mutex_lock_nested(&profile->ns->lock, profile->ns->level); p = __find_child(&parent->base.profiles, bname); diff --git a/security/apparmor/policy_ns.c b/security/apparmor/policy_ns.c index cb10994cd3b6b..121aa79bccaac 100644 --- a/security/apparmor/policy_ns.c +++ b/security/apparmor/policy_ns.c @@ -83,6 +83,7 @@ const char *aa_ns_name(struct aa_ns *curr, struct aa_ns *view, bool subns) static struct aa_profile *alloc_unconfined(const char *name) { struct aa_profile *profile; + struct aa_ruleset *rules; profile = aa_alloc_profile(name, NULL, GFP_KERNEL); if (!profile) @@ -91,8 +92,9 @@ static struct aa_profile *alloc_unconfined(const char *name) profile->label.flags |= FLAG_IX_ON_NAME_ERROR | FLAG_IMMUTIBLE | FLAG_NS_COUNT | FLAG_UNCONFINED; profile->mode = APPARMOR_UNCONFINED; - profile->rules.file.dfa = aa_get_dfa(nulldfa); - profile->rules.policy.dfa = aa_get_dfa(nulldfa); + rules = list_first_entry(&profile->rules, typeof(*rules), list); + rules->file.dfa = aa_get_dfa(nulldfa); + rules->policy.dfa = aa_get_dfa(nulldfa); return profile; } diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index bbca7772dfa2b..ac9955ef5d4a7 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -577,9 +577,8 @@ fail: return false; } -static bool unpack_secmark(struct aa_ext *e, struct aa_profile *profile) +static bool unpack_secmark(struct aa_ext *e, struct aa_ruleset *rules) { - struct aa_ruleset *rules = &profile->rules; void *pos = e->pos; u16 size; int i; @@ -624,7 +623,7 @@ fail: return false; } -static bool unpack_rlimits(struct aa_ext *e, struct aa_profile *profile) +static bool unpack_rlimits(struct aa_ext *e, struct aa_ruleset *rules) { void *pos = e->pos; @@ -635,7 +634,7 @@ static bool unpack_rlimits(struct aa_ext *e, struct aa_profile *profile) u32 tmp = 0; if (!unpack_u32(e, &tmp, NULL)) goto fail; - profile->rules.rlimits.mask = tmp; + rules->rlimits.mask = tmp; if (unpack_array(e, NULL, &size) != TRI_TRUE || size > RLIM_NLIMITS) @@ -645,7 +644,7 @@ static bool unpack_rlimits(struct aa_ext *e, struct aa_profile *profile) int a = aa_map_resource(i); if (!unpack_u64(e, &tmp2, NULL)) goto fail; - profile->rules.rlimits.limits[a].rlim_max = tmp2; + rules->rlimits.limits[a].rlim_max = tmp2; } if (!unpack_nameX(e, AA_ARRAYEND, NULL)) goto fail; @@ -852,7 +851,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) profile = aa_alloc_profile(name, NULL, GFP_KERNEL); if (!profile) return ERR_PTR(-ENOMEM); - rules = &profile->rules; + rules = list_first_entry(&profile->rules, typeof(*rules), list); /* profile renaming is optional */ (void) unpack_str(e, &profile->rename, "rename"); @@ -971,12 +970,12 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) goto fail; } - if (!unpack_rlimits(e, profile)) { + if (!unpack_rlimits(e, rules)) { info = "failed to unpack profile rlimits"; goto fail; } - if (!unpack_secmark(e, profile)) { + if (!unpack_secmark(e, rules)) { info = "failed to unpack profile secmark rules"; goto fail; } @@ -1208,23 +1207,26 @@ static bool verify_perms(struct aa_policydb *pdb) */ static int verify_profile(struct aa_profile *profile) { - if ((profile->rules.file.dfa && - !verify_dfa_xindex(profile->rules.file.dfa, - profile->rules.file.trans.size)) || - (profile->rules.policy.dfa && - !verify_dfa_xindex(profile->rules.policy.dfa, - profile->rules.policy.trans.size))) { + struct aa_ruleset *rules = list_first_entry(&profile->rules, + typeof(*rules), list); + if (!rules) + return 0; + + if ((rules->file.dfa && !verify_dfa_xindex(rules->file.dfa, + rules->file.trans.size)) || + (rules->policy.dfa && + !verify_dfa_xindex(rules->policy.dfa, rules->policy.trans.size))) { audit_iface(profile, NULL, NULL, "Unpack: Invalid named transition", NULL, -EPROTO); return -EPROTO; } - if (!verify_perms(&profile->rules.file)) { + if (!verify_perms(&rules->file)) { audit_iface(profile, NULL, NULL, "Unpack: Invalid perm index", NULL, -EPROTO); return -EPROTO; } - if (!verify_perms(&profile->rules.policy)) { + if (!verify_perms(&rules->policy)) { audit_iface(profile, NULL, NULL, "Unpack: Invalid perm index", NULL, -EPROTO); return -EPROTO; diff --git a/security/apparmor/resource.c b/security/apparmor/resource.c index f28026804d13d..ed543f4edfd9f 100644 --- a/security/apparmor/resource.c +++ b/security/apparmor/resource.c @@ -82,7 +82,8 @@ int aa_map_resource(int resource) static int profile_setrlimit(struct aa_profile *profile, unsigned int resource, struct rlimit *new_rlim) { - struct aa_ruleset *rules = &profile->rules; + struct aa_ruleset *rules = list_first_entry(&profile->rules, + typeof(*rules), list); int e = 0; if (rules->rlimits.mask & (1 << resource) && new_rlim->rlim_max > @@ -154,12 +155,15 @@ void __aa_transition_rlimits(struct aa_label *old_l, struct aa_label *new_l) * to the lesser of the tasks hard limit and the init tasks soft limit */ label_for_each_confined(i, old_l, old) { - if (old->rules.rlimits.mask) { + struct aa_ruleset *rules = list_first_entry(&old->rules, + typeof(*rules), + list); + if (rules->rlimits.mask) { int j; for (j = 0, mask = 1; j < RLIM_NLIMITS; j++, mask <<= 1) { - if (old->rules.rlimits.mask & mask) { + if (rules->rlimits.mask & mask) { rlim = current->signal->rlim + j; initrlim = init_task.signal->rlim + j; rlim->rlim_cur = min(rlim->rlim_max, @@ -171,17 +175,20 @@ void __aa_transition_rlimits(struct aa_label *old_l, struct aa_label *new_l) /* set any new hard limits as dictated by the new profile */ label_for_each_confined(i, new_l, new) { + struct aa_ruleset *rules = list_first_entry(&new->rules, + typeof(*rules), + list); int j; - if (!new->rules.rlimits.mask) + if (!rules->rlimits.mask) continue; for (j = 0, mask = 1; j < RLIM_NLIMITS; j++, mask <<= 1) { - if (!(new->rules.rlimits.mask & mask)) + if (!(rules->rlimits.mask & mask)) continue; rlim = current->signal->rlim + j; rlim->rlim_max = min(rlim->rlim_max, - new->rules.rlimits.limits[j].rlim_max); + rules->rlimits.limits[j].rlim_max); /* soft limit should not exceed hard limit */ rlim->rlim_cur = min(rlim->rlim_cur, rlim->rlim_max); } diff --git a/security/apparmor/task.c b/security/apparmor/task.c index 7e64fba42ca3d..5000cbd055b62 100644 --- a/security/apparmor/task.c +++ b/security/apparmor/task.c @@ -229,11 +229,13 @@ static int profile_ptrace_perm(struct aa_profile *profile, struct aa_label *peer, u32 request, struct common_audit_data *sa) { + struct aa_ruleset *rules = list_first_entry(&profile->rules, + typeof(*rules), list); struct aa_perms perms = { }; aad(sa)->peer = peer; - aa_profile_match_label(profile, &profile->rules, peer, - AA_CLASS_PTRACE, request, &perms); + aa_profile_match_label(profile, rules, peer, AA_CLASS_PTRACE, request, + &perms); aa_apply_modes_to_perms(profile, &perms); return aa_check_perms(profile, &perms, request, sa, audit_ptrace_cb); } @@ -243,7 +245,7 @@ static int profile_tracee_perm(struct aa_profile *tracee, struct common_audit_data *sa) { if (profile_unconfined(tracee) || unconfined(tracer) || - !RULE_MEDIATES(&tracee->rules, AA_CLASS_PTRACE)) + !ANY_RULE_MEDIATES(&tracee->rules, AA_CLASS_PTRACE)) return 0; return profile_ptrace_perm(tracee, tracer, request, sa); @@ -256,7 +258,7 @@ static int profile_tracer_perm(struct aa_profile *tracer, if (profile_unconfined(tracer)) return 0; - if (RULE_MEDIATES(&tracer->rules, AA_CLASS_PTRACE)) + if (ANY_RULE_MEDIATES(&tracer->rules, AA_CLASS_PTRACE)) return profile_ptrace_perm(tracer, tracee, request, sa); /* profile uses the old style capability check for ptrace */ -- GitLab From 961f3e3de14467f3babe252f7b6cc44a36ebba64 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Sun, 11 Sep 2022 22:05:26 -0700 Subject: [PATCH 0038/1423] apparmor: fix aa_class_names[] to match reserved classes The class name map did not have the reserved names added. Fix this Signed-off-by: John Johansen --- security/apparmor/audit.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/security/apparmor/audit.c b/security/apparmor/audit.c index e638f7bc9f528..8dfdda98fbf13 100644 --- a/security/apparmor/audit.c +++ b/security/apparmor/audit.c @@ -48,13 +48,28 @@ static const char *const aa_class_names[] = { "unknown", "ptrace", "signal", - "unknown", + "xmatch", "unknown", "unknown", "net", "unknown", "label", + "posix_mqueue", + "io_uring", + "module", "lsm", + "unknown", + "unknown", + "unknown", + "unknown", + "unknown", + "unknown", + "unknown", + "unknown", + "unknown", + "unknown", + "X", + "dbus", }; -- GitLab From 1f939c6bd1512d0b39b470396740added3cb403f Mon Sep 17 00:00:00 2001 From: John Johansen Date: Tue, 20 Sep 2022 04:01:28 -0700 Subject: [PATCH 0039/1423] apparmor: Fix regression in stacking due to label flags The unconfined label flag is not being computed correctly. It should only be set if all the profiles in the vector are set, which is different than what is required for the debug and stale flag that are set if any on the profile flags are set. Fixes: c1ed5da19765 ("apparmor: allow label to carry debug flags") Signed-off-by: John Johansen --- security/apparmor/label.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/security/apparmor/label.c b/security/apparmor/label.c index 98dadd9609772..aa4031628af55 100644 --- a/security/apparmor/label.c +++ b/security/apparmor/label.c @@ -197,15 +197,18 @@ static bool vec_is_stale(struct aa_profile **vec, int n) return false; } -static long union_vec_flags(struct aa_profile **vec, int n, long mask) +static long accum_vec_flags(struct aa_profile **vec, int n) { - long u = 0; + long u = FLAG_UNCONFINED; int i; AA_BUG(!vec); for (i = 0; i < n; i++) { - u |= vec[i]->label.flags & mask; + u |= vec[i]->label.flags & (FLAG_DEBUG1 | FLAG_DEBUG2 | + FLAG_STALE); + if (!(u & vec[i]->label.flags & FLAG_UNCONFINED)) + u &= ~FLAG_UNCONFINED; } return u; @@ -1097,8 +1100,7 @@ static struct aa_label *label_merge_insert(struct aa_label *new, else if (k == b->size) return aa_get_label(b); } - new->flags |= union_vec_flags(new->vec, new->size, FLAG_UNCONFINED | - FLAG_DEBUG1 | FLAG_DEBUG2); + new->flags |= accum_vec_flags(new->vec, new->size); ls = labels_set(new); write_lock_irqsave(&ls->lock, flags); label = __label_insert(labels_set(new), new, false); -- GitLab From adaa9a3f72e6f98538bfac54f6dc4afc0537f410 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Fri, 23 Sep 2022 17:21:18 +0800 Subject: [PATCH 0040/1423] apparmor: Simplify obtain the newest label on a cred In aa_get_task_label(), aa_get_newest_cred_label(__task_cred(task)) can do the same things as aa_get_newest_label(__aa_task_raw_label(task)), so we can replace it and remove __aa_task_raw_label() to simplify the code. Signed-off-by: Gaosheng Cui Signed-off-by: John Johansen --- security/apparmor/include/cred.h | 13 ------------- security/apparmor/task.c | 2 +- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/security/apparmor/include/cred.h b/security/apparmor/include/cred.h index 0b9ae4804ef73..58fdc72af6645 100644 --- a/security/apparmor/include/cred.h +++ b/security/apparmor/include/cred.h @@ -63,19 +63,6 @@ static inline struct aa_label *aa_get_newest_cred_label(const struct cred *cred) return aa_get_newest_label(aa_cred_raw_label(cred)); } -/** - * __aa_task_raw_label - retrieve another task's label - * @task: task to query (NOT NULL) - * - * Returns: @task's label without incrementing its ref count - * - * If @task != current needs to be called in RCU safe critical section - */ -static inline struct aa_label *__aa_task_raw_label(struct task_struct *task) -{ - return aa_cred_raw_label(__task_cred(task)); -} - /** * aa_current_raw_label - find the current tasks confining label * diff --git a/security/apparmor/task.c b/security/apparmor/task.c index 5000cbd055b62..84d16a29bfcbc 100644 --- a/security/apparmor/task.c +++ b/security/apparmor/task.c @@ -31,7 +31,7 @@ struct aa_label *aa_get_task_label(struct task_struct *task) struct aa_label *p; rcu_read_lock(); - p = aa_get_newest_label(__aa_task_raw_label(task)); + p = aa_get_newest_cred_label(__task_cred(task)); rcu_read_unlock(); return p; -- GitLab From 65f7f666f21ce374628d58b3cc48515070f31e72 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Wed, 14 Sep 2022 15:46:07 +0800 Subject: [PATCH 0041/1423] apparmor: make __aa_path_perm() static Make __aa_path_perm() static as it's only used inside apparmor/file.c. Signed-off-by: Xiu Jianfeng Signed-off-by: John Johansen --- security/apparmor/file.c | 7 ++++--- security/apparmor/include/file.h | 3 --- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/security/apparmor/file.c b/security/apparmor/file.c index d7f27848e7cc9..e7dc5ea389973 100644 --- a/security/apparmor/file.c +++ b/security/apparmor/file.c @@ -220,9 +220,10 @@ aa_state_t aa_str_perms(struct aa_policydb *file_rules, aa_state_t start, return state; } -int __aa_path_perm(const char *op, struct aa_profile *profile, const char *name, - u32 request, struct path_cond *cond, int flags, - struct aa_perms *perms) +static int __aa_path_perm(const char *op, struct aa_profile *profile, + const char *name, u32 request, + struct path_cond *cond, int flags, + struct aa_perms *perms) { struct aa_ruleset *rules = list_first_entry(&profile->rules, typeof(*rules), list); diff --git a/security/apparmor/include/file.h b/security/apparmor/include/file.h index 1a1c0f0c50710..5be620af33ba0 100644 --- a/security/apparmor/include/file.h +++ b/security/apparmor/include/file.h @@ -119,9 +119,6 @@ aa_state_t aa_str_perms(struct aa_policydb *file_rules, aa_state_t start, const char *name, struct path_cond *cond, struct aa_perms *perms); -int __aa_path_perm(const char *op, struct aa_profile *profile, - const char *name, u32 request, struct path_cond *cond, - int flags, struct aa_perms *perms); int aa_path_perm(const char *op, struct aa_label *label, const struct path *path, int flags, u32 request, struct path_cond *cond); -- GitLab From 1ddece8cd0f43582085497eacff2e3cd37f93d1f Mon Sep 17 00:00:00 2001 From: John Johansen Date: Sat, 24 Sep 2022 22:25:25 -0700 Subject: [PATCH 0042/1423] apparmor: Fix doc comment for compute_fperms When compute_fperms was moved to policy_compat and made static it was renamed from aa_compute_fperms to just compute_fperms to help indicate it is only available statically. Unfortunately the doc comment did not also get updated to reflect the change. Reported-by: kernel test robot Signed-off-by: John Johansen --- security/apparmor/policy_compat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/security/apparmor/policy_compat.c b/security/apparmor/policy_compat.c index 1aa5cced935ee..9e52e218bf308 100644 --- a/security/apparmor/policy_compat.c +++ b/security/apparmor/policy_compat.c @@ -140,8 +140,8 @@ static struct aa_perms compute_fperms_other(struct aa_dfa *dfa, } /** - * aa_compute_fperms - convert dfa compressed perms to internal perms and store - * them so they can be retrieved later. + * compute_fperms - convert dfa compressed perms to internal perms and store + * them so they can be retrieved later. * @dfa: a dfa using fperms to remap to internal permissions * * Returns: remapped perm table -- GitLab From 73c7e91c8bc98a5da94be62a9a4ba2793f86a97b Mon Sep 17 00:00:00 2001 From: John Johansen Date: Sat, 24 Sep 2022 22:34:07 -0700 Subject: [PATCH 0043/1423] apparmor: Remove unnecessary size check when unpacking trans_table The index into the trans_table has a max size of 2^24 bits which the code was testing but this is unnecessary as unpack_array can only unpack a table of 2^16 bits in size so the table unpacked will never be larger than what can be indexed, and any test here is redundant. Reported-by: kernel test robot Signed-off-by: John Johansen --- security/apparmor/policy_unpack.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index ac9955ef5d4a7..6deaeecb76feb 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -484,9 +484,13 @@ static bool unpack_trans_table(struct aa_ext *e, struct aa_str_table *strs) u16 size; int i; - if (unpack_array(e, NULL, &size) != TRI_TRUE || - size > (1 << 24)) - /* currently 2^24 bits entries 0-3 */ + if (unpack_array(e, NULL, &size) != TRI_TRUE) + /* + * Note: index into trans table array is a max + * of 2^24, but unpack array can only unpack + * an array of 2^16 in size atm so no need + * for size check here + */ goto fail; table = kcalloc(size, sizeof(char *), GFP_KERNEL); if (!table) -- GitLab From 14d37a7f14569adbf7a019710762271fa2a9e739 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Sun, 25 Sep 2022 15:36:45 -0700 Subject: [PATCH 0044/1423] apparmor: make sure the decompression ctx is promperly initialized The decompress ctx was not properly initialized when reading raw profile data back to userspace. Reported-by: kernel test robot Fixes: 52ccc20c652b ("apparmor: use zstd compression for profile data") Signed-off-by: John Johansen --- security/apparmor/apparmorfs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index f6d83ffde3c42..ddd64b8ebf05e 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -1327,7 +1327,11 @@ static int decompress_zstd(char *src, size_t slen, char *dst, size_t dlen) ret = -ENOMEM; goto cleanup; } - + ctx = zstd_init_dctx(wksp, wksp_len); + if (ctx == NULL) { + ret = -ENOMEM; + goto cleanup; + } out_len = zstd_decompress_dctx(ctx, dst, dlen, src, slen); if (zstd_is_error(out_len)) { ret = -EINVAL; -- GitLab From 70f24a9f9084b7fffd95daa707cce8e339b189dd Mon Sep 17 00:00:00 2001 From: John Johansen Date: Thu, 29 Sep 2022 06:24:29 -0700 Subject: [PATCH 0045/1423] apparmor: Fix undefined references to zstd_ symbols Unfortunately the switch to using zstd compression did not properly ifdef all the code that uses zstd_ symbols. So that if exporting of binary policy is disabled in the config the compile will fail with the following errors security/apparmor/lsm.c:1545: undefined reference to `zstd_min_clevel' aarch64-linux-ld: security/apparmor/lsm.c:1545: undefined reference to `zstd_max_clevel' Reported-by: kernel test robot Fixes: 52ccc20c652b ("apparmor: use zstd compression for profile data") Signed-off-by: John Johansen Acked-by: Jon Tourville --- security/apparmor/apparmorfs.c | 4 ++-- security/apparmor/include/apparmor.h | 11 +++++++++++ security/apparmor/lsm.c | 5 ++--- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index ddd64b8ebf05e..2c138309ad665 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -1202,13 +1202,13 @@ static int seq_ns_name_show(struct seq_file *seq, void *v) static int seq_ns_compress_min_show(struct seq_file *seq, void *v) { - seq_printf(seq, "%d\n", zstd_min_clevel()); + seq_printf(seq, "%d\n", AA_MIN_CLEVEL); return 0; } static int seq_ns_compress_max_show(struct seq_file *seq, void *v) { - seq_printf(seq, "%d\n", zstd_max_clevel()); + seq_printf(seq, "%d\n", AA_MAX_CLEVEL); return 0; } diff --git a/security/apparmor/include/apparmor.h b/security/apparmor/include/apparmor.h index 6d9ca075fcb9c..8a81557c9d59b 100644 --- a/security/apparmor/include/apparmor.h +++ b/security/apparmor/include/apparmor.h @@ -51,4 +51,15 @@ extern bool aa_g_logsyscall; extern bool aa_g_paranoid_load; extern unsigned int aa_g_path_max; +#ifdef CONFIG_SECURITY_APPARMOR_EXPORT_BINARY +#define AA_MIN_CLEVEL zstd_min_clevel() +#define AA_MAX_CLEVEL zstd_max_clevel() +#define AA_DEFAULT_CLEVEL ZSTD_CLEVEL_DEFAULT +#else +#define AA_MIN_CLEVEL 0 +#define AA_MAX_CLEVEL 0 +#define AA_DEFAULT_CLEVEL 0 +#endif /* CONFIG_SECURITY_APPARMOR_EXPORT_BINARY */ + + #endif /* __APPARMOR_H */ diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index a22e53e44123b..8e2b951c4988c 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -1365,7 +1365,7 @@ module_param_named(export_binary, aa_g_export_binary, aabool, 0600); #endif /* policy loaddata compression level */ -int aa_g_rawdata_compression_level = ZSTD_CLEVEL_DEFAULT; +int aa_g_rawdata_compression_level = AA_DEFAULT_CLEVEL; module_param_named(rawdata_compression_level, aa_g_rawdata_compression_level, aacompressionlevel, 0400); @@ -1547,8 +1547,7 @@ static int param_set_aacompressionlevel(const char *val, error = param_set_int(val, kp); aa_g_rawdata_compression_level = clamp(aa_g_rawdata_compression_level, - zstd_min_clevel(), - zstd_max_clevel()); + AA_MIN_CLEVEL, AA_MAX_CLEVEL); pr_info("AppArmor: policy rawdata compression level set to %d\n", aa_g_rawdata_compression_level); -- GitLab From a2f31df06b7aa1769f12ec6f9ae7f18e78582cad Mon Sep 17 00:00:00 2001 From: John Johansen Date: Thu, 29 Sep 2022 06:48:10 -0700 Subject: [PATCH 0046/1423] apparmor: Fix decompression of rawdata for read back to userspace The rawdata readback has a few of problems. First if compression is enabled when the data is read then the compressed data is read out instead decompressing the data. Second if compression of the data fails, the code does not handle holding onto the raw_data in uncompressed form. Third if the compression is enabled/disabled after the rawdata was loaded, the check against the global control of whether to use compression does not reflect what was already done to the data. Fix these by always storing the compressed size, along with the original data size even if compression fails or is not used. And use this to detect whether the rawdata is actually compressed. Fixes: 52ccc20c652b ("apparmor: use zstd compression for profile data") Signed-off-by: John Johansen Acked-by: Jon Tourville --- security/apparmor/apparmorfs.c | 2 +- security/apparmor/policy_unpack.c | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 2c138309ad665..424b2c1e586d5 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -1315,7 +1315,7 @@ SEQ_RAWDATA_FOPS(compressed_size); static int decompress_zstd(char *src, size_t slen, char *dst, size_t dlen) { #ifdef CONFIG_SECURITY_APPARMOR_EXPORT_BINARY - if (aa_g_rawdata_compression_level == 0) { + if (slen < dlen) { const size_t wksp_len = zstd_dctx_workspace_bound(); zstd_dctx *ctx; void *wksp; diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 6deaeecb76feb..45c9dfdc8e0de 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -1294,7 +1294,7 @@ static int compress_zstd(const char *src, size_t slen, char **dst, size_t *dlen) } out_len = zstd_compress_cctx(ctx, out, out_len, src, slen, ¶ms); - if (zstd_is_error(out_len)) { + if (zstd_is_error(out_len) || out_len >= slen) { ret = -EINVAL; goto cleanup; } @@ -1348,9 +1348,10 @@ static int compress_loaddata(struct aa_loaddata *data) void *udata = data->data; int error = compress_zstd(udata, data->size, &data->data, &data->compressed_size); - if (error) + if (error) { + data->compressed_size = data->size; return error; - + } if (udata != data->data) kvfree(udata); } else -- GitLab From 32490541682bf8ea445e9bd29c866981851e0912 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Mon, 3 Oct 2022 01:30:38 -0700 Subject: [PATCH 0047/1423] apparmor: Fix kunit test for out of bounds array The apparmor kunit tests are failing on the out of bounds array check with the following failure # policy_unpack_test_unpack_array_out_of_bounds: EXPECTATION FAILED at security/apparmor/policy_unpack_test.c:178 Expected unpack_array(puf->e, name, &array_size) == 1, but unpack_array(puf->e, name, &array_size) == -1 # policy_unpack_test_unpack_array_out_of_bounds: EXPECTATION FAILED at security/apparmor/policy_unpack_test.c:180 Expected array_size == 0, but array_size == 64192 not ok 5 - policy_unpack_test_unpack_array_out_of_bounds This is because unpack_array changed to allow distinguishing between the array not being present and an error. In the error case the array size is not set and should not be tested. Reported-by: kernel test robot Fixes: 995a5b64620e ("apparmor: make unpack_array return a trianary value") Signed-off-by: John Johansen --- security/apparmor/policy_unpack_test.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/security/apparmor/policy_unpack_test.c b/security/apparmor/policy_unpack_test.c index 1a43d538c4c04..b214f6ea8a725 100644 --- a/security/apparmor/policy_unpack_test.c +++ b/security/apparmor/policy_unpack_test.c @@ -176,8 +176,7 @@ static void policy_unpack_test_unpack_array_out_of_bounds(struct kunit *test) puf->e->end = puf->e->start + TEST_ARRAY_BUF_OFFSET + sizeof(u16); KUNIT_EXPECT_EQ(test, unpack_array(puf->e, name, &array_size), - TRI_TRUE); - KUNIT_EXPECT_EQ(test, array_size, 0); + TRI_FALSE); KUNIT_EXPECT_PTR_EQ(test, puf->e->pos, puf->e->start + TEST_NAMED_ARRAY_BUF_OFFSET); } -- GitLab From 5515a8e30eaa8ae0d57ec59c908716cf2af114ae Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Tue, 4 Oct 2022 13:45:15 +0500 Subject: [PATCH 0048/1423] apparmor: store return value of unpack_perms_table() to signed variable The unpack_perms_table() can return error which is negative value. Store the return value to a signed variable. policy->size is unsigned variable. It shouldn't be used to store the return status. Fixes: 2d6b2dea7f3c ("apparmor: add the ability for policy to specify a permission table") Signed-off-by: Muhammad Usama Anjum Signed-off-by: John Johansen --- security/apparmor/policy_unpack.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 45c9dfdc8e0de..09f3169439512 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -734,14 +734,18 @@ static int unpack_pdb(struct aa_ext *e, struct aa_policydb *policy, { void *pos = e->pos; int i, flags, error = -EPROTO; + ssize_t size; - policy->size = unpack_perms_table(e, &policy->perms); - if (policy->size < 0) { - error = policy->size; + size = unpack_perms_table(e, &policy->perms); + if (size < 0) { + error = size; policy->perms = NULL; *info = "failed to unpack - perms"; goto fail; - } else if (policy->perms) { + } + policy->size = size; + + if (policy->perms) { /* perms table present accept is index */ flags = TO_ACCEPT1_FLAG(YYTD_DATA32); } else { -- GitLab From ee21a175ecfa821b74822881d354c7f848930738 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Mon, 10 Oct 2022 11:18:50 -0700 Subject: [PATCH 0049/1423] apparmor: fix uninitialize table variable in error in unpack_trans_table The error path has one case where *table is uninitialized, initialize it. Fixes: a0792e2ceddc ("apparmor: make transition table unpack generic so it can be reused") Reported-by: kernel test robot Signed-off-by: John Johansen --- security/apparmor/policy_unpack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 09f3169439512..3b956b1235f3a 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -477,7 +477,7 @@ static struct aa_dfa *unpack_dfa(struct aa_ext *e, int flags) static bool unpack_trans_table(struct aa_ext *e, struct aa_str_table *strs) { void *saved_pos = e->pos; - char **table; + char **table = NULL; /* exec table is optional */ if (unpack_nameX(e, AA_STRUCT, "xtable")) { -- GitLab From 53991aedcd34760be23f1b0ef312e39b6add84af Mon Sep 17 00:00:00 2001 From: John Johansen Date: Mon, 10 Oct 2022 12:15:10 -0700 Subject: [PATCH 0050/1423] apparmor: Fix unpack_profile() warn: passing zero to 'ERR_PTR' unpack_profile() sets a default error on entry but this gets overridden by error assignment by functions called in its body. If an error check that was relying on the default value is triggered after one of these error assignments then zero will be passed to ERR_PTR. Fix this by setting up a default -EPROTO assignment in the error path and while we are at it make sure the correct error is returned in non-default cases. Fixes: 217af7e2f4de ("apparmor: refactor profile rules and attachments") Reported-by: kernel test robot Signed-off-by: John Johansen --- security/apparmor/policy_unpack.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 3b956b1235f3a..2e028d540c6b9 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -851,6 +851,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) *ns_name = kstrndup(tmpns, ns_len, GFP_KERNEL); if (!*ns_name) { info = "out of memory"; + error = -ENOMEM; goto fail; } name = tmpname; @@ -882,7 +883,8 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) } profile->attach.xmatch_len = tmp; profile->attach.xmatch.start[AA_CLASS_XMATCH] = DFA_START; - if (aa_compat_map_xmatch(&profile->attach.xmatch)) { + error = aa_compat_map_xmatch(&profile->attach.xmatch); + if (error) { info = "failed to convert xmatch permission table"; goto fail; } @@ -1004,7 +1006,8 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) AA_CLASS_FILE); if (!unpack_nameX(e, AA_STRUCTEND, NULL)) goto fail; - if (aa_compat_map_policy(&rules->policy, e->version)) { + error = aa_compat_map_policy(&rules->policy, e->version); + if (error) { info = "failed to remap policydb permission table"; goto fail; } @@ -1016,7 +1019,8 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) if (error) { goto fail; } else if (rules->file.dfa) { - if (aa_compat_map_file(&rules->file)) { + error = aa_compat_map_file(&rules->file); + if (error) { info = "failed to remap file permission table"; goto fail; } @@ -1027,12 +1031,14 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) } else rules->file.dfa = aa_get_dfa(nulldfa); + error = -EPROTO; if (unpack_nameX(e, AA_STRUCT, "data")) { info = "out of memory"; profile->data = kzalloc(sizeof(*profile->data), GFP_KERNEL); - if (!profile->data) + if (!profile->data) { + error = -ENOMEM; goto fail; - + } params.nelem_hint = 3; params.key_len = sizeof(void *); params.key_offset = offsetof(struct aa_data, key); @@ -1049,6 +1055,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) { kfree_sensitive(key); + error = -ENOMEM; goto fail; } @@ -1058,6 +1065,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) if (data->size && !data->data) { kfree_sensitive(data->key); kfree_sensitive(data); + error = -ENOMEM; goto fail; } @@ -1079,6 +1087,9 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) return profile; fail: + if (error == 0) + /* default error covers most cases */ + error = -EPROTO; if (profile) name = NULL; else if (!name) -- GitLab From bf08ce132cd069afc45635e1ffeb6adb0523cc60 Mon Sep 17 00:00:00 2001 From: Stephen Kitt Date: Wed, 12 Oct 2022 16:25:23 +0200 Subject: [PATCH 0051/1423] drivers/gpio: use simple i2c probe All these drivers have an i2c probe function which doesn't use the "struct i2c_device_id *id" parameter, so they can trivially be converted to the "probe_new" style of probe with a single argument. This is part of an ongoing transition to single-argument i2c probe functions. Old-style probe functions involve a call to i2c_match_id: in drivers/i2c/i2c-core-base.c, /* * When there are no more users of probe(), * rename probe_new to probe. */ if (driver->probe_new) status = driver->probe_new(client); else if (driver->probe) status = driver->probe(client, i2c_match_id(driver->id_table, client)); else status = -EINVAL; Drivers which don't need the second parameter can be declared using probe_new instead, avoiding the call to i2c_match_id. Drivers which do can still be converted to probe_new-style, calling i2c_match_id themselves (as is done currently for of_match_id). This change was done using the following Coccinelle script, and fixed up for whitespace changes: @ rule1 @ identifier fn; identifier client, id; @@ - static int fn(struct i2c_client *client, const struct i2c_device_id *id) + static int fn(struct i2c_client *client) { ...when != id } @ rule2 depends on rule1 @ identifier rule1.fn; identifier driver; @@ struct i2c_driver driver = { - .probe + .probe_new = ( fn | - &fn + fn ) , }; Signed-off-by: Stephen Kitt Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-gw-pld.c | 5 ++--- drivers/gpio/gpio-max7300.c | 5 ++--- drivers/gpio/gpio-tpic2810.c | 5 ++--- drivers/gpio/gpio-ts4900.c | 5 ++--- 4 files changed, 8 insertions(+), 12 deletions(-) diff --git a/drivers/gpio/gpio-gw-pld.c b/drivers/gpio/gpio-gw-pld.c index 2109803ffb386..5057fa9ad6108 100644 --- a/drivers/gpio/gpio-gw-pld.c +++ b/drivers/gpio/gpio-gw-pld.c @@ -67,8 +67,7 @@ static void gw_pld_set8(struct gpio_chip *gc, unsigned offset, int value) gw_pld_output8(gc, offset, value); } -static int gw_pld_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int gw_pld_probe(struct i2c_client *client) { struct device *dev = &client->dev; struct gw_pld *gw; @@ -126,7 +125,7 @@ static struct i2c_driver gw_pld_driver = { .name = "gw_pld", .of_match_table = gw_pld_dt_ids, }, - .probe = gw_pld_probe, + .probe_new = gw_pld_probe, .id_table = gw_pld_id, }; module_i2c_driver(gw_pld_driver); diff --git a/drivers/gpio/gpio-max7300.c b/drivers/gpio/gpio-max7300.c index 43da381a4d7e7..cf482f4f00982 100644 --- a/drivers/gpio/gpio-max7300.c +++ b/drivers/gpio/gpio-max7300.c @@ -28,8 +28,7 @@ static int max7300_i2c_read(struct device *dev, unsigned int reg) return i2c_smbus_read_byte_data(client, reg); } -static int max7300_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int max7300_probe(struct i2c_client *client) { struct max7301 *ts; @@ -63,7 +62,7 @@ static struct i2c_driver max7300_driver = { .driver = { .name = "max7300", }, - .probe = max7300_probe, + .probe_new = max7300_probe, .remove = max7300_remove, .id_table = max7300_id, }; diff --git a/drivers/gpio/gpio-tpic2810.c b/drivers/gpio/gpio-tpic2810.c index d642c35cb97c0..349c5fbd9b027 100644 --- a/drivers/gpio/gpio-tpic2810.c +++ b/drivers/gpio/gpio-tpic2810.c @@ -98,8 +98,7 @@ static const struct of_device_id tpic2810_of_match_table[] = { }; MODULE_DEVICE_TABLE(of, tpic2810_of_match_table); -static int tpic2810_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int tpic2810_probe(struct i2c_client *client) { struct tpic2810 *gpio; int ret; @@ -144,7 +143,7 @@ static struct i2c_driver tpic2810_driver = { .name = "tpic2810", .of_match_table = tpic2810_of_match_table, }, - .probe = tpic2810_probe, + .probe_new = tpic2810_probe, .remove = tpic2810_remove, .id_table = tpic2810_id_table, }; diff --git a/drivers/gpio/gpio-ts4900.c b/drivers/gpio/gpio-ts4900.c index 416725c26e949..43e8b66e04f78 100644 --- a/drivers/gpio/gpio-ts4900.c +++ b/drivers/gpio/gpio-ts4900.c @@ -136,8 +136,7 @@ static const struct of_device_id ts4900_gpio_of_match_table[] = { }; MODULE_DEVICE_TABLE(of, ts4900_gpio_of_match_table); -static int ts4900_gpio_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int ts4900_gpio_probe(struct i2c_client *client) { struct ts4900_gpio_priv *priv; u32 ngpio; @@ -186,7 +185,7 @@ static struct i2c_driver ts4900_gpio_driver = { .name = "ts4900-gpio", .of_match_table = ts4900_gpio_of_match_table, }, - .probe = ts4900_gpio_probe, + .probe_new = ts4900_gpio_probe, .id_table = ts4900_gpio_id_table, }; module_i2c_driver(ts4900_gpio_driver); -- GitLab From 317627a4a19e2a6f8d60e3e2eefe6dfd87059d79 Mon Sep 17 00:00:00 2001 From: Davide Ciminaghi Date: Fri, 2 Sep 2022 14:42:01 +0200 Subject: [PATCH 0052/1423] gpio: Remove sta2x11 GPIO driver The Connext chip has 4 gpio cells looking very similar to those of the Nomadik, whose gpio/pinctrl driver (already featuring devicetree support) will be used instead of the sta2x11 specific one. Signed-off-by: Davide Ciminaghi Acked-by: Giancarlo Asnaghi Acked-by: Linus Walleij Signed-off-by: Christophe Leroy Signed-off-by: Bartosz Golaszewski --- drivers/gpio/Kconfig | 8 - drivers/gpio/Makefile | 1 - drivers/gpio/gpio-sta2x11.c | 411 ------------------------------------ 3 files changed, 420 deletions(-) delete mode 100644 drivers/gpio/gpio-sta2x11.c diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index a01af11806164..e034f752e7cef 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -600,14 +600,6 @@ config GPIO_SPRD help Say yes here to support Spreadtrum GPIO device. -config GPIO_STA2X11 - bool "STA2x11/ConneXt GPIO support" - depends on MFD_STA2X11 - select GENERIC_IRQ_CHIP - help - Say yes here to support the STA2x11/ConneXt GPIO device. - The GPIO module has 128 GPIO pins with alternate functions. - config GPIO_STP_XWAY bool "XWAY STP GPIOs" depends on SOC_XWAY || COMPILE_TEST diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile index 29e3beb6548cb..84fae267e8ebf 100644 --- a/drivers/gpio/Makefile +++ b/drivers/gpio/Makefile @@ -140,7 +140,6 @@ obj-$(CONFIG_GPIO_SL28CPLD) += gpio-sl28cpld.o obj-$(CONFIG_GPIO_SODAVILLE) += gpio-sodaville.o obj-$(CONFIG_GPIO_SPEAR_SPICS) += gpio-spear-spics.o obj-$(CONFIG_GPIO_SPRD) += gpio-sprd.o -obj-$(CONFIG_GPIO_STA2X11) += gpio-sta2x11.o obj-$(CONFIG_GPIO_STMPE) += gpio-stmpe.o obj-$(CONFIG_GPIO_STP_XWAY) += gpio-stp-xway.o obj-$(CONFIG_GPIO_SYSCON) += gpio-syscon.o diff --git a/drivers/gpio/gpio-sta2x11.c b/drivers/gpio/gpio-sta2x11.c deleted file mode 100644 index e07cca0f8d353..0000000000000 --- a/drivers/gpio/gpio-sta2x11.c +++ /dev/null @@ -1,411 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * STMicroelectronics ConneXt (STA2X11) GPIO driver - * - * Copyright 2012 ST Microelectronics (Alessandro Rubini) - * Based on gpio-ml-ioh.c, Copyright 2010 OKI Semiconductors Ltd. - * Also based on previous sta2x11 work, Copyright 2011 Wind River Systems, Inc. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct gsta_regs { - u32 dat; /* 0x00 */ - u32 dats; - u32 datc; - u32 pdis; - u32 dir; /* 0x10 */ - u32 dirs; - u32 dirc; - u32 unused_1c; - u32 afsela; /* 0x20 */ - u32 unused_24[7]; - u32 rimsc; /* 0x40 */ - u32 fimsc; - u32 is; - u32 ic; -}; - -struct gsta_gpio { - spinlock_t lock; - struct device *dev; - void __iomem *reg_base; - struct gsta_regs __iomem *regs[GSTA_NR_BLOCKS]; - struct gpio_chip gpio; - int irq_base; - /* FIXME: save the whole config here (AF, ...) */ - unsigned irq_type[GSTA_NR_GPIO]; -}; - -/* - * gpio methods - */ - -static void gsta_gpio_set(struct gpio_chip *gpio, unsigned nr, int val) -{ - struct gsta_gpio *chip = gpiochip_get_data(gpio); - struct gsta_regs __iomem *regs = chip->regs[nr / GSTA_GPIO_PER_BLOCK]; - u32 bit = BIT(nr % GSTA_GPIO_PER_BLOCK); - - if (val) - writel(bit, ®s->dats); - else - writel(bit, ®s->datc); -} - -static int gsta_gpio_get(struct gpio_chip *gpio, unsigned nr) -{ - struct gsta_gpio *chip = gpiochip_get_data(gpio); - struct gsta_regs __iomem *regs = chip->regs[nr / GSTA_GPIO_PER_BLOCK]; - u32 bit = BIT(nr % GSTA_GPIO_PER_BLOCK); - - return !!(readl(®s->dat) & bit); -} - -static int gsta_gpio_direction_output(struct gpio_chip *gpio, unsigned nr, - int val) -{ - struct gsta_gpio *chip = gpiochip_get_data(gpio); - struct gsta_regs __iomem *regs = chip->regs[nr / GSTA_GPIO_PER_BLOCK]; - u32 bit = BIT(nr % GSTA_GPIO_PER_BLOCK); - - writel(bit, ®s->dirs); - /* Data register after direction, otherwise pullup/down is selected */ - if (val) - writel(bit, ®s->dats); - else - writel(bit, ®s->datc); - return 0; -} - -static int gsta_gpio_direction_input(struct gpio_chip *gpio, unsigned nr) -{ - struct gsta_gpio *chip = gpiochip_get_data(gpio); - struct gsta_regs __iomem *regs = chip->regs[nr / GSTA_GPIO_PER_BLOCK]; - u32 bit = BIT(nr % GSTA_GPIO_PER_BLOCK); - - writel(bit, ®s->dirc); - return 0; -} - -static int gsta_gpio_to_irq(struct gpio_chip *gpio, unsigned offset) -{ - struct gsta_gpio *chip = gpiochip_get_data(gpio); - return chip->irq_base + offset; -} - -static void gsta_gpio_setup(struct gsta_gpio *chip) /* called from probe */ -{ - struct gpio_chip *gpio = &chip->gpio; - - /* - * ARCH_NR_GPIOS is currently 256 and dynamic allocation starts - * from the end. However, for compatibility, we need the first - * ConneXt device to start from gpio 0: it's the main chipset - * on most boards so documents and drivers assume gpio0..gpio127 - */ - static int gpio_base; - - gpio->label = dev_name(chip->dev); - gpio->owner = THIS_MODULE; - gpio->direction_input = gsta_gpio_direction_input; - gpio->get = gsta_gpio_get; - gpio->direction_output = gsta_gpio_direction_output; - gpio->set = gsta_gpio_set; - gpio->dbg_show = NULL; - gpio->base = gpio_base; - gpio->ngpio = GSTA_NR_GPIO; - gpio->can_sleep = false; - gpio->to_irq = gsta_gpio_to_irq; - - /* - * After the first device, turn to dynamic gpio numbers. - * For example, with ARCH_NR_GPIOS = 256 we can fit two cards - */ - if (!gpio_base) - gpio_base = -1; -} - -/* - * Special method: alternate functions and pullup/pulldown. This is only - * invoked on startup to configure gpio's according to platform data. - * FIXME : this functionality shall be managed (and exported to other drivers) - * via the pin control subsystem. - */ -static void gsta_set_config(struct gsta_gpio *chip, int nr, unsigned cfg) -{ - struct gsta_regs __iomem *regs = chip->regs[nr / GSTA_GPIO_PER_BLOCK]; - unsigned long flags; - u32 bit = BIT(nr % GSTA_GPIO_PER_BLOCK); - u32 val; - int err = 0; - - pr_info("%s: %p %i %i\n", __func__, chip, nr, cfg); - - if (cfg == PINMUX_TYPE_NONE) - return; - - /* Alternate function or not? */ - spin_lock_irqsave(&chip->lock, flags); - val = readl(®s->afsela); - if (cfg == PINMUX_TYPE_FUNCTION) - val |= bit; - else - val &= ~bit; - writel(val | bit, ®s->afsela); - if (cfg == PINMUX_TYPE_FUNCTION) { - spin_unlock_irqrestore(&chip->lock, flags); - return; - } - - /* not alternate function: set details */ - switch (cfg) { - case PINMUX_TYPE_OUTPUT_LOW: - writel(bit, ®s->dirs); - writel(bit, ®s->datc); - break; - case PINMUX_TYPE_OUTPUT_HIGH: - writel(bit, ®s->dirs); - writel(bit, ®s->dats); - break; - case PINMUX_TYPE_INPUT: - writel(bit, ®s->dirc); - val = readl(®s->pdis) | bit; - writel(val, ®s->pdis); - break; - case PINMUX_TYPE_INPUT_PULLUP: - writel(bit, ®s->dirc); - val = readl(®s->pdis) & ~bit; - writel(val, ®s->pdis); - writel(bit, ®s->dats); - break; - case PINMUX_TYPE_INPUT_PULLDOWN: - writel(bit, ®s->dirc); - val = readl(®s->pdis) & ~bit; - writel(val, ®s->pdis); - writel(bit, ®s->datc); - break; - default: - err = 1; - } - spin_unlock_irqrestore(&chip->lock, flags); - if (err) - pr_err("%s: chip %p, pin %i, cfg %i is invalid\n", - __func__, chip, nr, cfg); -} - -/* - * Irq methods - */ - -static void gsta_irq_disable(struct irq_data *data) -{ - struct irq_chip_generic *gc = irq_data_get_irq_chip_data(data); - struct gsta_gpio *chip = gc->private; - int nr = data->irq - chip->irq_base; - struct gsta_regs __iomem *regs = chip->regs[nr / GSTA_GPIO_PER_BLOCK]; - u32 bit = BIT(nr % GSTA_GPIO_PER_BLOCK); - u32 val; - unsigned long flags; - - spin_lock_irqsave(&chip->lock, flags); - if (chip->irq_type[nr] & IRQ_TYPE_EDGE_RISING) { - val = readl(®s->rimsc) & ~bit; - writel(val, ®s->rimsc); - } - if (chip->irq_type[nr] & IRQ_TYPE_EDGE_FALLING) { - val = readl(®s->fimsc) & ~bit; - writel(val, ®s->fimsc); - } - spin_unlock_irqrestore(&chip->lock, flags); - return; -} - -static void gsta_irq_enable(struct irq_data *data) -{ - struct irq_chip_generic *gc = irq_data_get_irq_chip_data(data); - struct gsta_gpio *chip = gc->private; - int nr = data->irq - chip->irq_base; - struct gsta_regs __iomem *regs = chip->regs[nr / GSTA_GPIO_PER_BLOCK]; - u32 bit = BIT(nr % GSTA_GPIO_PER_BLOCK); - u32 val; - int type; - unsigned long flags; - - type = chip->irq_type[nr]; - - spin_lock_irqsave(&chip->lock, flags); - val = readl(®s->rimsc); - if (type & IRQ_TYPE_EDGE_RISING) - writel(val | bit, ®s->rimsc); - else - writel(val & ~bit, ®s->rimsc); - val = readl(®s->rimsc); - if (type & IRQ_TYPE_EDGE_FALLING) - writel(val | bit, ®s->fimsc); - else - writel(val & ~bit, ®s->fimsc); - spin_unlock_irqrestore(&chip->lock, flags); - return; -} - -static int gsta_irq_type(struct irq_data *d, unsigned int type) -{ - struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - struct gsta_gpio *chip = gc->private; - int nr = d->irq - chip->irq_base; - - /* We only support edge interrupts */ - if (!(type & (IRQ_TYPE_EDGE_RISING | IRQ_TYPE_EDGE_FALLING))) { - pr_debug("%s: unsupported type 0x%x\n", __func__, type); - return -EINVAL; - } - - chip->irq_type[nr] = type; /* used for enable/disable */ - - gsta_irq_enable(d); - return 0; -} - -static irqreturn_t gsta_gpio_handler(int irq, void *dev_id) -{ - struct gsta_gpio *chip = dev_id; - struct gsta_regs __iomem *regs; - u32 is; - int i, nr, base; - irqreturn_t ret = IRQ_NONE; - - for (i = 0; i < GSTA_NR_BLOCKS; i++) { - regs = chip->regs[i]; - base = chip->irq_base + i * GSTA_GPIO_PER_BLOCK; - while ((is = readl(®s->is))) { - nr = __ffs(is); - irq = base + nr; - generic_handle_irq(irq); - writel(1 << nr, ®s->ic); - ret = IRQ_HANDLED; - } - } - return ret; -} - -static int gsta_alloc_irq_chip(struct gsta_gpio *chip) -{ - struct irq_chip_generic *gc; - struct irq_chip_type *ct; - int rv; - - gc = devm_irq_alloc_generic_chip(chip->dev, KBUILD_MODNAME, 1, - chip->irq_base, - chip->reg_base, handle_simple_irq); - if (!gc) - return -ENOMEM; - - gc->private = chip; - ct = gc->chip_types; - - ct->chip.irq_set_type = gsta_irq_type; - ct->chip.irq_disable = gsta_irq_disable; - ct->chip.irq_enable = gsta_irq_enable; - - /* FIXME: this makes at most 32 interrupts. Request 0 by now */ - rv = devm_irq_setup_generic_chip(chip->dev, gc, - 0 /* IRQ_MSK(GSTA_GPIO_PER_BLOCK) */, - 0, IRQ_NOREQUEST | IRQ_NOPROBE, 0); - if (rv) - return rv; - - /* Set up all 128 interrupts: code from setup_generic_chip */ - { - struct irq_chip_type *ct = gc->chip_types; - int i, j; - for (j = 0; j < GSTA_NR_GPIO; j++) { - i = chip->irq_base + j; - irq_set_chip_and_handler(i, &ct->chip, ct->handler); - irq_set_chip_data(i, gc); - irq_clear_status_flags(i, IRQ_NOREQUEST | IRQ_NOPROBE); - } - gc->irq_cnt = i - gc->irq_base; - } - - return 0; -} - -/* The platform device used here is instantiated by the MFD device */ -static int gsta_probe(struct platform_device *dev) -{ - int i, err; - struct pci_dev *pdev; - struct sta2x11_gpio_pdata *gpio_pdata; - struct gsta_gpio *chip; - - pdev = *(struct pci_dev **)dev_get_platdata(&dev->dev); - gpio_pdata = dev_get_platdata(&pdev->dev); - - if (gpio_pdata == NULL) - dev_err(&dev->dev, "no gpio config\n"); - pr_debug("gpio config: %p\n", gpio_pdata); - - chip = devm_kzalloc(&dev->dev, sizeof(*chip), GFP_KERNEL); - if (!chip) - return -ENOMEM; - chip->dev = &dev->dev; - chip->reg_base = devm_platform_ioremap_resource(dev, 0); - if (IS_ERR(chip->reg_base)) - return PTR_ERR(chip->reg_base); - - for (i = 0; i < GSTA_NR_BLOCKS; i++) { - chip->regs[i] = chip->reg_base + i * 4096; - /* disable all irqs */ - writel(0, &chip->regs[i]->rimsc); - writel(0, &chip->regs[i]->fimsc); - writel(~0, &chip->regs[i]->ic); - } - spin_lock_init(&chip->lock); - gsta_gpio_setup(chip); - if (gpio_pdata) - for (i = 0; i < GSTA_NR_GPIO; i++) - gsta_set_config(chip, i, gpio_pdata->pinconfig[i]); - - /* 384 was used in previous code: be compatible for other drivers */ - err = devm_irq_alloc_descs(&dev->dev, -1, 384, - GSTA_NR_GPIO, NUMA_NO_NODE); - if (err < 0) { - dev_warn(&dev->dev, "sta2x11 gpio: Can't get irq base (%i)\n", - -err); - return err; - } - chip->irq_base = err; - - err = gsta_alloc_irq_chip(chip); - if (err) - return err; - - err = devm_request_irq(&dev->dev, pdev->irq, gsta_gpio_handler, - IRQF_SHARED, KBUILD_MODNAME, chip); - if (err < 0) { - dev_err(&dev->dev, "sta2x11 gpio: Can't request irq (%i)\n", - -err); - return err; - } - - return devm_gpiochip_add_data(&dev->dev, &chip->gpio, chip); -} - -static struct platform_driver sta2x11_gpio_platform_driver = { - .driver = { - .name = "sta2x11-gpio", - .suppress_bind_attrs = true, - }, - .probe = gsta_probe, -}; -builtin_platform_driver(sta2x11_gpio_platform_driver); -- GitLab From 95b39792c6646322e0684f1a1aa395ee82b6f3fb Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 2 Sep 2022 14:42:02 +0200 Subject: [PATCH 0053/1423] gpio: aggregator: Stop using ARCH_NR_GPIOS ARCH_NR_GPIOS is used locally in aggr_parse() as the maximum number of GPIOs to be aggregated together by the driver since commit ec75039d5550 ("gpio: aggregator: Use bitmap_parselist() for parsing GPIO offsets"). Don't rely on the total possible number of GPIOs in the system but define a local arbitrary macro for that, set to 512 which should be large enough as it is also the default value for ARCH_NR_GPIOS. Signed-off-by: Christophe Leroy Reviewed-by: Geert Uytterhoeven Reviewed-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-aggregator.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/gpio/gpio-aggregator.c b/drivers/gpio/gpio-aggregator.c index 0cb2664085cf8..6d17d262ad917 100644 --- a/drivers/gpio/gpio-aggregator.c +++ b/drivers/gpio/gpio-aggregator.c @@ -23,6 +23,7 @@ #include #include +#define AGGREGATOR_MAX_GPIOS 512 /* * GPIO Aggregator sysfs interface @@ -64,7 +65,7 @@ static int aggr_parse(struct gpio_aggregator *aggr) unsigned int i, n = 0; int error = 0; - bitmap = bitmap_alloc(ARCH_NR_GPIOS, GFP_KERNEL); + bitmap = bitmap_alloc(AGGREGATOR_MAX_GPIOS, GFP_KERNEL); if (!bitmap) return -ENOMEM; @@ -84,13 +85,13 @@ static int aggr_parse(struct gpio_aggregator *aggr) } /* GPIO chip + offset(s) */ - error = bitmap_parselist(offsets, bitmap, ARCH_NR_GPIOS); + error = bitmap_parselist(offsets, bitmap, AGGREGATOR_MAX_GPIOS); if (error) { pr_err("Cannot parse %s: %d\n", offsets, error); goto free_bitmap; } - for_each_set_bit(i, bitmap, ARCH_NR_GPIOS) { + for_each_set_bit(i, bitmap, AGGREGATOR_MAX_GPIOS) { error = aggr_add_gpio(aggr, name, i, &n); if (error) goto free_bitmap; -- GitLab From 95e827a1b0b7c8334d24da7b4a2d17ec5aa7374c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 2 Sep 2022 14:42:03 +0200 Subject: [PATCH 0054/1423] gpio: davinci: Stop using ARCH_NR_GPIOS Since commit 14e85c0e69d5 ("gpio: remove gpio_descs global array") there is no global limitation anymore on the number of GPIOs in the system so don't clamp the number of GPIOs with ARCH_NR_GPIOS. Signed-off-by: Christophe Leroy Reviewed-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-davinci.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/gpio/gpio-davinci.c b/drivers/gpio/gpio-davinci.c index 59c4c48d8296b..1018860c83c27 100644 --- a/drivers/gpio/gpio-davinci.c +++ b/drivers/gpio/gpio-davinci.c @@ -217,9 +217,6 @@ static int davinci_gpio_probe(struct platform_device *pdev) return -EINVAL; } - if (WARN_ON(ARCH_NR_GPIOS < ngpio)) - ngpio = ARCH_NR_GPIOS; - /* * If there are unbanked interrupts then the number of * interrupts is equal to number of gpios else all are banked so -- GitLab From 502df79b860563d79143be7a1453c2b3224cd836 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 2 Sep 2022 14:42:04 +0200 Subject: [PATCH 0055/1423] gpiolib: Warn on drivers still using static gpiobase allocation In the preparation of getting completely rid of static gpiobase allocation in the future, emit a warning in drivers still doing so. Signed-off-by: Christophe Leroy Reviewed-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 4756ea08894f6..5c64d1a412c7a 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -715,6 +715,9 @@ int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data, * a poison instead. */ gc->base = base; + } else { + dev_warn(&gdev->dev, + "Static allocation of GPIO base is deprecated, use dynamic allocation.\n"); } gdev->base = base; -- GitLab From 7b61212f2a07a5afd213c8876e52b5c9946441e2 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 2 Sep 2022 14:42:05 +0200 Subject: [PATCH 0056/1423] gpiolib: Get rid of ARCH_NR_GPIOS Since commit 14e85c0e69d5 ("gpio: remove gpio_descs global array") there is no limitation on the number of GPIOs that can be allocated in the system since the allocation is fully dynamic. ARCH_NR_GPIOS is today only used in order to provide downwards gpiobase allocation from that value, while static allocation is performed upwards from 0. However that has the disadvantage of limiting the number of GPIOs that can be registered in the system. To overcome this limitation without requiring each and every platform to provide its 'best-guess' maximum number, rework the allocation to allocate upwards, allowing approx 2 millions of GPIOs. In order to still allow static allocation for legacy drivers, define GPIO_DYNAMIC_BASE with the value 512 as the start for dynamic allocation. The 512 value is chosen because it is the end of the current default range so all current static allocations are expected to be below that value. Of course that's just a rough estimate based on the default value, but assuming static allocations come first, even if there are more static allocations it should fit under the 512 value. In the future, it is expected that all static allocations go away and then dynamic allocation will be patched to start at 0. Signed-off-by: Christophe Leroy Reviewed-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- arch/arm/include/asm/gpio.h | 1 - drivers/gpio/gpiolib.c | 10 +++---- include/asm-generic/gpio.h | 55 ++++++++++++++----------------------- 3 files changed, 26 insertions(+), 40 deletions(-) diff --git a/arch/arm/include/asm/gpio.h b/arch/arm/include/asm/gpio.h index f3bb8a2bf788e..4ebbb58f06ea6 100644 --- a/arch/arm/include/asm/gpio.h +++ b/arch/arm/include/asm/gpio.h @@ -2,7 +2,6 @@ #ifndef _ARCH_ARM_GPIO_H #define _ARCH_ARM_GPIO_H -/* Note: this may rely upon the value of ARCH_NR_GPIOS set in mach/gpio.h */ #include /* The trivial gpiolib dispatchers */ diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 5c64d1a412c7a..e8faedca6b14f 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -183,14 +183,14 @@ EXPORT_SYMBOL_GPL(gpiod_to_chip); static int gpiochip_find_base(int ngpio) { struct gpio_device *gdev; - int base = ARCH_NR_GPIOS - ngpio; + int base = GPIO_DYNAMIC_BASE; - list_for_each_entry_reverse(gdev, &gpio_devices, list) { + list_for_each_entry(gdev, &gpio_devices, list) { /* found a free space? */ - if (gdev->base + gdev->ngpio <= base) + if (gdev->base >= base + ngpio) break; - /* nope, check the space right before the chip */ - base = gdev->base - ngpio; + /* nope, check the space right after the chip */ + base = gdev->base + gdev->ngpio; } if (gpio_is_valid(base)) { diff --git a/include/asm-generic/gpio.h b/include/asm-generic/gpio.h index aea9aee1f3e97..a7752cf152ce4 100644 --- a/include/asm-generic/gpio.h +++ b/include/asm-generic/gpio.h @@ -11,40 +11,18 @@ #include #include -/* Platforms may implement their GPIO interface with library code, +/* + * Platforms may implement their GPIO interface with library code, * at a small performance cost for non-inlined operations and some * extra memory (for code and for per-GPIO table entries). - * - * While the GPIO programming interface defines valid GPIO numbers - * to be in the range 0..MAX_INT, this library restricts them to the - * smaller range 0..ARCH_NR_GPIOS-1. - * - * ARCH_NR_GPIOS is somewhat arbitrary; it usually reflects the sum of - * builtin/SoC GPIOs plus a number of GPIOs on expanders; the latter is - * actually an estimate of a board-specific value. */ -#ifndef ARCH_NR_GPIOS -#if defined(CONFIG_ARCH_NR_GPIO) && CONFIG_ARCH_NR_GPIO > 0 -#define ARCH_NR_GPIOS CONFIG_ARCH_NR_GPIO -#else -#define ARCH_NR_GPIOS 512 -#endif -#endif - /* - * "valid" GPIO numbers are nonnegative and may be passed to - * setup routines like gpio_request(). only some valid numbers - * can successfully be requested and used. - * - * Invalid GPIO numbers are useful for indicating no-such-GPIO in - * platform data and other tables. + * At the end we want all GPIOs to be dynamically allocated from 0. + * However, some legacy drivers still perform fixed allocation. + * Until they are all fixed, leave 0-512 space for them. */ - -static inline bool gpio_is_valid(int number) -{ - return number >= 0 && number < ARCH_NR_GPIOS; -} +#define GPIO_DYNAMIC_BASE 512 struct device; struct gpio; @@ -140,12 +118,6 @@ static inline void gpio_unexport(unsigned gpio) #include -static inline bool gpio_is_valid(int number) -{ - /* only non-negative numbers are valid */ - return number >= 0; -} - /* platforms that don't directly support access to GPIOs through I2C, SPI, * or other blocking infrastructure can use these wrappers. */ @@ -169,4 +141,19 @@ static inline void gpio_set_value_cansleep(unsigned gpio, int value) #endif /* !CONFIG_GPIOLIB */ +/* + * "valid" GPIO numbers are nonnegative and may be passed to + * setup routines like gpio_request(). only some valid numbers + * can successfully be requested and used. + * + * Invalid GPIO numbers are useful for indicating no-such-GPIO in + * platform data and other tables. + */ + +static inline bool gpio_is_valid(int number) +{ + /* only non-negative numbers are valid */ + return number >= 0; +} + #endif /* _ASM_GENERIC_GPIO_H */ -- GitLab From f2b470f036770805ebd20fb5cfe800395c7215af Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 2 Sep 2022 14:42:06 +0200 Subject: [PATCH 0057/1423] Documentation: gpio: Remove text about ARCH_NR_GPIOS ARCH_NR_GPIOS have been removed, clean up the documentation. After this patch, the only place when ARCH_NR_GPIOS remains is in translations/zh_CN/gpio.txt and translations/zh_TW/gpio.txt. I don't have the skills to update that, anyway those two files are already out of sync as they are still mentionning ARCH_REQUIRE_GPIOLIB which was removed by commit 65053e1a7743 ("gpio: delete ARCH_[WANTS_OPTIONAL|REQUIRE]_GPIOLIB") Signed-off-by: Christophe Leroy Reviewed-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- Documentation/driver-api/gpio/legacy.rst | 5 ----- 1 file changed, 5 deletions(-) diff --git a/Documentation/driver-api/gpio/legacy.rst b/Documentation/driver-api/gpio/legacy.rst index 9b12eeb89170f..e17910cc32719 100644 --- a/Documentation/driver-api/gpio/legacy.rst +++ b/Documentation/driver-api/gpio/legacy.rst @@ -558,11 +558,6 @@ Platform Support To force-enable this framework, a platform's Kconfig will "select" GPIOLIB, else it is up to the user to configure support for GPIO. -It may also provide a custom value for ARCH_NR_GPIOS, so that it better -reflects the number of GPIOs in actual use on that platform, without -wasting static table space. (It should count both built-in/SoC GPIOs and -also ones on GPIO expanders. - If neither of these options are selected, the platform does not support GPIOs through GPIO-lib and the code cannot be enabled by the user. -- GitLab From f71806d8dc6c053b5a5344536380c5b2bb82b3fc Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 2 Sep 2022 14:42:07 +0200 Subject: [PATCH 0058/1423] x86: Remove CONFIG_ARCH_NR_GPIO CONFIG_ARCH_NR_GPIO is not used anymore, remove it. Signed-off-by: Christophe Leroy Reviewed-by: Andy Shevchenko Acked-by: Thomas Gleixner Signed-off-by: Bartosz Golaszewski --- arch/x86/Kconfig | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6d1879ef933a2..a3288f7e7b2bd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -357,11 +357,6 @@ config ARCH_HAS_CPU_RELAX config ARCH_HIBERNATION_POSSIBLE def_bool y -config ARCH_NR_GPIO - int - default 1024 if X86_64 - default 512 - config ARCH_SUSPEND_POSSIBLE def_bool y -- GitLab From 8937944f4ee4f5763de8784ccdb068d93d9b0f3e Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 2 Sep 2022 14:42:08 +0200 Subject: [PATCH 0059/1423] arm: Remove CONFIG_ARCH_NR_GPIO CONFIG_ARCH_NR_GPIO is not used anymore, remove it. Signed-off-by: Christophe Leroy Reviewed-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- arch/arm/Kconfig | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index a08c9d092a332..b03fd451122ee 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1155,27 +1155,6 @@ config ARM_PSCI 0022A ("Power State Coordination Interface System Software on ARM processors"). -# The GPIO number here must be sorted by descending number. In case of -# a multiplatform kernel, we just want the highest value required by the -# selected platforms. -config ARCH_NR_GPIO - int - default 2048 if ARCH_INTEL_SOCFPGA - default 1024 if ARCH_BRCMSTB || ARCH_RENESAS || ARCH_TEGRA || \ - ARCH_ZYNQ || ARCH_ASPEED - default 512 if ARCH_EXYNOS || ARCH_KEYSTONE || SOC_OMAP5 || \ - SOC_DRA7XX || ARCH_S3C24XX || ARCH_S3C64XX || ARCH_S5PV210 - default 416 if ARCH_SUNXI - default 392 if ARCH_U8500 - default 352 if ARCH_VT8500 - default 288 if ARCH_ROCKCHIP - default 264 if MACH_H4700 - default 0 - help - Maximum number of GPIOs in the system. - - If unsure, leave the default value. - config HZ_FIXED int default 128 if SOC_AT91RM9200 -- GitLab From f5a681d238885f238a5f06fcfda625a90d87a327 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 2 Sep 2022 14:42:09 +0200 Subject: [PATCH 0060/1423] arm64: Remove CONFIG_ARCH_NR_GPIO CONFIG_ARCH_NR_GPIO is not used anymore, remove it. Signed-off-by: Christophe Leroy Reviewed-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- arch/arm64/Kconfig | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 505c8a1ccbe0c..a0c36763d9547 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2145,18 +2145,6 @@ config STACKPROTECTOR_PER_TASK def_bool y depends on STACKPROTECTOR && CC_HAVE_STACKPROTECTOR_SYSREG -# The GPIO number here must be sorted by descending number. In case of -# a multiplatform kernel, we just want the highest value required by the -# selected platforms. -config ARCH_NR_GPIO - int - default 2048 if ARCH_APPLE - default 0 - help - Maximum number of GPIOs in the system. - - If unsure, leave the default value. - endmenu # "Kernel Features" menu "Boot options" -- GitLab From b5636d45aae42aa345b4c7918bdef245ed63da68 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:41 +0200 Subject: [PATCH 0061/1423] x86/cpu: Remove segment load from switch_to_new_gdt() On 32bit FS and on 64bit GS segments are already set up correctly, but load_percpu_segment() still sets [FG]S after switching from the early GDT to the direct GDT. For 32bit the segment load has no side effects, but on 64bit it causes GSBASE to become 0, which means that any per CPU access before GSBASE is set to the new value is going to fault. That's the reason why the whole file containing this code has stackprotector removed. But that's a pointless exercise for both 32 and 64 bit as the relevant segment selector is already correct. Loading the new GDT does not change that. Remove the segment loads and add comments. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111143.097052006@infradead.org --- arch/x86/include/asm/processor.h | 1 - arch/x86/kernel/cpu/common.c | 47 +++++++++++++++++++++----------- 2 files changed, 31 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 67c9d73b31faa..e21ec970d41ab 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -670,7 +670,6 @@ extern struct desc_ptr early_gdt_descr; extern void switch_to_new_gdt(int); extern void load_direct_gdt(int); extern void load_fixmap_gdt(int); -extern void load_percpu_segment(int); extern void cpu_init(void); extern void cpu_init_secondary(void); extern void cpu_init_exception_handling(void); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 3e508f2390983..c09abee6f4d5b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -701,16 +701,6 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c) __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long)); __u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long)); -void load_percpu_segment(int cpu) -{ -#ifdef CONFIG_X86_32 - loadsegment(fs, __KERNEL_PERCPU); -#else - __loadsegment_simple(gs, 0); - wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); -#endif -} - #ifdef CONFIG_X86_32 /* The 32-bit entry code needs to find cpu_entry_area. */ DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); @@ -738,16 +728,41 @@ void load_fixmap_gdt(int cpu) } EXPORT_SYMBOL_GPL(load_fixmap_gdt); -/* - * Current gdt points %fs at the "master" per-cpu area: after this, - * it's on the real one. +/** + * switch_to_new_gdt - Switch form early GDT to the direct one + * @cpu: The CPU number for which this is invoked + * + * Invoked during early boot to switch from early GDT and early per CPU + * (%fs on 32bit, GS_BASE on 64bit) to the direct GDT and the runtime per + * CPU area. */ void switch_to_new_gdt(int cpu) { - /* Load the original GDT */ load_direct_gdt(cpu); - /* Reload the per-cpu base */ - load_percpu_segment(cpu); + +#ifdef CONFIG_X86_64 + /* + * No need to load %gs. It is already correct. + * + * Writing %gs on 64bit would zero GSBASE which would make any per + * CPU operation up to the point of the wrmsrl() fault. + * + * Set GSBASE to the new offset. Until the wrmsrl() happens the + * early mapping is still valid. That means the GSBASE update will + * lose any prior per CPU data which was not copied over in + * setup_per_cpu_areas(). + */ + wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); +#else + /* + * %fs is already set to __KERNEL_PERCPU, but after switching GDT + * it is required to load FS again so that the 'hidden' part is + * updated from the new GDT. Up to this point the early per CPU + * translation is active. Any content of the early per CPU data + * which was not copied over in setup_per_cpu_areas() is lost. + */ + loadsegment(fs, __KERNEL_PERCPU); +#endif } static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; -- GitLab From 1f19e2d50baf6515991844eaa8a84a0b0037da70 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:42 +0200 Subject: [PATCH 0062/1423] x86/cpu: Get rid of redundant switch_to_new_gdt() invocations The only place where switch_to_new_gdt() is required is early boot to switch from the early GDT to the direct GDT. Any other invocation is completely redundant because it does not change anything. Secondary CPUs come out of the ASM code with GDT and GSBASE correctly set up. The same is true for XEN_PV. Remove all the voodoo invocations which are left overs from the ancient past, rename the function to switch_gdt_and_percpu_base() and mark it init. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111143.198076128@infradead.org --- arch/x86/include/asm/processor.h | 2 +- arch/x86/kernel/cpu/common.c | 17 ++++++----------- arch/x86/kernel/setup_percpu.c | 2 +- arch/x86/kernel/smpboot.c | 6 +++++- arch/x86/xen/enlighten_pv.c | 2 +- 5 files changed, 14 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index e21ec970d41ab..c660700ecfc61 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -667,7 +667,7 @@ extern int sysenter_setup(void); /* Defined in head.S */ extern struct desc_ptr early_gdt_descr; -extern void switch_to_new_gdt(int); +extern void switch_gdt_and_percpu_base(int); extern void load_direct_gdt(int); extern void load_fixmap_gdt(int); extern void cpu_init(void); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c09abee6f4d5b..f51928dd275ad 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -729,14 +729,15 @@ void load_fixmap_gdt(int cpu) EXPORT_SYMBOL_GPL(load_fixmap_gdt); /** - * switch_to_new_gdt - Switch form early GDT to the direct one + * switch_gdt_and_percpu_base - Switch to direct GDT and runtime per CPU base * @cpu: The CPU number for which this is invoked * - * Invoked during early boot to switch from early GDT and early per CPU - * (%fs on 32bit, GS_BASE on 64bit) to the direct GDT and the runtime per - * CPU area. + * Invoked during early boot to switch from early GDT and early per CPU to + * the direct GDT and the runtime per CPU area. On 32-bit the percpu base + * switch is implicit by loading the direct GDT. On 64bit this requires + * to update GSBASE. */ -void switch_to_new_gdt(int cpu) +void __init switch_gdt_and_percpu_base(int cpu) { load_direct_gdt(cpu); @@ -2263,12 +2264,6 @@ void cpu_init(void) boot_cpu_has(X86_FEATURE_TSC) || boot_cpu_has(X86_FEATURE_DE)) cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - /* - * Initialize the per-CPU GDT with the boot GDT, - * and set up the GDT descriptor: - */ - switch_to_new_gdt(cpu); - if (IS_ENABLED(CONFIG_X86_64)) { loadsegment(fs, 0); memset(cur->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 49325caa7307d..555089a5b446d 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -211,7 +211,7 @@ void __init setup_per_cpu_areas(void) * area. Reload any changed state for the boot CPU. */ if (!cpu) - switch_to_new_gdt(cpu); + switch_gdt_and_percpu_base(cpu); } /* indicate the early static arrays will soon be gone */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3f3ea0287f694..ce8728d2e5efa 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1453,7 +1453,11 @@ void arch_thaw_secondary_cpus_end(void) void __init native_smp_prepare_boot_cpu(void) { int me = smp_processor_id(); - switch_to_new_gdt(me); + + /* SMP handles this from setup_per_cpu_areas() */ + if (!IS_ENABLED(CONFIG_SMP)) + switch_gdt_and_percpu_base(me); + /* already set me in cpu_online_mask in boot_cpu_init() */ cpumask_set_cpu(me, cpu_callout_mask); cpu_set_state_online(me); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index f82857e488152..9b892079581ba 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1209,7 +1209,7 @@ static void __init xen_setup_gdt(int cpu) pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry_boot; pv_ops.cpu.load_gdt = xen_load_gdt_boot; - switch_to_new_gdt(cpu); + switch_gdt_and_percpu_base(cpu); pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry; pv_ops.cpu.load_gdt = xen_load_gdt; -- GitLab From 2cb15faaedeb67f52f2ddc32b5ca152acfc422c2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:43 +0200 Subject: [PATCH 0063/1423] x86/cpu: Re-enable stackprotector Commit 5416c2663517 ("x86: make sure load_percpu_segment has no stackprotector") disabled the stackprotector for cpu/common.c because of load_percpu_segment(). Back then the boot stack canary was initialized very early in start_kernel(). Switching the per CPU area by loading the GDT caused the stackprotector to fail with paravirt enabled kernels as the GSBASE was not updated yet. In hindsight a wrong change because it would have been sufficient to ensure that the canary is the same in both per CPU areas. Commit d55535232c3d ("random: move rand_initialize() earlier") moved the stack canary initialization to a later point in the init sequence. As a consequence the per CPU stack canary is 0 when switching the per CPU areas, so there is no requirement anymore to exclude this file. Add a comment to load_percpu_segment(). Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111143.303010511@infradead.org --- arch/x86/kernel/cpu/Makefile | 3 --- arch/x86/kernel/cpu/common.c | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index f10a921ee7565..d7e3ceaf75c16 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -17,9 +17,6 @@ KMSAN_SANITIZE_common.o := n # As above, instrumenting secondary CPU boot code causes boot hangs. KCSAN_SANITIZE_common.o := n -# Make sure load_percpu_segment has no stackprotector -CFLAGS_common.o := -fno-stack-protector - obj-y := cacheinfo.o scattered.o topology.o obj-y += common.o obj-y += rdrand.o diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f51928dd275ad..8e873181759a2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -752,6 +752,9 @@ void __init switch_gdt_and_percpu_base(int cpu) * early mapping is still valid. That means the GSBASE update will * lose any prior per CPU data which was not copied over in * setup_per_cpu_areas(). + * + * This works even with stackprotector enabled because the + * per CPU stack canary is 0 in both per CPU areas. */ wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); #else -- GitLab From 4c4eb3ecc91f4fee6d6bf7cfbc1e21f2e38d19ff Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:44 +0200 Subject: [PATCH 0064/1423] x86/modules: Set VM_FLUSH_RESET_PERMS in module_alloc() Instead of resetting permissions all over the place when freeing module memory tell the vmalloc code to do so. Avoids the exercise for the next upcoming user. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111143.406703869@infradead.org --- arch/x86/kernel/ftrace.c | 2 -- arch/x86/kernel/kprobes/core.c | 1 - arch/x86/kernel/module.c | 9 +++++---- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index bd165004776d9..00eac455a3a1f 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -413,8 +413,6 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) /* ALLOC_TRAMP flags lets us know we created it */ ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP; - set_vm_flush_reset_perms(trampoline); - if (likely(system_state != SYSTEM_BOOTING)) set_memory_ro((unsigned long)trampoline, npages); set_memory_x((unsigned long)trampoline, npages); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index eb8bc82846b99..01b8d956aa76d 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -414,7 +414,6 @@ void *alloc_insn_page(void) if (!page) return NULL; - set_vm_flush_reset_perms(page); /* * First make the page read-only, and only then make it executable to * prevent it from being W+X in between. diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index c032edcd3d95e..43f0112772192 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -74,10 +74,11 @@ void *module_alloc(unsigned long size) return NULL; p = __vmalloc_node_range(size, MODULE_ALIGN, - MODULES_VADDR + get_module_load_offset(), - MODULES_END, gfp_mask, - PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, - __builtin_return_address(0)); + MODULES_VADDR + get_module_load_offset(), + MODULES_END, gfp_mask, PAGE_KERNEL, + VM_FLUSH_RESET_PERMS | VM_DEFER_KMEMLEAK, + NUMA_NO_NODE, __builtin_return_address(0)); + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { vfree(p); return NULL; -- GitLab From b26d66f8dace32c46ce58147002964ce8cdfde5f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:45 +0200 Subject: [PATCH 0065/1423] x86/vdso: Ensure all kernel code is seen by objtool extable.c is kernel code and not part of the VDSO Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111143.512144110@infradead.org --- arch/x86/entry/vdso/Makefile | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 3e88b9df8c8f1..3ef611044c8f6 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -33,11 +33,12 @@ vobjs32-y += vdso32/vclock_gettime.o vobjs-$(CONFIG_X86_SGX) += vsgx.o # files to link into kernel -obj-y += vma.o extable.o -KASAN_SANITIZE_vma.o := y -UBSAN_SANITIZE_vma.o := y -KCSAN_SANITIZE_vma.o := y -OBJECT_FILES_NON_STANDARD_vma.o := n +obj-y += vma.o extable.o +KASAN_SANITIZE_vma.o := y +UBSAN_SANITIZE_vma.o := y +KCSAN_SANITIZE_vma.o := y +OBJECT_FILES_NON_STANDARD_vma.o := n +OBJECT_FILES_NON_STANDARD_extable.o := n # vDSO images to build vdso_img-$(VDSO64-y) += 64 -- GitLab From 24a9c543d2114d416f84e386c2fa90089bd97e4c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:46 +0200 Subject: [PATCH 0066/1423] x86: Sanitize linker script The section ordering in the text section is more than suboptimal: ALIGN_ENTRY_TEXT_BEGIN ENTRY_TEXT ALIGN_ENTRY_TEXT_END SOFTIRQENTRY_TEXT STATIC_CALL_TEXT INDIRECT_THUNK_TEXT ENTRY_TEXT is in a seperate PMD so it can be mapped into the cpu entry area when KPTI is enabled. That means the sections after it are also in a seperate PMD. That's wasteful especially as the indirect thunk text is a hotpath on retpoline enabled systems and the static call text is fairly hot on 32bit. Move the entry text section last so that the other sections share a PMD with the text before it. This is obviously just best effort and not guaranteed when the previous text is just at a PMD boundary. The text section placement needs an overhaul in general. There is e.g. no point to have debugfs, sysfs, cpuhotplug and other rarely used functions next to hot path text. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111143.614728935@infradead.org --- arch/x86/kernel/vmlinux.lds.S | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 15f29053cec46..0e9fc080c417d 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -132,18 +132,19 @@ SECTIONS CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT - ALIGN_ENTRY_TEXT_BEGIN - ENTRY_TEXT - ALIGN_ENTRY_TEXT_END SOFTIRQENTRY_TEXT - STATIC_CALL_TEXT - *(.gnu.warning) - #ifdef CONFIG_RETPOLINE __indirect_thunk_start = .; *(.text.__x86.*) __indirect_thunk_end = .; #endif + STATIC_CALL_TEXT + + ALIGN_ENTRY_TEXT_BEGIN + ENTRY_TEXT + ALIGN_ENTRY_TEXT_END + *(.gnu.warning) + } :text =0xcccc /* End of text section, which should occupy whole number of pages */ -- GitLab From d49a0626216b95cd4bf696f6acf55f39a16ab0bb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:10:47 +0200 Subject: [PATCH 0067/1423] arch: Introduce CONFIG_FUNCTION_ALIGNMENT Generic function-alignment infrastructure. Architectures can select FUNCTION_ALIGNMENT_xxB symbols; the FUNCTION_ALIGNMENT symbol is then set to the largest such selected size, 0 otherwise. From this the -falign-functions compiler argument and __ALIGN macro are set. This incorporates the DEBUG_FORCE_FUNCTION_ALIGN_64B knob and future alignment requirements for x86_64 (later in this series) into a single place. NOTE: also removes the 0x90 filler byte from the generic __ALIGN primitive, that value makes no sense outside of x86. NOTE: .balign 0 reverts to a no-op. Requested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111143.719248727@infradead.org --- Makefile | 4 ++-- arch/Kconfig | 24 ++++++++++++++++++++++++ arch/ia64/Kconfig | 1 + arch/ia64/Makefile | 2 +- arch/x86/Kconfig | 2 ++ arch/x86/boot/compressed/head_64.S | 8 ++++++++ arch/x86/include/asm/linkage.h | 4 +--- include/asm-generic/vmlinux.lds.h | 4 ++-- include/linux/linkage.h | 4 ++-- lib/Kconfig.debug | 1 + 10 files changed, 44 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index f41ec8c8426ba..141e1bcc06715 100644 --- a/Makefile +++ b/Makefile @@ -1004,8 +1004,8 @@ KBUILD_CFLAGS += $(CC_FLAGS_CFI) export CC_FLAGS_CFI endif -ifdef CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B -KBUILD_CFLAGS += -falign-functions=64 +ifneq ($(CONFIG_FUNCTION_ALIGNMENT),0) +KBUILD_CFLAGS += -falign-functions=$(CONFIG_FUNCTION_ALIGNMENT) endif # arch Makefile may override CC so keep this after arch Makefile is included diff --git a/arch/Kconfig b/arch/Kconfig index 8f138e580d1ae..4025802538029 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1428,4 +1428,28 @@ source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" +config FUNCTION_ALIGNMENT_4B + bool + +config FUNCTION_ALIGNMENT_8B + bool + +config FUNCTION_ALIGNMENT_16B + bool + +config FUNCTION_ALIGNMENT_32B + bool + +config FUNCTION_ALIGNMENT_64B + bool + +config FUNCTION_ALIGNMENT + int + default 64 if FUNCTION_ALIGNMENT_64B + default 32 if FUNCTION_ALIGNMENT_32B + default 16 if FUNCTION_ALIGNMENT_16B + default 8 if FUNCTION_ALIGNMENT_8B + default 4 if FUNCTION_ALIGNMENT_4B + default 0 + endmenu diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index c6e06cdc738f0..d7e4a24e8644c 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -63,6 +63,7 @@ config IA64 select NUMA if !FLATMEM select PCI_MSI_ARCH_FALLBACKS if PCI_MSI select ZONE_DMA32 + select FUNCTION_ALIGNMENT_32B default y help The Itanium Processor Family is Intel's 64-bit successor to diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile index 56c4bb276b6ed..d553ab7022fe4 100644 --- a/arch/ia64/Makefile +++ b/arch/ia64/Makefile @@ -23,7 +23,7 @@ KBUILD_AFLAGS_KERNEL := -mconstant-gp EXTRA := cflags-y := -pipe $(EXTRA) -ffixed-r13 -mfixed-range=f12-f15,f32-f127 \ - -falign-functions=32 -frename-registers -fno-optimize-sibling-calls + -frename-registers -fno-optimize-sibling-calls KBUILD_CFLAGS_KERNEL := -mconstant-gp GAS_STATUS = $(shell $(srctree)/arch/ia64/scripts/check-gas "$(CC)" "$(OBJDUMP)") diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6d1879ef933a2..f408fa87ed947 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -290,6 +290,8 @@ config X86 select X86_FEATURE_NAMES if PROC_FS select PROC_PID_ARCH_STATUS if PROC_FS select HAVE_ARCH_NODE_DEV_GROUP if X86_SGX + select FUNCTION_ALIGNMENT_16B if X86_64 || X86_ALIGNMENT_16 + select FUNCTION_ALIGNMENT_4B imply IMA_SECURE_AND_OR_TRUSTED_BOOT if EFI select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index d33f060900d23..190b803eb7871 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -37,6 +37,14 @@ #include #include "pgtable.h" +/* + * Fix alignment at 16 bytes. Following CONFIG_FUNCTION_ALIGNMENT will result + * in assembly errors due to trying to move .org backward due to the excessive + * alignment. + */ +#undef __ALIGN +#define __ALIGN .balign 16, 0x90 + /* * Locally defined symbols should be marked hidden: */ diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index f484d656d34ee..9ee0e28517423 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -14,10 +14,8 @@ #ifdef __ASSEMBLY__ -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_ALIGNMENT_16) -#define __ALIGN .p2align 4, 0x90 +#define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT, 0x90; #define __ALIGN_STR __stringify(__ALIGN) -#endif #if defined(CONFIG_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) #define RET jmp __x86_return_thunk diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index c15de165ec8ff..335b5711a7ede 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -81,8 +81,8 @@ #define RO_EXCEPTION_TABLE #endif -/* Align . to a 8 byte boundary equals to maximum function alignment. */ -#define ALIGN_FUNCTION() . = ALIGN(8) +/* Align . function alignment. */ +#define ALIGN_FUNCTION() . = ALIGN(CONFIG_FUNCTION_ALIGNMENT) /* * LD_DEAD_CODE_DATA_ELIMINATION option enables -fdata-sections, which diff --git a/include/linux/linkage.h b/include/linux/linkage.h index 1feab6136b5b5..5c8865bb59d91 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -69,8 +69,8 @@ #endif #ifndef __ALIGN -#define __ALIGN .align 4,0x90 -#define __ALIGN_STR ".align 4,0x90" +#define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT +#define __ALIGN_STR __stringify(__ALIGN) #endif #ifdef __ASSEMBLY__ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 3fc7abffc7aa2..e90dc67385347 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -467,6 +467,7 @@ config SECTION_MISMATCH_WARN_ONLY config DEBUG_FORCE_FUNCTION_ALIGN_64B bool "Force all function address 64B aligned" depends on EXPERT && (X86_64 || ARM64 || PPC32 || PPC64 || ARC) + select FUNCTION_ALIGNMENT_64B help There are cases that a commit from one domain changes the function address alignment of other domains, and cause magic performance -- GitLab From 8eb5d34e77c63fde8af21c691bcf6e3cd87f7829 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:48 +0200 Subject: [PATCH 0068/1423] x86/asm: Differentiate between code and function alignment Create SYM_F_ALIGN to differentiate alignment requirements between SYM_CODE and SYM_FUNC. This distinction is useful later when adding padding in front of functions; IOW this allows following the compiler's patchable-function-entry option. [peterz: Changelog] Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111143.824822743@infradead.org --- arch/x86/include/asm/linkage.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 9ee0e28517423..c2d6e2733b111 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -12,11 +12,15 @@ #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) #endif /* CONFIG_X86_32 */ -#ifdef __ASSEMBLY__ - #define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT, 0x90; #define __ALIGN_STR __stringify(__ALIGN) +#define ASM_FUNC_ALIGN __ALIGN_STR +#define __FUNC_ALIGN __ALIGN +#define SYM_F_ALIGN __FUNC_ALIGN + +#ifdef __ASSEMBLY__ + #if defined(CONFIG_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) #define RET jmp __x86_return_thunk #else /* CONFIG_RETPOLINE */ @@ -55,7 +59,7 @@ /* SYM_FUNC_START -- use for global functions */ #define SYM_FUNC_START(name) \ - SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \ + SYM_START(name, SYM_L_GLOBAL, SYM_F_ALIGN) \ ENDBR /* SYM_FUNC_START_NOALIGN -- use for global functions, w/o alignment */ @@ -65,7 +69,7 @@ /* SYM_FUNC_START_LOCAL -- use for local functions */ #define SYM_FUNC_START_LOCAL(name) \ - SYM_START(name, SYM_L_LOCAL, SYM_A_ALIGN) \ + SYM_START(name, SYM_L_LOCAL, SYM_F_ALIGN) \ ENDBR /* SYM_FUNC_START_LOCAL_NOALIGN -- use for local functions, w/o alignment */ @@ -75,7 +79,7 @@ /* SYM_FUNC_START_WEAK -- use for weak functions */ #define SYM_FUNC_START_WEAK(name) \ - SYM_START(name, SYM_L_WEAK, SYM_A_ALIGN) \ + SYM_START(name, SYM_L_WEAK, SYM_F_ALIGN) \ ENDBR /* SYM_FUNC_START_WEAK_NOALIGN -- use for weak functions, w/o alignment */ -- GitLab From 1934dc9a8a92fda36ab80cfd55edab1708dcdf9a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:10:49 +0200 Subject: [PATCH 0069/1423] x86/error_inject: Align function properly Ensure inline asm functions are consistently aligned with compiler generated and SYM_FUNC_START*() functions. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111143.930201368@infradead.org --- arch/x86/lib/error-inject.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/lib/error-inject.c b/arch/x86/lib/error-inject.c index 1e3de0769b810..b5a6d83106bc2 100644 --- a/arch/x86/lib/error-inject.c +++ b/arch/x86/lib/error-inject.c @@ -11,6 +11,7 @@ asm( ".text\n" ".type just_return_func, @function\n" ".globl just_return_func\n" + ASM_FUNC_ALIGN "just_return_func:\n" ANNOTATE_NOENDBR ASM_RET -- GitLab From 1d293758e548aa6ff65e4dd3f5a9bc2a34b38ce3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:50 +0200 Subject: [PATCH 0070/1423] x86/paravirt: Properly align PV functions Ensure inline asm functions are consistently aligned with compiler generated and SYM_FUNC_START*() functions. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20220915111144.038540008@infradead.org --- arch/x86/include/asm/paravirt.h | 1 + arch/x86/include/asm/qspinlock_paravirt.h | 2 +- arch/x86/kernel/kvm.c | 1 + arch/x86/kernel/paravirt.c | 2 ++ 4 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 2a0b8dd4ec335..1be66c15ecbd7 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -665,6 +665,7 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu); asm(".pushsection " section ", \"ax\";" \ ".globl " PV_THUNK_NAME(func) ";" \ ".type " PV_THUNK_NAME(func) ", @function;" \ + ASM_FUNC_ALIGN \ PV_THUNK_NAME(func) ":" \ ASM_ENDBR \ FRAME_BEGIN \ diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h index 60ece592b2207..082551b3c75ed 100644 --- a/arch/x86/include/asm/qspinlock_paravirt.h +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -40,7 +40,7 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text"); asm (".pushsection .spinlock.text;" ".globl " PV_UNLOCK ";" ".type " PV_UNLOCK ", @function;" - ".align 4,0x90;" + ASM_FUNC_ALIGN PV_UNLOCK ": " ASM_ENDBR FRAME_BEGIN diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index d4e48b4a438b2..95fb85bea1118 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -802,6 +802,7 @@ asm( ".pushsection .text;" ".global __raw_callee_save___kvm_vcpu_is_preempted;" ".type __raw_callee_save___kvm_vcpu_is_preempted, @function;" +ASM_FUNC_ALIGN "__raw_callee_save___kvm_vcpu_is_preempted:" ASM_ENDBR "movq __per_cpu_offset(,%rdi,8), %rax;" diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 7ca2d46c08cc9..e244c49b52d7e 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -40,6 +40,7 @@ extern void _paravirt_nop(void); asm (".pushsection .entry.text, \"ax\"\n" ".global _paravirt_nop\n" + ASM_FUNC_ALIGN "_paravirt_nop:\n\t" ASM_ENDBR ASM_RET @@ -50,6 +51,7 @@ asm (".pushsection .entry.text, \"ax\"\n" /* stub always returning 0. */ asm (".pushsection .entry.text, \"ax\"\n" ".global paravirt_ret0\n" + ASM_FUNC_ALIGN "paravirt_ret0:\n\t" ASM_ENDBR "xor %" _ASM_AX ", %" _ASM_AX ";\n\t" -- GitLab From 67e93ddd5d0b84ac17bddb13d98533e425282421 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:51 +0200 Subject: [PATCH 0071/1423] x86/entry: Align SYM_CODE_START() variants Explicitly align a bunch of commonly called SYM_CODE_START() symbols. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111144.144068841@infradead.org --- arch/x86/entry/entry_64.S | 16 ++++++++++------ arch/x86/entry/thunk_64.S | 4 ++-- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9953d966d1244..e635f962afb80 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -284,7 +284,8 @@ SYM_FUNC_END(__switch_to_asm) * r12: kernel thread arg */ .pushsection .text, "ax" -SYM_CODE_START(ret_from_fork) + __FUNC_ALIGN +SYM_CODE_START_NOALIGN(ret_from_fork) UNWIND_HINT_EMPTY ANNOTATE_NOENDBR // copy_thread movq %rax, %rdi @@ -600,13 +601,13 @@ SYM_CODE_END(\asmsym) * shared between 32 and 64 bit and emit the __irqentry_text_* markers * so the stacktrace boundary checks work. */ - .align 16 + __ALIGN .globl __irqentry_text_start __irqentry_text_start: #include - .align 16 + __ALIGN .globl __irqentry_text_end __irqentry_text_end: ANNOTATE_NOENDBR @@ -828,7 +829,8 @@ EXPORT_SYMBOL(asm_load_gs_index) * * C calling convention: exc_xen_hypervisor_callback(struct *pt_regs) */ -SYM_CODE_START_LOCAL(exc_xen_hypervisor_callback) + __FUNC_ALIGN +SYM_CODE_START_LOCAL_NOALIGN(exc_xen_hypervisor_callback) /* * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will @@ -856,7 +858,8 @@ SYM_CODE_END(exc_xen_hypervisor_callback) * We distinguish between categories by comparing each saved segment register * with its current contents: any discrepancy means we in category 1. */ -SYM_CODE_START(xen_failsafe_callback) + __FUNC_ALIGN +SYM_CODE_START_NOALIGN(xen_failsafe_callback) UNWIND_HINT_EMPTY ENDBR movl %ds, %ecx @@ -1516,7 +1519,8 @@ SYM_CODE_END(ignore_sysret) #endif .pushsection .text, "ax" -SYM_CODE_START(rewind_stack_and_make_dead) + __FUNC_ALIGN +SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead) UNWIND_HINT_FUNC /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S index f38b07d2768bb..5e37f41e5f14d 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -11,7 +11,7 @@ /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ .macro THUNK name, func -SYM_FUNC_START_NOALIGN(\name) +SYM_FUNC_START(\name) pushq %rbp movq %rsp, %rbp @@ -36,7 +36,7 @@ SYM_FUNC_END(\name) EXPORT_SYMBOL(preempt_schedule_thunk) EXPORT_SYMBOL(preempt_schedule_notrace_thunk) -SYM_CODE_START_LOCAL_NOALIGN(__thunk_restore) +SYM_CODE_START_LOCAL(__thunk_restore) popq %r11 popq %r10 popq %r9 -- GitLab From f6dabc817e1f0da8f4088734ad0b9814adad0bce Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:52 +0200 Subject: [PATCH 0072/1423] crypto: x86/camellia: Remove redundant alignments SYM_FUNC_START*() and friends already imply alignment, remove custom alignment hacks to make code consistent. This prepares for future function call ABI changes. Also, with having pushed the function alignment to 16 bytes, this custom alignment is completely superfluous. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111144.248229966@infradead.org --- arch/x86/crypto/camellia-aesni-avx-asm_64.S | 2 -- arch/x86/crypto/camellia-aesni-avx2-asm_64.S | 4 ---- 2 files changed, 6 deletions(-) diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index 2e1658ddbe1a9..4a30618281ec2 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -712,7 +712,6 @@ SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) .text -.align 8 SYM_FUNC_START_LOCAL(__camellia_enc_blk16) /* input: * %rdi: ctx, CTX @@ -799,7 +798,6 @@ SYM_FUNC_START_LOCAL(__camellia_enc_blk16) jmp .Lenc_done; SYM_FUNC_END(__camellia_enc_blk16) -.align 8 SYM_FUNC_START_LOCAL(__camellia_dec_blk16) /* input: * %rdi: ctx, CTX diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index 0e4e9abbf4de3..deaf62aa73a6b 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -221,7 +221,6 @@ * Size optimization... with inlined roundsm32 binary would be over 5 times * larger and would only marginally faster. */ -.align 8 SYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, @@ -229,7 +228,6 @@ SYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_c RET; SYM_FUNC_END(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) -.align 8 SYM_FUNC_START_LOCAL(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3, %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11, @@ -748,7 +746,6 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) .text -.align 8 SYM_FUNC_START_LOCAL(__camellia_enc_blk32) /* input: * %rdi: ctx, CTX @@ -835,7 +832,6 @@ SYM_FUNC_START_LOCAL(__camellia_enc_blk32) jmp .Lenc_done; SYM_FUNC_END(__camellia_enc_blk32) -.align 8 SYM_FUNC_START_LOCAL(__camellia_dec_blk32) /* input: * %rdi: ctx, CTX -- GitLab From 88cdf02551f9aef9284d778ecd375c400555d900 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:53 +0200 Subject: [PATCH 0073/1423] crypto: x86/cast5: Remove redundant alignments SYM_FUNC_START*() and friends already imply alignment, remove custom alignment hacks to make code consistent. This prepares for future function call ABI changes. Also, with having pushed the function alignment to 16 bytes, this custom alignment is completely superfluous. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111144.353555711@infradead.org --- arch/x86/crypto/cast5-avx-x86_64-asm_64.S | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S index b258af420c92c..0326a01503c3a 100644 --- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S @@ -208,7 +208,6 @@ .text -.align 16 SYM_FUNC_START_LOCAL(__cast5_enc_blk16) /* input: * %rdi: ctx @@ -282,7 +281,6 @@ SYM_FUNC_START_LOCAL(__cast5_enc_blk16) RET; SYM_FUNC_END(__cast5_enc_blk16) -.align 16 SYM_FUNC_START_LOCAL(__cast5_dec_blk16) /* input: * %rdi: ctx -- GitLab From ba1b270c20dfb7f7b7a076b1a97ef4b7dcb539b5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:54 +0200 Subject: [PATCH 0074/1423] crypto: x86/crct10dif-pcl: Remove redundant alignments SYM_FUNC_START*() and friends already imply alignment, remove custom alignment hacks to make code consistent. This prepares for future function call ABI changes. Also, with having pushed the function alignment to 16 bytes, this custom alignment is completely superfluous. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111144.456602381@infradead.org --- arch/x86/crypto/crct10dif-pcl-asm_64.S | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S index 721474abfb719..5286db5b8165e 100644 --- a/arch/x86/crypto/crct10dif-pcl-asm_64.S +++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S @@ -94,7 +94,6 @@ # # Assumes len >= 16. # -.align 16 SYM_FUNC_START(crc_t10dif_pcl) movdqa .Lbswap_mask(%rip), BSWAP_MASK -- GitLab From 8b44221671ec45d725a4558ff7aa5ea90ecfc885 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:55 +0200 Subject: [PATCH 0075/1423] crypto: x86/serpent: Remove redundant alignments SYM_FUNC_START*() and friends already imply alignment, remove custom alignment hacks to make code consistent. This prepares for future function call ABI changes. Also, with having pushed the function alignment to 16 bytes, this custom alignment is completely superfluous. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111144.558544791@infradead.org --- arch/x86/crypto/serpent-avx-x86_64-asm_64.S | 2 -- arch/x86/crypto/serpent-avx2-asm_64.S | 2 -- 2 files changed, 4 deletions(-) diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S index 82f2313f512b8..97e2836218518 100644 --- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S @@ -550,7 +550,6 @@ #define write_blocks(x0, x1, x2, x3, t0, t1, t2) \ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) -.align 8 SYM_FUNC_START_LOCAL(__serpent_enc_blk8_avx) /* input: * %rdi: ctx, CTX @@ -604,7 +603,6 @@ SYM_FUNC_START_LOCAL(__serpent_enc_blk8_avx) RET; SYM_FUNC_END(__serpent_enc_blk8_avx) -.align 8 SYM_FUNC_START_LOCAL(__serpent_dec_blk8_avx) /* input: * %rdi: ctx, CTX diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S index 8ea34c9b93160..6d60c50593a9b 100644 --- a/arch/x86/crypto/serpent-avx2-asm_64.S +++ b/arch/x86/crypto/serpent-avx2-asm_64.S @@ -550,7 +550,6 @@ #define write_blocks(x0, x1, x2, x3, t0, t1, t2) \ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) -.align 8 SYM_FUNC_START_LOCAL(__serpent_enc_blk16) /* input: * %rdi: ctx, CTX @@ -604,7 +603,6 @@ SYM_FUNC_START_LOCAL(__serpent_enc_blk16) RET; SYM_FUNC_END(__serpent_enc_blk16) -.align 8 SYM_FUNC_START_LOCAL(__serpent_dec_blk16) /* input: * %rdi: ctx, CTX -- GitLab From c2a3ce6fdb122b12bc4cfffd28ecf8a9fb0d6736 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:56 +0200 Subject: [PATCH 0076/1423] crypto: x86/sha1: Remove custom alignments SYM_FUNC_START*() and friends already imply alignment, remove custom alignment hacks to make code consistent. This prepares for future function call ABI changes. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111144.662580589@infradead.org --- arch/x86/crypto/sha1_ni_asm.S | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S index 2f94ec0e763bf..cd943b2af2c41 100644 --- a/arch/x86/crypto/sha1_ni_asm.S +++ b/arch/x86/crypto/sha1_ni_asm.S @@ -92,7 +92,6 @@ * numBlocks: Number of blocks to process */ .text -.align 32 SYM_FUNC_START(sha1_ni_transform) push %rbp mov %rsp, %rbp -- GitLab From 3ba56d0b87113785413dfc5b9910d45001cc4eeb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:57 +0200 Subject: [PATCH 0077/1423] crypto: x86/sha256: Remove custom alignments SYM_FUNC_START*() and friends already imply alignment, remove custom alignment hacks to make code consistent. This prepares for future function call ABI changes. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111144.766564176@infradead.org --- arch/x86/crypto/sha256-avx-asm.S | 1 - arch/x86/crypto/sha256-avx2-asm.S | 1 - arch/x86/crypto/sha256-ssse3-asm.S | 1 - arch/x86/crypto/sha256_ni_asm.S | 1 - 4 files changed, 4 deletions(-) diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S index 3baa1ec390974..3649370690c5d 100644 --- a/arch/x86/crypto/sha256-avx-asm.S +++ b/arch/x86/crypto/sha256-avx-asm.S @@ -347,7 +347,6 @@ a = TMP_ ######################################################################## .text SYM_FUNC_START(sha256_transform_avx) -.align 32 pushq %rbx pushq %r12 pushq %r13 diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S index 9bcdbc47b8b4b..c4c1dc5ee078d 100644 --- a/arch/x86/crypto/sha256-avx2-asm.S +++ b/arch/x86/crypto/sha256-avx2-asm.S @@ -524,7 +524,6 @@ STACK_SIZE = _CTX + _CTX_SIZE ######################################################################## .text SYM_FUNC_START(sha256_transform_rorx) -.align 32 pushq %rbx pushq %r12 pushq %r13 diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S index c4a5db612c327..96b7dcdeaebe4 100644 --- a/arch/x86/crypto/sha256-ssse3-asm.S +++ b/arch/x86/crypto/sha256-ssse3-asm.S @@ -356,7 +356,6 @@ a = TMP_ ######################################################################## .text SYM_FUNC_START(sha256_transform_ssse3) -.align 32 pushq %rbx pushq %r12 pushq %r13 diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S index 94d50dd27cb53..b3f1a1a12027e 100644 --- a/arch/x86/crypto/sha256_ni_asm.S +++ b/arch/x86/crypto/sha256_ni_asm.S @@ -96,7 +96,6 @@ */ .text -.align 32 SYM_FUNC_START(sha256_ni_transform) shl $6, NUM_BLKS /* convert to bytes */ -- GitLab From 2f93238b87ddbbe1b050ec48ab5843fc61346adb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:58 +0200 Subject: [PATCH 0078/1423] crypto: x86/sm[34]: Remove redundant alignments SYM_FUNC_START*() and friends already imply alignment, remove custom alignment hacks to make code consistent. This prepares for future function call ABI changes. Also, with having pushed the function alignment to 16 bytes, this custom alignment is completely superfluous. ( this code couldn't seem to make up it's mind about what alignment it actually wanted, randomly mixing 8 and 16 bytes ) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111144.868540856@infradead.org --- arch/x86/crypto/sm3-avx-asm_64.S | 1 - arch/x86/crypto/sm4-aesni-avx-asm_64.S | 7 ------- arch/x86/crypto/sm4-aesni-avx2-asm_64.S | 6 ------ 3 files changed, 14 deletions(-) diff --git a/arch/x86/crypto/sm3-avx-asm_64.S b/arch/x86/crypto/sm3-avx-asm_64.S index b12b9efb5ec51..b28d804ee10de 100644 --- a/arch/x86/crypto/sm3-avx-asm_64.S +++ b/arch/x86/crypto/sm3-avx-asm_64.S @@ -327,7 +327,6 @@ * void sm3_transform_avx(struct sm3_state *state, * const u8 *data, int nblocks); */ -.align 16 SYM_FUNC_START(sm3_transform_avx) /* input: * %rdi: ctx, CTX diff --git a/arch/x86/crypto/sm4-aesni-avx-asm_64.S b/arch/x86/crypto/sm4-aesni-avx-asm_64.S index 4767ab61ff489..e13c8537b2ec3 100644 --- a/arch/x86/crypto/sm4-aesni-avx-asm_64.S +++ b/arch/x86/crypto/sm4-aesni-avx-asm_64.S @@ -139,13 +139,11 @@ .text -.align 16 /* * void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst, * const u8 *src, int nblocks) */ -.align 8 SYM_FUNC_START(sm4_aesni_avx_crypt4) /* input: * %rdi: round key array, CTX @@ -249,7 +247,6 @@ SYM_FUNC_START(sm4_aesni_avx_crypt4) RET; SYM_FUNC_END(sm4_aesni_avx_crypt4) -.align 8 SYM_FUNC_START_LOCAL(__sm4_crypt_blk8) /* input: * %rdi: round key array, CTX @@ -363,7 +360,6 @@ SYM_FUNC_END(__sm4_crypt_blk8) * void sm4_aesni_avx_crypt8(const u32 *rk, u8 *dst, * const u8 *src, int nblocks) */ -.align 8 SYM_FUNC_START(sm4_aesni_avx_crypt8) /* input: * %rdi: round key array, CTX @@ -419,7 +415,6 @@ SYM_FUNC_END(sm4_aesni_avx_crypt8) * void sm4_aesni_avx_ctr_enc_blk8(const u32 *rk, u8 *dst, * const u8 *src, u8 *iv) */ -.align 8 SYM_FUNC_START(sm4_aesni_avx_ctr_enc_blk8) /* input: * %rdi: round key array, CTX @@ -494,7 +489,6 @@ SYM_FUNC_END(sm4_aesni_avx_ctr_enc_blk8) * void sm4_aesni_avx_cbc_dec_blk8(const u32 *rk, u8 *dst, * const u8 *src, u8 *iv) */ -.align 8 SYM_FUNC_START(sm4_aesni_avx_cbc_dec_blk8) /* input: * %rdi: round key array, CTX @@ -544,7 +538,6 @@ SYM_FUNC_END(sm4_aesni_avx_cbc_dec_blk8) * void sm4_aesni_avx_cfb_dec_blk8(const u32 *rk, u8 *dst, * const u8 *src, u8 *iv) */ -.align 8 SYM_FUNC_START(sm4_aesni_avx_cfb_dec_blk8) /* input: * %rdi: round key array, CTX diff --git a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S index 4732fe8bb65b6..2212705f7da6c 100644 --- a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S @@ -153,9 +153,6 @@ .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef .text -.align 16 - -.align 8 SYM_FUNC_START_LOCAL(__sm4_crypt_blk16) /* input: * %rdi: round key array, CTX @@ -281,7 +278,6 @@ SYM_FUNC_END(__sm4_crypt_blk16) * void sm4_aesni_avx2_ctr_enc_blk16(const u32 *rk, u8 *dst, * const u8 *src, u8 *iv) */ -.align 8 SYM_FUNC_START(sm4_aesni_avx2_ctr_enc_blk16) /* input: * %rdi: round key array, CTX @@ -394,7 +390,6 @@ SYM_FUNC_END(sm4_aesni_avx2_ctr_enc_blk16) * void sm4_aesni_avx2_cbc_dec_blk16(const u32 *rk, u8 *dst, * const u8 *src, u8 *iv) */ -.align 8 SYM_FUNC_START(sm4_aesni_avx2_cbc_dec_blk16) /* input: * %rdi: round key array, CTX @@ -448,7 +443,6 @@ SYM_FUNC_END(sm4_aesni_avx2_cbc_dec_blk16) * void sm4_aesni_avx2_cfb_dec_blk16(const u32 *rk, u8 *dst, * const u8 *src, u8 *iv) */ -.align 8 SYM_FUNC_START(sm4_aesni_avx2_cfb_dec_blk16) /* input: * %rdi: round key array, CTX -- GitLab From e2c9475e88f70820270ca5cc81a1ae2fda262278 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:59 +0200 Subject: [PATCH 0079/1423] crypto: twofish: Remove redundant alignments SYM_FUNC_START*() and friends already imply alignment, remove custom alignment hacks to make code consistent. This prepares for future function call ABI changes. Also, with having pushed the function alignment to 16 bytes, this custom alignment is completely superfluous. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111144.971229477@infradead.org --- arch/x86/crypto/twofish-avx-x86_64-asm_64.S | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S index 31f9b2ec3857d..12fde271cd3f6 100644 --- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S @@ -228,7 +228,6 @@ vpxor x2, wkey, x2; \ vpxor x3, wkey, x3; -.align 8 SYM_FUNC_START_LOCAL(__twofish_enc_blk8) /* input: * %rdi: ctx, CTX @@ -270,7 +269,6 @@ SYM_FUNC_START_LOCAL(__twofish_enc_blk8) RET; SYM_FUNC_END(__twofish_enc_blk8) -.align 8 SYM_FUNC_START_LOCAL(__twofish_dec_blk8) /* input: * %rdi: ctx, CTX -- GitLab From fdc9ee7e97aa2c1dfa7ebb092fffec40ffa59108 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:00 +0200 Subject: [PATCH 0080/1423] crypto: x86/poly1305: Remove custom function alignment SYM_FUNC_START*() and friends already imply alignment, remove custom alignment hacks to make code consistent. This prepares for future function call ABI changes. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111145.073285765@infradead.org --- arch/x86/crypto/poly1305-x86_64-cryptogams.pl | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl index 2077ce7a56479..b9abcd79c1f43 100644 --- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl +++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl @@ -108,7 +108,6 @@ if (!$kernel) { sub declare_function() { my ($name, $align, $nargs) = @_; if($kernel) { - $code .= ".align $align\n"; $code .= "SYM_FUNC_START($name)\n"; $code .= ".L$name:\n"; } else { -- GitLab From e57ef2ed97c1d078973298658a8096644a1e9e09 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:01 +0200 Subject: [PATCH 0081/1423] x86: Put hot per CPU variables into a struct The layout of per-cpu variables is at the mercy of the compiler. This can lead to random performance fluctuations from build to build. Create a structure to hold some of the hottest per-cpu variables, starting with current_task. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111145.179707194@infradead.org --- arch/x86/include/asm/current.h | 19 ++++++++++++++++--- arch/x86/kernel/cpu/common.c | 14 +++++--------- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/smpboot.c | 2 +- 5 files changed, 24 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index 3e204e6140b51..63c42ac3cd86d 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -3,16 +3,29 @@ #define _ASM_X86_CURRENT_H #include -#include #ifndef __ASSEMBLY__ + +#include +#include + struct task_struct; -DECLARE_PER_CPU(struct task_struct *, current_task); +struct pcpu_hot { + union { + struct { + struct task_struct *current_task; + }; + u8 pad[64]; + }; +}; +static_assert(sizeof(struct pcpu_hot) == 64); + +DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot); static __always_inline struct task_struct *get_current(void) { - return this_cpu_read_stable(current_task); + return this_cpu_read_stable(pcpu_hot.current_task); } #define current get_current() diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8e873181759a2..52071539a14ca 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -2012,18 +2012,16 @@ static __init int setup_clearcpuid(char *arg) } __setup("clearcpuid=", setup_clearcpuid); +DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = { + .current_task = &init_task, +}; +EXPORT_PER_CPU_SYMBOL(pcpu_hot); + #ifdef CONFIG_X86_64 DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __aligned(PAGE_SIZE) __visible; EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data); -/* - * The following percpu variables are hot. Align current_task to - * cacheline size such that they fall in the same cacheline. - */ -DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = - &init_task; -EXPORT_PER_CPU_SYMBOL(current_task); DEFINE_PER_CPU(void *, hardirq_stack_ptr); DEFINE_PER_CPU(bool, hardirq_stack_inuse); @@ -2083,8 +2081,6 @@ void syscall_init(void) #else /* CONFIG_X86_64 */ -DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; -EXPORT_PER_CPU_SYMBOL(current_task); DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; EXPORT_PER_CPU_SYMBOL(__preempt_count); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 2f314b170c9f0..807da45d84c72 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -207,7 +207,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (prev->gs | next->gs) loadsegment(gs, next->gs); - this_cpu_write(current_task, next_p); + raw_cpu_write(pcpu_hot.current_task, next_p); switch_fpu_finish(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 6b3418bff3261..c4f6cacf6599d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -617,7 +617,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Switch the PDA and FPU contexts. */ - this_cpu_write(current_task, next_p); + raw_cpu_write(pcpu_hot.current_task, next_p); this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); switch_fpu_finish(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ce8728d2e5efa..05f315777691c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1046,7 +1046,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) /* Just in case we booted with a single CPU. */ alternatives_enable_smp(); - per_cpu(current_task, cpu) = idle; + per_cpu(pcpu_hot.current_task, cpu) = idle; cpu_init_stack_canary(cpu, idle); /* Initialize the interrupt stack(s) */ -- GitLab From 64701838bf0575ef8acb1ad2db5934e864f3e6c3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:02 +0200 Subject: [PATCH 0082/1423] x86/percpu: Move preempt_count next to current_task Add preempt_count to pcpu_hot, since it is once of the most used per-cpu variables. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111145.284170644@infradead.org --- arch/x86/include/asm/current.h | 1 + arch/x86/include/asm/preempt.h | 27 ++++++++++++++------------- arch/x86/kernel/cpu/common.c | 8 +------- 3 files changed, 16 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index 63c42ac3cd86d..0f4b46293c6c0 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -15,6 +15,7 @@ struct pcpu_hot { union { struct { struct task_struct *current_task; + int preempt_count; }; u8 pad[64]; }; diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 5f6daea1ee248..2d13f25b1bd8f 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -4,11 +4,11 @@ #include #include +#include + #include #include -DECLARE_PER_CPU(int, __preempt_count); - /* We use the MSB mostly because its available */ #define PREEMPT_NEED_RESCHED 0x80000000 @@ -24,7 +24,7 @@ DECLARE_PER_CPU(int, __preempt_count); */ static __always_inline int preempt_count(void) { - return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED; + return raw_cpu_read_4(pcpu_hot.preempt_count) & ~PREEMPT_NEED_RESCHED; } static __always_inline void preempt_count_set(int pc) @@ -32,10 +32,10 @@ static __always_inline void preempt_count_set(int pc) int old, new; do { - old = raw_cpu_read_4(__preempt_count); + old = raw_cpu_read_4(pcpu_hot.preempt_count); new = (old & PREEMPT_NEED_RESCHED) | (pc & ~PREEMPT_NEED_RESCHED); - } while (raw_cpu_cmpxchg_4(__preempt_count, old, new) != old); + } while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old); } /* @@ -44,7 +44,7 @@ static __always_inline void preempt_count_set(int pc) #define init_task_preempt_count(p) do { } while (0) #define init_idle_preempt_count(p, cpu) do { \ - per_cpu(__preempt_count, (cpu)) = PREEMPT_DISABLED; \ + per_cpu(pcpu_hot.preempt_count, (cpu)) = PREEMPT_DISABLED; \ } while (0) /* @@ -58,17 +58,17 @@ static __always_inline void preempt_count_set(int pc) static __always_inline void set_preempt_need_resched(void) { - raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED); + raw_cpu_and_4(pcpu_hot.preempt_count, ~PREEMPT_NEED_RESCHED); } static __always_inline void clear_preempt_need_resched(void) { - raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED); + raw_cpu_or_4(pcpu_hot.preempt_count, PREEMPT_NEED_RESCHED); } static __always_inline bool test_preempt_need_resched(void) { - return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED); + return !(raw_cpu_read_4(pcpu_hot.preempt_count) & PREEMPT_NEED_RESCHED); } /* @@ -77,12 +77,12 @@ static __always_inline bool test_preempt_need_resched(void) static __always_inline void __preempt_count_add(int val) { - raw_cpu_add_4(__preempt_count, val); + raw_cpu_add_4(pcpu_hot.preempt_count, val); } static __always_inline void __preempt_count_sub(int val) { - raw_cpu_add_4(__preempt_count, -val); + raw_cpu_add_4(pcpu_hot.preempt_count, -val); } /* @@ -92,7 +92,8 @@ static __always_inline void __preempt_count_sub(int val) */ static __always_inline bool __preempt_count_dec_and_test(void) { - return GEN_UNARY_RMWcc("decl", __preempt_count, e, __percpu_arg([var])); + return GEN_UNARY_RMWcc("decl", pcpu_hot.preempt_count, e, + __percpu_arg([var])); } /* @@ -100,7 +101,7 @@ static __always_inline bool __preempt_count_dec_and_test(void) */ static __always_inline bool should_resched(int preempt_offset) { - return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset); + return unlikely(raw_cpu_read_4(pcpu_hot.preempt_count) == preempt_offset); } #ifdef CONFIG_PREEMPTION diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 52071539a14ca..cafb6bd90d107 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -2014,6 +2014,7 @@ __setup("clearcpuid=", setup_clearcpuid); DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = { .current_task = &init_task, + .preempt_count = INIT_PREEMPT_COUNT, }; EXPORT_PER_CPU_SYMBOL(pcpu_hot); @@ -2022,13 +2023,9 @@ DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __aligned(PAGE_SIZE) __visible; EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data); - DEFINE_PER_CPU(void *, hardirq_stack_ptr); DEFINE_PER_CPU(bool, hardirq_stack_inuse); -DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; -EXPORT_PER_CPU_SYMBOL(__preempt_count); - DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK; static void wrmsrl_cstar(unsigned long val) @@ -2081,9 +2078,6 @@ void syscall_init(void) #else /* CONFIG_X86_64 */ -DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; -EXPORT_PER_CPU_SYMBOL(__preempt_count); - /* * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find * the top of the kernel stack. Use an extra percpu variable to track the -- GitLab From 7443b296e699e6922f5be243c8d2e316de8cacbe Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:03 +0200 Subject: [PATCH 0083/1423] x86/percpu: Move cpu_number next to current_task Also add cpu_number to the pcpu_hot structure, it is often referenced and this cacheline is there. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111145.387678283@infradead.org --- arch/x86/include/asm/current.h | 1 + arch/x86/include/asm/smp.h | 12 +++++------- arch/x86/kernel/setup_percpu.c | 5 +---- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index 0f4b46293c6c0..8ac6589e9a1b7 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -16,6 +16,7 @@ struct pcpu_hot { struct { struct task_struct *current_task; int preempt_count; + int cpu_number; }; u8 pad[64]; }; diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index a73bced40e241..b4dbb20dab1a1 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -3,10 +3,10 @@ #define _ASM_X86_SMP_H #ifndef __ASSEMBLY__ #include -#include -#include #include +#include +#include extern int smp_num_siblings; extern unsigned int num_processors; @@ -19,7 +19,6 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map); DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id); DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id); -DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number); DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid); DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid); @@ -150,11 +149,10 @@ __visible void smp_call_function_single_interrupt(struct pt_regs *r); /* * This function is needed by all SMP systems. It must _always_ be valid - * from the initial startup. We map APIC_BASE very early in page_setup(), - * so this is correct in the x86 case. + * from the initial startup. */ -#define raw_smp_processor_id() this_cpu_read(cpu_number) -#define __smp_processor_id() __this_cpu_read(cpu_number) +#define raw_smp_processor_id() this_cpu_read(pcpu_hot.cpu_number) +#define __smp_processor_id() __this_cpu_read(pcpu_hot.cpu_number) #ifdef CONFIG_X86_32 extern int safe_smp_processor_id(void); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 555089a5b446d..c2fc4c41c1644 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -23,9 +23,6 @@ #include #include -DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number); -EXPORT_PER_CPU_SYMBOL(cpu_number); - #ifdef CONFIG_X86_64 #define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load) #else @@ -172,7 +169,7 @@ void __init setup_per_cpu_areas(void) for_each_possible_cpu(cpu) { per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); - per_cpu(cpu_number, cpu) = cpu; + per_cpu(pcpu_hot.cpu_number, cpu) = cpu; setup_percpu_segment(cpu); /* * Copy data used in early init routines from the -- GitLab From c063a217bc0726c2560138229de5673dbb253a02 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:04 +0200 Subject: [PATCH 0084/1423] x86/percpu: Move current_top_of_stack next to current_task Extend the struct pcpu_hot cacheline with current_top_of_stack; another very frequently used value. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111145.493038635@infradead.org --- arch/x86/entry/entry_32.S | 4 ++-- arch/x86/entry/entry_64.S | 6 +++--- arch/x86/entry/entry_64_compat.S | 6 +++--- arch/x86/include/asm/current.h | 1 + arch/x86/include/asm/processor.h | 4 +--- arch/x86/kernel/asm-offsets.c | 2 ++ arch/x86/kernel/cpu/common.c | 12 +----------- arch/x86/kernel/process_32.c | 4 ++-- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/smpboot.c | 2 +- arch/x86/kernel/traps.c | 4 ++-- 11 files changed, 19 insertions(+), 28 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index e309e71560389..91397f58ac300 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1181,7 +1181,7 @@ SYM_CODE_START(asm_exc_nmi) * is using the thread stack right now, so it's safe for us to use it. */ movl %esp, %ebx - movl PER_CPU_VAR(cpu_current_top_of_stack), %esp + movl PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %esp call exc_nmi movl %ebx, %esp @@ -1243,7 +1243,7 @@ SYM_CODE_START(rewind_stack_and_make_dead) /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp - movl PER_CPU_VAR(cpu_current_top_of_stack), %esi + movl PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %esi leal -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp call make_task_dead diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index e635f962afb80..9249a45cf53fd 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -92,7 +92,7 @@ SYM_CODE_START(entry_SYSCALL_64) /* tss.sp2 is scratch space. */ movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2) SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL) ANNOTATE_NOENDBR @@ -1209,7 +1209,7 @@ SYM_CODE_START(asm_exc_nmi) FENCE_SWAPGS_USER_ENTRY SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx movq %rsp, %rdx - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp UNWIND_HINT_IRET_REGS base=%rdx offset=8 pushq 5*8(%rdx) /* pt_regs->ss */ pushq 4*8(%rdx) /* pt_regs->rsp */ @@ -1525,7 +1525,7 @@ SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead) /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp - movq PER_CPU_VAR(cpu_current_top_of_stack), %rax + movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rax leaq -PTREGS_SIZE(%rax), %rsp UNWIND_HINT_REGS diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 4dd19819053a5..1dfee868d4a10 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -58,7 +58,7 @@ SYM_CODE_START(entry_SYSENTER_compat) SWITCH_TO_KERNEL_CR3 scratch_reg=%rax popq %rax - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ @@ -191,7 +191,7 @@ SYM_CODE_START(entry_SYSCALL_compat) SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp /* Switch to the kernel stack */ - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL) ANNOTATE_NOENDBR @@ -332,7 +332,7 @@ SYM_CODE_START(entry_INT80_compat) ALTERNATIVE "", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV movq %rsp, %rax - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp pushq 5*8(%rax) /* regs->ss */ pushq 4*8(%rax) /* regs->rsp */ diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index 8ac6589e9a1b7..2dd013128f1ef 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -17,6 +17,7 @@ struct pcpu_hot { struct task_struct *current_task; int preempt_count; int cpu_number; + unsigned long top_of_stack; }; u8 pad[64]; }; diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c660700ecfc61..c345f3096c80b 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -426,8 +426,6 @@ struct irq_stack { char stack[IRQ_STACK_SIZE]; } __aligned(IRQ_STACK_SIZE); -DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); - #ifdef CONFIG_X86_64 struct fixed_percpu_data { /* @@ -566,7 +564,7 @@ static __always_inline unsigned long current_top_of_stack(void) * and around vm86 mode and sp0 on x86_64 is special because of the * entry trampoline. */ - return this_cpu_read_stable(cpu_current_top_of_stack); + return this_cpu_read_stable(pcpu_hot.top_of_stack); } static __always_inline bool on_thread_stack(void) diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index cb50589a7102f..a9824318e1c55 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -109,6 +109,8 @@ static void __used common(void) OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); OFFSET(TSS_sp2, tss_struct, x86_tss.sp2); + OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack); + if (IS_ENABLED(CONFIG_KVM_INTEL)) { BLANK(); OFFSET(VMX_spec_ctrl, vcpu_vmx, spec_ctrl); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cafb6bd90d107..408245c2eead8 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -2015,6 +2015,7 @@ __setup("clearcpuid=", setup_clearcpuid); DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = { .current_task = &init_task, .preempt_count = INIT_PREEMPT_COUNT, + .top_of_stack = TOP_OF_INIT_STACK, }; EXPORT_PER_CPU_SYMBOL(pcpu_hot); @@ -2026,8 +2027,6 @@ EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data); DEFINE_PER_CPU(void *, hardirq_stack_ptr); DEFINE_PER_CPU(bool, hardirq_stack_inuse); -DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK; - static void wrmsrl_cstar(unsigned long val) { /* @@ -2078,15 +2077,6 @@ void syscall_init(void) #else /* CONFIG_X86_64 */ -/* - * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find - * the top of the kernel stack. Use an extra percpu variable to track the - * top of the kernel stack directly. - */ -DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = - (unsigned long)&init_thread_union + THREAD_SIZE; -EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack); - #ifdef CONFIG_STACKPROTECTOR DEFINE_PER_CPU(unsigned long, __stack_chk_guard); EXPORT_PER_CPU_SYMBOL(__stack_chk_guard); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 807da45d84c72..470c128759eab 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -191,13 +191,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) arch_end_context_switch(next_p); /* - * Reload esp0 and cpu_current_top_of_stack. This changes + * Reload esp0 and pcpu_hot.top_of_stack. This changes * current_thread_info(). Refresh the SYSENTER configuration in * case prev or next is vm86. */ update_task_stack(next_p); refresh_sysenter_cs(next); - this_cpu_write(cpu_current_top_of_stack, + this_cpu_write(pcpu_hot.top_of_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c4f6cacf6599d..7f807e8bc9237 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -618,7 +618,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * Switch the PDA and FPU contexts. */ raw_cpu_write(pcpu_hot.current_task, next_p); - this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); + raw_cpu_write(pcpu_hot.top_of_stack, task_top_of_stack(next_p)); switch_fpu_finish(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 05f315777691c..87863a93e9181 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1056,7 +1056,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ - per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); + per_cpu(pcpu_hot.top_of_stack, cpu) = task_top_of_stack(idle); #else initial_gs = per_cpu_offset(cpu); #endif diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 178015a820f08..7ac19aba89830 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -851,7 +851,7 @@ DEFINE_IDTENTRY_RAW(exc_int3) */ asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs) { - struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1; + struct pt_regs *regs = (struct pt_regs *)this_cpu_read(pcpu_hot.top_of_stack) - 1; if (regs != eregs) *regs = *eregs; return regs; @@ -869,7 +869,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r * trust it and switch to the current kernel stack */ if (ip_within_syscall_gap(regs)) { - sp = this_cpu_read(cpu_current_top_of_stack); + sp = this_cpu_read(pcpu_hot.top_of_stack); goto sync; } -- GitLab From d7b6d709a76a4f4ef3108ac41e1b39eb80f5c084 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:05 +0200 Subject: [PATCH 0085/1423] x86/percpu: Move irq_stack variables next to current_task Further extend struct pcpu_hot with the hard and soft irq stack pointers. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111145.599170752@infradead.org --- arch/x86/include/asm/current.h | 6 ++++++ arch/x86/include/asm/irq_stack.h | 12 ++++++------ arch/x86/include/asm/processor.h | 4 ---- arch/x86/kernel/cpu/common.c | 3 --- arch/x86/kernel/dumpstack_32.c | 4 ++-- arch/x86/kernel/dumpstack_64.c | 2 +- arch/x86/kernel/irq_32.c | 13 +++++-------- arch/x86/kernel/irq_64.c | 6 +++--- arch/x86/kernel/process_64.c | 2 +- 9 files changed, 24 insertions(+), 28 deletions(-) diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index 2dd013128f1ef..ac3090ddf34e8 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -18,6 +18,12 @@ struct pcpu_hot { int preempt_count; int cpu_number; unsigned long top_of_stack; + void *hardirq_stack_ptr; +#ifdef CONFIG_X86_64 + bool hardirq_stack_inuse; +#else + void *softirq_stack_ptr; +#endif }; u8 pad[64]; }; diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h index 147cb8fdda92e..798183867d789 100644 --- a/arch/x86/include/asm/irq_stack.h +++ b/arch/x86/include/asm/irq_stack.h @@ -116,7 +116,7 @@ ASM_CALL_ARG2 #define call_on_irqstack(func, asm_call, argconstr...) \ - call_on_stack(__this_cpu_read(hardirq_stack_ptr), \ + call_on_stack(__this_cpu_read(pcpu_hot.hardirq_stack_ptr), \ func, asm_call, argconstr) /* Macros to assert type correctness for run_*_on_irqstack macros */ @@ -135,7 +135,7 @@ * User mode entry and interrupt on the irq stack do not \ * switch stacks. If from user mode the task stack is empty. \ */ \ - if (user_mode(regs) || __this_cpu_read(hardirq_stack_inuse)) { \ + if (user_mode(regs) || __this_cpu_read(pcpu_hot.hardirq_stack_inuse)) { \ irq_enter_rcu(); \ func(c_args); \ irq_exit_rcu(); \ @@ -146,9 +146,9 @@ * places. Invoke the stack switch macro with the call \ * sequence which matches the above direct invocation. \ */ \ - __this_cpu_write(hardirq_stack_inuse, true); \ + __this_cpu_write(pcpu_hot.hardirq_stack_inuse, true); \ call_on_irqstack(func, asm_call, constr); \ - __this_cpu_write(hardirq_stack_inuse, false); \ + __this_cpu_write(pcpu_hot.hardirq_stack_inuse, false); \ } \ } @@ -212,9 +212,9 @@ */ #define do_softirq_own_stack() \ { \ - __this_cpu_write(hardirq_stack_inuse, true); \ + __this_cpu_write(pcpu_hot.hardirq_stack_inuse, true); \ call_on_irqstack(__do_softirq, ASM_CALL_ARG0); \ - __this_cpu_write(hardirq_stack_inuse, false); \ + __this_cpu_write(pcpu_hot.hardirq_stack_inuse, false); \ } #endif diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c345f3096c80b..bdde68744eb3a 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -448,8 +448,6 @@ static inline unsigned long cpu_kernelmode_gs_base(int cpu) return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu); } -DECLARE_PER_CPU(void *, hardirq_stack_ptr); -DECLARE_PER_CPU(bool, hardirq_stack_inuse); extern asmlinkage void ignore_sysret(void); /* Save actual FS/GS selectors and bases to current->thread */ @@ -458,8 +456,6 @@ void current_save_fsgs(void); #ifdef CONFIG_STACKPROTECTOR DECLARE_PER_CPU(unsigned long, __stack_chk_guard); #endif -DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); -DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr); #endif /* !X86_64 */ struct perf_event; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 408245c2eead8..2bec4b4b2c50d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -2024,9 +2024,6 @@ DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __aligned(PAGE_SIZE) __visible; EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data); -DEFINE_PER_CPU(void *, hardirq_stack_ptr); -DEFINE_PER_CPU(bool, hardirq_stack_inuse); - static void wrmsrl_cstar(unsigned long val) { /* diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 722fd712e1cf0..b4905d5173fd3 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -37,7 +37,7 @@ const char *stack_type_name(enum stack_type type) static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack_ptr); + unsigned long *begin = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); /* @@ -62,7 +62,7 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack_ptr); + unsigned long *begin = (unsigned long *)this_cpu_read(pcpu_hot.softirq_stack_ptr); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); /* diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 6c5defd6569a3..f05339fee7785 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -134,7 +134,7 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr); + unsigned long *end = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr); unsigned long *begin; /* diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 01833ebf5e8e3..dc1049c01f9b9 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -52,9 +52,6 @@ static inline int check_stack_overflow(void) { return 0; } static inline void print_stack_overflow(void) { } #endif -DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); -DEFINE_PER_CPU(struct irq_stack *, softirq_stack_ptr); - static void call_on_stack(void *func, void *stack) { asm volatile("xchgl %%ebx,%%esp \n" @@ -77,7 +74,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) u32 *isp, *prev_esp, arg1; curstk = (struct irq_stack *) current_stack(); - irqstk = __this_cpu_read(hardirq_stack_ptr); + irqstk = __this_cpu_read(pcpu_hot.hardirq_stack_ptr); /* * this is where we switch to the IRQ stack. However, if we are @@ -115,7 +112,7 @@ int irq_init_percpu_irqstack(unsigned int cpu) int node = cpu_to_node(cpu); struct page *ph, *ps; - if (per_cpu(hardirq_stack_ptr, cpu)) + if (per_cpu(pcpu_hot.hardirq_stack_ptr, cpu)) return 0; ph = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); @@ -127,8 +124,8 @@ int irq_init_percpu_irqstack(unsigned int cpu) return -ENOMEM; } - per_cpu(hardirq_stack_ptr, cpu) = page_address(ph); - per_cpu(softirq_stack_ptr, cpu) = page_address(ps); + per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = page_address(ph); + per_cpu(pcpu_hot.softirq_stack_ptr, cpu) = page_address(ps); return 0; } @@ -138,7 +135,7 @@ void do_softirq_own_stack(void) struct irq_stack *irqstk; u32 *isp, *prev_esp; - irqstk = __this_cpu_read(softirq_stack_ptr); + irqstk = __this_cpu_read(pcpu_hot.softirq_stack_ptr); /* build the stack frame on the softirq stack */ isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 1c0fb96b9e390..fe0c859873d17 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -50,7 +50,7 @@ static int map_irq_stack(unsigned int cpu) return -ENOMEM; /* Store actual TOS to avoid adjustment in the hotpath */ - per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; + per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; return 0; } #else @@ -63,14 +63,14 @@ static int map_irq_stack(unsigned int cpu) void *va = per_cpu_ptr(&irq_stack_backing_store, cpu); /* Store actual TOS to avoid adjustment in the hotpath */ - per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; + per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; return 0; } #endif int irq_init_percpu_irqstack(unsigned int cpu) { - if (per_cpu(hardirq_stack_ptr, cpu)) + if (per_cpu(pcpu_hot.hardirq_stack_ptr, cpu)) return 0; return map_irq_stack(cpu); } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 7f807e8bc9237..1312de5b76aa4 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -563,7 +563,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && - this_cpu_read(hardirq_stack_inuse)); + this_cpu_read(pcpu_hot.hardirq_stack_inuse)); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) switch_fpu_prepare(prev_fpu, cpu); -- GitLab From 7fcecafebed90d03f35bec6e147fc0b5f6e1bc71 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:06 +0200 Subject: [PATCH 0086/1423] x86/softirq: Move softirq pending next to current task Another hot variable which is strict per CPU and benefits from being in the same cache line. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111145.702133710@infradead.org --- arch/x86/include/asm/current.h | 1 + arch/x86/include/asm/hardirq.h | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index ac3090ddf34e8..b89aba077b84b 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -19,6 +19,7 @@ struct pcpu_hot { int cpu_number; unsigned long top_of_stack; void *hardirq_stack_ptr; + u16 softirq_pending; #ifdef CONFIG_X86_64 bool hardirq_stack_inuse; #else diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 275e7fd20310f..66837b8c67f1a 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -3,9 +3,9 @@ #define _ASM_X86_HARDIRQ_H #include +#include typedef struct { - u16 __softirq_pending; #if IS_ENABLED(CONFIG_KVM_INTEL) u8 kvm_cpu_l1tf_flush_l1d; #endif @@ -60,6 +60,7 @@ extern u64 arch_irq_stat_cpu(unsigned int cpu); extern u64 arch_irq_stat(void); #define arch_irq_stat arch_irq_stat +#define local_softirq_pending_ref pcpu_hot.softirq_pending #if IS_ENABLED(CONFIG_KVM_INTEL) static inline void kvm_set_cpu_l1tf_flush_l1d(void) -- GitLab From 5b71ac8a2a3185da34a6556e791b533b48183a41 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Mon, 17 Oct 2022 16:41:06 +0200 Subject: [PATCH 0087/1423] x86: Fixup asm-offsets duplicate It turns out that 'stack_canary_offset' is a variable name; shadowing that with a #define is ripe of fail when the asm-offsets.h header gets included. Rename the thing. Signed-off-by: Peter Zijlstra (Intel) --- arch/x86/entry/entry_64.S | 2 +- arch/x86/kernel/asm-offsets_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9249a45cf53fd..5c578a7dfcd79 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -252,7 +252,7 @@ SYM_FUNC_START(__switch_to_asm) #ifdef CONFIG_STACKPROTECTOR movq TASK_stack_canary(%rsi), %rbx - movq %rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset + movq %rbx, PER_CPU_VAR(fixed_percpu_data) + FIXED_stack_canary #endif /* diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 9b698215d2618..bb65371ea9df5 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -57,7 +57,7 @@ int main(void) BLANK(); #ifdef CONFIG_STACKPROTECTOR - DEFINE(stack_canary_offset, offsetof(struct fixed_percpu_data, stack_canary)); + OFFSET(FIXED_stack_canary, fixed_percpu_data, stack_canary); BLANK(); #endif return 0; -- GitLab From 61c6065ef7ec0447a280179d04b2d81c80c2f479 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:07 +0200 Subject: [PATCH 0088/1423] objtool: Allow !PC relative relocations Objtool doesn't currently much like per-cpu usage in alternatives: arch/x86/entry/entry_64.o: warning: objtool: .altinstr_replacement+0xf: unsupported relocation in alternatives section f: 65 c7 04 25 00 00 00 00 00 00 00 80 movl $0x80000000,%gs:0x0 13: R_X86_64_32S __x86_call_depth Since the R_X86_64_32S relocation is location invariant (it's computation doesn't include P - the address of the location itself), it can be trivially allowed. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111145.806607235@infradead.org --- tools/objtool/arch/x86/decode.c | 24 ++++++++++++++++++++++++ tools/objtool/check.c | 2 +- tools/objtool/include/objtool/arch.h | 2 ++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 1c253b4b7ce00..f0943830add75 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -73,6 +73,30 @@ unsigned long arch_jump_destination(struct instruction *insn) return insn->offset + insn->len + insn->immediate; } +bool arch_pc_relative_reloc(struct reloc *reloc) +{ + /* + * All relocation types where P (the address of the target) + * is included in the computation. + */ + switch (reloc->type) { + case R_X86_64_PC8: + case R_X86_64_PC16: + case R_X86_64_PC32: + case R_X86_64_PC64: + + case R_X86_64_PLT32: + case R_X86_64_GOTPC32: + case R_X86_64_GOTPCREL: + return true; + + default: + break; + } + + return false; +} + #define ADD_OP(op) \ if (!(op = calloc(1, sizeof(*op)))) \ return -1; \ diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 43ec14c29a60c..7174bba144940 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1645,7 +1645,7 @@ static int handle_group_alt(struct objtool_file *file, * accordingly. */ alt_reloc = insn_reloc(file, insn); - if (alt_reloc && + if (alt_reloc && arch_pc_relative_reloc(alt_reloc) && !arch_support_alt_relocation(special_alt, insn, alt_reloc)) { WARN_FUNC("unsupported relocation in alternatives section", diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h index beb2f3aa94ffc..fe2ea4b892c34 100644 --- a/tools/objtool/include/objtool/arch.h +++ b/tools/objtool/include/objtool/arch.h @@ -93,4 +93,6 @@ bool arch_is_rethunk(struct symbol *sym); int arch_rewrite_retpolines(struct objtool_file *file); +bool arch_pc_relative_reloc(struct reloc *reloc); + #endif /* _ARCH_H */ -- GitLab From 6644ee846cb983437063da8fd24b7cae671fd019 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:08 +0200 Subject: [PATCH 0089/1423] objtool: Track init section For future usage of .init.text exclusion track the init section in the instruction decoder and use the result in retpoline validation. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111145.910334431@infradead.org --- tools/objtool/check.c | 17 ++++++++++------- tools/objtool/include/objtool/elf.h | 2 +- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 7174bba144940..bb7c8196bf57c 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -382,6 +382,15 @@ static int decode_instructions(struct objtool_file *file) !strncmp(sec->name, ".text.__x86.", 12)) sec->noinstr = true; + /* + * .init.text code is ran before userspace and thus doesn't + * strictly need retpolines, except for modules which are + * loaded late, they very much do need retpoline in their + * .init.text + */ + if (!strcmp(sec->name, ".init.text") && !opts.module) + sec->init = true; + for (offset = 0; offset < sec->sh.sh_size; offset += insn->len) { insn = malloc(sizeof(*insn)); if (!insn) { @@ -3748,13 +3757,7 @@ static int validate_retpoline(struct objtool_file *file) if (insn->retpoline_safe) continue; - /* - * .init.text code is ran before userspace and thus doesn't - * strictly need retpolines, except for modules which are - * loaded late, they very much do need retpoline in their - * .init.text - */ - if (!strcmp(insn->sec->name, ".init.text") && !opts.module) + if (insn->sec->init) continue; if (insn->type == INSN_RETURN) { diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index 16f4067b82aea..baa808583c4fa 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -38,7 +38,7 @@ struct section { Elf_Data *data; char *name; int idx; - bool changed, text, rodata, noinstr; + bool changed, text, rodata, noinstr, init; }; struct symbol { -- GitLab From 00abd38408127a57861698a8bffba65849de6bbd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:09 +0200 Subject: [PATCH 0090/1423] objtool: Add .call_sites section In preparation for call depth tracking provide a section which collects all direct calls. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111146.016511961@infradead.org --- arch/x86/kernel/vmlinux.lds.S | 7 ++++ tools/objtool/check.c | 51 +++++++++++++++++++++++++ tools/objtool/include/objtool/objtool.h | 1 + tools/objtool/objtool.c | 1 + 4 files changed, 60 insertions(+) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 0e9fc080c417d..b69df9e013cca 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -291,6 +291,13 @@ SECTIONS *(.return_sites) __return_sites_end = .; } + + . = ALIGN(8); + .call_sites : AT(ADDR(.call_sites) - LOAD_OFFSET) { + __call_sites = .; + *(.call_sites) + __call_sites_end = .; + } #endif #ifdef CONFIG_X86_KERNEL_IBT diff --git a/tools/objtool/check.c b/tools/objtool/check.c index bb7c8196bf57c..f578e030e8bb9 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -902,6 +902,49 @@ static int create_mcount_loc_sections(struct objtool_file *file) return 0; } +static int create_direct_call_sections(struct objtool_file *file) +{ + struct instruction *insn; + struct section *sec; + unsigned int *loc; + int idx; + + sec = find_section_by_name(file->elf, ".call_sites"); + if (sec) { + INIT_LIST_HEAD(&file->call_list); + WARN("file already has .call_sites section, skipping"); + return 0; + } + + if (list_empty(&file->call_list)) + return 0; + + idx = 0; + list_for_each_entry(insn, &file->call_list, call_node) + idx++; + + sec = elf_create_section(file->elf, ".call_sites", 0, sizeof(unsigned int), idx); + if (!sec) + return -1; + + idx = 0; + list_for_each_entry(insn, &file->call_list, call_node) { + + loc = (unsigned int *)sec->data->d_buf + idx; + memset(loc, 0, sizeof(unsigned int)); + + if (elf_add_reloc_to_insn(file->elf, sec, + idx * sizeof(unsigned int), + R_X86_64_PC32, + insn->sec, insn->offset)) + return -1; + + idx++; + } + + return 0; +} + /* * Warnings shouldn't be reported for ignored functions. */ @@ -1279,6 +1322,9 @@ static void annotate_call_site(struct objtool_file *file, return; } + if (insn->type == INSN_CALL && !insn->sec->init) + list_add_tail(&insn->call_node, &file->call_list); + if (!sibling && dead_end_function(file, sym)) insn->dead_end = true; } @@ -4305,6 +4351,11 @@ int check(struct objtool_file *file) if (ret < 0) goto out; warnings += ret; + + ret = create_direct_call_sections(file); + if (ret < 0) + goto out; + warnings += ret; } if (opts.mcount) { diff --git a/tools/objtool/include/objtool/objtool.h b/tools/objtool/include/objtool/objtool.h index 7f2d1b0953333..6b40977bcdb1a 100644 --- a/tools/objtool/include/objtool/objtool.h +++ b/tools/objtool/include/objtool/objtool.h @@ -28,6 +28,7 @@ struct objtool_file { struct list_head static_call_list; struct list_head mcount_loc_list; struct list_head endbr_list; + struct list_head call_list; bool ignore_unreachables, hints, rodata; unsigned int nr_endbr; diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c index a7ecc32e35125..6affd8067f837 100644 --- a/tools/objtool/objtool.c +++ b/tools/objtool/objtool.c @@ -106,6 +106,7 @@ struct objtool_file *objtool_open_read(const char *_objname) INIT_LIST_HEAD(&file.static_call_list); INIT_LIST_HEAD(&file.mcount_loc_list); INIT_LIST_HEAD(&file.endbr_list); + INIT_LIST_HEAD(&file.call_list); file.ignore_unreachables = opts.no_unreachable; file.hints = false; -- GitLab From 0c0a6d8934e2081df93ba0bfc0cf615cc9c06988 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:10 +0200 Subject: [PATCH 0091/1423] objtool: Add --hacks=skylake Make the call/func sections selectable via the --hacks option. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111146.120821440@infradead.org --- scripts/Makefile.lib | 1 + tools/objtool/builtin-check.c | 7 ++++++- tools/objtool/check.c | 10 ++++++---- tools/objtool/include/objtool/builtin.h | 1 + 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 3aa384cec76b8..85f02756dc9c1 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -254,6 +254,7 @@ objtool := $(objtree)/tools/objtool/objtool objtool-args-$(CONFIG_HAVE_JUMP_LABEL_HACK) += --hacks=jump_label objtool-args-$(CONFIG_HAVE_NOINSTR_HACK) += --hacks=noinstr +objtool-args-$(CONFIG_CALL_DEPTH_TRACKING) += --hacks=skylake objtool-args-$(CONFIG_X86_KERNEL_IBT) += --ibt objtool-args-$(CONFIG_FTRACE_MCOUNT_USE_OBJTOOL) += --mcount objtool-args-$(CONFIG_UNWINDER_ORC) += --orc diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index 24fbe803a0d32..0a04f8ea4432d 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -57,12 +57,17 @@ static int parse_hacks(const struct option *opt, const char *str, int unset) found = true; } + if (!str || strstr(str, "skylake")) { + opts.hack_skylake = true; + found = true; + } + return found ? 0 : -1; } const struct option check_options[] = { OPT_GROUP("Actions:"), - OPT_CALLBACK_OPTARG('h', "hacks", NULL, NULL, "jump_label,noinstr", "patch toolchain bugs/limitations", parse_hacks), + OPT_CALLBACK_OPTARG('h', "hacks", NULL, NULL, "jump_label,noinstr,skylake", "patch toolchain bugs/limitations", parse_hacks), OPT_BOOLEAN('i', "ibt", &opts.ibt, "validate and annotate IBT"), OPT_BOOLEAN('m', "mcount", &opts.mcount, "annotate mcount/fentry calls for ftrace"), OPT_BOOLEAN('n', "noinstr", &opts.noinstr, "validate noinstr rules"), diff --git a/tools/objtool/check.c b/tools/objtool/check.c index f578e030e8bb9..1461c8894fb70 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -4352,10 +4352,12 @@ int check(struct objtool_file *file) goto out; warnings += ret; - ret = create_direct_call_sections(file); - if (ret < 0) - goto out; - warnings += ret; + if (opts.hack_skylake) { + ret = create_direct_call_sections(file); + if (ret < 0) + goto out; + warnings += ret; + } } if (opts.mcount) { diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h index 42a52f1a0add5..22092a9f3cf6c 100644 --- a/tools/objtool/include/objtool/builtin.h +++ b/tools/objtool/include/objtool/builtin.h @@ -14,6 +14,7 @@ struct opts { bool dump_orc; bool hack_jump_label; bool hack_noinstr; + bool hack_skylake; bool ibt; bool mcount; bool noinstr; -- GitLab From 5da6aea375cde499fdfac3cde4f26df4a840eb9f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:12 +0200 Subject: [PATCH 0092/1423] objtool: Fix find_{symbol,func}_containing() The current find_{symbol,func}_containing() functions are broken in the face of overlapping symbols, exactly the case that is needed for a new ibt/endbr supression. Import interval_tree_generic.h into the tools tree and convert the symbol tree to an interval tree to support proper range stabs. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111146.330203761@infradead.org --- tools/include/linux/interval_tree_generic.h | 187 ++++++++++++++++++++ tools/objtool/elf.c | 93 +++++----- tools/objtool/include/objtool/elf.h | 3 +- 3 files changed, 229 insertions(+), 54 deletions(-) create mode 100644 tools/include/linux/interval_tree_generic.h diff --git a/tools/include/linux/interval_tree_generic.h b/tools/include/linux/interval_tree_generic.h new file mode 100644 index 0000000000000..aaa8a0767aa3a --- /dev/null +++ b/tools/include/linux/interval_tree_generic.h @@ -0,0 +1,187 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + Interval Trees + (C) 2012 Michel Lespinasse + + + include/linux/interval_tree_generic.h +*/ + +#include + +/* + * Template for implementing interval trees + * + * ITSTRUCT: struct type of the interval tree nodes + * ITRB: name of struct rb_node field within ITSTRUCT + * ITTYPE: type of the interval endpoints + * ITSUBTREE: name of ITTYPE field within ITSTRUCT holding last-in-subtree + * ITSTART(n): start endpoint of ITSTRUCT node n + * ITLAST(n): last endpoint of ITSTRUCT node n + * ITSTATIC: 'static' or empty + * ITPREFIX: prefix to use for the inline tree definitions + * + * Note - before using this, please consider if generic version + * (interval_tree.h) would work for you... + */ + +#define INTERVAL_TREE_DEFINE(ITSTRUCT, ITRB, ITTYPE, ITSUBTREE, \ + ITSTART, ITLAST, ITSTATIC, ITPREFIX) \ + \ +/* Callbacks for augmented rbtree insert and remove */ \ + \ +RB_DECLARE_CALLBACKS_MAX(static, ITPREFIX ## _augment, \ + ITSTRUCT, ITRB, ITTYPE, ITSUBTREE, ITLAST) \ + \ +/* Insert / remove interval nodes from the tree */ \ + \ +ITSTATIC void ITPREFIX ## _insert(ITSTRUCT *node, \ + struct rb_root_cached *root) \ +{ \ + struct rb_node **link = &root->rb_root.rb_node, *rb_parent = NULL; \ + ITTYPE start = ITSTART(node), last = ITLAST(node); \ + ITSTRUCT *parent; \ + bool leftmost = true; \ + \ + while (*link) { \ + rb_parent = *link; \ + parent = rb_entry(rb_parent, ITSTRUCT, ITRB); \ + if (parent->ITSUBTREE < last) \ + parent->ITSUBTREE = last; \ + if (start < ITSTART(parent)) \ + link = &parent->ITRB.rb_left; \ + else { \ + link = &parent->ITRB.rb_right; \ + leftmost = false; \ + } \ + } \ + \ + node->ITSUBTREE = last; \ + rb_link_node(&node->ITRB, rb_parent, link); \ + rb_insert_augmented_cached(&node->ITRB, root, \ + leftmost, &ITPREFIX ## _augment); \ +} \ + \ +ITSTATIC void ITPREFIX ## _remove(ITSTRUCT *node, \ + struct rb_root_cached *root) \ +{ \ + rb_erase_augmented_cached(&node->ITRB, root, &ITPREFIX ## _augment); \ +} \ + \ +/* \ + * Iterate over intervals intersecting [start;last] \ + * \ + * Note that a node's interval intersects [start;last] iff: \ + * Cond1: ITSTART(node) <= last \ + * and \ + * Cond2: start <= ITLAST(node) \ + */ \ + \ +static ITSTRUCT * \ +ITPREFIX ## _subtree_search(ITSTRUCT *node, ITTYPE start, ITTYPE last) \ +{ \ + while (true) { \ + /* \ + * Loop invariant: start <= node->ITSUBTREE \ + * (Cond2 is satisfied by one of the subtree nodes) \ + */ \ + if (node->ITRB.rb_left) { \ + ITSTRUCT *left = rb_entry(node->ITRB.rb_left, \ + ITSTRUCT, ITRB); \ + if (start <= left->ITSUBTREE) { \ + /* \ + * Some nodes in left subtree satisfy Cond2. \ + * Iterate to find the leftmost such node N. \ + * If it also satisfies Cond1, that's the \ + * match we are looking for. Otherwise, there \ + * is no matching interval as nodes to the \ + * right of N can't satisfy Cond1 either. \ + */ \ + node = left; \ + continue; \ + } \ + } \ + if (ITSTART(node) <= last) { /* Cond1 */ \ + if (start <= ITLAST(node)) /* Cond2 */ \ + return node; /* node is leftmost match */ \ + if (node->ITRB.rb_right) { \ + node = rb_entry(node->ITRB.rb_right, \ + ITSTRUCT, ITRB); \ + if (start <= node->ITSUBTREE) \ + continue; \ + } \ + } \ + return NULL; /* No match */ \ + } \ +} \ + \ +ITSTATIC ITSTRUCT * \ +ITPREFIX ## _iter_first(struct rb_root_cached *root, \ + ITTYPE start, ITTYPE last) \ +{ \ + ITSTRUCT *node, *leftmost; \ + \ + if (!root->rb_root.rb_node) \ + return NULL; \ + \ + /* \ + * Fastpath range intersection/overlap between A: [a0, a1] and \ + * B: [b0, b1] is given by: \ + * \ + * a0 <= b1 && b0 <= a1 \ + * \ + * ... where A holds the lock range and B holds the smallest \ + * 'start' and largest 'last' in the tree. For the later, we \ + * rely on the root node, which by augmented interval tree \ + * property, holds the largest value in its last-in-subtree. \ + * This allows mitigating some of the tree walk overhead for \ + * for non-intersecting ranges, maintained and consulted in O(1). \ + */ \ + node = rb_entry(root->rb_root.rb_node, ITSTRUCT, ITRB); \ + if (node->ITSUBTREE < start) \ + return NULL; \ + \ + leftmost = rb_entry(root->rb_leftmost, ITSTRUCT, ITRB); \ + if (ITSTART(leftmost) > last) \ + return NULL; \ + \ + return ITPREFIX ## _subtree_search(node, start, last); \ +} \ + \ +ITSTATIC ITSTRUCT * \ +ITPREFIX ## _iter_next(ITSTRUCT *node, ITTYPE start, ITTYPE last) \ +{ \ + struct rb_node *rb = node->ITRB.rb_right, *prev; \ + \ + while (true) { \ + /* \ + * Loop invariants: \ + * Cond1: ITSTART(node) <= last \ + * rb == node->ITRB.rb_right \ + * \ + * First, search right subtree if suitable \ + */ \ + if (rb) { \ + ITSTRUCT *right = rb_entry(rb, ITSTRUCT, ITRB); \ + if (start <= right->ITSUBTREE) \ + return ITPREFIX ## _subtree_search(right, \ + start, last); \ + } \ + \ + /* Move up the tree until we come from a node's left child */ \ + do { \ + rb = rb_parent(&node->ITRB); \ + if (!rb) \ + return NULL; \ + prev = &node->ITRB; \ + node = rb_entry(rb, ITSTRUCT, ITRB); \ + rb = node->ITRB.rb_right; \ + } while (prev == rb); \ + \ + /* Check if the node intersects [start;last] */ \ + if (last < ITSTART(node)) /* !Cond1 */ \ + return NULL; \ + else if (start <= ITLAST(node)) /* Cond2 */ \ + return node; \ + } \ +} diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 7e24b09b1163a..89b37cd4ab1dc 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -50,38 +51,22 @@ static inline u32 str_hash(const char *str) __elf_table(name); \ }) -static bool symbol_to_offset(struct rb_node *a, const struct rb_node *b) +static inline unsigned long __sym_start(struct symbol *s) { - struct symbol *sa = rb_entry(a, struct symbol, node); - struct symbol *sb = rb_entry(b, struct symbol, node); - - if (sa->offset < sb->offset) - return true; - if (sa->offset > sb->offset) - return false; - - if (sa->len < sb->len) - return true; - if (sa->len > sb->len) - return false; - - sa->alias = sb; - - return false; + return s->offset; } -static int symbol_by_offset(const void *key, const struct rb_node *node) +static inline unsigned long __sym_last(struct symbol *s) { - const struct symbol *s = rb_entry(node, struct symbol, node); - const unsigned long *o = key; + return s->offset + s->len - 1; +} - if (*o < s->offset) - return -1; - if (*o >= s->offset + s->len) - return 1; +INTERVAL_TREE_DEFINE(struct symbol, node, unsigned long, __subtree_last, + __sym_start, __sym_last, static, __sym) - return 0; -} +#define __sym_for_each(_iter, _tree, _start, _end) \ + for (_iter = __sym_iter_first((_tree), (_start), (_end)); \ + _iter; _iter = __sym_iter_next(_iter, (_start), (_end))) struct symbol_hole { unsigned long key; @@ -147,13 +132,12 @@ static struct symbol *find_symbol_by_index(struct elf *elf, unsigned int idx) struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset) { - struct rb_node *node; - - rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) { - struct symbol *s = rb_entry(node, struct symbol, node); + struct rb_root_cached *tree = (struct rb_root_cached *)&sec->symbol_tree; + struct symbol *iter; - if (s->offset == offset && s->type != STT_SECTION) - return s; + __sym_for_each(iter, tree, offset, offset) { + if (iter->offset == offset && iter->type != STT_SECTION) + return iter; } return NULL; @@ -161,13 +145,12 @@ struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset) struct symbol *find_func_by_offset(struct section *sec, unsigned long offset) { - struct rb_node *node; + struct rb_root_cached *tree = (struct rb_root_cached *)&sec->symbol_tree; + struct symbol *iter; - rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) { - struct symbol *s = rb_entry(node, struct symbol, node); - - if (s->offset == offset && s->type == STT_FUNC) - return s; + __sym_for_each(iter, tree, offset, offset) { + if (iter->offset == offset && iter->type == STT_FUNC) + return iter; } return NULL; @@ -175,13 +158,12 @@ struct symbol *find_func_by_offset(struct section *sec, unsigned long offset) struct symbol *find_symbol_containing(const struct section *sec, unsigned long offset) { - struct rb_node *node; - - rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) { - struct symbol *s = rb_entry(node, struct symbol, node); + struct rb_root_cached *tree = (struct rb_root_cached *)&sec->symbol_tree; + struct symbol *iter; - if (s->type != STT_SECTION) - return s; + __sym_for_each(iter, tree, offset, offset) { + if (iter->type != STT_SECTION) + return iter; } return NULL; @@ -202,7 +184,7 @@ int find_symbol_hole_containing(const struct section *sec, unsigned long offset) /* * Find the rightmost symbol for which @offset is after it. */ - n = rb_find(&hole, &sec->symbol_tree, symbol_hole_by_offset); + n = rb_find(&hole, &sec->symbol_tree.rb_root, symbol_hole_by_offset); /* found a symbol that contains @offset */ if (n) @@ -224,13 +206,12 @@ int find_symbol_hole_containing(const struct section *sec, unsigned long offset) struct symbol *find_func_containing(struct section *sec, unsigned long offset) { - struct rb_node *node; - - rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) { - struct symbol *s = rb_entry(node, struct symbol, node); + struct rb_root_cached *tree = (struct rb_root_cached *)&sec->symbol_tree; + struct symbol *iter; - if (s->type == STT_FUNC) - return s; + __sym_for_each(iter, tree, offset, offset) { + if (iter->type == STT_FUNC) + return iter; } return NULL; @@ -373,6 +354,7 @@ static void elf_add_symbol(struct elf *elf, struct symbol *sym) { struct list_head *entry; struct rb_node *pnode; + struct symbol *iter; INIT_LIST_HEAD(&sym->pv_target); sym->alias = sym; @@ -386,7 +368,12 @@ static void elf_add_symbol(struct elf *elf, struct symbol *sym) sym->offset = sym->sym.st_value; sym->len = sym->sym.st_size; - rb_add(&sym->node, &sym->sec->symbol_tree, symbol_to_offset); + __sym_for_each(iter, &sym->sec->symbol_tree, sym->offset, sym->offset) { + if (iter->offset == sym->offset && iter->type == sym->type) + iter->alias = sym; + } + + __sym_insert(sym, &sym->sec->symbol_tree); pnode = rb_prev(&sym->node); if (pnode) entry = &rb_entry(pnode, struct symbol, node)->list; @@ -401,7 +388,7 @@ static void elf_add_symbol(struct elf *elf, struct symbol *sym) * can exist within a function, confusing the sorting. */ if (!sym->len) - rb_erase(&sym->node, &sym->sec->symbol_tree); + __sym_remove(sym, &sym->sec->symbol_tree); } static int read_symbols(struct elf *elf) diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index baa808583c4fa..d28533106b780 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -30,7 +30,7 @@ struct section { struct hlist_node hash; struct hlist_node name_hash; GElf_Shdr sh; - struct rb_root symbol_tree; + struct rb_root_cached symbol_tree; struct list_head symbol_list; struct list_head reloc_list; struct section *base, *reloc; @@ -53,6 +53,7 @@ struct symbol { unsigned char bind, type; unsigned long offset; unsigned int len; + unsigned long __subtree_last; struct symbol *pfunc, *cfunc, *alias; u8 uaccess_safe : 1; u8 static_call_tramp : 1; -- GitLab From 08ef8c40112b8cd157515cd532f65cb82c934a76 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:13 +0200 Subject: [PATCH 0093/1423] objtool: Allow symbol range comparisons for IBT/ENDBR A semi common pattern is where code checks if a code address is within a specific range. All text addresses require either ENDBR or ANNOTATE_ENDBR, however the ANNOTATE_NOENDBR past the range is unnatural. Instead, suppress this warning when this is exactly at the end of a symbol that itself starts with either ENDBR/ANNOTATE_ENDBR. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111146.434642471@infradead.org --- arch/x86/entry/entry_64_compat.S | 1 - tools/objtool/check.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 1dfee868d4a10..bc45ea7d08ee7 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -128,7 +128,6 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) popfq jmp .Lsysenter_flags_fixed SYM_INNER_LABEL(__end_entry_SYSENTER_compat, SYM_L_GLOBAL) - ANNOTATE_NOENDBR // is_sysenter_singlestep SYM_CODE_END(entry_SYSENTER_compat) /* diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 1461c8894fb70..3f46f46ab85c3 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -4033,6 +4033,24 @@ static void mark_endbr_used(struct instruction *insn) list_del_init(&insn->call_node); } +static bool noendbr_range(struct objtool_file *file, struct instruction *insn) +{ + struct symbol *sym = find_symbol_containing(insn->sec, insn->offset-1); + struct instruction *first; + + if (!sym) + return false; + + first = find_insn(file, sym->sec, sym->offset); + if (!first) + return false; + + if (first->type != INSN_ENDBR && !first->noendbr) + return false; + + return insn->offset == sym->offset + sym->len; +} + static int validate_ibt_insn(struct objtool_file *file, struct instruction *insn) { struct instruction *dest; @@ -4105,9 +4123,19 @@ static int validate_ibt_insn(struct objtool_file *file, struct instruction *insn continue; } + /* + * Accept anything ANNOTATE_NOENDBR. + */ if (dest->noendbr) continue; + /* + * Accept if this is the instruction after a symbol + * that is (no)endbr -- typical code-range usage. + */ + if (noendbr_range(file, dest)) + continue; + WARN_FUNC("relocation to !ENDBR: %s", insn->sec, insn->offset, offstr(dest->sec, dest->offset)); -- GitLab From dbcdbdfdf137b49144204571f1a5e5dc01b8aaad Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 22 Sep 2022 22:03:50 +0200 Subject: [PATCH 0094/1423] objtool: Rework instruction -> symbol mapping Currently insn->func contains a instruction -> symbol link for STT_FUNC symbols. A NULL value is assumed to mean STT_NOTYPE. However, there are also instructions not covered by any symbol at all. This can happen due to __weak symbols for example. Since the current scheme cannot differentiate between no symbol and STT_NOTYPE symbol, change things around. Make insn->sym point to any symbol type such that !insn->sym means no symbol and add a helper insn_func() that check the sym->type to retain the old functionality. This then prepares the way to add code that depends on the distinction between STT_NOTYPE and no symbol at all. Signed-off-by: Peter Zijlstra (Intel) --- tools/objtool/check.c | 105 ++++++++++++++------------ tools/objtool/include/objtool/check.h | 12 ++- 2 files changed, 66 insertions(+), 51 deletions(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 3f46f46ab85c3..e532efb9b5ab3 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -62,12 +62,12 @@ static struct instruction *next_insn_same_func(struct objtool_file *file, struct instruction *insn) { struct instruction *next = list_next_entry(insn, list); - struct symbol *func = insn->func; + struct symbol *func = insn_func(insn); if (!func) return NULL; - if (&next->list != &file->insn_list && next->func == func) + if (&next->list != &file->insn_list && insn_func(next) == func) return next; /* Check if we're already in the subfunction: */ @@ -83,7 +83,7 @@ static struct instruction *prev_insn_same_sym(struct objtool_file *file, { struct instruction *prev = list_prev_entry(insn, list); - if (&prev->list != &file->insn_list && prev->func == insn->func) + if (&prev->list != &file->insn_list && insn_func(prev) == insn_func(insn)) return prev; return NULL; @@ -133,7 +133,7 @@ static bool is_sibling_call(struct instruction *insn) * sibling call detection consistency between vmlinux.o and individual * objects. */ - if (!insn->func) + if (!insn_func(insn)) return false; /* An indirect jump is either a sibling call or a jump to a table. */ @@ -207,7 +207,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func, return false; insn = find_insn(file, func->sec, func->offset); - if (!insn->func) + if (!insn_func(insn)) return false; func_for_each_insn(file, func, insn) { @@ -243,7 +243,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func, return false; } - return __dead_end_function(file, dest->func, recursion+1); + return __dead_end_function(file, insn_func(dest), recursion+1); } } @@ -427,7 +427,10 @@ static int decode_instructions(struct objtool_file *file) } list_for_each_entry(func, &sec->symbol_list, list) { - if (func->type != STT_FUNC || func->alias != func) + if (func->type != STT_NOTYPE && func->type != STT_FUNC) + continue; + + if (func->return_thunk || func->alias != func) continue; if (!find_insn(file, sec, func->offset)) { @@ -437,9 +440,11 @@ static int decode_instructions(struct objtool_file *file) } sym_for_each_insn(file, func, insn) { - insn->func = func; - if (insn->type == INSN_ENDBR && list_empty(&insn->call_node)) { - if (insn->offset == insn->func->offset) { + insn->sym = func; + if (func->type == STT_FUNC && + insn->type == INSN_ENDBR && + list_empty(&insn->call_node)) { + if (insn->offset == func->offset) { list_add_tail(&insn->call_node, &file->endbr_list); file->nr_endbr++; } else { @@ -1397,19 +1402,19 @@ static void add_return_call(struct objtool_file *file, struct instruction *insn, static bool same_function(struct instruction *insn1, struct instruction *insn2) { - return insn1->func->pfunc == insn2->func->pfunc; + return insn_func(insn1)->pfunc == insn_func(insn2)->pfunc; } static bool is_first_func_insn(struct objtool_file *file, struct instruction *insn) { - if (insn->offset == insn->func->offset) + if (insn->offset == insn_func(insn)->offset) return true; if (opts.ibt) { struct instruction *prev = prev_insn_same_sym(file, insn); if (prev && prev->type == INSN_ENDBR && - insn->offset == insn->func->offset + prev->len) + insn->offset == insn_func(insn)->offset + prev->len) return true; } @@ -1450,7 +1455,7 @@ static int add_jump_destinations(struct objtool_file *file) } else if (reloc->sym->return_thunk) { add_return_call(file, insn, true); continue; - } else if (insn->func) { + } else if (insn_func(insn)) { /* * External sibling call or internal sibling call with * STT_FUNC reloc. @@ -1492,8 +1497,8 @@ static int add_jump_destinations(struct objtool_file *file) /* * Cross-function jump. */ - if (insn->func && jump_dest->func && - insn->func != jump_dest->func) { + if (insn_func(insn) && insn_func(jump_dest) && + insn_func(insn) != insn_func(jump_dest)) { /* * For GCC 8+, create parent/child links for any cold @@ -1510,10 +1515,10 @@ static int add_jump_destinations(struct objtool_file *file) * case where the parent function's only reference to a * subfunction is through a jump table. */ - if (!strstr(insn->func->name, ".cold") && - strstr(jump_dest->func->name, ".cold")) { - insn->func->cfunc = jump_dest->func; - jump_dest->func->pfunc = insn->func; + if (!strstr(insn_func(insn)->name, ".cold") && + strstr(insn_func(jump_dest)->name, ".cold")) { + insn_func(insn)->cfunc = insn_func(jump_dest); + insn_func(jump_dest)->pfunc = insn_func(insn); } else if (!same_function(insn, jump_dest) && is_first_func_insn(file, jump_dest)) { @@ -1521,7 +1526,7 @@ static int add_jump_destinations(struct objtool_file *file) * Internal sibling call without reloc or with * STT_SECTION reloc. */ - add_call_dest(file, insn, jump_dest->func, true); + add_call_dest(file, insn, insn_func(jump_dest), true); continue; } } @@ -1572,7 +1577,7 @@ static int add_call_destinations(struct objtool_file *file) return -1; } - if (insn->func && insn->call_dest->type != STT_FUNC) { + if (insn_func(insn) && insn->call_dest->type != STT_FUNC) { WARN_FUNC("unsupported call to non-function", insn->sec, insn->offset); return -1; @@ -1668,7 +1673,7 @@ static int handle_group_alt(struct objtool_file *file, nop->offset = special_alt->new_off + special_alt->new_len; nop->len = special_alt->orig_len - special_alt->new_len; nop->type = INSN_NOP; - nop->func = orig_insn->func; + nop->sym = orig_insn->sym; nop->alt_group = new_alt_group; nop->ignore = orig_insn->ignore_alts; } @@ -1688,7 +1693,7 @@ static int handle_group_alt(struct objtool_file *file, last_new_insn = insn; insn->ignore = orig_insn->ignore_alts; - insn->func = orig_insn->func; + insn->sym = orig_insn->sym; insn->alt_group = new_alt_group; /* @@ -1882,7 +1887,7 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn, struct reloc *reloc = table; struct instruction *dest_insn; struct alternative *alt; - struct symbol *pfunc = insn->func->pfunc; + struct symbol *pfunc = insn_func(insn)->pfunc; unsigned int prev_offset = 0; /* @@ -1909,7 +1914,7 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn, break; /* Make sure the destination is in the same function: */ - if (!dest_insn->func || dest_insn->func->pfunc != pfunc) + if (!insn_func(dest_insn) || insn_func(dest_insn)->pfunc != pfunc) break; alt = malloc(sizeof(*alt)); @@ -1949,7 +1954,7 @@ static struct reloc *find_jump_table(struct objtool_file *file, * it. */ for (; - insn && insn->func && insn->func->pfunc == func; + insn && insn_func(insn) && insn_func(insn)->pfunc == func; insn = insn->first_jump_src ?: prev_insn_same_sym(file, insn)) { if (insn != orig_insn && insn->type == INSN_JUMP_DYNAMIC) @@ -1966,7 +1971,7 @@ static struct reloc *find_jump_table(struct objtool_file *file, if (!table_reloc) continue; dest_insn = find_insn(file, table_reloc->sym->sec, table_reloc->addend); - if (!dest_insn || !dest_insn->func || dest_insn->func->pfunc != func) + if (!dest_insn || !insn_func(dest_insn) || insn_func(dest_insn)->pfunc != func) continue; return table_reloc; @@ -2415,6 +2420,13 @@ static int decode_sections(struct objtool_file *file) if (ret) return ret; + /* + * Must be before add_{jump_call}_destination. + */ + ret = classify_symbols(file); + if (ret) + return ret; + ret = decode_instructions(file); if (ret) return ret; @@ -2433,13 +2445,6 @@ static int decode_sections(struct objtool_file *file) if (ret) return ret; - /* - * Must be before add_{jump_call}_destination. - */ - ret = classify_symbols(file); - if (ret) - return ret; - /* * Must be before add_jump_destinations(), which depends on 'func' * being set for alternatives, to enable proper sibling call detection. @@ -2648,7 +2653,7 @@ static int update_cfi_state(struct instruction *insn, /* stack operations don't make sense with an undefined CFA */ if (cfa->base == CFI_UNDEFINED) { - if (insn->func) { + if (insn_func(insn)) { WARN_FUNC("undefined stack state", insn->sec, insn->offset); return -1; } @@ -2994,7 +2999,7 @@ static int update_cfi_state(struct instruction *insn, } /* detect when asm code uses rbp as a scratch register */ - if (opts.stackval && insn->func && op->src.reg == CFI_BP && + if (opts.stackval && insn_func(insn) && op->src.reg == CFI_BP && cfa->base != CFI_BP) cfi->bp_scratch = true; break; @@ -3390,13 +3395,13 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, while (1) { next_insn = next_insn_to_validate(file, insn); - if (func && insn->func && func != insn->func->pfunc) { + if (func && insn_func(insn) && func != insn_func(insn)->pfunc) { /* Ignore KCFI type preambles, which always fall through */ if (!strncmp(func->name, "__cfi_", 6)) return 0; WARN("%s() falls through to next function %s()", - func->name, insn->func->name); + func->name, insn_func(insn)->name); return 1; } @@ -3638,7 +3643,7 @@ static int validate_unwind_hints(struct objtool_file *file, struct section *sec) while (&insn->list != &file->insn_list && (!sec || insn->sec == sec)) { if (insn->hint && !insn->visited && !insn->ignore) { - ret = validate_branch(file, insn->func, insn, state); + ret = validate_branch(file, insn_func(insn), insn, state); if (ret && opts.backtrace) BT_FUNC("<=== (hint)", insn); warnings += ret; @@ -3861,7 +3866,7 @@ static bool ignore_unreachable_insn(struct objtool_file *file, struct instructio * In this case we'll find a piece of code (whole function) that is not * covered by a !section symbol. Ignore them. */ - if (opts.link && !insn->func) { + if (opts.link && !insn_func(insn)) { int size = find_symbol_hole_containing(insn->sec, insn->offset); unsigned long end = insn->offset + size; @@ -3885,10 +3890,10 @@ static bool ignore_unreachable_insn(struct objtool_file *file, struct instructio /* * If this hole jumps to a .cold function, mark it ignore too. */ - if (insn->jump_dest && insn->jump_dest->func && - strstr(insn->jump_dest->func->name, ".cold")) { + if (insn->jump_dest && insn_func(insn->jump_dest) && + strstr(insn_func(insn->jump_dest)->name, ".cold")) { struct instruction *dest = insn->jump_dest; - func_for_each_insn(file, dest->func, dest) + func_for_each_insn(file, insn_func(dest), dest) dest->ignore = true; } } @@ -3896,10 +3901,10 @@ static bool ignore_unreachable_insn(struct objtool_file *file, struct instructio return false; } - if (!insn->func) + if (!insn_func(insn)) return false; - if (insn->func->static_call_tramp) + if (insn_func(insn)->static_call_tramp) return true; /* @@ -3930,7 +3935,7 @@ static bool ignore_unreachable_insn(struct objtool_file *file, struct instructio if (insn->type == INSN_JUMP_UNCONDITIONAL) { if (insn->jump_dest && - insn->jump_dest->func == insn->func) { + insn_func(insn->jump_dest) == insn_func(insn)) { insn = insn->jump_dest; continue; } @@ -3938,7 +3943,7 @@ static bool ignore_unreachable_insn(struct objtool_file *file, struct instructio break; } - if (insn->offset + insn->len >= insn->func->offset + insn->func->len) + if (insn->offset + insn->len >= insn_func(insn)->offset + insn_func(insn)->len) break; insn = list_next_entry(insn, list); @@ -3967,7 +3972,7 @@ static int validate_symbol(struct objtool_file *file, struct section *sec, state->uaccess = sym->uaccess_safe; - ret = validate_branch(file, insn->func, insn, *state); + ret = validate_branch(file, insn_func(insn), insn, *state); if (ret && opts.backtrace) BT_FUNC("<=== (sym)", insn); return ret; @@ -4104,7 +4109,7 @@ static int validate_ibt_insn(struct objtool_file *file, struct instruction *insn continue; } - if (dest->func && dest->func == insn->func) { + if (insn_func(dest) && insn_func(dest) == insn_func(insn)) { /* * Anything from->to self is either _THIS_IP_ or * IRET-to-self. diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h index 036129cebeeef..acd7fae593484 100644 --- a/tools/objtool/include/objtool/check.h +++ b/tools/objtool/include/objtool/check.h @@ -67,11 +67,21 @@ struct instruction { struct reloc *jump_table; struct reloc *reloc; struct list_head alts; - struct symbol *func; + struct symbol *sym; struct list_head stack_ops; struct cfi_state *cfi; }; +static inline struct symbol *insn_func(struct instruction *insn) +{ + struct symbol *sym = insn->sym; + + if (sym && sym->type != STT_FUNC) + sym = NULL; + + return sym; +} + #define VISITED_BRANCH 0x01 #define VISITED_BRANCH_UACCESS 0x02 #define VISITED_BRANCH_MASK 0x03 -- GitLab From 5a9c361a416fc3a3301e859ff09587cc1b933eb8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 Jul 2022 11:49:50 +0200 Subject: [PATCH 0095/1423] objtool: Allow STT_NOTYPE -> STT_FUNC+0 sibling-calls Teach objtool about STT_NOTYPE -> STT_FUNC+0 sibling calls. Doing do allows slightly simpler .S files. There is a slight complication in that we specifically do not want to allow sibling calls from symbol holes (previously covered by STT_WEAK symbols) -- such things exist where a weak function has a .cold subfunction for example. Additionally, STT_NOTYPE tail-calls are allowed to happen with a modified stack frame, they don't need to obey the normal rules after all. Signed-off-by: Peter Zijlstra (Intel) --- tools/objtool/check.c | 74 +++++++++++++++++++++++++++---------------- 1 file changed, 47 insertions(+), 27 deletions(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index e532efb9b5ab3..7936312e10c79 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -129,16 +129,13 @@ static bool is_jump_table_jump(struct instruction *insn) static bool is_sibling_call(struct instruction *insn) { /* - * Assume only ELF functions can make sibling calls. This ensures - * sibling call detection consistency between vmlinux.o and individual - * objects. + * Assume only STT_FUNC calls have jump-tables. */ - if (!insn_func(insn)) - return false; - - /* An indirect jump is either a sibling call or a jump to a table. */ - if (insn->type == INSN_JUMP_DYNAMIC) - return !is_jump_table_jump(insn); + if (insn_func(insn)) { + /* An indirect jump is either a sibling call or a jump to a table. */ + if (insn->type == INSN_JUMP_DYNAMIC) + return !is_jump_table_jump(insn); + } /* add_jump_destinations() sets insn->call_dest for sibling calls. */ return (is_static_jump(insn) && insn->call_dest); @@ -1400,27 +1397,50 @@ static void add_return_call(struct objtool_file *file, struct instruction *insn, list_add_tail(&insn->call_node, &file->return_thunk_list); } -static bool same_function(struct instruction *insn1, struct instruction *insn2) -{ - return insn_func(insn1)->pfunc == insn_func(insn2)->pfunc; -} - -static bool is_first_func_insn(struct objtool_file *file, struct instruction *insn) +static bool is_first_func_insn(struct objtool_file *file, + struct instruction *insn, struct symbol *sym) { - if (insn->offset == insn_func(insn)->offset) + if (insn->offset == sym->offset) return true; + /* Allow direct CALL/JMP past ENDBR */ if (opts.ibt) { struct instruction *prev = prev_insn_same_sym(file, insn); if (prev && prev->type == INSN_ENDBR && - insn->offset == insn_func(insn)->offset + prev->len) + insn->offset == sym->offset + prev->len) return true; } return false; } +/* + * A sibling call is a tail-call to another symbol -- to differentiate from a + * recursive tail-call which is to the same symbol. + */ +static bool jump_is_sibling_call(struct objtool_file *file, + struct instruction *from, struct instruction *to) +{ + struct symbol *fs = from->sym; + struct symbol *ts = to->sym; + + /* Not a sibling call if from/to a symbol hole */ + if (!fs || !ts) + return false; + + /* Not a sibling call if not targeting the start of a symbol. */ + if (!is_first_func_insn(file, to, ts)) + return false; + + /* Disallow sibling calls into STT_NOTYPE */ + if (ts->type == STT_NOTYPE) + return false; + + /* Must not be self to be a sibling */ + return fs->pfunc != ts->pfunc; +} + /* * Find the destination instructions for all jumps. */ @@ -1519,18 +1539,18 @@ static int add_jump_destinations(struct objtool_file *file) strstr(insn_func(jump_dest)->name, ".cold")) { insn_func(insn)->cfunc = insn_func(jump_dest); insn_func(jump_dest)->pfunc = insn_func(insn); - - } else if (!same_function(insn, jump_dest) && - is_first_func_insn(file, jump_dest)) { - /* - * Internal sibling call without reloc or with - * STT_SECTION reloc. - */ - add_call_dest(file, insn, insn_func(jump_dest), true); - continue; } } + if (jump_is_sibling_call(file, insn, jump_dest)) { + /* + * Internal sibling call without reloc or with + * STT_SECTION reloc. + */ + add_call_dest(file, insn, insn_func(jump_dest), true); + continue; + } + insn->jump_dest = jump_dest; } @@ -3309,7 +3329,7 @@ static int validate_sibling_call(struct objtool_file *file, struct instruction *insn, struct insn_state *state) { - if (has_modified_stack_frame(insn, state)) { + if (insn_func(insn) && has_modified_stack_frame(insn, state)) { WARN_FUNC("sibling call from callable instruction with modified stack frame", insn->sec, insn->offset); return 1; -- GitLab From ef79ed20e3ae9ee9ac2e0f3a4e12814893972e63 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:14 +0200 Subject: [PATCH 0096/1423] x86/entry: Make sync_regs() invocation a tail call No point in having a call there. Spare the call/ret overhead. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111146.539578813@infradead.org --- arch/x86/entry/entry_64.S | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 5c578a7dfcd79..b24b84b3425fb 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1062,11 +1062,8 @@ SYM_CODE_START_LOCAL(error_entry) UNTRAIN_RET leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */ -.Lerror_entry_from_usermode_after_swapgs: - /* Put us onto the real thread stack. */ - call sync_regs - RET + jmp sync_regs /* * There are two places in the kernel that can potentially fault with @@ -1124,7 +1121,7 @@ SYM_CODE_START_LOCAL(error_entry) leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */ call fixup_bad_iret mov %rax, %rdi - jmp .Lerror_entry_from_usermode_after_swapgs + jmp sync_regs SYM_CODE_END(error_entry) SYM_CODE_START_LOCAL(error_return) -- GitLab From cb855971d717a2dd752241f66fedad9dc178388c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:16 +0200 Subject: [PATCH 0097/1423] x86/putuser: Provide room for padding Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111146.746429822@infradead.org --- arch/x86/lib/putuser.S | 62 +++++++++++++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 13 deletions(-) diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index b7dfd60243b75..32125224fccaa 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -47,8 +47,6 @@ SYM_FUNC_START(__put_user_1) LOAD_TASK_SIZE_MINUS_N(0) cmp %_ASM_BX,%_ASM_CX jae .Lbad_put_user -SYM_INNER_LABEL(__put_user_nocheck_1, SYM_L_GLOBAL) - ENDBR ASM_STAC 1: movb %al,(%_ASM_CX) xor %ecx,%ecx @@ -56,54 +54,87 @@ SYM_INNER_LABEL(__put_user_nocheck_1, SYM_L_GLOBAL) RET SYM_FUNC_END(__put_user_1) EXPORT_SYMBOL(__put_user_1) + +SYM_FUNC_START(__put_user_nocheck_1) + ENDBR + ASM_STAC +2: movb %al,(%_ASM_CX) + xor %ecx,%ecx + ASM_CLAC + RET +SYM_FUNC_END(__put_user_nocheck_1) EXPORT_SYMBOL(__put_user_nocheck_1) SYM_FUNC_START(__put_user_2) LOAD_TASK_SIZE_MINUS_N(1) cmp %_ASM_BX,%_ASM_CX jae .Lbad_put_user -SYM_INNER_LABEL(__put_user_nocheck_2, SYM_L_GLOBAL) - ENDBR ASM_STAC -2: movw %ax,(%_ASM_CX) +3: movw %ax,(%_ASM_CX) xor %ecx,%ecx ASM_CLAC RET SYM_FUNC_END(__put_user_2) EXPORT_SYMBOL(__put_user_2) + +SYM_FUNC_START(__put_user_nocheck_2) + ENDBR + ASM_STAC +4: movw %ax,(%_ASM_CX) + xor %ecx,%ecx + ASM_CLAC + RET +SYM_FUNC_END(__put_user_nocheck_2) EXPORT_SYMBOL(__put_user_nocheck_2) SYM_FUNC_START(__put_user_4) LOAD_TASK_SIZE_MINUS_N(3) cmp %_ASM_BX,%_ASM_CX jae .Lbad_put_user -SYM_INNER_LABEL(__put_user_nocheck_4, SYM_L_GLOBAL) - ENDBR ASM_STAC -3: movl %eax,(%_ASM_CX) +5: movl %eax,(%_ASM_CX) xor %ecx,%ecx ASM_CLAC RET SYM_FUNC_END(__put_user_4) EXPORT_SYMBOL(__put_user_4) + +SYM_FUNC_START(__put_user_nocheck_4) + ENDBR + ASM_STAC +6: movl %eax,(%_ASM_CX) + xor %ecx,%ecx + ASM_CLAC + RET +SYM_FUNC_END(__put_user_nocheck_4) EXPORT_SYMBOL(__put_user_nocheck_4) SYM_FUNC_START(__put_user_8) LOAD_TASK_SIZE_MINUS_N(7) cmp %_ASM_BX,%_ASM_CX jae .Lbad_put_user -SYM_INNER_LABEL(__put_user_nocheck_8, SYM_L_GLOBAL) - ENDBR ASM_STAC -4: mov %_ASM_AX,(%_ASM_CX) +7: mov %_ASM_AX,(%_ASM_CX) #ifdef CONFIG_X86_32 -5: movl %edx,4(%_ASM_CX) +8: movl %edx,4(%_ASM_CX) #endif xor %ecx,%ecx ASM_CLAC RET SYM_FUNC_END(__put_user_8) EXPORT_SYMBOL(__put_user_8) + +SYM_FUNC_START(__put_user_nocheck_8) + ENDBR + ASM_STAC +9: mov %_ASM_AX,(%_ASM_CX) +#ifdef CONFIG_X86_32 +10: movl %edx,4(%_ASM_CX) +#endif + xor %ecx,%ecx + ASM_CLAC + RET +SYM_FUNC_END(__put_user_nocheck_8) EXPORT_SYMBOL(__put_user_nocheck_8) SYM_CODE_START_LOCAL(.Lbad_put_user_clac) @@ -117,6 +148,11 @@ SYM_CODE_END(.Lbad_put_user_clac) _ASM_EXTABLE_UA(2b, .Lbad_put_user_clac) _ASM_EXTABLE_UA(3b, .Lbad_put_user_clac) _ASM_EXTABLE_UA(4b, .Lbad_put_user_clac) -#ifdef CONFIG_X86_32 _ASM_EXTABLE_UA(5b, .Lbad_put_user_clac) + _ASM_EXTABLE_UA(6b, .Lbad_put_user_clac) + _ASM_EXTABLE_UA(7b, .Lbad_put_user_clac) + _ASM_EXTABLE_UA(9b, .Lbad_put_user_clac) +#ifdef CONFIG_X86_32 + _ASM_EXTABLE_UA(8b, .Lbad_put_user_clac) + _ASM_EXTABLE_UA(10b, .Lbad_put_user_clac) #endif -- GitLab From 8f7c0d8b23c3f5f740a48db31ebadef28af17a22 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:17 +0200 Subject: [PATCH 0098/1423] x86/Kconfig: Add CONFIG_CALL_THUNKS In preparation for mitigating the Intel SKL RSB underflow issue in software, add a new configuration symbol which allows to build the required call thunk infrastructure conditionally. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111146.849523555@infradead.org --- arch/x86/Kconfig | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f408fa87ed947..e18963e77cb1d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2446,6 +2446,14 @@ config CC_HAS_SLS config CC_HAS_RETURN_THUNK def_bool $(cc-option,-mfunction-return=thunk-extern) +config HAVE_CALL_THUNKS + def_bool y + depends on RETHUNK && OBJTOOL + +config CALL_THUNKS + def_bool n + select FUNCTION_ALIGNMENT_16B + menuconfig SPECULATION_MITIGATIONS bool "Mitigations for speculative execution vulnerabilities" default y -- GitLab From bea75b33895f7f87f0c40023e36a2d087e87ffa1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:18 +0200 Subject: [PATCH 0099/1423] x86/Kconfig: Introduce function padding Now that all functions are 16 byte aligned, add 16 bytes of NOP padding in front of each function. This prepares things for software call stack tracking and kCFI/FineIBT. This significantly increases kernel .text size, around 5.1% on a x86_64-defconfig-ish build. However, per the random access argument used for alignment, these 16 extra bytes are code that wouldn't be used. Performance measurements back this up by showing no significant performance regressions. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111146.950884492@infradead.org --- arch/x86/Kconfig | 20 ++++++++++++- arch/x86/Makefile | 6 ++++ arch/x86/entry/vdso/Makefile | 3 +- arch/x86/include/asm/linkage.h | 51 ++++++++++++++++++++++++++++++++-- include/linux/bpf.h | 4 +++ 5 files changed, 79 insertions(+), 5 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e18963e77cb1d..e368fc0daa4ac 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2446,9 +2446,27 @@ config CC_HAS_SLS config CC_HAS_RETURN_THUNK def_bool $(cc-option,-mfunction-return=thunk-extern) +config CC_HAS_ENTRY_PADDING + def_bool $(cc-option,-fpatchable-function-entry=16,16) + +config FUNCTION_PADDING_CFI + int + default 59 if FUNCTION_ALIGNMENT_64B + default 27 if FUNCTION_ALIGNMENT_32B + default 11 if FUNCTION_ALIGNMENT_16B + default 3 if FUNCTION_ALIGNMENT_8B + default 0 + +# Basically: FUNCTION_ALIGNMENT - 5*CFI_CLANG +# except Kconfig can't do arithmetic :/ +config FUNCTION_PADDING_BYTES + int + default FUNCTION_PADDING_CFI if CFI_CLANG + default FUNCTION_ALIGNMENT + config HAVE_CALL_THUNKS def_bool y - depends on RETHUNK && OBJTOOL + depends on CC_HAS_ENTRY_PADDING && RETHUNK && OBJTOOL config CALL_THUNKS def_bool n diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 415a5d138de47..1640e005092b9 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -208,6 +208,12 @@ ifdef CONFIG_SLS KBUILD_CFLAGS += -mharden-sls=all endif +ifdef CONFIG_CALL_THUNKS +PADDING_CFLAGS := -fpatchable-function-entry=$(CONFIG_FUNCTION_PADDING_BYTES),$(CONFIG_FUNCTION_PADDING_BYTES) +KBUILD_CFLAGS += $(PADDING_CFLAGS) +export PADDING_CFLAGS +endif + KBUILD_LDFLAGS += -m elf_$(UTS_MACHINE) ifdef CONFIG_LTO_CLANG diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 3ef611044c8f6..838613ac15b82 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -95,7 +95,7 @@ ifneq ($(RETPOLINE_VDSO_CFLAGS),) endif endif -$(vobjs): KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) +$(vobjs): KBUILD_CFLAGS := $(filter-out $(PADDING_CFLAGS) $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) $(vobjs): KBUILD_AFLAGS += -DBUILD_VDSO # @@ -158,6 +158,7 @@ KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_CFI),$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out $(PADDING_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic KBUILD_CFLAGS_32 += -fno-stack-protector KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index c2d6e2733b111..45e0df850645c 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -15,8 +15,19 @@ #define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT, 0x90; #define __ALIGN_STR __stringify(__ALIGN) -#define ASM_FUNC_ALIGN __ALIGN_STR -#define __FUNC_ALIGN __ALIGN +#if defined(CONFIG_CALL_THUNKS) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +#define FUNCTION_PADDING .skip CONFIG_FUNCTION_ALIGNMENT, 0x90; +#else +#define FUNCTION_PADDING +#endif + +#if (CONFIG_FUNCTION_ALIGNMENT > 8) && !defined(__DISABLE_EXPORTS) && !defined(BULID_VDSO) +# define __FUNC_ALIGN __ALIGN; FUNCTION_PADDING +#else +# define __FUNC_ALIGN __ALIGN +#endif + +#define ASM_FUNC_ALIGN __stringify(__FUNC_ALIGN) #define SYM_F_ALIGN __FUNC_ALIGN #ifdef __ASSEMBLY__ @@ -45,11 +56,45 @@ #endif /* __ASSEMBLY__ */ +/* + * Depending on -fpatchable-function-entry=N,N usage (CONFIG_CALL_THUNKS) the + * CFI symbol layout changes. + * + * Without CALL_THUNKS: + * + * .align FUNCTION_ALIGNMENT + * __cfi_##name: + * .skip FUNCTION_PADDING, 0x90 + * .byte 0xb8 + * .long __kcfi_typeid_##name + * name: + * + * With CALL_THUNKS: + * + * .align FUNCTION_ALIGNMENT + * __cfi_##name: + * .byte 0xb8 + * .long __kcfi_typeid_##name + * .skip FUNCTION_PADDING, 0x90 + * name: + * + * In both cases the whole thing is FUNCTION_ALIGNMENT aligned and sized. + */ + +#ifdef CONFIG_CALL_THUNKS +#define CFI_PRE_PADDING +#define CFI_POST_PADDING .skip CONFIG_FUNCTION_PADDING_BYTES, 0x90; +#else +#define CFI_PRE_PADDING .skip CONFIG_FUNCTION_PADDING_BYTES, 0x90; +#define CFI_POST_PADDING +#endif + #define __CFI_TYPE(name) \ SYM_START(__cfi_##name, SYM_L_LOCAL, SYM_A_NONE) \ - .fill 11, 1, 0x90 ASM_NL \ + CFI_PRE_PADDING \ .byte 0xb8 ASM_NL \ .long __kcfi_typeid_##name ASM_NL \ + CFI_POST_PADDING \ SYM_FUNC_END(__cfi_##name) /* SYM_TYPED_FUNC_START -- use for indirectly called globals, w/ CFI type */ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9e7d46d16032f..5296aea9b5b40 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -984,7 +984,11 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func } #ifdef CONFIG_X86_64 +#ifdef CONFIG_CALL_THUNKS +#define BPF_DISPATCHER_ATTRIBUTES __attribute__((patchable_function_entry(5+CONFIG_FUNCTION_PADDING_BYTES,CONFIG_FUNCTION_PADDING_BYTES))) +#else #define BPF_DISPATCHER_ATTRIBUTES __attribute__((patchable_function_entry(5))) +#endif #else #define BPF_DISPATCHER_ATTRIBUTES #endif -- GitLab From 80e4c1cd42fff110bfdae8fce7ac4f22465f9664 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:19 +0200 Subject: [PATCH 0100/1423] x86/retbleed: Add X86_FEATURE_CALL_DEPTH Intel SKL CPUs fall back to other predictors when the RSB underflows. The only microcode mitigation is IBRS which is insanely expensive. It comes with performance drops of up to 30% depending on the workload. A way less expensive, but nevertheless horrible mitigation is to track the call depth in software and overeagerly fill the RSB when returns underflow the software counter. Provide a configuration symbol and a CPU misfeature bit. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111147.056176424@infradead.org --- arch/x86/Kconfig | 19 +++++++++++++++++++ arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/disabled-features.h | 9 ++++++++- 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e368fc0daa4ac..6ae7fa4b8eb7d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2523,6 +2523,25 @@ config CPU_UNRET_ENTRY help Compile the kernel with support for the retbleed=unret mitigation. +config CALL_DEPTH_TRACKING + bool "Mitigate RSB underflow with call depth tracking" + depends on CPU_SUP_INTEL && HAVE_CALL_THUNKS + select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE + select CALL_THUNKS + default y + help + Compile the kernel with call depth tracking to mitigate the Intel + SKL Return-Speculation-Buffer (RSB) underflow issue. The + mitigation is off by default and needs to be enabled on the + kernel command line via the retbleed=stuff option. For + non-affected systems the overhead of this option is marginal as + the call depth tracking is using run-time generated call thunks + in a compiler generated padding area and call patching. This + increases text size by ~5%. For non affected systems this space + is unused. On affected SKL systems this results in a significant + performance gain over the IBRS mitigation. + + config CPU_IBPB_ENTRY bool "Enable IBPB on kernel entry" depends on CPU_SUP_AMD && X86_64 diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index b71f4f2ecdd57..aefd0816a333f 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -304,6 +304,7 @@ #define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */ #define X86_FEATURE_USE_IBPB_FW (11*32+16) /* "" Use IBPB during runtime firmware calls */ #define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM exit when EIBRS is enabled */ +#define X86_FEATURE_CALL_DEPTH (11*32+18) /* "" Call depth tracking for RSB stuffing */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 33d2cd04d2544..bbb03b25263e7 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -69,6 +69,12 @@ # define DISABLE_UNRET (1 << (X86_FEATURE_UNRET & 31)) #endif +#ifdef CONFIG_CALL_DEPTH_TRACKING +# define DISABLE_CALL_DEPTH_TRACKING 0 +#else +# define DISABLE_CALL_DEPTH_TRACKING (1 << (X86_FEATURE_CALL_DEPTH & 31)) +#endif + #ifdef CONFIG_INTEL_IOMMU_SVM # define DISABLE_ENQCMD 0 #else @@ -101,7 +107,8 @@ #define DISABLED_MASK8 (DISABLE_TDX_GUEST) #define DISABLED_MASK9 (DISABLE_SGX) #define DISABLED_MASK10 0 -#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET) +#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \ + DISABLE_CALL_DEPTH_TRACKING) #define DISABLED_MASK12 0 #define DISABLED_MASK13 0 #define DISABLED_MASK14 0 -- GitLab From fe54d0793796ccdb213d8ea7bff0b49903b6afaa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:20 +0200 Subject: [PATCH 0101/1423] x86/alternatives: Provide text_poke_copy_locked() The upcoming call thunk patching must hold text_mutex and needs access to text_poke_copy(), which takes text_mutex. Provide a _locked postfixed variant to expose the inner workings. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111147.159977224@infradead.org --- arch/x86/include/asm/text-patching.h | 1 + arch/x86/kernel/alternative.c | 37 +++++++++++++++++----------- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 1cc15528ce29b..f4b87f08f5c50 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -45,6 +45,7 @@ extern void *text_poke(void *addr, const void *opcode, size_t len); extern void text_poke_sync(void); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern void *text_poke_copy(void *addr, const void *opcode, size_t len); +extern void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, bool core_ok); extern void *text_poke_set(void *addr, int c, size_t len); extern int poke_int3_handler(struct pt_regs *regs); extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 5cadcea035e04..fad3c0e4838ea 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1236,27 +1236,15 @@ void *text_poke_kgdb(void *addr, const void *opcode, size_t len) return __text_poke(text_poke_memcpy, addr, opcode, len); } -/** - * text_poke_copy - Copy instructions into (an unused part of) RX memory - * @addr: address to modify - * @opcode: source of the copy - * @len: length to copy, could be more than 2x PAGE_SIZE - * - * Not safe against concurrent execution; useful for JITs to dump - * new code blocks into unused regions of RX memory. Can be used in - * conjunction with synchronize_rcu_tasks() to wait for existing - * execution to quiesce after having made sure no existing functions - * pointers are live. - */ -void *text_poke_copy(void *addr, const void *opcode, size_t len) +void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, + bool core_ok) { unsigned long start = (unsigned long)addr; size_t patched = 0; - if (WARN_ON_ONCE(core_kernel_text(start))) + if (WARN_ON_ONCE(!core_ok && core_kernel_text(start))) return NULL; - mutex_lock(&text_mutex); while (patched < len) { unsigned long ptr = start + patched; size_t s; @@ -1266,6 +1254,25 @@ void *text_poke_copy(void *addr, const void *opcode, size_t len) __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s); patched += s; } + return addr; +} + +/** + * text_poke_copy - Copy instructions into (an unused part of) RX memory + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy, could be more than 2x PAGE_SIZE + * + * Not safe against concurrent execution; useful for JITs to dump + * new code blocks into unused regions of RX memory. Can be used in + * conjunction with synchronize_rcu_tasks() to wait for existing + * execution to quiesce after having made sure no existing functions + * pointers are live. + */ +void *text_poke_copy(void *addr, const void *opcode, size_t len) +{ + mutex_lock(&text_mutex); + addr = text_poke_copy_locked(addr, opcode, len, false); mutex_unlock(&text_mutex); return addr; } -- GitLab From c22cf380c79c4bb0e502b0343f57271b17626424 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:21 +0200 Subject: [PATCH 0102/1423] x86/entry: Make some entry symbols global paranoid_entry(), error_entry() and xen_error_entry() have to be exempted from call accounting by thunk patching because they are before UNTRAIN_RET. Expose them so they are available in the alternative code. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111147.265598113@infradead.org --- arch/x86/entry/entry_64.S | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index b24b84b3425fb..4cc0125fdfdc2 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -327,7 +327,8 @@ SYM_CODE_END(ret_from_fork) #endif .endm -SYM_CODE_START_LOCAL(xen_error_entry) +SYM_CODE_START(xen_error_entry) + ANNOTATE_NOENDBR UNWIND_HINT_FUNC PUSH_AND_CLEAR_REGS save_ret=1 ENCODE_FRAME_POINTER 8 @@ -906,7 +907,8 @@ SYM_CODE_END(xen_failsafe_callback) * R14 - old CR3 * R15 - old SPEC_CTRL */ -SYM_CODE_START_LOCAL(paranoid_entry) +SYM_CODE_START(paranoid_entry) + ANNOTATE_NOENDBR UNWIND_HINT_FUNC PUSH_AND_CLEAR_REGS save_ret=1 ENCODE_FRAME_POINTER 8 @@ -1041,7 +1043,8 @@ SYM_CODE_END(paranoid_exit) /* * Switch GS and CR3 if needed. */ -SYM_CODE_START_LOCAL(error_entry) +SYM_CODE_START(error_entry) + ANNOTATE_NOENDBR UNWIND_HINT_FUNC PUSH_AND_CLEAR_REGS save_ret=1 -- GitLab From 239f2e248ef12840178a3ed1a217f19b5fbfde26 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:22 +0200 Subject: [PATCH 0103/1423] x86/paravirt: Make struct paravirt_call_site unconditionally available For the upcoming call thunk patching it's less ifdeffery when the data structure is unconditionally available. The code can then be trivially fenced off with IS_ENABLED(). Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111147.367853167@infradead.org --- arch/x86/include/asm/paravirt.h | 4 ++-- arch/x86/include/asm/paravirt_types.h | 20 ++++++++++++-------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 1be66c15ecbd7..2851bc2339d58 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -4,13 +4,13 @@ /* Various instructions on x86 need to be replaced for * para-virtualization: those hooks are defined here. */ +#include + #ifdef CONFIG_PARAVIRT #include #include #include -#include - #ifndef __ASSEMBLY__ #include #include diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index f3d601574730c..e137d94121232 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -2,6 +2,17 @@ #ifndef _ASM_X86_PARAVIRT_TYPES_H #define _ASM_X86_PARAVIRT_TYPES_H +#ifndef __ASSEMBLY__ +/* These all sit in the .parainstructions section to tell us what to patch. */ +struct paravirt_patch_site { + u8 *instr; /* original instructions */ + u8 type; /* type of this instruction */ + u8 len; /* length of original instruction */ +}; +#endif + +#ifdef CONFIG_PARAVIRT + /* Bitmask of what can be clobbered: usually at least eax. */ #define CLBR_EAX (1 << 0) #define CLBR_ECX (1 << 1) @@ -593,16 +604,9 @@ unsigned long paravirt_ret0(void); #define paravirt_nop ((void *)_paravirt_nop) -/* These all sit in the .parainstructions section to tell us what to patch. */ -struct paravirt_patch_site { - u8 *instr; /* original instructions */ - u8 type; /* type of this instruction */ - u8 len; /* length of original instruction */ -}; - extern struct paravirt_patch_site __parainstructions[], __parainstructions_end[]; #endif /* __ASSEMBLY__ */ - +#endif /* CONFIG_PARAVIRT */ #endif /* _ASM_X86_PARAVIRT_TYPES_H */ -- GitLab From e81dc127ef69887c72735a3e3868930e2bf313ed Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:23 +0200 Subject: [PATCH 0104/1423] x86/callthunks: Add call patching for call depth tracking Mitigating the Intel SKL RSB underflow issue in software requires to track the call depth. That is every CALL and every RET need to be intercepted and additional code injected. The existing retbleed mitigations already include means of redirecting RET to __x86_return_thunk; this can be re-purposed and RET can be redirected to another function doing RET accounting. CALL accounting will use the function padding introduced in prior patches. For each CALL instruction, the destination symbol's padding is rewritten to do the accounting and the CALL instruction is adjusted to call into the padding. This ensures only affected CPUs pay the overhead of this accounting. Unaffected CPUs will leave the padding unused and have their 'JMP __x86_return_thunk' replaced with an actual 'RET' instruction. Objtool has been modified to supply a .call_sites section that lists all the 'CALL' instructions. Additionally the paravirt instruction sites are iterated since they will have been patched from an indirect call to direct calls (or direct instructions in which case it'll be ignored). Module handling and the actual thunk code for SKL will be added in subsequent steps. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111147.470877038@infradead.org --- arch/x86/Kconfig | 12 ++ arch/x86/include/asm/alternative.h | 12 ++ arch/x86/kernel/Makefile | 2 + arch/x86/kernel/alternative.c | 6 + arch/x86/kernel/callthunks.c | 251 +++++++++++++++++++++++++++ arch/x86/kernel/head_64.S | 1 + arch/x86/kernel/relocate_kernel_64.S | 5 +- arch/x86/kernel/vmlinux.lds.S | 8 - 8 files changed, 287 insertions(+), 10 deletions(-) create mode 100644 arch/x86/kernel/callthunks.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6ae7fa4b8eb7d..a1dae9d5e3dab 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2541,6 +2541,18 @@ config CALL_DEPTH_TRACKING is unused. On affected SKL systems this results in a significant performance gain over the IBRS mitigation. +config CALL_THUNKS_DEBUG + bool "Enable call thunks and call depth tracking debugging" + depends on CALL_DEPTH_TRACKING + select FUNCTION_ALIGNMENT_32B + default n + help + Enable call/ret counters for imbalance detection and build in + a noisy dmesg about callthunks generation and call patching for + trouble shooting. The debug prints need to be enabled on the + kernel command line with 'debug-callthunks'. + Only enable this, when you are debugging call thunks as this + creates a noticable runtime overhead. If unsure say N. config CPU_IBPB_ENTRY bool "Enable IBPB on kernel entry" diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 9542c582d546b..6b7bbd0db2480 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -80,6 +80,18 @@ extern void apply_returns(s32 *start, s32 *end); extern void apply_ibt_endbr(s32 *start, s32 *end); struct module; +struct paravirt_patch_site; + +struct callthunk_sites { + s32 *call_start, *call_end; + struct paravirt_patch_site *pv_start, *pv_end; +}; + +#ifdef CONFIG_CALL_THUNKS +extern void callthunks_patch_builtin_calls(void); +#else +static __always_inline void callthunks_patch_builtin_calls(void) {} +#endif #ifdef CONFIG_SMP extern void alternatives_smp_module_add(struct module *mod, char *name, diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f901658d9f7c0..c2739a5886fa2 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -143,6 +143,8 @@ obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o obj-$(CONFIG_CFI_CLANG) += cfi.o +obj-$(CONFIG_CALL_THUNKS) += callthunks.o + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index fad3c0e4838ea..963872d177075 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -947,6 +947,12 @@ void __init alternative_instructions(void) */ apply_alternatives(__alt_instructions, __alt_instructions_end); + /* + * Now all calls are established. Apply the call thunks if + * required. + */ + callthunks_patch_builtin_calls(); + apply_ibt_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); #ifdef CONFIG_SMP diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c new file mode 100644 index 0000000000000..e5275d6e674d9 --- /dev/null +++ b/arch/x86/kernel/callthunks.c @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "callthunks: " fmt + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int __initdata_or_module debug_callthunks; + +#define prdbg(fmt, args...) \ +do { \ + if (debug_callthunks) \ + printk(KERN_DEBUG pr_fmt(fmt), ##args); \ +} while(0) + +static int __init debug_thunks(char *str) +{ + debug_callthunks = 1; + return 1; +} +__setup("debug-callthunks", debug_thunks); + +extern s32 __call_sites[], __call_sites_end[]; + +struct thunk_desc { + void *template; + unsigned int template_size; +}; + +struct core_text { + unsigned long base; + unsigned long end; + const char *name; +}; + +static bool thunks_initialized __ro_after_init; + +static const struct core_text builtin_coretext = { + .base = (unsigned long)_text, + .end = (unsigned long)_etext, + .name = "builtin", +}; + +static struct thunk_desc callthunk_desc __ro_after_init; + +extern void error_entry(void); +extern void xen_error_entry(void); +extern void paranoid_entry(void); + +static inline bool within_coretext(const struct core_text *ct, void *addr) +{ + unsigned long p = (unsigned long)addr; + + return ct->base <= p && p < ct->end; +} + +static inline bool within_module_coretext(void *addr) +{ + bool ret = false; + +#ifdef CONFIG_MODULES + struct module *mod; + + preempt_disable(); + mod = __module_address((unsigned long)addr); + if (mod && within_module_core((unsigned long)addr, mod)) + ret = true; + preempt_enable(); +#endif + return ret; +} + +static bool is_coretext(const struct core_text *ct, void *addr) +{ + if (ct && within_coretext(ct, addr)) + return true; + if (within_coretext(&builtin_coretext, addr)) + return true; + return within_module_coretext(addr); +} + +static __init_or_module bool skip_addr(void *dest) +{ + if (dest == error_entry) + return true; + if (dest == paranoid_entry) + return true; + if (dest == xen_error_entry) + return true; + /* Does FILL_RSB... */ + if (dest == __switch_to_asm) + return true; + /* Accounts directly */ + if (dest == ret_from_fork) + return true; +#ifdef CONFIG_HOTPLUG_CPU + if (dest == start_cpu0) + return true; +#endif +#ifdef CONFIG_FUNCTION_TRACER + if (dest == __fentry__) + return true; +#endif +#ifdef CONFIG_KEXEC_CORE + if (dest >= (void *)relocate_kernel && + dest < (void*)relocate_kernel + KEXEC_CONTROL_CODE_MAX_SIZE) + return true; +#endif +#ifdef CONFIG_XEN + if (dest >= (void *)hypercall_page && + dest < (void*)hypercall_page + PAGE_SIZE) + return true; +#endif + return false; +} + +static __init_or_module void *call_get_dest(void *addr) +{ + struct insn insn; + void *dest; + int ret; + + ret = insn_decode_kernel(&insn, addr); + if (ret) + return ERR_PTR(ret); + + /* Patched out call? */ + if (insn.opcode.bytes[0] != CALL_INSN_OPCODE) + return NULL; + + dest = addr + insn.length + insn.immediate.value; + if (skip_addr(dest)) + return NULL; + return dest; +} + +static const u8 nops[] = { + 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, + 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, + 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, + 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, +}; + +static __init_or_module void *patch_dest(void *dest, bool direct) +{ + unsigned int tsize = callthunk_desc.template_size; + u8 *pad = dest - tsize; + + /* Already patched? */ + if (!bcmp(pad, callthunk_desc.template, tsize)) + return pad; + + /* Ensure there are nops */ + if (bcmp(pad, nops, tsize)) { + pr_warn_once("Invalid padding area for %pS\n", dest); + return NULL; + } + + if (direct) + memcpy(pad, callthunk_desc.template, tsize); + else + text_poke_copy_locked(pad, callthunk_desc.template, tsize, true); + return pad; +} + +static __init_or_module void patch_call(void *addr, const struct core_text *ct) +{ + void *pad, *dest; + u8 bytes[8]; + + if (!within_coretext(ct, addr)) + return; + + dest = call_get_dest(addr); + if (!dest || WARN_ON_ONCE(IS_ERR(dest))) + return; + + if (!is_coretext(ct, dest)) + return; + + pad = patch_dest(dest, within_coretext(ct, dest)); + if (!pad) + return; + + prdbg("Patch call at: %pS %px to %pS %px -> %px \n", addr, addr, + dest, dest, pad); + __text_gen_insn(bytes, CALL_INSN_OPCODE, addr, pad, CALL_INSN_SIZE); + text_poke_early(addr, bytes, CALL_INSN_SIZE); +} + +static __init_or_module void +patch_call_sites(s32 *start, s32 *end, const struct core_text *ct) +{ + s32 *s; + + for (s = start; s < end; s++) + patch_call((void *)s + *s, ct); +} + +static __init_or_module void +patch_paravirt_call_sites(struct paravirt_patch_site *start, + struct paravirt_patch_site *end, + const struct core_text *ct) +{ + struct paravirt_patch_site *p; + + for (p = start; p < end; p++) + patch_call(p->instr, ct); +} + +static __init_or_module void +callthunks_setup(struct callthunk_sites *cs, const struct core_text *ct) +{ + prdbg("Patching call sites %s\n", ct->name); + patch_call_sites(cs->call_start, cs->call_end, ct); + patch_paravirt_call_sites(cs->pv_start, cs->pv_end, ct); + prdbg("Patching call sites done%s\n", ct->name); +} + +void __init callthunks_patch_builtin_calls(void) +{ + struct callthunk_sites cs = { + .call_start = __call_sites, + .call_end = __call_sites_end, + .pv_start = __parainstructions, + .pv_end = __parainstructions_end + }; + + if (!cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) + return; + + pr_info("Setting up call depth tracking\n"); + mutex_lock(&text_mutex); + callthunks_setup(&cs, &builtin_coretext); + thunks_initialized = true; + mutex_unlock(&text_mutex); +} diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d860d437631b6..222efd4a09bc8 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -370,6 +370,7 @@ SYM_CODE_END(secondary_startup_64) * start_secondary() via .Ljump_to_C_code. */ SYM_CODE_START(start_cpu0) + ANNOTATE_NOENDBR UNWIND_HINT_EMPTY movq initial_stack(%rip), %rsp jmp .Ljump_to_C_code diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 4809c0dc4eb0c..4a73351f87f88 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -41,6 +41,7 @@ .text .align PAGE_SIZE .code64 +SYM_CODE_START_NOALIGN(relocate_range) SYM_CODE_START_NOALIGN(relocate_kernel) UNWIND_HINT_EMPTY ANNOTATE_NOENDBR @@ -312,5 +313,5 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) int3 SYM_CODE_END(swap_pages) - .globl kexec_control_code_size -.set kexec_control_code_size, . - relocate_kernel + .skip KEXEC_CONTROL_CODE_MAX_SIZE - (. - relocate_kernel), 0xcc +SYM_CODE_END(relocate_range); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index b69df9e013cca..49f3f86433c7d 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -501,11 +501,3 @@ INIT_PER_CPU(irq_stack_backing_store); #endif #endif /* CONFIG_X86_64 */ - -#ifdef CONFIG_KEXEC_CORE -#include - -. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, - "kexec control code size is too big"); -#endif - -- GitLab From eaf44c816ed8d1ef94c354e3ed47d53cd5a5cb13 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:24 +0200 Subject: [PATCH 0105/1423] x86/modules: Add call patching As for the builtins create call thunks and patch the call sites to call the thunk on Intel SKL CPUs for retbleed mitigation. Note, that module init functions are ignored for sake of simplicity because loading modules is not something which is done in high frequent loops and the attacker has not really a handle on when this happens in order to launch a matching attack. The depth tracking will still work for calls into the builtins and because the call is not accounted it will underflow faster and overstuff, but that's mitigated by the saturating counter and the side effect is only temporary. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111147.575673066@infradead.org --- arch/x86/include/asm/alternative.h | 5 +++++ arch/x86/kernel/callthunks.c | 19 +++++++++++++++++++ arch/x86/kernel/module.c | 20 +++++++++++++++++++- 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 6b7bbd0db2480..ef007fa33dc41 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -89,8 +89,13 @@ struct callthunk_sites { #ifdef CONFIG_CALL_THUNKS extern void callthunks_patch_builtin_calls(void); +extern void callthunks_patch_module_calls(struct callthunk_sites *sites, + struct module *mod); #else static __always_inline void callthunks_patch_builtin_calls(void) {} +static __always_inline void +callthunks_patch_module_calls(struct callthunk_sites *sites, + struct module *mod) {} #endif #ifdef CONFIG_SMP diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index e5275d6e674d9..7b9d998ebd7dc 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -249,3 +249,22 @@ void __init callthunks_patch_builtin_calls(void) thunks_initialized = true; mutex_unlock(&text_mutex); } + +#ifdef CONFIG_MODULES +void noinline callthunks_patch_module_calls(struct callthunk_sites *cs, + struct module *mod) +{ + struct core_text ct = { + .base = (unsigned long)mod->core_layout.base, + .end = (unsigned long)mod->core_layout.base + mod->core_layout.size, + .name = mod->name, + }; + + if (!thunks_initialized) + return; + + mutex_lock(&text_mutex); + callthunks_setup(cs, &ct); + mutex_unlock(&text_mutex); +} +#endif /* CONFIG_MODULES */ diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 43f0112772192..2fb9de2cef40e 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -254,7 +254,8 @@ int module_finalize(const Elf_Ehdr *hdr, { const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, *para = NULL, *orc = NULL, *orc_ip = NULL, - *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL; + *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL, + *calls = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -274,6 +275,8 @@ int module_finalize(const Elf_Ehdr *hdr, retpolines = s; if (!strcmp(".return_sites", secstrings + s->sh_name)) returns = s; + if (!strcmp(".call_sites", secstrings + s->sh_name)) + calls = s; if (!strcmp(".ibt_endbr_seal", secstrings + s->sh_name)) ibt_endbr = s; } @@ -299,6 +302,21 @@ int module_finalize(const Elf_Ehdr *hdr, void *aseg = (void *)alt->sh_addr; apply_alternatives(aseg, aseg + alt->sh_size); } + if (calls || para) { + struct callthunk_sites cs = {}; + + if (calls) { + cs.call_start = (void *)calls->sh_addr; + cs.call_end = (void *)calls->sh_addr + calls->sh_size; + } + + if (para) { + cs.pv_start = (void *)para->sh_addr; + cs.pv_end = (void *)para->sh_addr + para->sh_size; + } + + callthunks_patch_module_calls(&cs, me); + } if (ibt_endbr) { void *iseg = (void *)ibt_endbr->sh_addr; apply_ibt_endbr(iseg, iseg + ibt_endbr->sh_size); -- GitLab From 770ae1b709528a6a173b5c7b183818ee9b45e376 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:25 +0200 Subject: [PATCH 0106/1423] x86/returnthunk: Allow different return thunks In preparation for call depth tracking on Intel SKL CPUs, make it possible to patch in a SKL specific return thunk. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111147.680469665@infradead.org --- arch/x86/include/asm/nospec-branch.h | 6 ++++++ arch/x86/kernel/alternative.c | 17 +++++++++++++---- arch/x86/kernel/ftrace.c | 2 +- arch/x86/kernel/static_call.c | 2 +- arch/x86/net/bpf_jit_comp.c | 2 +- 5 files changed, 22 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index c936ce9f0c47c..f10ca334dd752 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -208,6 +208,12 @@ extern void __x86_return_thunk(void); extern void zen_untrain_ret(void); extern void entry_ibpb(void); +#ifdef CONFIG_CALL_THUNKS +extern void (*x86_return_thunk)(void); +#else +#define x86_return_thunk (&__x86_return_thunk) +#endif + #ifdef CONFIG_RETPOLINE #define GEN(reg) \ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 963872d177075..04d1e3d35b0e7 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -518,6 +518,11 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) } #ifdef CONFIG_RETHUNK + +#ifdef CONFIG_CALL_THUNKS +void (*x86_return_thunk)(void) __ro_after_init = &__x86_return_thunk; +#endif + /* * Rewrite the compiler generated return thunk tail-calls. * @@ -533,14 +538,18 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes) { int i = 0; - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) - return -1; + if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) { + if (x86_return_thunk == __x86_return_thunk) + return -1; - bytes[i++] = RET_INSN_OPCODE; + i = JMP32_INSN_SIZE; + __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i); + } else { + bytes[i++] = RET_INSN_OPCODE; + } for (; i < insn->length;) bytes[i++] = INT3_INSN_OPCODE; - return i; } diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 00eac455a3a1f..4ac6692d5ef82 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -359,7 +359,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) ip = trampoline + size; if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) - __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, &__x86_return_thunk, JMP32_INSN_SIZE); + __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE); else memcpy(ip, retq, sizeof(retq)); diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index aaaba85d6d7ff..5d3844a98373e 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -52,7 +52,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, case RET: if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) - code = text_gen_insn(JMP32_INSN_OPCODE, insn, &__x86_return_thunk); + code = text_gen_insn(JMP32_INSN_OPCODE, insn, x86_return_thunk); else code = &retinsn; break; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 99620428ad785..0df391ecd4d83 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -432,7 +432,7 @@ static void emit_return(u8 **pprog, u8 *ip) u8 *prog = *pprog; if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) { - emit_jump(&prog, &__x86_return_thunk, ip); + emit_jump(&prog, x86_return_thunk, ip); } else { EMIT1(0xC3); /* ret */ if (IS_ENABLED(CONFIG_SLS)) -- GitLab From 52354973573cc260ff2fc661cb28ff8eaa7b879b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:26 +0200 Subject: [PATCH 0107/1423] x86/asm: Provide ALTERNATIVE_3 Fairly straight forward adaptation/extention of ALTERNATIVE_2. Required for call depth tracking. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111147.787711192@infradead.org --- arch/x86/include/asm/alternative.h | 33 +++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index ef007fa33dc41..4c416b21bac80 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -364,6 +364,7 @@ static inline int alternatives_text_reserved(void *start, void *end) #define old_len 141b-140b #define new_len1 144f-143f #define new_len2 145f-144f +#define new_len3 146f-145f /* * gas compatible max based on the idea from: @@ -371,7 +372,8 @@ static inline int alternatives_text_reserved(void *start, void *end) * * The additional "-" is needed because gas uses a "true" value of -1. */ -#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) +#define alt_max_2(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) +#define alt_max_3(a, b, c) (alt_max_2(alt_max_2(a, b), c)) /* @@ -383,8 +385,8 @@ static inline int alternatives_text_reserved(void *start, void *end) 140: \oldinstr 141: - .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \ - (alt_max_short(new_len1, new_len2) - (old_len)),0x90 + .skip -((alt_max_2(new_len1, new_len2) - (old_len)) > 0) * \ + (alt_max_2(new_len1, new_len2) - (old_len)),0x90 142: .pushsection .altinstructions,"a" @@ -401,6 +403,31 @@ static inline int alternatives_text_reserved(void *start, void *end) .popsection .endm +.macro ALTERNATIVE_3 oldinstr, newinstr1, feature1, newinstr2, feature2, newinstr3, feature3 +140: + \oldinstr +141: + .skip -((alt_max_3(new_len1, new_len2, new_len3) - (old_len)) > 0) * \ + (alt_max_3(new_len1, new_len2, new_len3) - (old_len)),0x90 +142: + + .pushsection .altinstructions,"a" + altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f + altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f + altinstruction_entry 140b,145f,\feature3,142b-140b,146f-145f + .popsection + + .pushsection .altinstr_replacement,"ax" +143: + \newinstr1 +144: + \newinstr2 +145: + \newinstr3 +146: + .popsection +.endm + /* If @feature is set, patch in @newinstr_yes, otherwise @newinstr_no. */ #define ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) \ ALTERNATIVE_2 oldinstr, newinstr_no, X86_FEATURE_ALWAYS, \ -- GitLab From 5d8213864ade86b48fc492584ea86d65a62f892e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:27 +0200 Subject: [PATCH 0108/1423] x86/retbleed: Add SKL return thunk To address the Intel SKL RSB underflow issue in software it's required to do call depth tracking. Provide a return thunk for call depth tracking on Intel SKL CPUs. The tracking does not use a counter. It uses uses arithmetic shift right on call entry and logical shift left on return. The depth tracking variable is initialized to 0x8000.... when the call depth is zero. The arithmetic shift right sign extends the MSB and saturates after the 12th call. The shift count is 5 so the tracking covers 12 nested calls. On return the variable is shifted left logically so it becomes zero again. CALL RET 0: 0x8000000000000000 0x0000000000000000 1: 0xfc00000000000000 0xf000000000000000 ... 11: 0xfffffffffffffff8 0xfffffffffffffc00 12: 0xffffffffffffffff 0xffffffffffffffe0 After a return buffer fill the depth is credited 12 calls before the next stuffing has to take place. There is a inaccuracy for situations like this: 10 calls 5 returns 3 calls 4 returns 3 calls .... The shift count might cause this to be off by one in either direction, but there is still a cushion vs. the RSB depth. The algorithm does not claim to be perfect, but it should obfuscate the problem enough to make exploitation extremly difficult. The theory behind this is: RSB is a stack with depth 16 which is filled on every call. On the return path speculation "pops" entries to speculate down the call chain. Once the speculative RSB is empty it switches to other predictors, e.g. the Branch History Buffer, which can be mistrained by user space and misguide the speculation path to a gadget. Call depth tracking is designed to break this speculation path by stuffing speculation trap calls into the RSB which are never getting a corresponding return executed. This stalls the prediction path until it gets resteered, The assumption is that stuffing at the 12th return is sufficient to break the speculation before it hits the underflow and the fallback to the other predictors. Testing confirms that it works. Johannes, one of the retbleed researchers. tried to attack this approach but failed. There is obviously no scientific proof that this will withstand future research progress, but all we can do right now is to speculate about it. The SAR/SHL usage was suggested by Andi Kleen. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111147.890071690@infradead.org --- arch/x86/entry/entry_64.S | 10 ++- arch/x86/include/asm/current.h | 3 + arch/x86/include/asm/nospec-branch.h | 121 +++++++++++++++++++++++++-- arch/x86/kernel/asm-offsets.c | 3 + arch/x86/kvm/svm/vmenter.S | 1 + arch/x86/lib/retpoline.S | 31 +++++++ 6 files changed, 159 insertions(+), 10 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 4cc0125fdfdc2..15739a2c09833 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -288,6 +288,7 @@ SYM_FUNC_END(__switch_to_asm) SYM_CODE_START_NOALIGN(ret_from_fork) UNWIND_HINT_EMPTY ANNOTATE_NOENDBR // copy_thread + CALL_DEPTH_ACCOUNT movq %rax, %rdi call schedule_tail /* rdi: 'prev' task parameter */ @@ -332,7 +333,7 @@ SYM_CODE_START(xen_error_entry) UNWIND_HINT_FUNC PUSH_AND_CLEAR_REGS save_ret=1 ENCODE_FRAME_POINTER 8 - UNTRAIN_RET + UNTRAIN_RET_FROM_CALL RET SYM_CODE_END(xen_error_entry) @@ -977,7 +978,7 @@ SYM_CODE_START(paranoid_entry) * CR3 above, keep the old value in a callee saved register. */ IBRS_ENTER save_reg=%r15 - UNTRAIN_RET + UNTRAIN_RET_FROM_CALL RET SYM_CODE_END(paranoid_entry) @@ -1062,7 +1063,7 @@ SYM_CODE_START(error_entry) /* We have user CR3. Change to kernel CR3. */ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax IBRS_ENTER - UNTRAIN_RET + UNTRAIN_RET_FROM_CALL leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */ /* Put us onto the real thread stack. */ @@ -1097,6 +1098,7 @@ SYM_CODE_START(error_entry) */ .Lerror_entry_done_lfence: FENCE_SWAPGS_KERNEL_ENTRY + CALL_DEPTH_ACCOUNT leaq 8(%rsp), %rax /* return pt_regs pointer */ ANNOTATE_UNRET_END RET @@ -1115,7 +1117,7 @@ SYM_CODE_START(error_entry) FENCE_SWAPGS_USER_ENTRY SWITCH_TO_KERNEL_CR3 scratch_reg=%rax IBRS_ENTER - UNTRAIN_RET + UNTRAIN_RET_FROM_CALL /* * Pretend that the exception came from user mode: set up pt_regs diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index b89aba077b84b..a1168e7b69e5b 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -17,6 +17,9 @@ struct pcpu_hot { struct task_struct *current_task; int preempt_count; int cpu_number; +#ifdef CONFIG_CALL_DEPTH_TRACKING + u64 call_depth; +#endif unsigned long top_of_stack; void *hardirq_stack_ptr; u16 softirq_pending; diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index f10ca334dd752..d4be826a22824 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -12,8 +12,83 @@ #include #include #include +#include -#define RETPOLINE_THUNK_SIZE 32 +/* + * Call depth tracking for Intel SKL CPUs to address the RSB underflow + * issue in software. + * + * The tracking does not use a counter. It uses uses arithmetic shift + * right on call entry and logical shift left on return. + * + * The depth tracking variable is initialized to 0x8000.... when the call + * depth is zero. The arithmetic shift right sign extends the MSB and + * saturates after the 12th call. The shift count is 5 for both directions + * so the tracking covers 12 nested calls. + * + * Call + * 0: 0x8000000000000000 0x0000000000000000 + * 1: 0xfc00000000000000 0xf000000000000000 + * ... + * 11: 0xfffffffffffffff8 0xfffffffffffffc00 + * 12: 0xffffffffffffffff 0xffffffffffffffe0 + * + * After a return buffer fill the depth is credited 12 calls before the + * next stuffing has to take place. + * + * There is a inaccuracy for situations like this: + * + * 10 calls + * 5 returns + * 3 calls + * 4 returns + * 3 calls + * .... + * + * The shift count might cause this to be off by one in either direction, + * but there is still a cushion vs. the RSB depth. The algorithm does not + * claim to be perfect and it can be speculated around by the CPU, but it + * is considered that it obfuscates the problem enough to make exploitation + * extremly difficult. + */ +#define RET_DEPTH_SHIFT 5 +#define RSB_RET_STUFF_LOOPS 16 +#define RET_DEPTH_INIT 0x8000000000000000ULL +#define RET_DEPTH_INIT_FROM_CALL 0xfc00000000000000ULL +#define RET_DEPTH_CREDIT 0xffffffffffffffffULL + +#if defined(CONFIG_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS) + +#include + +#define CREDIT_CALL_DEPTH \ + movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth); + +#define ASM_CREDIT_CALL_DEPTH \ + movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth); + +#define RESET_CALL_DEPTH \ + mov $0x80, %rax; \ + shl $56, %rax; \ + movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); + +#define RESET_CALL_DEPTH_FROM_CALL \ + mov $0xfc, %rax; \ + shl $56, %rax; \ + movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); + +#define INCREMENT_CALL_DEPTH \ + sarq $5, %gs:pcpu_hot + X86_call_depth; + +#define ASM_INCREMENT_CALL_DEPTH \ + sarq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth); + +#else +#define CREDIT_CALL_DEPTH +#define RESET_CALL_DEPTH +#define INCREMENT_CALL_DEPTH +#define RESET_CALL_DEPTH_FROM_CALL +#endif /* * Fill the CPU return stack buffer. @@ -32,6 +107,7 @@ * from C via asm(".include ") but let's not go there. */ +#define RETPOLINE_THUNK_SIZE 32 #define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */ /* @@ -60,7 +136,8 @@ dec reg; \ jnz 771b; \ /* barrier for jnz misprediction */ \ - lfence; + lfence; \ + ASM_CREDIT_CALL_DEPTH #else /* * i386 doesn't unconditionally have LFENCE, as such it can't @@ -185,11 +262,32 @@ * where we have a stack but before any RET instruction. */ .macro UNTRAIN_RET -#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) +#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ + defined(CONFIG_X86_FEATURE_CALL_DEPTH) ANNOTATE_UNRET_END - ALTERNATIVE_2 "", \ - CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \ - "call entry_ibpb", X86_FEATURE_ENTRY_IBPB + ALTERNATIVE_3 "", \ + CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \ + "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \ + __stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH +#endif +.endm + +.macro UNTRAIN_RET_FROM_CALL +#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ + defined(CONFIG_X86_FEATURE_CALL_DEPTH) + ANNOTATE_UNRET_END + ALTERNATIVE_3 "", \ + CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \ + "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \ + __stringify(RESET_CALL_DEPTH_FROM_CALL), X86_FEATURE_CALL_DEPTH +#endif +.endm + + +.macro CALL_DEPTH_ACCOUNT +#ifdef CONFIG_CALL_DEPTH_TRACKING + ALTERNATIVE "", \ + __stringify(ASM_INCREMENT_CALL_DEPTH), X86_FEATURE_CALL_DEPTH #endif .endm @@ -214,6 +312,17 @@ extern void (*x86_return_thunk)(void); #define x86_return_thunk (&__x86_return_thunk) #endif +#ifdef CONFIG_CALL_DEPTH_TRACKING +extern void __x86_return_skl(void); + +static inline void x86_set_skl_return_thunk(void) +{ + x86_return_thunk = &__x86_return_skl; +} +#else +static inline void x86_set_skl_return_thunk(void) {} +#endif + #ifdef CONFIG_RETPOLINE #define GEN(reg) \ diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index a9824318e1c55..13afdbbee349e 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -110,6 +110,9 @@ static void __used common(void) OFFSET(TSS_sp2, tss_struct, x86_tss.sp2); OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack); +#ifdef CONFIG_CALL_DEPTH_TRACKING + OFFSET(X86_call_depth, pcpu_hot, call_depth); +#endif if (IS_ENABLED(CONFIG_KVM_INTEL)) { BLANK(); diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 723f8534986c3..09eacf19d7185 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include #include +#include #include #include #include diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 073289a55f849..1e79eccc1d698 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -5,9 +5,11 @@ #include #include #include +#include #include #include #include +#include #include .section .text.__x86.indirect_thunk @@ -140,3 +142,32 @@ __EXPORT_THUNK(zen_untrain_ret) EXPORT_SYMBOL(__x86_return_thunk) #endif /* CONFIG_RETHUNK */ + +#ifdef CONFIG_CALL_DEPTH_TRACKING + + .align 64 +SYM_FUNC_START(__x86_return_skl) + ANNOTATE_NOENDBR + /* Keep the hotpath in a 16byte I-fetch */ + shlq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth) + jz 1f + ANNOTATE_UNRET_SAFE + ret + int3 +1: + .rept 16 + ANNOTATE_INTRA_FUNCTION_CALL + call 2f + int3 +2: + .endr + add $(8*16), %rsp + + CREDIT_CALL_DEPTH + + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_FUNC_END(__x86_return_skl) + +#endif /* CONFIG_CALL_DEPTH_TRACKING */ -- GitLab From 3b6c1747da48ff40ab746b0e860cffe83619f5c5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:28 +0200 Subject: [PATCH 0109/1423] x86/retpoline: Add SKL retthunk retpolines Ensure that retpolines do the proper call accounting so that the return accounting works correctly. Specifically; retpolines are used to replace both 'jmp *%reg' and 'call *%reg', however these two cases do not have the same accounting requirements. Therefore split things up and provide two different retpoline arrays for SKL. The 'jmp *%reg' case needs no accounting, the __x86_indirect_jump_thunk_array[] covers this. The retpoline is changed to not use the return thunk; it's a simple call;ret construct. [ strictly speaking it should do: andq $(~0x1f), PER_CPU_VAR(__x86_call_depth) but we can argue this can be covered by the fuzz we already have in the accounting depth (12) vs the RSB depth (16) ] The 'call *%reg' case does need accounting, the __x86_indirect_call_thunk_array[] covers this. Again, this retpoline avoids the use of the return-thunk, in this case to avoid double accounting. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111147.996634749@infradead.org --- arch/x86/include/asm/nospec-branch.h | 12 +++++ arch/x86/kernel/alternative.c | 59 +++++++++++++++++++++-- arch/x86/lib/retpoline.S | 71 ++++++++++++++++++++++++---- arch/x86/net/bpf_jit_comp.c | 5 +- 4 files changed, 135 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index d4be826a22824..06ba7caa0cad0 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -301,6 +301,8 @@ typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE]; extern retpoline_thunk_t __x86_indirect_thunk_array[]; +extern retpoline_thunk_t __x86_indirect_call_thunk_array[]; +extern retpoline_thunk_t __x86_indirect_jump_thunk_array[]; extern void __x86_return_thunk(void); extern void zen_untrain_ret(void); @@ -330,6 +332,16 @@ static inline void x86_set_skl_return_thunk(void) {} #include #undef GEN +#define GEN(reg) \ + extern retpoline_thunk_t __x86_indirect_call_thunk_ ## reg; +#include +#undef GEN + +#define GEN(reg) \ + extern retpoline_thunk_t __x86_indirect_jump_thunk_ ## reg; +#include +#undef GEN + #ifdef CONFIG_X86_64 /* diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 04d1e3d35b0e7..19221d77dc275 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -377,6 +377,56 @@ static int emit_indirect(int op, int reg, u8 *bytes) return i; } +static inline bool is_jcc32(struct insn *insn) +{ + /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ + return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80; +} + +static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes) +{ + u8 op = insn->opcode.bytes[0]; + int i = 0; + + /* + * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional + * tail-calls. Deal with them. + */ + if (is_jcc32(insn)) { + bytes[i++] = op; + op = insn->opcode.bytes[1]; + goto clang_jcc; + } + + if (insn->length == 6) + bytes[i++] = 0x2e; /* CS-prefix */ + + switch (op) { + case CALL_INSN_OPCODE: + __text_gen_insn(bytes+i, op, addr+i, + __x86_indirect_call_thunk_array[reg], + CALL_INSN_SIZE); + i += CALL_INSN_SIZE; + break; + + case JMP32_INSN_OPCODE: +clang_jcc: + __text_gen_insn(bytes+i, op, addr+i, + __x86_indirect_jump_thunk_array[reg], + JMP32_INSN_SIZE); + i += JMP32_INSN_SIZE; + break; + + default: + WARN("%pS %px %*ph\n", addr, addr, 6, addr); + return -1; + } + + WARN_ON_ONCE(i != insn->length); + + return i; +} + /* * Rewrite the compiler generated retpoline thunk calls. * @@ -409,8 +459,12 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) BUG_ON(reg == 4); if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && - !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) + !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { + if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) + return emit_call_track_retpoline(addr, insn, reg, bytes); + return -1; + } op = insn->opcode.bytes[0]; @@ -427,8 +481,7 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) * [ NOP ] * 1: */ - /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ - if (op == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80) { + if (is_jcc32(insn)) { cc = insn->opcode.bytes[1] & 0xf; cc ^= 1; /* invert condition */ diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 1e79eccc1d698..e00206077ae91 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -14,17 +14,18 @@ .section .text.__x86.indirect_thunk -.macro RETPOLINE reg + +.macro POLINE reg ANNOTATE_INTRA_FUNCTION_CALL call .Ldo_rop_\@ -.Lspec_trap_\@: - UNWIND_HINT_EMPTY - pause - lfence - jmp .Lspec_trap_\@ + int3 .Ldo_rop_\@: mov %\reg, (%_ASM_SP) UNWIND_HINT_FUNC +.endm + +.macro RETPOLINE reg + POLINE \reg RET .endm @@ -54,7 +55,6 @@ SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL) */ #define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym) -#define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg) .align RETPOLINE_THUNK_SIZE SYM_CODE_START(__x86_indirect_thunk_array) @@ -66,10 +66,65 @@ SYM_CODE_START(__x86_indirect_thunk_array) .align RETPOLINE_THUNK_SIZE SYM_CODE_END(__x86_indirect_thunk_array) -#define GEN(reg) EXPORT_THUNK(reg) +#define GEN(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg) +#include +#undef GEN + +#ifdef CONFIG_CALL_DEPTH_TRACKING +.macro CALL_THUNK reg + .align RETPOLINE_THUNK_SIZE + +SYM_INNER_LABEL(__x86_indirect_call_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_EMPTY + ANNOTATE_NOENDBR + + CALL_DEPTH_ACCOUNT + POLINE \reg + ANNOTATE_UNRET_SAFE + ret + int3 +.endm + + .align RETPOLINE_THUNK_SIZE +SYM_CODE_START(__x86_indirect_call_thunk_array) + +#define GEN(reg) CALL_THUNK reg #include #undef GEN + .align RETPOLINE_THUNK_SIZE +SYM_CODE_END(__x86_indirect_call_thunk_array) + +#define GEN(reg) __EXPORT_THUNK(__x86_indirect_call_thunk_ ## reg) +#include +#undef GEN + +.macro JUMP_THUNK reg + .align RETPOLINE_THUNK_SIZE + +SYM_INNER_LABEL(__x86_indirect_jump_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_EMPTY + ANNOTATE_NOENDBR + POLINE \reg + ANNOTATE_UNRET_SAFE + ret + int3 +.endm + + .align RETPOLINE_THUNK_SIZE +SYM_CODE_START(__x86_indirect_jump_thunk_array) + +#define GEN(reg) JUMP_THUNK reg +#include +#undef GEN + + .align RETPOLINE_THUNK_SIZE +SYM_CODE_END(__x86_indirect_jump_thunk_array) + +#define GEN(reg) __EXPORT_THUNK(__x86_indirect_jump_thunk_ ## reg) +#include +#undef GEN +#endif /* * This function name is magical and is used by -mfunction-return=thunk-extern * for the compiler to generate JMPs to it. diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 0df391ecd4d83..ad8cb7f15ab8a 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -417,7 +417,10 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip) EMIT2(0xFF, 0xE0 + reg); } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) { OPTIMIZER_HIDE_VAR(reg); - emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip); + if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) + emit_jump(&prog, &__x86_indirect_jump_thunk_array[reg], ip); + else + emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip); } else { EMIT2(0xFF, 0xE0 + reg); /* jmp *%\reg */ if (IS_ENABLED(CONFIG_RETPOLINE) || IS_ENABLED(CONFIG_SLS)) -- GitLab From bbaceb189a21d7245e8063701fe10985396028f9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:29 +0200 Subject: [PATCH 0110/1423] x86/retbleed: Add SKL call thunk Add the actual SKL call thunk for call depth accounting. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111148.101125588@infradead.org --- arch/x86/kernel/callthunks.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index 7b9d998ebd7dc..01f6f6b5a93cd 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -55,7 +56,21 @@ static const struct core_text builtin_coretext = { .name = "builtin", }; -static struct thunk_desc callthunk_desc __ro_after_init; +asm ( + ".pushsection .rodata \n" + ".global skl_call_thunk_template \n" + "skl_call_thunk_template: \n" + __stringify(INCREMENT_CALL_DEPTH)" \n" + ".global skl_call_thunk_tail \n" + "skl_call_thunk_tail: \n" + ".popsection \n" +); + +extern u8 skl_call_thunk_template[]; +extern u8 skl_call_thunk_tail[]; + +#define SKL_TMPL_SIZE \ + ((unsigned int)(skl_call_thunk_tail - skl_call_thunk_template)) extern void error_entry(void); extern void xen_error_entry(void); @@ -157,11 +172,11 @@ static const u8 nops[] = { static __init_or_module void *patch_dest(void *dest, bool direct) { - unsigned int tsize = callthunk_desc.template_size; + unsigned int tsize = SKL_TMPL_SIZE; u8 *pad = dest - tsize; /* Already patched? */ - if (!bcmp(pad, callthunk_desc.template, tsize)) + if (!bcmp(pad, skl_call_thunk_template, tsize)) return pad; /* Ensure there are nops */ @@ -171,9 +186,9 @@ static __init_or_module void *patch_dest(void *dest, bool direct) } if (direct) - memcpy(pad, callthunk_desc.template, tsize); + memcpy(pad, skl_call_thunk_template, tsize); else - text_poke_copy_locked(pad, callthunk_desc.template, tsize, true); + text_poke_copy_locked(pad, skl_call_thunk_template, tsize, true); return pad; } -- GitLab From f5c1bb2afe93396d41c5cbdcb909b08a75b8dde4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:30 +0200 Subject: [PATCH 0111/1423] x86/calldepth: Add ret/call counting for debug Add a debuigfs mechanism to validate the accounting, e.g. vs. call/ret balance and to gather statistics about the stuffing to call ratio. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111148.204285506@infradead.org --- arch/x86/include/asm/nospec-branch.h | 36 ++++++++++++++++--- arch/x86/kernel/callthunks.c | 53 ++++++++++++++++++++++++++++ arch/x86/lib/retpoline.S | 7 +++- 3 files changed, 91 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 06ba7caa0cad0..4771147c7c5a6 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -57,6 +57,22 @@ #define RET_DEPTH_INIT_FROM_CALL 0xfc00000000000000ULL #define RET_DEPTH_CREDIT 0xffffffffffffffffULL +#ifdef CONFIG_CALL_THUNKS_DEBUG +# define CALL_THUNKS_DEBUG_INC_CALLS \ + incq %gs:__x86_call_count; +# define CALL_THUNKS_DEBUG_INC_RETS \ + incq %gs:__x86_ret_count; +# define CALL_THUNKS_DEBUG_INC_STUFFS \ + incq %gs:__x86_stuffs_count; +# define CALL_THUNKS_DEBUG_INC_CTXSW \ + incq %gs:__x86_ctxsw_count; +#else +# define CALL_THUNKS_DEBUG_INC_CALLS +# define CALL_THUNKS_DEBUG_INC_RETS +# define CALL_THUNKS_DEBUG_INC_STUFFS +# define CALL_THUNKS_DEBUG_INC_CTXSW +#endif + #if defined(CONFIG_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS) #include @@ -75,18 +91,23 @@ #define RESET_CALL_DEPTH_FROM_CALL \ mov $0xfc, %rax; \ shl $56, %rax; \ - movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); + movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); \ + CALL_THUNKS_DEBUG_INC_CALLS #define INCREMENT_CALL_DEPTH \ - sarq $5, %gs:pcpu_hot + X86_call_depth; + sarq $5, %gs:pcpu_hot + X86_call_depth; \ + CALL_THUNKS_DEBUG_INC_CALLS #define ASM_INCREMENT_CALL_DEPTH \ - sarq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth); + sarq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth); \ + CALL_THUNKS_DEBUG_INC_CALLS #else #define CREDIT_CALL_DEPTH +#define ASM_CREDIT_CALL_DEPTH #define RESET_CALL_DEPTH #define INCREMENT_CALL_DEPTH +#define ASM_INCREMENT_CALL_DEPTH #define RESET_CALL_DEPTH_FROM_CALL #endif @@ -137,7 +158,8 @@ jnz 771b; \ /* barrier for jnz misprediction */ \ lfence; \ - ASM_CREDIT_CALL_DEPTH + ASM_CREDIT_CALL_DEPTH \ + CALL_THUNKS_DEBUG_INC_CTXSW #else /* * i386 doesn't unconditionally have LFENCE, as such it can't @@ -321,6 +343,12 @@ static inline void x86_set_skl_return_thunk(void) { x86_return_thunk = &__x86_return_skl; } +#ifdef CONFIG_CALL_THUNKS_DEBUG +DECLARE_PER_CPU(u64, __x86_call_count); +DECLARE_PER_CPU(u64, __x86_ret_count); +DECLARE_PER_CPU(u64, __x86_stuffs_count); +DECLARE_PER_CPU(u64, __x86_ctxsw_count); +#endif #else static inline void x86_set_skl_return_thunk(void) {} #endif diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index 01f6f6b5a93cd..dfe7ffff88b93 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -2,6 +2,7 @@ #define pr_fmt(fmt) "callthunks: " fmt +#include #include #include #include @@ -35,6 +36,15 @@ static int __init debug_thunks(char *str) } __setup("debug-callthunks", debug_thunks); +#ifdef CONFIG_CALL_THUNKS_DEBUG +DEFINE_PER_CPU(u64, __x86_call_count); +DEFINE_PER_CPU(u64, __x86_ret_count); +DEFINE_PER_CPU(u64, __x86_stuffs_count); +DEFINE_PER_CPU(u64, __x86_ctxsw_count); +EXPORT_SYMBOL_GPL(__x86_ctxsw_count); +EXPORT_SYMBOL_GPL(__x86_call_count); +#endif + extern s32 __call_sites[], __call_sites_end[]; struct thunk_desc { @@ -283,3 +293,46 @@ void noinline callthunks_patch_module_calls(struct callthunk_sites *cs, mutex_unlock(&text_mutex); } #endif /* CONFIG_MODULES */ + +#if defined(CONFIG_CALL_THUNKS_DEBUG) && defined(CONFIG_DEBUG_FS) +static int callthunks_debug_show(struct seq_file *m, void *p) +{ + unsigned long cpu = (unsigned long)m->private; + + seq_printf(m, "C: %16llu R: %16llu S: %16llu X: %16llu\n,", + per_cpu(__x86_call_count, cpu), + per_cpu(__x86_ret_count, cpu), + per_cpu(__x86_stuffs_count, cpu), + per_cpu(__x86_ctxsw_count, cpu)); + return 0; +} + +static int callthunks_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, callthunks_debug_show, inode->i_private); +} + +static const struct file_operations dfs_ops = { + .open = callthunks_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init callthunks_debugfs_init(void) +{ + struct dentry *dir; + unsigned long cpu; + + dir = debugfs_create_dir("callthunks", NULL); + for_each_possible_cpu(cpu) { + void *arg = (void *)cpu; + char name [10]; + + sprintf(name, "cpu%lu", cpu); + debugfs_create_file(name, 0644, dir, arg, &dfs_ops); + } + return 0; +} +__initcall(callthunks_debugfs_init); +#endif diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index e00206077ae91..5f61c65322bea 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -203,13 +203,18 @@ EXPORT_SYMBOL(__x86_return_thunk) .align 64 SYM_FUNC_START(__x86_return_skl) ANNOTATE_NOENDBR - /* Keep the hotpath in a 16byte I-fetch */ + /* + * Keep the hotpath in a 16byte I-fetch for the non-debug + * case. + */ + CALL_THUNKS_DEBUG_INC_RETS shlq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth) jz 1f ANNOTATE_UNRET_SAFE ret int3 1: + CALL_THUNKS_DEBUG_INC_STUFFS .rept 16 ANNOTATE_INTRA_FUNCTION_CALL call 2f -- GitLab From 7825451fa4dc04660f1f53d236e4302161d0ebd1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:31 +0200 Subject: [PATCH 0112/1423] static_call: Add call depth tracking support When indirect calls are switched to direct calls then it has to be ensured that the call target is not the function, but the call thunk when call depth tracking is enabled. But static calls are available before call thunks have been set up. Ensure a second run through the static call patching code after call thunks have been created. When call thunks are not enabled this has no side effects. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111148.306100465@infradead.org --- arch/x86/include/asm/alternative.h | 5 +++++ arch/x86/kernel/callthunks.c | 18 ++++++++++++++++++ arch/x86/kernel/static_call.c | 1 + include/linux/static_call.h | 2 ++ kernel/static_call_inline.c | 23 ++++++++++++++++++----- 5 files changed, 44 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 4c416b21bac80..07ac25793a3f4 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -91,11 +91,16 @@ struct callthunk_sites { extern void callthunks_patch_builtin_calls(void); extern void callthunks_patch_module_calls(struct callthunk_sites *sites, struct module *mod); +extern void *callthunks_translate_call_dest(void *dest); #else static __always_inline void callthunks_patch_builtin_calls(void) {} static __always_inline void callthunks_patch_module_calls(struct callthunk_sites *sites, struct module *mod) {} +static __always_inline void *callthunks_translate_call_dest(void *dest) +{ + return dest; +} #endif #ifdef CONFIG_SMP diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index dfe7ffff88b93..071003605a865 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -271,10 +272,27 @@ void __init callthunks_patch_builtin_calls(void) pr_info("Setting up call depth tracking\n"); mutex_lock(&text_mutex); callthunks_setup(&cs, &builtin_coretext); + static_call_force_reinit(); thunks_initialized = true; mutex_unlock(&text_mutex); } +void *callthunks_translate_call_dest(void *dest) +{ + void *target; + + lockdep_assert_held(&text_mutex); + + if (!thunks_initialized || skip_addr(dest)) + return dest; + + if (!is_coretext(NULL, dest)) + return dest; + + target = patch_dest(dest, false); + return target ? : dest; +} + #ifdef CONFIG_MODULES void noinline callthunks_patch_module_calls(struct callthunk_sites *cs, struct module *mod) diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index 5d3844a98373e..2ebc338980bcd 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -34,6 +34,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, switch (type) { case CALL: + func = callthunks_translate_call_dest(func); code = text_gen_insn(CALL_INSN_OPCODE, insn, func); if (func == &__static_call_return0) { emulate = code; diff --git a/include/linux/static_call.h b/include/linux/static_call.h index df53bed9d71f1..141e6b176a1b3 100644 --- a/include/linux/static_call.h +++ b/include/linux/static_call.h @@ -162,6 +162,8 @@ extern void arch_static_call_transform(void *site, void *tramp, void *func, bool extern int __init static_call_init(void); +extern void static_call_force_reinit(void); + struct static_call_mod { struct static_call_mod *next; struct module *mod; /* for vmlinux, mod == NULL */ diff --git a/kernel/static_call_inline.c b/kernel/static_call_inline.c index dc5665b628140..639397b5491ca 100644 --- a/kernel/static_call_inline.c +++ b/kernel/static_call_inline.c @@ -15,7 +15,18 @@ extern struct static_call_site __start_static_call_sites[], extern struct static_call_tramp_key __start_static_call_tramp_key[], __stop_static_call_tramp_key[]; -static bool static_call_initialized; +static int static_call_initialized; + +/* + * Must be called before early_initcall() to be effective. + */ +void static_call_force_reinit(void) +{ + if (WARN_ON_ONCE(!static_call_initialized)) + return; + + static_call_initialized++; +} /* mutex to protect key modules/sites */ static DEFINE_MUTEX(static_call_mutex); @@ -475,7 +486,8 @@ int __init static_call_init(void) { int ret; - if (static_call_initialized) + /* See static_call_force_reinit(). */ + if (static_call_initialized == 1) return 0; cpus_read_lock(); @@ -490,11 +502,12 @@ int __init static_call_init(void) BUG(); } - static_call_initialized = true; - #ifdef CONFIG_MODULES - register_module_notifier(&static_call_module_nb); + if (!static_call_initialized) + register_module_notifier(&static_call_module_nb); #endif + + static_call_initialized = 1; return 0; } early_initcall(static_call_init); -- GitLab From f1389181622a08d6f1c71407a64df36b809d632b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:32 +0200 Subject: [PATCH 0113/1423] kallsyms: Take callthunks into account Since the pre-symbol function padding is an integral part of the symbol make kallsyms report it as part of the symbol by reporting it as sym-x instead of prev_sym+y. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111148.409656012@infradead.org --- kernel/kallsyms.c | 45 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 60c20f301a6ba..cc244c02b4cf6 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -293,6 +293,12 @@ static unsigned long get_symbol_pos(unsigned long addr, return low; } +#ifdef CONFIG_FUNCTION_PADDING_BYTES +#define PADDING_BYTES CONFIG_FUNCTION_PADDING_BYTES +#else +#define PADDING_BYTES 0 +#endif + /* * Lookup an address but don't bother to find any names. */ @@ -300,13 +306,25 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, unsigned long *offset) { char namebuf[KSYM_NAME_LEN]; + int ret; + + addr += PADDING_BYTES; if (is_ksym_addr(addr)) { get_symbol_pos(addr, symbolsize, offset); - return 1; + ret = 1; + goto found; + } + + ret = !!module_address_lookup(addr, symbolsize, offset, NULL, NULL, namebuf); + if (!ret) { + ret = !!__bpf_address_lookup(addr, symbolsize, + offset, namebuf); } - return !!module_address_lookup(addr, symbolsize, offset, NULL, NULL, namebuf) || - !!__bpf_address_lookup(addr, symbolsize, offset, namebuf); +found: + if (ret && offset) + *offset -= PADDING_BYTES; + return ret; } static const char *kallsyms_lookup_buildid(unsigned long addr, @@ -319,6 +337,8 @@ static const char *kallsyms_lookup_buildid(unsigned long addr, namebuf[KSYM_NAME_LEN - 1] = 0; namebuf[0] = 0; + addr += PADDING_BYTES; + if (is_ksym_addr(addr)) { unsigned long pos; @@ -348,6 +368,8 @@ static const char *kallsyms_lookup_buildid(unsigned long addr, found: cleanup_symbol_name(namebuf); + if (ret && offset) + *offset -= PADDING_BYTES; return ret; } @@ -374,6 +396,8 @@ int lookup_symbol_name(unsigned long addr, char *symname) symname[0] = '\0'; symname[KSYM_NAME_LEN - 1] = '\0'; + addr += PADDING_BYTES; + if (is_ksym_addr(addr)) { unsigned long pos; @@ -401,6 +425,8 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, name[0] = '\0'; name[KSYM_NAME_LEN - 1] = '\0'; + addr += PADDING_BYTES; + if (is_ksym_addr(addr)) { unsigned long pos; @@ -417,6 +443,8 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, return res; found: + if (offset) + *offset -= PADDING_BYTES; cleanup_symbol_name(name); return 0; } @@ -442,8 +470,15 @@ static int __sprint_symbol(char *buffer, unsigned long address, len = strlen(buffer); offset -= symbol_offset; - if (add_offset) - len += sprintf(buffer + len, "+%#lx/%#lx", offset, size); + if (add_offset) { + char s = '+'; + + if ((long)offset < 0) { + s = '-'; + offset = 0UL - offset; + } + len += sprintf(buffer + len, "%c%#lx/%#lx", s, offset, size); + } if (modname) { len += sprintf(buffer + len, " [%s", modname); -- GitLab From 396e0b8e09e86440c2119d12c2101110d3cd5bf9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:33 +0200 Subject: [PATCH 0114/1423] x86/orc: Make it callthunk aware Callthunks addresses on the stack would confuse the ORC unwinder. Handle them correctly and tell ORC to proceed further down the stack. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111148.511637628@infradead.org --- arch/x86/include/asm/alternative.h | 5 +++++ arch/x86/kernel/callthunks.c | 13 +++++++++++++ arch/x86/kernel/unwind_orc.c | 21 ++++++++++++++++++++- 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 07ac25793a3f4..4b8cd256c95ec 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -92,6 +92,7 @@ extern void callthunks_patch_builtin_calls(void); extern void callthunks_patch_module_calls(struct callthunk_sites *sites, struct module *mod); extern void *callthunks_translate_call_dest(void *dest); +extern bool is_callthunk(void *addr); #else static __always_inline void callthunks_patch_builtin_calls(void) {} static __always_inline void @@ -101,6 +102,10 @@ static __always_inline void *callthunks_translate_call_dest(void *dest) { return dest; } +static __always_inline bool is_callthunk(void *addr) +{ + return false; +} #endif #ifdef CONFIG_SMP diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index 071003605a865..7f9788194eb5a 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -293,6 +293,19 @@ void *callthunks_translate_call_dest(void *dest) return target ? : dest; } +bool is_callthunk(void *addr) +{ + unsigned int tmpl_size = SKL_TMPL_SIZE; + void *tmpl = skl_call_thunk_template; + unsigned long dest; + + dest = roundup((unsigned long)addr, CONFIG_FUNCTION_ALIGNMENT); + if (!thunks_initialized || skip_addr((void *)dest)) + return false; + + return !bcmp((void *)(dest - tmpl_size), tmpl, tmpl_size); +} + #ifdef CONFIG_MODULES void noinline callthunks_patch_module_calls(struct callthunk_sites *cs, struct module *mod) diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 0ea57da929407..cfac2b54b37bc 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -136,6 +136,21 @@ static struct orc_entry null_orc_entry = { .type = UNWIND_HINT_TYPE_CALL }; +#ifdef CONFIG_CALL_THUNKS +static struct orc_entry *orc_callthunk_find(unsigned long ip) +{ + if (!is_callthunk((void *)ip)) + return NULL; + + return &null_orc_entry; +} +#else +static struct orc_entry *orc_callthunk_find(unsigned long ip) +{ + return NULL; +} +#endif + /* Fake frame pointer entry -- used as a fallback for generated code */ static struct orc_entry orc_fp_entry = { .type = UNWIND_HINT_TYPE_CALL, @@ -189,7 +204,11 @@ static struct orc_entry *orc_find(unsigned long ip) if (orc) return orc; - return orc_ftrace_find(ip); + orc = orc_ftrace_find(ip); + if (orc) + return orc; + + return orc_callthunk_find(ip); } #ifdef CONFIG_MODULES -- GitLab From b2e9dfe54be4d023124d588d6f03d16a9c0d2507 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:34 +0200 Subject: [PATCH 0115/1423] x86/bpf: Emit call depth accounting if required Ensure that calls in BPF jitted programs are emitting call depth accounting when enabled to keep the call/return balanced. The return thunk jump is already injected due to the earlier retbleed mitigations. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111148.615413406@infradead.org --- arch/x86/include/asm/alternative.h | 6 ++++++ arch/x86/kernel/callthunks.c | 19 ++++++++++++++++++ arch/x86/net/bpf_jit_comp.c | 32 +++++++++++++++++++++--------- 3 files changed, 48 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 4b8cd256c95ec..664c0779375c1 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -93,6 +93,7 @@ extern void callthunks_patch_module_calls(struct callthunk_sites *sites, struct module *mod); extern void *callthunks_translate_call_dest(void *dest); extern bool is_callthunk(void *addr); +extern int x86_call_depth_emit_accounting(u8 **pprog, void *func); #else static __always_inline void callthunks_patch_builtin_calls(void) {} static __always_inline void @@ -106,6 +107,11 @@ static __always_inline bool is_callthunk(void *addr) { return false; } +static __always_inline int x86_call_depth_emit_accounting(u8 **pprog, + void *func) +{ + return 0; +} #endif #ifdef CONFIG_SMP diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index 7f9788194eb5a..a03d646b5e692 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -306,6 +306,25 @@ bool is_callthunk(void *addr) return !bcmp((void *)(dest - tmpl_size), tmpl, tmpl_size); } +#ifdef CONFIG_BPF_JIT +int x86_call_depth_emit_accounting(u8 **pprog, void *func) +{ + unsigned int tmpl_size = SKL_TMPL_SIZE; + void *tmpl = skl_call_thunk_template; + + if (!thunks_initialized) + return 0; + + /* Is function call target a thunk? */ + if (is_callthunk(func)) + return 0; + + memcpy(*pprog, tmpl, tmpl_size); + *pprog += tmpl_size; + return tmpl_size; +} +#endif + #ifdef CONFIG_MODULES void noinline callthunks_patch_module_calls(struct callthunk_sites *cs, struct module *mod) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index ad8cb7f15ab8a..a6b46740ea300 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -340,6 +340,13 @@ static int emit_call(u8 **pprog, void *func, void *ip) return emit_patch(pprog, func, ip, 0xE8); } +static int emit_rsb_call(u8 **pprog, void *func, void *ip) +{ + OPTIMIZER_HIDE_VAR(func); + x86_call_depth_emit_accounting(pprog, func); + return emit_patch(pprog, func, ip, 0xE8); +} + static int emit_jump(u8 **pprog, void *func, void *ip) { return emit_patch(pprog, func, ip, 0xE9); @@ -1436,19 +1443,26 @@ st: if (is_imm8(insn->off)) break; /* call */ - case BPF_JMP | BPF_CALL: + case BPF_JMP | BPF_CALL: { + int offs; + func = (u8 *) __bpf_call_base + imm32; if (tail_call_reachable) { /* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */ EMIT3_off32(0x48, 0x8B, 0x85, -round_up(bpf_prog->aux->stack_depth, 8) - 8); - if (!imm32 || emit_call(&prog, func, image + addrs[i - 1] + 7)) + if (!imm32) return -EINVAL; + offs = 7 + x86_call_depth_emit_accounting(&prog, func); } else { - if (!imm32 || emit_call(&prog, func, image + addrs[i - 1])) + if (!imm32) return -EINVAL; + offs = x86_call_depth_emit_accounting(&prog, func); } + if (emit_call(&prog, func, image + addrs[i - 1] + offs)) + return -EINVAL; break; + } case BPF_JMP | BPF_TAIL_CALL: if (imm32) @@ -1854,7 +1868,7 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, /* arg2: lea rsi, [rbp - ctx_cookie_off] */ EMIT4(0x48, 0x8D, 0x75, -run_ctx_off); - if (emit_call(&prog, enter, prog)) + if (emit_rsb_call(&prog, enter, prog)) return -EINVAL; /* remember prog start time returned by __bpf_prog_enter */ emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); @@ -1875,7 +1889,7 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, (long) p->insnsi >> 32, (u32) (long) p->insnsi); /* call JITed bpf program or interpreter */ - if (emit_call(&prog, p->bpf_func, prog)) + if (emit_rsb_call(&prog, p->bpf_func, prog)) return -EINVAL; /* @@ -1899,7 +1913,7 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); /* arg3: lea rdx, [rbp - run_ctx_off] */ EMIT4(0x48, 0x8D, 0x55, -run_ctx_off); - if (emit_call(&prog, exit, prog)) + if (emit_rsb_call(&prog, exit, prog)) return -EINVAL; *pprog = prog; @@ -2147,7 +2161,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i if (flags & BPF_TRAMP_F_CALL_ORIG) { /* arg1: mov rdi, im */ emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im); - if (emit_call(&prog, __bpf_tramp_enter, prog)) { + if (emit_rsb_call(&prog, __bpf_tramp_enter, prog)) { ret = -EINVAL; goto cleanup; } @@ -2179,7 +2193,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i EMIT2(0xff, 0xd0); /* call *rax */ } else { /* call original function */ - if (emit_call(&prog, orig_call, prog)) { + if (emit_rsb_call(&prog, orig_call, prog)) { ret = -EINVAL; goto cleanup; } @@ -2223,7 +2237,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i im->ip_epilogue = prog; /* arg1: mov rdi, im */ emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im); - if (emit_call(&prog, __bpf_tramp_exit, prog)) { + if (emit_rsb_call(&prog, __bpf_tramp_exit, prog)) { ret = -EINVAL; goto cleanup; } -- GitLab From eac828eaef295cd0cc8b58f55fa5c8401fdc2370 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:35 +0200 Subject: [PATCH 0116/1423] x86/ftrace: Remove ftrace_epilogue() Remove the weird jumps to RET and simply use RET. This then promotes ftrace_stub() to a real function; which becomes important for kcfi. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111148.719080593@infradead.org --- arch/x86/kernel/ftrace_64.S | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index dfeb227de5617..a90c55a6b4817 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -172,20 +172,14 @@ SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) */ SYM_INNER_LABEL(ftrace_caller_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR - - jmp ftrace_epilogue + RET SYM_FUNC_END(ftrace_caller); STACK_FRAME_NON_STANDARD_FP(ftrace_caller) -SYM_FUNC_START(ftrace_epilogue) -/* - * This is weak to keep gas from relaxing the jumps. - */ -SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK) +SYM_FUNC_START(ftrace_stub) UNWIND_HINT_FUNC - ENDBR RET -SYM_FUNC_END(ftrace_epilogue) +SYM_FUNC_END(ftrace_stub) SYM_FUNC_START(ftrace_regs_caller) /* Save the current flags before any operations that can change them */ @@ -262,14 +256,11 @@ SYM_INNER_LABEL(ftrace_regs_caller_jmp, SYM_L_GLOBAL) popfq /* - * As this jmp to ftrace_epilogue can be a short jump - * it must not be copied into the trampoline. - * The trampoline will add the code to jump - * to the return. + * The trampoline will add the return. */ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR - jmp ftrace_epilogue + RET /* Swap the flags with orig_rax */ 1: movq MCOUNT_REG_SIZE(%rsp), %rdi @@ -280,7 +271,7 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) /* Restore flags */ popfq UNWIND_HINT_FUNC - jmp ftrace_epilogue + RET SYM_FUNC_END(ftrace_regs_caller) STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller) -- GitLab From 36b64f101219dd9e6e4f0ea880b64e8a90da547b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:36 +0200 Subject: [PATCH 0117/1423] x86/ftrace: Rebalance RSB ftrace_regs_caller() uses a PUSH;RET pattern to tail-call into a direct-call function, this unbalances the RSB, fix that. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111148.823216933@infradead.org --- arch/x86/kernel/ftrace_64.S | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index a90c55a6b4817..b5b54f58957e7 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -271,6 +271,17 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) /* Restore flags */ popfq UNWIND_HINT_FUNC + + /* + * The above left an extra return value on the stack; effectively + * doing a tail-call without using a register. This PUSH;RET + * pattern unbalances the RSB, inject a pointless CALL to rebalance. + */ + ANNOTATE_INTRA_FUNCTION_CALL + CALL .Ldo_rebalance + int3 +.Ldo_rebalance: + add $8, %rsp RET SYM_FUNC_END(ftrace_regs_caller) -- GitLab From ee3e2469b3463d28ca4cde20e0283319ac6a562d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:37 +0200 Subject: [PATCH 0118/1423] x86/ftrace: Make it call depth tracking aware Since ftrace has trampolines, don't use thunks for the __fentry__ site but instead require that every function called from there includes accounting. This very much includes all the direct-call functions. Additionally, ftrace uses ROP tricks in two places: - return_to_handler(), and - ftrace_regs_caller() when pt_regs->orig_ax is set by a direct-call. return_to_handler() already uses a retpoline to replace an indirect-jump to defeat IBT, since this is a jump-type retpoline, make sure there is no accounting done and ALTERNATIVE the RET into a ret. ftrace_regs_caller() does much the same and gets the same treatment. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111148.927545073@infradead.org --- arch/x86/include/asm/nospec-branch.h | 9 +++++++++ arch/x86/kernel/callthunks.c | 2 +- arch/x86/kernel/ftrace.c | 16 +++++++++++---- arch/x86/kernel/ftrace_64.S | 22 +++++++++++++++++++-- arch/x86/net/bpf_jit_comp.c | 6 ++++++ kernel/trace/trace_selftest.c | 9 ++++++++- samples/ftrace/ftrace-direct-modify.c | 3 +++ samples/ftrace/ftrace-direct-multi-modify.c | 3 +++ samples/ftrace/ftrace-direct-multi.c | 2 ++ samples/ftrace/ftrace-direct-too.c | 2 ++ samples/ftrace/ftrace-direct.c | 2 ++ 11 files changed, 68 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 4771147c7c5a6..82580adbca4b2 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -343,6 +343,12 @@ static inline void x86_set_skl_return_thunk(void) { x86_return_thunk = &__x86_return_skl; } + +#define CALL_DEPTH_ACCOUNT \ + ALTERNATIVE("", \ + __stringify(INCREMENT_CALL_DEPTH), \ + X86_FEATURE_CALL_DEPTH) + #ifdef CONFIG_CALL_THUNKS_DEBUG DECLARE_PER_CPU(u64, __x86_call_count); DECLARE_PER_CPU(u64, __x86_ret_count); @@ -351,6 +357,9 @@ DECLARE_PER_CPU(u64, __x86_ctxsw_count); #endif #else static inline void x86_set_skl_return_thunk(void) {} + +#define CALL_DEPTH_ACCOUNT "" + #endif #ifdef CONFIG_RETPOLINE diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index a03d646b5e692..7d2c75ec9a8cd 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -316,7 +316,7 @@ int x86_call_depth_emit_accounting(u8 **pprog, void *func) return 0; /* Is function call target a thunk? */ - if (is_callthunk(func)) + if (func && is_callthunk(func)) return 0; memcpy(*pprog, tmpl, tmpl_size); diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 4ac6692d5ef82..cf15ef5aecff2 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -69,6 +69,10 @@ static const char *ftrace_nop_replace(void) static const char *ftrace_call_replace(unsigned long ip, unsigned long addr) { + /* + * No need to translate into a callthunk. The trampoline does + * the depth accounting itself. + */ return text_gen_insn(CALL_INSN_OPCODE, (void *)ip, (void *)addr); } @@ -317,7 +321,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) unsigned long size; unsigned long *ptr; void *trampoline; - void *ip; + void *ip, *dest; /* 48 8b 15 is movq (%rip), %rdx */ unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 }; unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE }; @@ -404,10 +408,14 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) /* put in the call to the function */ mutex_lock(&text_mutex); call_offset -= start_offset; + /* + * No need to translate into a callthunk. The trampoline does + * the depth accounting before the call already. + */ + dest = ftrace_ops_get_func(ops); memcpy(trampoline + call_offset, - text_gen_insn(CALL_INSN_OPCODE, - trampoline + call_offset, - ftrace_ops_get_func(ops)), CALL_INSN_SIZE); + text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest), + CALL_INSN_SIZE); mutex_unlock(&text_mutex); /* ALLOC_TRAMP flags lets us know we created it */ diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index b5b54f58957e7..6a7e6d666a12b 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -4,6 +4,7 @@ */ #include +#include #include #include #include @@ -132,6 +133,7 @@ #ifdef CONFIG_DYNAMIC_FTRACE SYM_FUNC_START(__fentry__) + CALL_DEPTH_ACCOUNT RET SYM_FUNC_END(__fentry__) EXPORT_SYMBOL(__fentry__) @@ -140,6 +142,8 @@ SYM_FUNC_START(ftrace_caller) /* save_mcount_regs fills in first two parameters */ save_mcount_regs + CALL_DEPTH_ACCOUNT + /* Stack - skipping return address of ftrace_caller */ leaq MCOUNT_REG_SIZE+8(%rsp), %rcx movq %rcx, RSP(%rsp) @@ -155,6 +159,9 @@ SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL) /* Only ops with REGS flag set should have CS register set */ movq $0, CS(%rsp) + /* Account for the function call below */ + CALL_DEPTH_ACCOUNT + SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) ANNOTATE_NOENDBR call ftrace_stub @@ -189,6 +196,8 @@ SYM_FUNC_START(ftrace_regs_caller) save_mcount_regs 8 /* save_mcount_regs fills in first two parameters */ + CALL_DEPTH_ACCOUNT + SYM_INNER_LABEL(ftrace_regs_caller_op_ptr, SYM_L_GLOBAL) ANNOTATE_NOENDBR /* Load the ftrace_ops into the 3rd parameter */ @@ -219,6 +228,9 @@ SYM_INNER_LABEL(ftrace_regs_caller_op_ptr, SYM_L_GLOBAL) /* regs go into 4th parameter */ leaq (%rsp), %rcx + /* Account for the function call below */ + CALL_DEPTH_ACCOUNT + SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) ANNOTATE_NOENDBR call ftrace_stub @@ -282,7 +294,9 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) int3 .Ldo_rebalance: add $8, %rsp - RET + ALTERNATIVE __stringify(RET), \ + __stringify(ANNOTATE_UNRET_SAFE; ret; int3), \ + X86_FEATURE_CALL_DEPTH SYM_FUNC_END(ftrace_regs_caller) STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller) @@ -291,6 +305,8 @@ STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ SYM_FUNC_START(__fentry__) + CALL_DEPTH_ACCOUNT + cmpq $ftrace_stub, ftrace_trace_function jnz trace @@ -347,6 +363,8 @@ SYM_CODE_START(return_to_handler) int3 .Ldo_rop: mov %rdi, (%rsp) - RET + ALTERNATIVE __stringify(RET), \ + __stringify(ANNOTATE_UNRET_SAFE; ret; int3), \ + X86_FEATURE_CALL_DEPTH SYM_CODE_END(return_to_handler) #endif diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index a6b46740ea300..f46b62029d91e 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -2135,6 +2136,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i prog = image; EMIT_ENDBR(); + /* + * This is the direct-call trampoline, as such it needs accounting + * for the __fentry__ call. + */ + x86_call_depth_emit_accounting(&prog, NULL); EMIT1(0x55); /* push rbp */ EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */ diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index a2d301f58ceda..ff0536cea9682 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -785,7 +785,14 @@ static struct fgraph_ops fgraph_ops __initdata = { }; #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS -noinline __noclone static void trace_direct_tramp(void) { } +#ifndef CALL_DEPTH_ACCOUNT +#define CALL_DEPTH_ACCOUNT "" +#endif + +noinline __noclone static void trace_direct_tramp(void) +{ + asm(CALL_DEPTH_ACCOUNT); +} #endif /* diff --git a/samples/ftrace/ftrace-direct-modify.c b/samples/ftrace/ftrace-direct-modify.c index 39146fa83e20b..de5a0f67f320c 100644 --- a/samples/ftrace/ftrace-direct-modify.c +++ b/samples/ftrace/ftrace-direct-modify.c @@ -3,6 +3,7 @@ #include #include #include +#include extern void my_direct_func1(void); extern void my_direct_func2(void); @@ -34,6 +35,7 @@ asm ( ASM_ENDBR " pushq %rbp\n" " movq %rsp, %rbp\n" + CALL_DEPTH_ACCOUNT " call my_direct_func1\n" " leave\n" " .size my_tramp1, .-my_tramp1\n" @@ -45,6 +47,7 @@ asm ( ASM_ENDBR " pushq %rbp\n" " movq %rsp, %rbp\n" + CALL_DEPTH_ACCOUNT " call my_direct_func2\n" " leave\n" ASM_RET diff --git a/samples/ftrace/ftrace-direct-multi-modify.c b/samples/ftrace/ftrace-direct-multi-modify.c index 65aa94d96f4e3..d52370cad0b6e 100644 --- a/samples/ftrace/ftrace-direct-multi-modify.c +++ b/samples/ftrace/ftrace-direct-multi-modify.c @@ -3,6 +3,7 @@ #include #include #include +#include extern void my_direct_func1(unsigned long ip); extern void my_direct_func2(unsigned long ip); @@ -32,6 +33,7 @@ asm ( ASM_ENDBR " pushq %rbp\n" " movq %rsp, %rbp\n" + CALL_DEPTH_ACCOUNT " pushq %rdi\n" " movq 8(%rbp), %rdi\n" " call my_direct_func1\n" @@ -46,6 +48,7 @@ asm ( ASM_ENDBR " pushq %rbp\n" " movq %rsp, %rbp\n" + CALL_DEPTH_ACCOUNT " pushq %rdi\n" " movq 8(%rbp), %rdi\n" " call my_direct_func2\n" diff --git a/samples/ftrace/ftrace-direct-multi.c b/samples/ftrace/ftrace-direct-multi.c index 41ded7c615c7f..ec1088922517d 100644 --- a/samples/ftrace/ftrace-direct-multi.c +++ b/samples/ftrace/ftrace-direct-multi.c @@ -5,6 +5,7 @@ #include #include #include +#include extern void my_direct_func(unsigned long ip); @@ -27,6 +28,7 @@ asm ( ASM_ENDBR " pushq %rbp\n" " movq %rsp, %rbp\n" + CALL_DEPTH_ACCOUNT " pushq %rdi\n" " movq 8(%rbp), %rdi\n" " call my_direct_func\n" diff --git a/samples/ftrace/ftrace-direct-too.c b/samples/ftrace/ftrace-direct-too.c index 6690468c5cc2d..e13fb59a2b47a 100644 --- a/samples/ftrace/ftrace-direct-too.c +++ b/samples/ftrace/ftrace-direct-too.c @@ -4,6 +4,7 @@ #include /* for handle_mm_fault() */ #include #include +#include extern void my_direct_func(struct vm_area_struct *vma, unsigned long address, unsigned int flags); @@ -29,6 +30,7 @@ asm ( ASM_ENDBR " pushq %rbp\n" " movq %rsp, %rbp\n" + CALL_DEPTH_ACCOUNT " pushq %rdi\n" " pushq %rsi\n" " pushq %rdx\n" diff --git a/samples/ftrace/ftrace-direct.c b/samples/ftrace/ftrace-direct.c index e8f1e440b9b8a..1f769d0db20f3 100644 --- a/samples/ftrace/ftrace-direct.c +++ b/samples/ftrace/ftrace-direct.c @@ -4,6 +4,7 @@ #include /* for wake_up_process() */ #include #include +#include extern void my_direct_func(struct task_struct *p); @@ -26,6 +27,7 @@ asm ( ASM_ENDBR " pushq %rbp\n" " movq %rsp, %rbp\n" + CALL_DEPTH_ACCOUNT " pushq %rdi\n" " call my_direct_func\n" " popq %rdi\n" -- GitLab From d82a0345cf218f5050f5ad913e1ae6c579105731 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:11:38 +0200 Subject: [PATCH 0119/1423] x86/retbleed: Add call depth tracking mitigation The fully secure mitigation for RSB underflow on Intel SKL CPUs is IBRS, which inflicts up to 30% penalty for pathological syscall heavy work loads. Software based call depth tracking and RSB refill is not perfect, but reduces the attack surface massively. The penalty for the pathological case is about 8% which is still annoying but definitely more palatable than IBRS. Add a retbleed=stuff command line option to enable the call depth tracking and software refill of the RSB. This gives admins a choice. IBeeRS are safe and cause headaches, call depth tracking is considered to be s(t)ufficiently safe. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111149.029587352@infradead.org --- arch/x86/kernel/cpu/bugs.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index da7c361f47e0d..e6c23ead16170 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -787,6 +787,7 @@ enum retbleed_mitigation { RETBLEED_MITIGATION_IBPB, RETBLEED_MITIGATION_IBRS, RETBLEED_MITIGATION_EIBRS, + RETBLEED_MITIGATION_STUFF, }; enum retbleed_mitigation_cmd { @@ -794,6 +795,7 @@ enum retbleed_mitigation_cmd { RETBLEED_CMD_AUTO, RETBLEED_CMD_UNRET, RETBLEED_CMD_IBPB, + RETBLEED_CMD_STUFF, }; static const char * const retbleed_strings[] = { @@ -802,6 +804,7 @@ static const char * const retbleed_strings[] = { [RETBLEED_MITIGATION_IBPB] = "Mitigation: IBPB", [RETBLEED_MITIGATION_IBRS] = "Mitigation: IBRS", [RETBLEED_MITIGATION_EIBRS] = "Mitigation: Enhanced IBRS", + [RETBLEED_MITIGATION_STUFF] = "Mitigation: Stuffing", }; static enum retbleed_mitigation retbleed_mitigation __ro_after_init = @@ -831,6 +834,8 @@ static int __init retbleed_parse_cmdline(char *str) retbleed_cmd = RETBLEED_CMD_UNRET; } else if (!strcmp(str, "ibpb")) { retbleed_cmd = RETBLEED_CMD_IBPB; + } else if (!strcmp(str, "stuff")) { + retbleed_cmd = RETBLEED_CMD_STUFF; } else if (!strcmp(str, "nosmt")) { retbleed_nosmt = true; } else { @@ -879,6 +884,21 @@ static void __init retbleed_select_mitigation(void) } break; + case RETBLEED_CMD_STUFF: + if (IS_ENABLED(CONFIG_CALL_DEPTH_TRACKING) && + spectre_v2_enabled == SPECTRE_V2_RETPOLINE) { + retbleed_mitigation = RETBLEED_MITIGATION_STUFF; + + } else { + if (IS_ENABLED(CONFIG_CALL_DEPTH_TRACKING)) + pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n"); + else + pr_err("WARNING: kernel not compiled with CALL_DEPTH_TRACKING.\n"); + + goto do_cmd_auto; + } + break; + do_cmd_auto: case RETBLEED_CMD_AUTO: default: @@ -916,6 +936,12 @@ do_cmd_auto: mitigate_smt = true; break; + case RETBLEED_MITIGATION_STUFF: + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH); + x86_set_skl_return_thunk(); + break; + default: break; } @@ -926,7 +952,7 @@ do_cmd_auto: /* * Let IBRS trump all on Intel without affecting the effects of the - * retbleed= cmdline option. + * retbleed= cmdline option except for call depth based stuffing */ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { switch (spectre_v2_enabled) { @@ -939,7 +965,8 @@ do_cmd_auto: retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; break; default: - pr_err(RETBLEED_INTEL_MSG); + if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) + pr_err(RETBLEED_INTEL_MSG); } } @@ -1413,6 +1440,7 @@ static void __init spectre_v2_select_mitigation(void) if (IS_ENABLED(CONFIG_CPU_IBRS_ENTRY) && boot_cpu_has_bug(X86_BUG_RETBLEED) && retbleed_cmd != RETBLEED_CMD_OFF && + retbleed_cmd != RETBLEED_CMD_STUFF && boot_cpu_has(X86_FEATURE_IBRS) && boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { mode = SPECTRE_V2_IBRS; -- GitLab From 5c9a92dec3235b0c1d51e92860f8014753161593 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Mon, 17 Oct 2022 16:41:20 +0200 Subject: [PATCH 0120/1423] x86/bugs: Add retbleed=force Debug aid, allows running retbleed=force,stuff on non-affected uarchs Signed-off-by: Peter Zijlstra (Intel) --- arch/x86/kernel/cpu/bugs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index e6c23ead16170..b307b83e22bef 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -838,6 +838,8 @@ static int __init retbleed_parse_cmdline(char *str) retbleed_cmd = RETBLEED_CMD_STUFF; } else if (!strcmp(str, "nosmt")) { retbleed_nosmt = true; + } else if (!strcmp(str, "force")) { + setup_force_cpu_bug(X86_BUG_RETBLEED); } else { pr_err("Ignoring unknown retbleed option (%s).", str); } -- GitLab From ce3a0a29fb9f36d1cd221fffa64a3c405308868f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Sat, 8 Oct 2022 22:33:53 +0300 Subject: [PATCH 0121/1423] gpio: merrifield: Use str_enable_disable() helper Use str_enable_disable() helper instead of open coding the same. Signed-off-by: Andy Shevchenko --- drivers/gpio/gpio-merrifield.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-merrifield.c b/drivers/gpio/gpio-merrifield.c index 72ac09a597029..92ea8411050dd 100644 --- a/drivers/gpio/gpio-merrifield.c +++ b/drivers/gpio/gpio-merrifield.c @@ -14,6 +14,7 @@ #include #include #include +#include #define GCCR 0x000 /* controller configuration */ #define GPLR 0x004 /* pin level r/o */ @@ -331,7 +332,7 @@ static int mrfld_irq_set_wake(struct irq_data *d, unsigned int on) raw_spin_unlock_irqrestore(&priv->lock, flags); - dev_dbg(priv->dev, "%sable wake for gpio %u\n", on ? "en" : "dis", gpio); + dev_dbg(priv->dev, "%s wake for gpio %u\n", str_enable_disable(on), gpio); return 0; } -- GitLab From 26312973bfbc1db24b157797776ee2b5b48f5c50 Mon Sep 17 00:00:00 2001 From: Deming Wang Date: Thu, 6 Oct 2022 12:14:56 -0400 Subject: [PATCH 0122/1423] IB/uverbs: fix the typo of optional Fix the typo of optional in the function of UVERBS_HANDLER. Signed-off-by: Deming Wang Link: https://lore.kernel.org/r/20221006161456.2998-1-wangdeming@inspur.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/uverbs_std_types_qp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/uverbs_std_types_qp.c b/drivers/infiniband/core/uverbs_std_types_qp.c index dd1075466f61b..7b4773fa4bc0b 100644 --- a/drivers/infiniband/core/uverbs_std_types_qp.c +++ b/drivers/infiniband/core/uverbs_std_types_qp.c @@ -163,7 +163,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_QP_CREATE)( UVERBS_ATTR_CREATE_QP_SRQ_HANDLE)) return -EINVAL; - /* send_cq is optinal */ + /* send_cq is optional */ if (cap.max_send_wr) { send_cq = uverbs_attr_get_obj(attrs, UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE); -- GitLab From 53c2d5b14a82f6e7f0f8089083972df20e66a354 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Sat, 1 Oct 2022 10:00:45 +0800 Subject: [PATCH 0123/1423] RDMA/core: return -EOPNOSUPP for ODP unsupported device ib_reg_mr(3) which is used to register a MR with specific access flags for specific HCA will set errno when something go wrong. So, here we should return the specific -EOPNOTSUPP when the being requested ODP access flag is unsupported by the HCA(such as RXE). Signed-off-by: Li Zhijian Link: https://lore.kernel.org/r/20221001020045.8324-1-lizhijian@fujitsu.com Reviewed-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- include/rdma/ib_verbs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 975d6e9efbcb4..a1f4d53a4bb63 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4334,7 +4334,7 @@ static inline int ib_check_mr_access(struct ib_device *ib_dev, if (flags & IB_ACCESS_ON_DEMAND && !(ib_dev->attrs.kernel_cap_flags & IBK_ON_DEMAND_PAGING)) - return -EINVAL; + return -EOPNOTSUPP; return 0; } -- GitLab From 7ac7bfe746d8faddbd79abed526ee67f46d8867c Mon Sep 17 00:00:00 2001 From: Jiangshan Yi Date: Sun, 9 Oct 2022 16:10:47 +0800 Subject: [PATCH 0124/1423] RDMA/opa_vnic: fix spelling typo in comment Fix spelling typo in comment. Reported-by: k2ci Signed-off-by: Jiangshan Yi Link: https://lore.kernel.org/r/20221009081047.2643471-1-13667453960@163.com Signed-off-by: Leon Romanovsky --- include/rdma/opa_vnic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/rdma/opa_vnic.h b/include/rdma/opa_vnic.h index f3d5377b217a6..d297f084001a5 100644 --- a/include/rdma/opa_vnic.h +++ b/include/rdma/opa_vnic.h @@ -51,7 +51,7 @@ static inline void *opa_vnic_dev_priv(const struct net_device *dev) return oparn->dev_priv; } -/* opa_vnic skb meta data structrue */ +/* opa_vnic skb meta data structure */ struct opa_vnic_skb_mdata { u8 vl; u8 entropy; -- GitLab From acc7d94ab431fb6f34e41588f2784410c2d9008e Mon Sep 17 00:00:00 2001 From: Sergey Gorenko Date: Sun, 16 Oct 2022 12:38:31 +0300 Subject: [PATCH 0125/1423] IB/iser: open code iser_conn_state_comp_exch There is a single caller to iser_conn_state_comp_exch. Open code its logic and remove it. Acked-by: Max Gurtovoy Signed-off-by: Sergey Gorenko Link: https://lore.kernel.org/r/20221016093833.12537-2-mgurtovoy@nvidia.com Reviewed-by: Sagi Grimberg Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/iser/iser_verbs.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index a00ca117303a9..a73c30230ff9e 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -347,22 +347,6 @@ static void iser_device_try_release(struct iser_device *device) mutex_unlock(&ig.device_list_mutex); } -/* - * Called with state mutex held - */ -static int iser_conn_state_comp_exch(struct iser_conn *iser_conn, - enum iser_conn_state comp, - enum iser_conn_state exch) -{ - int ret; - - ret = (iser_conn->state == comp); - if (ret) - iser_conn->state = exch; - - return ret; -} - void iser_release_work(struct work_struct *work) { struct iser_conn *iser_conn; @@ -465,10 +449,10 @@ int iser_conn_terminate(struct iser_conn *iser_conn) int err = 0; /* terminate the iser conn only if the conn state is UP */ - if (!iser_conn_state_comp_exch(iser_conn, ISER_CONN_UP, - ISER_CONN_TERMINATING)) + if (iser_conn->state != ISER_CONN_UP) return 0; + iser_conn->state = ISER_CONN_TERMINATING; iser_info("iser_conn %p state %d\n", iser_conn, iser_conn->state); /* suspend queuing of new iscsi commands */ -- GitLab From a75243ae08d23272235cb26117464843538936b4 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 16 Oct 2022 12:38:32 +0300 Subject: [PATCH 0126/1423] IB/iser: add safety checks for state_mutex lock In some cases, we need to make sure that state_mutex is taken. Use lockdep_assert_held to warn us in case it doesn't while it should. Signed-off-by: Max Gurtovoy Link: https://lore.kernel.org/r/20221016093833.12537-3-mgurtovoy@nvidia.com Reviewed-by: Sergey Gorenko Reviewed-by: Sagi Grimberg Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/iser/iser_verbs.c | 26 ++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index a73c30230ff9e..f33e3a7f605d2 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -448,6 +448,8 @@ int iser_conn_terminate(struct iser_conn *iser_conn) struct ib_conn *ib_conn = &iser_conn->ib_conn; int err = 0; + lockdep_assert_held(&iser_conn->state_mutex); + /* terminate the iser conn only if the conn state is UP */ if (iser_conn->state != ISER_CONN_UP) return 0; @@ -482,9 +484,10 @@ int iser_conn_terminate(struct iser_conn *iser_conn) */ static void iser_connect_error(struct rdma_cm_id *cma_id) { - struct iser_conn *iser_conn; + struct iser_conn *iser_conn = cma_id->context; + + lockdep_assert_held(&iser_conn->state_mutex); - iser_conn = cma_id->context; iser_conn->state = ISER_CONN_TERMINATING; } @@ -526,12 +529,13 @@ static void iser_calc_scsi_params(struct iser_conn *iser_conn, */ static void iser_addr_handler(struct rdma_cm_id *cma_id) { + struct iser_conn *iser_conn = cma_id->context; struct iser_device *device; - struct iser_conn *iser_conn; struct ib_conn *ib_conn; int ret; - iser_conn = cma_id->context; + lockdep_assert_held(&iser_conn->state_mutex); + if (iser_conn->state != ISER_CONN_PENDING) /* bailout */ return; @@ -581,6 +585,8 @@ static void iser_route_handler(struct rdma_cm_id *cma_id) struct ib_conn *ib_conn = &iser_conn->ib_conn; struct ib_device *ib_dev = ib_conn->device->ib_device; + lockdep_assert_held(&iser_conn->state_mutex); + if (iser_conn->state != ISER_CONN_PENDING) /* bailout */ return; @@ -613,14 +619,18 @@ failure: iser_connect_error(cma_id); } +/* + * Called with state mutex held + */ static void iser_connected_handler(struct rdma_cm_id *cma_id, const void *private_data) { - struct iser_conn *iser_conn; + struct iser_conn *iser_conn = cma_id->context; struct ib_qp_attr attr; struct ib_qp_init_attr init_attr; - iser_conn = cma_id->context; + lockdep_assert_held(&iser_conn->state_mutex); + if (iser_conn->state != ISER_CONN_PENDING) /* bailout */ return; @@ -654,11 +664,15 @@ static void iser_disconnected_handler(struct rdma_cm_id *cma_id) } } +/* + * Called with state mutex held + */ static void iser_cleanup_handler(struct rdma_cm_id *cma_id, bool destroy) { struct iser_conn *iser_conn = cma_id->context; + lockdep_assert_held(&iser_conn->state_mutex); /* * We are not guaranteed that we visited disconnected_handler * by now, call it here to be safe that we handle CM drep -- GitLab From c1842f34fceef47d6285e558004f8e2d6ed91b91 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 16 Oct 2022 12:38:33 +0300 Subject: [PATCH 0127/1423] IB/iser: open code iser_disconnected_handler There is a single caller to iser_disconnected_handler. Open code its logic and remove it. Signed-off-by: Max Gurtovoy Link: https://lore.kernel.org/r/20221016093833.12537-4-mgurtovoy@nvidia.com Reviewed-by: Sergey Gorenko Reviewed-by: Sagi Grimberg Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/iser/iser_verbs.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index f33e3a7f605d2..1b8eda0dae4e0 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -651,19 +651,6 @@ static void iser_connected_handler(struct rdma_cm_id *cma_id, complete(&iser_conn->up_completion); } -static void iser_disconnected_handler(struct rdma_cm_id *cma_id) -{ - struct iser_conn *iser_conn = cma_id->context; - - if (iser_conn_terminate(iser_conn)) { - if (iser_conn->iscsi_conn) - iscsi_conn_failure(iser_conn->iscsi_conn, - ISCSI_ERR_CONN_FAILED); - else - iser_err("iscsi_iser connection isn't bound\n"); - } -} - /* * Called with state mutex held */ @@ -678,7 +665,13 @@ static void iser_cleanup_handler(struct rdma_cm_id *cma_id, * by now, call it here to be safe that we handle CM drep * and flush errors. */ - iser_disconnected_handler(cma_id); + if (iser_conn_terminate(iser_conn)) { + if (iser_conn->iscsi_conn) + iscsi_conn_failure(iser_conn->iscsi_conn, + ISCSI_ERR_CONN_FAILED); + else + iser_err("iscsi_iser connection isn't bound\n"); + } iser_free_ib_conn_res(iser_conn, destroy); complete(&iser_conn->ib_completion); } -- GitLab From 326c3753a6358ffab607749ea0aa95d1d0ad79b0 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 17 Oct 2022 22:41:02 -0700 Subject: [PATCH 0128/1423] gpiolib: of: add a quirk for legacy names in Mediatek mt2701-cs42448 The driver is using non-standard "i2s1-in-sel-gpio1" and "i2s1-in-sel-gpio2" names to describe its gpios. In preparation to converting to the standard naming (i2s1-in-sel-gpios) and switching the driver to gpiod API add a quirk to gpiolib to keep compatibility with existing DTSes. Reviewed-by: Daniel Thompson Reviewed-by: Linus Walleij Signed-off-by: Dmitry Torokhov Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-of.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index 0e4e1291604d6..cef4f66341256 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -488,6 +488,38 @@ static struct gpio_desc *of_find_usb_gpio(struct device_node *np, return of_get_named_gpiod_flags(np, con_id, idx, of_flags); } +static struct gpio_desc *of_find_mt2701_gpio(struct device_node *np, + const char *con_id, + unsigned int idx, + enum of_gpio_flags *of_flags) +{ + struct gpio_desc *desc; + const char *legacy_id; + + if (!IS_ENABLED(CONFIG_SND_SOC_MT2701_CS42448)) + return ERR_PTR(-ENOENT); + + if (!of_device_is_compatible(np, "mediatek,mt2701-cs42448-machine")) + return ERR_PTR(-ENOENT); + + if (!con_id || strcmp(con_id, "i2s1-in-sel")) + return ERR_PTR(-ENOENT); + + if (idx == 0) + legacy_id = "i2s1-in-sel-gpio1"; + else if (idx == 1) + legacy_id = "i2s1-in-sel-gpio2"; + else + return ERR_PTR(-ENOENT); + + desc = of_get_named_gpiod_flags(np, legacy_id, 0, of_flags); + if (!gpiod_not_found(desc)) + pr_info("%s is using legacy gpio name '%s' instead of '%s-gpios'\n", + of_node_full_name(np), legacy_id, con_id); + + return desc; +} + typedef struct gpio_desc *(*of_find_gpio_quirk)(struct device_node *np, const char *con_id, unsigned int idx, @@ -498,6 +530,7 @@ static const of_find_gpio_quirk of_find_gpio_quirks[] = { of_find_regulator_gpio, of_find_arizona_gpio, of_find_usb_gpio, + of_find_mt2701_gpio, NULL }; -- GitLab From b311c5cba779a87e85525d351965bbd2c18111de Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 17 Oct 2022 22:41:03 -0700 Subject: [PATCH 0129/1423] gpiolib: of: consolidate simple renames into a single quirk This consolidates all quirks doing simple renames (either allowing suffix-less names or trivial renames, when index changes are not required) into a single quirk. Reviewed-by: Daniel Thompson Reviewed-by: Linus Walleij Signed-off-by: Dmitry Torokhov Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-of.c | 183 +++++++++++++++----------------------- 1 file changed, 71 insertions(+), 112 deletions(-) diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index cef4f66341256..63c6fa3086f3c 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -365,127 +365,90 @@ struct gpio_desc *gpiod_get_from_of_node(const struct device_node *node, } EXPORT_SYMBOL_GPL(gpiod_get_from_of_node); -/* - * The SPI GPIO bindings happened before we managed to establish that GPIO - * properties should be named "foo-gpios" so we have this special kludge for - * them. - */ -static struct gpio_desc *of_find_spi_gpio(struct device_node *np, - const char *con_id, - unsigned int idx, - enum of_gpio_flags *of_flags) -{ - char prop_name[32]; /* 32 is max size of property name */ - - /* - * Hopefully the compiler stubs the rest of the function if this - * is false. - */ - if (!IS_ENABLED(CONFIG_SPI_MASTER)) - return ERR_PTR(-ENOENT); - - /* Allow this specifically for "spi-gpio" devices */ - if (!of_device_is_compatible(np, "spi-gpio") || !con_id) - return ERR_PTR(-ENOENT); - - /* Will be "gpio-sck", "gpio-mosi" or "gpio-miso" */ - snprintf(prop_name, sizeof(prop_name), "%s-%s", "gpio", con_id); - - return of_get_named_gpiod_flags(np, prop_name, idx, of_flags); -} - -/* - * The old Freescale bindings use simply "gpios" as name for the chip select - * lines rather than "cs-gpios" like all other SPI hardware. Account for this - * with a special quirk. - */ -static struct gpio_desc *of_find_spi_cs_gpio(struct device_node *np, +static struct gpio_desc *of_find_gpio_rename(struct device_node *np, const char *con_id, unsigned int idx, enum of_gpio_flags *of_flags) { - if (!IS_ENABLED(CONFIG_SPI_MASTER)) - return ERR_PTR(-ENOENT); - - /* Allow this specifically for Freescale and PPC devices */ - if (!of_device_is_compatible(np, "fsl,spi") && - !of_device_is_compatible(np, "aeroflexgaisler,spictrl") && - !of_device_is_compatible(np, "ibm,ppc4xx-spi")) - return ERR_PTR(-ENOENT); - /* Allow only if asking for "cs-gpios" */ - if (!con_id || strcmp(con_id, "cs")) - return ERR_PTR(-ENOENT); + static const struct of_rename_gpio { + const char *con_id; + const char *legacy_id; /* NULL - same as con_id */ + /* + * Compatible string can be set to NULL in case where + * matching to a particular compatible is not practical, + * but it should only be done for gpio names that have + * vendor prefix to reduce risk of false positives. + * Addition of such entries is strongly discouraged. + */ + const char *compatible; + } gpios[] = { +#if IS_ENABLED(CONFIG_MFD_ARIZONA) + { "wlf,reset", NULL, NULL }, +#endif +#if IS_ENABLED(CONFIG_REGULATOR) + /* + * Some regulator bindings happened before we managed to + * establish that GPIO properties should be named + * "foo-gpios" so we have this special kludge for them. + */ + { "wlf,ldoena", NULL, NULL }, /* Arizona */ + { "wlf,ldo1ena", NULL, NULL }, /* WM8994 */ + { "wlf,ldo2ena", NULL, NULL }, /* WM8994 */ +#endif +#if IS_ENABLED(CONFIG_SPI_MASTER) - /* - * While all other SPI controllers use "cs-gpios" the Freescale - * uses just "gpios" so translate to that when "cs-gpios" is - * requested. - */ - return of_get_named_gpiod_flags(np, "gpios", idx, of_flags); -} + /* + * The SPI GPIO bindings happened before we managed to + * establish that GPIO properties should be named + * "foo-gpios" so we have this special kludge for them. + */ + { "miso", "gpio-miso", "spi-gpio" }, + { "mosi", "gpio-mosi", "spi-gpio" }, + { "sck", "gpio-sck", "spi-gpio" }, -/* - * Some regulator bindings happened before we managed to establish that GPIO - * properties should be named "foo-gpios" so we have this special kludge for - * them. - */ -static struct gpio_desc *of_find_regulator_gpio(struct device_node *np, - const char *con_id, - unsigned int idx, - enum of_gpio_flags *of_flags) -{ - /* These are the connection IDs we accept as legacy GPIO phandles */ - const char *whitelist[] = { - "wlf,ldoena", /* Arizona */ - "wlf,ldo1ena", /* WM8994 */ - "wlf,ldo2ena", /* WM8994 */ + /* + * The old Freescale bindings use simply "gpios" as name + * for the chip select lines rather than "cs-gpios" like + * all other SPI hardware. Allow this specifically for + * Freescale and PPC devices. + */ + { "cs", "gpios", "fsl,spi" }, + { "cs", "gpios", "aeroflexgaisler,spictrl" }, + { "cs", "gpios", "ibm,ppc4xx-spi" }, +#endif +#if IS_ENABLED(CONFIG_TYPEC_FUSB302) + /* + * Fairchild FUSB302 host is using undocumented "fcs,int_n" + * property without the compulsory "-gpios" suffix. + */ + { "fcs,int_n", NULL, "fcs,fusb302" }, +#endif }; - int i; - - if (!IS_ENABLED(CONFIG_REGULATOR)) - return ERR_PTR(-ENOENT); + struct gpio_desc *desc; + const char *legacy_id; + unsigned int i; if (!con_id) return ERR_PTR(-ENOENT); - i = match_string(whitelist, ARRAY_SIZE(whitelist), con_id); - if (i < 0) - return ERR_PTR(-ENOENT); - - return of_get_named_gpiod_flags(np, con_id, idx, of_flags); -} - -static struct gpio_desc *of_find_arizona_gpio(struct device_node *np, - const char *con_id, - unsigned int idx, - enum of_gpio_flags *of_flags) -{ - if (!IS_ENABLED(CONFIG_MFD_ARIZONA)) - return ERR_PTR(-ENOENT); - - if (!con_id || strcmp(con_id, "wlf,reset")) - return ERR_PTR(-ENOENT); - - return of_get_named_gpiod_flags(np, con_id, idx, of_flags); -} + for (i = 0; i < ARRAY_SIZE(gpios); i++) { + if (strcmp(con_id, gpios[i].con_id)) + continue; -static struct gpio_desc *of_find_usb_gpio(struct device_node *np, - const char *con_id, - unsigned int idx, - enum of_gpio_flags *of_flags) -{ - /* - * Currently this USB quirk is only for the Fairchild FUSB302 host - * which is using an undocumented DT GPIO line named "fcs,int_n" - * without the compulsory "-gpios" suffix. - */ - if (!IS_ENABLED(CONFIG_TYPEC_FUSB302)) - return ERR_PTR(-ENOENT); + if (gpios[i].compatible && + !of_device_is_compatible(np, gpios[i].compatible)) + continue; - if (!con_id || strcmp(con_id, "fcs,int_n")) - return ERR_PTR(-ENOENT); + legacy_id = gpios[i].legacy_id ?: gpios[i].con_id; + desc = of_get_named_gpiod_flags(np, legacy_id, idx, of_flags); + if (!gpiod_not_found(desc)) { + pr_info("%s uses legacy gpio name '%s' instead of '%s-gpios'\n", + of_node_full_name(np), legacy_id, con_id); + return desc; + } + } - return of_get_named_gpiod_flags(np, con_id, idx, of_flags); + return ERR_PTR(-ENOENT); } static struct gpio_desc *of_find_mt2701_gpio(struct device_node *np, @@ -525,11 +488,7 @@ typedef struct gpio_desc *(*of_find_gpio_quirk)(struct device_node *np, unsigned int idx, enum of_gpio_flags *of_flags); static const of_find_gpio_quirk of_find_gpio_quirks[] = { - of_find_spi_gpio, - of_find_spi_cs_gpio, - of_find_regulator_gpio, - of_find_arizona_gpio, - of_find_usb_gpio, + of_find_gpio_rename, of_find_mt2701_gpio, NULL }; -- GitLab From 307c593ba5f915e308fd23a2daae7e9a5209b604 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 17 Oct 2022 22:41:04 -0700 Subject: [PATCH 0130/1423] gpiolib: of: tighten selection of gpio renaming quirks Tighten selection of legacy gpio renaming quirks so that they only considered on more relevant configurations. Suggested-by: Daniel Thompson Reviewed-by: Daniel Thompson Reviewed-by: Linus Walleij Signed-off-by: Dmitry Torokhov Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-of.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index 63c6fa3086f3c..7d4bbf6484bc7 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -385,18 +385,21 @@ static struct gpio_desc *of_find_gpio_rename(struct device_node *np, #if IS_ENABLED(CONFIG_MFD_ARIZONA) { "wlf,reset", NULL, NULL }, #endif -#if IS_ENABLED(CONFIG_REGULATOR) + /* * Some regulator bindings happened before we managed to * establish that GPIO properties should be named * "foo-gpios" so we have this special kludge for them. */ +#if IS_ENABLED(CONFIG_REGULATOR_ARIZONA_LDO1) { "wlf,ldoena", NULL, NULL }, /* Arizona */ +#endif +#if IS_ENABLED(CONFIG_REGULATOR_WM8994) { "wlf,ldo1ena", NULL, NULL }, /* WM8994 */ { "wlf,ldo2ena", NULL, NULL }, /* WM8994 */ #endif -#if IS_ENABLED(CONFIG_SPI_MASTER) +#if IS_ENABLED(CONFIG_SPI_GPIO) /* * The SPI GPIO bindings happened before we managed to * establish that GPIO properties should be named @@ -405,6 +408,7 @@ static struct gpio_desc *of_find_gpio_rename(struct device_node *np, { "miso", "gpio-miso", "spi-gpio" }, { "mosi", "gpio-mosi", "spi-gpio" }, { "sck", "gpio-sck", "spi-gpio" }, +#endif /* * The old Freescale bindings use simply "gpios" as name @@ -412,10 +416,14 @@ static struct gpio_desc *of_find_gpio_rename(struct device_node *np, * all other SPI hardware. Allow this specifically for * Freescale and PPC devices. */ +#if IS_ENABLED(CONFIG_SPI_FSL_SPI) { "cs", "gpios", "fsl,spi" }, { "cs", "gpios", "aeroflexgaisler,spictrl" }, +#endif +#if IS_ENABLED(CONFIG_SPI_PPC4xx) { "cs", "gpios", "ibm,ppc4xx-spi" }, #endif + #if IS_ENABLED(CONFIG_TYPEC_FUSB302) /* * Fairchild FUSB302 host is using undocumented "fcs,int_n" -- GitLab From fbbbcd177a27508a47c5136b31de5cf4c8d0ab1c Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 17 Oct 2022 22:41:05 -0700 Subject: [PATCH 0131/1423] gpiolib: of: add quirk for locating reset lines with legacy bindings Some legacy mappings used "gpio[s]-reset" instead of "reset-gpios", add a quirk so that gpiod API will still work on unmodified DTSes. Reviewed-by: Daniel Thompson Reviewed-by: Linus Walleij Signed-off-by: Dmitry Torokhov Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-of.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index 7d4bbf6484bc7..2b5d1b3095c7b 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -382,9 +382,18 @@ static struct gpio_desc *of_find_gpio_rename(struct device_node *np, */ const char *compatible; } gpios[] = { +#if !IS_ENABLED(CONFIG_LCD_HX8357) + /* Himax LCD controllers used "gpios-reset" */ + { "reset", "gpios-reset", "himax,hx8357" }, + { "reset", "gpios-reset", "himax,hx8369" }, +#endif #if IS_ENABLED(CONFIG_MFD_ARIZONA) { "wlf,reset", NULL, NULL }, #endif +#if !IS_ENABLED(CONFIG_PCI_LANTIQ) + /* MIPS Lantiq PCI */ + { "reset", "gpios-reset", "lantiq,pci-xway" }, +#endif /* * Some regulator bindings happened before we managed to @@ -399,6 +408,13 @@ static struct gpio_desc *of_find_gpio_rename(struct device_node *np, { "wlf,ldo2ena", NULL, NULL }, /* WM8994 */ #endif +#if IS_ENABLED(CONFIG_SND_SOC_TLV320AIC3X) + { "reset", "gpio-reset", "ti,tlv320aic3x" }, + { "reset", "gpio-reset", "ti,tlv320aic33" }, + { "reset", "gpio-reset", "ti,tlv320aic3007" }, + { "reset", "gpio-reset", "ti,tlv320aic3104" }, + { "reset", "gpio-reset", "ti,tlv320aic3106" }, +#endif #if IS_ENABLED(CONFIG_SPI_GPIO) /* * The SPI GPIO bindings happened before we managed to -- GitLab From 9c2cc7171e08eef52110d272fdf2225d6dcd81b6 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 17 Oct 2022 22:41:06 -0700 Subject: [PATCH 0132/1423] gpiolib: of: add a quirk for reset line for Marvell NFC controller The controller is using non-standard "reset-n-io" name for its reset gpio property, whereas gpiod API expects "-gpios". Add a quirk so that gpiod API will still work on unmodified DTSes. Reviewed-by: Daniel Thompson Signed-off-by: Dmitry Torokhov Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-of.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index 2b5d1b3095c7b..a9cedc39a2459 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -390,6 +390,16 @@ static struct gpio_desc *of_find_gpio_rename(struct device_node *np, #if IS_ENABLED(CONFIG_MFD_ARIZONA) { "wlf,reset", NULL, NULL }, #endif +#if IS_ENABLED(CONFIG_NFC_MRVL_I2C) + { "reset", "reset-n-io", "marvell,nfc-i2c" }, +#endif +#if IS_ENABLED(CONFIG_NFC_MRVL_SPI) + { "reset", "reset-n-io", "marvell,nfc-spi" }, +#endif +#if IS_ENABLED(CONFIG_NFC_MRVL_UART) + { "reset", "reset-n-io", "marvell,nfc-uart" }, + { "reset", "reset-n-io", "mrvl,nfc-uart" }, +#endif #if !IS_ENABLED(CONFIG_PCI_LANTIQ) /* MIPS Lantiq PCI */ { "reset", "gpios-reset", "lantiq,pci-xway" }, -- GitLab From 944004eb56dc977ad5f882ca4338f45396052317 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 17 Oct 2022 22:41:07 -0700 Subject: [PATCH 0133/1423] gpiolib: of: add a quirk for reset line for Cirrus CS42L56 codec The controller is using non-standard "cirrus,gpio-nreset" name for its reset gpio property, whereas gpiod API expects "-gpios". Add a quirk so that gpiod API will still work on unmodified DTSes. Reviewed-by: Daniel Thompson Reviewed-by: Linus Walleij Signed-off-by: Dmitry Torokhov Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-of.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index a9cedc39a2459..ffdbac2eeaa6f 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -418,6 +418,9 @@ static struct gpio_desc *of_find_gpio_rename(struct device_node *np, { "wlf,ldo2ena", NULL, NULL }, /* WM8994 */ #endif +#if IS_ENABLED(CONFIG_SND_SOC_CS42L56) + { "reset", "cirrus,gpio-nreset", "cirrus,cs42l56" }, +#endif #if IS_ENABLED(CONFIG_SND_SOC_TLV320AIC3X) { "reset", "gpio-reset", "ti,tlv320aic3x" }, { "reset", "gpio-reset", "ti,tlv320aic33" }, -- GitLab From eaf1a29665cda1c767cac0d523828892bd77a842 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 17 Oct 2022 22:41:08 -0700 Subject: [PATCH 0134/1423] gpiolib: of: add a quirk for legacy names in MOXA ART RTC The driver is using non-standard "gpio-rtc-data", "gpio-rtc-sclk", and "gpio-rtc-reset" names for properties describing its gpios. In preparation to converting to the standard naming ("rtc-*-gpios") and switching the driver to gpiod API add a quirk to gpiolib to keep compatibility with existing DTSes. Signed-off-by: Dmitry Torokhov Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-of.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index ffdbac2eeaa6f..d22498c72a678 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -390,6 +390,11 @@ static struct gpio_desc *of_find_gpio_rename(struct device_node *np, #if IS_ENABLED(CONFIG_MFD_ARIZONA) { "wlf,reset", NULL, NULL }, #endif +#if IS_ENABLED(CONFIG_RTC_DRV_MOXART) + { "rtc-data", "gpio-rtc-data", "moxa,moxart-rtc" }, + { "rtc-sclk", "gpio-rtc-sclk", "moxa,moxart-rtc" }, + { "rtc-reset", "gpio-rtc-reset", "moxa,moxart-rtc" }, +#endif #if IS_ENABLED(CONFIG_NFC_MRVL_I2C) { "reset", "reset-n-io", "marvell,nfc-i2c" }, #endif -- GitLab From e3186e36925fc18384492491ebcf3da749780a30 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 17 Oct 2022 22:41:09 -0700 Subject: [PATCH 0135/1423] gpiolib: of: factor out code overriding gpio line polarity There are several instances where we use a separate property to override polarity specified in gpio property. Factor it out into a separate function. Reviewed-by: Linus Walleij Signed-off-by: Dmitry Torokhov Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-of.c | 48 +++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index d22498c72a678..6faf0dc7bc31b 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -130,6 +130,28 @@ bool of_gpio_need_valid_mask(const struct gpio_chip *gc) return false; } +/* + * Overrides stated polarity of a gpio line and warns when there is a + * discrepancy. + */ +static void of_gpio_quirk_polarity(const struct device_node *np, + bool active_high, + enum of_gpio_flags *flags) +{ + if (active_high) { + if (*flags & OF_GPIO_ACTIVE_LOW) { + pr_warn("%s GPIO handle specifies active low - ignored\n", + of_node_full_name(np)); + *flags &= ~OF_GPIO_ACTIVE_LOW; + } + } else { + if (!(*flags & OF_GPIO_ACTIVE_LOW)) + pr_info("%s enforce active low on GPIO handle\n", + of_node_full_name(np)); + *flags |= OF_GPIO_ACTIVE_LOW; + } +} + static void of_gpio_flags_quirks(const struct device_node *np, const char *propname, enum of_gpio_flags *flags, @@ -145,7 +167,7 @@ static void of_gpio_flags_quirks(const struct device_node *np, (!(strcmp(propname, "enable-gpio") && strcmp(propname, "enable-gpios")) && of_device_is_compatible(np, "regulator-gpio")))) { - bool active_low = !of_property_read_bool(np, + bool active_high = of_property_read_bool(np, "enable-active-high"); /* * The regulator GPIO handles are specified such that the @@ -153,13 +175,7 @@ static void of_gpio_flags_quirks(const struct device_node *np, * the polarity of the GPIO line. Any phandle flags must * be actively ignored. */ - if ((*flags & OF_GPIO_ACTIVE_LOW) && !active_low) { - pr_warn("%s GPIO handle specifies active low - ignored\n", - of_node_full_name(np)); - *flags &= ~OF_GPIO_ACTIVE_LOW; - } - if (active_low) - *flags |= OF_GPIO_ACTIVE_LOW; + of_gpio_quirk_polarity(np, active_high, flags); } /* * Legacy open drain handling for fixed voltage regulators. @@ -200,18 +216,10 @@ static void of_gpio_flags_quirks(const struct device_node *np, * conflict and the "spi-cs-high" flag will * take precedence. */ - if (of_property_read_bool(child, "spi-cs-high")) { - if (*flags & OF_GPIO_ACTIVE_LOW) { - pr_warn("%s GPIO handle specifies active low - ignored\n", - of_node_full_name(child)); - *flags &= ~OF_GPIO_ACTIVE_LOW; - } - } else { - if (!(*flags & OF_GPIO_ACTIVE_LOW)) - pr_info("%s enforce active low on chipselect handle\n", - of_node_full_name(child)); - *flags |= OF_GPIO_ACTIVE_LOW; - } + bool active_high = of_property_read_bool(child, + "spi-cs-high"); + of_gpio_quirk_polarity(child, active_high, + flags); of_node_put(child); break; } -- GitLab From b02c85c9458cdd15e2c43413d7d2541a468cde57 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 17 Oct 2022 22:41:10 -0700 Subject: [PATCH 0136/1423] gpiolib: of: add quirk for phy reset polarity for Freescale Ethernet Bindings for Freescale Fast Ethernet Controller use a separate property "phy-reset-active-high" to specify polarity of its phy gpio line. To allow converting the driver to gpiod API we need to add this quirk to gpiolib. Reviewed-by: Linus Walleij Signed-off-by: Dmitry Torokhov Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-of.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index 6faf0dc7bc31b..c2a55ffb2b203 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -231,6 +231,33 @@ static void of_gpio_flags_quirks(const struct device_node *np, !strcmp(propname, "snps,reset-gpio") && of_property_read_bool(np, "snps,reset-active-low")) *flags |= OF_GPIO_ACTIVE_LOW; + + /* + * Freescale Fast Ethernet Controller uses a separate property to + * describe polarity of the phy reset line. + */ + if (IS_ENABLED(CONFIG_FEC)) { + static const char * const fec_devices[] = { + "fsl,imx25-fec", + "fsl,imx27-fec", + "fsl,imx28-fec", + "fsl,imx6q-fec", + "fsl,mvf600-fec", + "fsl,imx6sx-fec", + "fsl,imx6ul-fec", + "fsl,imx8mq-fec", + "fsl,imx8qm-fec", + "fsl,s32v234-fec", + NULL + }; + + if (!strcmp(propname, "phy-reset-gpios") && + of_device_compatible_match(np, fec_devices)) { + bool active_high = of_property_read_bool(np, + "phy-reset-active-high"); + of_gpio_quirk_polarity(np, active_high, flags); + } + } } /** -- GitLab From 99d18d42c942854a073191714a311dc2420ec7d3 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 17 Oct 2022 22:41:11 -0700 Subject: [PATCH 0137/1423] gpiolib: of: add a quirk for reset line polarity for Himax LCDs Existing DTS that use legacy (non-standard) property name for the reset line "gpios-reset" also specify incorrect polarity (0 which maps to "active high"). Add a quirk to force polarity to "active low" so that once driver is converted to gpiod API that pays attention to line polarity it will work properly. Reviewed-by: Linus Walleij Signed-off-by: Dmitry Torokhov Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-of.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index c2a55ffb2b203..52616848a37c4 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -152,11 +152,47 @@ static void of_gpio_quirk_polarity(const struct device_node *np, } } +/* + * This quirk does static polarity overrides in cases where existing + * DTS specified incorrect polarity. + */ +static void of_gpio_try_fixup_polarity(const struct device_node *np, + const char *propname, + enum of_gpio_flags *flags) +{ + static const struct { + const char *compatible; + const char *propname; + bool active_high; + } gpios[] = { +#if !IS_ENABLED(CONFIG_LCD_HX8357) + /* + * Himax LCD controllers used incorrectly named + * "gpios-reset" property and also specified wrong + * polarity. + */ + { "himax,hx8357", "gpios-reset", false }, + { "himax,hx8369", "gpios-reset", false }, +#endif + }; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(gpios); i++) { + if (of_device_is_compatible(np, gpios[i].compatible) && + !strcmp(propname, gpios[i].propname)) { + of_gpio_quirk_polarity(np, gpios[i].active_high, flags); + break; + } + } +} + static void of_gpio_flags_quirks(const struct device_node *np, const char *propname, enum of_gpio_flags *flags, int index) { + of_gpio_try_fixup_polarity(np, propname, flags); + /* * Some GPIO fixed regulator quirks. * Note that active low is the default. -- GitLab From dbf53a29b28b277fa952a000245b558536c6bdd7 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 19 Oct 2022 18:59:45 +0200 Subject: [PATCH 0138/1423] x86/paravirt: Fix a !PARAVIRT build warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix ./include/trace/events/xen.h:28:31: warning: ‘enum paravirt_lazy_mode’ \ declared inside parameter list will not be visible outside of this definition or declaration which turns into a build error: ./include/trace/events/xen.h:28:50: error: parameter 1 (‘mode’) has incomplete type 28 | TP_PROTO(enum paravirt_lazy_mode mode), \ due to enum paravirt_lazy_mode being visible only under CONFIG_PARAVIRT. Just pull it up where it is unconditionally visible. Signed-off-by: Borislav Petkov Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/Y1AtAXM8YjtBm2cj@zn.tnic --- arch/x86/include/asm/paravirt_types.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index e137d94121232..27c692791b7ea 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -9,6 +9,13 @@ struct paravirt_patch_site { u8 type; /* type of this instruction */ u8 len; /* length of original instruction */ }; + +/* Lazy mode for batching updates / context switch */ +enum paravirt_lazy_mode { + PARAVIRT_LAZY_NONE, + PARAVIRT_LAZY_MMU, + PARAVIRT_LAZY_CPU, +}; #endif #ifdef CONFIG_PARAVIRT @@ -582,13 +589,6 @@ int paravirt_disable_iospace(void); __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) -/* Lazy mode for batching updates / context switch */ -enum paravirt_lazy_mode { - PARAVIRT_LAZY_NONE, - PARAVIRT_LAZY_MMU, - PARAVIRT_LAZY_CPU, -}; - enum paravirt_lazy_mode paravirt_get_lazy_mode(void); void paravirt_start_context_switch(struct task_struct *prev); void paravirt_end_context_switch(struct task_struct *next); -- GitLab From 45e6319bd5f2154d8b8c9f1eaa4ac030ba0d330c Mon Sep 17 00:00:00 2001 From: Zhiqi Song Date: Sat, 24 Sep 2022 15:38:31 +0800 Subject: [PATCH 0139/1423] crypto: hisilicon/hpre - fix resource leak in remove process In hpre_remove(), when the disable operation of qm sriov failed, the following logic should continue to be executed to release the remaining resources that have been allocated, instead of returning directly, otherwise there will be resource leakage. Signed-off-by: Zhiqi Song Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/hpre/hpre_main.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/crypto/hisilicon/hpre/hpre_main.c b/drivers/crypto/hisilicon/hpre/hpre_main.c index 471e5ca720f57..baf1faec7046f 100644 --- a/drivers/crypto/hisilicon/hpre/hpre_main.c +++ b/drivers/crypto/hisilicon/hpre/hpre_main.c @@ -1437,18 +1437,12 @@ err_with_qm_init: static void hpre_remove(struct pci_dev *pdev) { struct hisi_qm *qm = pci_get_drvdata(pdev); - int ret; hisi_qm_pm_uninit(qm); hisi_qm_wait_task_finish(qm, &hpre_devices); hisi_qm_alg_unregister(qm, &hpre_devices); - if (qm->fun_type == QM_HW_PF && qm->vfs_num) { - ret = hisi_qm_sriov_disable(pdev, true); - if (ret) { - pci_err(pdev, "Disable SRIOV fail!\n"); - return; - } - } + if (qm->fun_type == QM_HW_PF && qm->vfs_num) + hisi_qm_sriov_disable(pdev, true); hpre_debugfs_exit(qm); hisi_qm_stop(qm, QM_NORMAL); -- GitLab From 7001141d34e550854425afa76e960513cf150a62 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Sat, 24 Sep 2022 17:34:24 +0800 Subject: [PATCH 0140/1423] crypto: hisilicon/qm - drop unnecessary IS_ENABLE(CONFIG_NUMA) check dev_to_node() can handle the case when CONFIG_NUMA is not set, so the check of CONFIG_NUMA is redundant and can be removed. Signed-off-by: Yicong Yang Signed-off-by: Weili Qian Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/qm.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 8b387de69d229..9a38e170fb1d1 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -4277,16 +4277,14 @@ static int hisi_qm_sort_devices(int node, struct list_head *head, struct hisi_qm *qm; struct list_head *n; struct device *dev; - int dev_node = 0; + int dev_node; list_for_each_entry(qm, &qm_list->list, list) { dev = &qm->pdev->dev; - if (IS_ENABLED(CONFIG_NUMA)) { - dev_node = dev_to_node(dev); - if (dev_node < 0) - dev_node = 0; - } + dev_node = dev_to_node(dev); + if (dev_node < 0) + dev_node = 0; res = kzalloc(sizeof(*res), GFP_KERNEL); if (!res) -- GitLab From f57e292897cac13b6ddee078aea21173b234ecb7 Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Sat, 24 Sep 2022 18:14:42 +0800 Subject: [PATCH 0141/1423] crypto: hisilicon/qm - fix incorrect parameters usage In qm_get_xqc_depth(), parameters low_bits and high_bits save the values of the corresponding bits. However, the values saved by the two parameters are opposite. As a result, the values returned to the callers are incorrect. Fixes: 129a9f340172 ("crypto: hisilicon/qm - get qp num and depth from hardware registers") Signed-off-by: Weili Qian Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/qm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 9a38e170fb1d1..01c083e2c4bd3 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -909,8 +909,8 @@ static void qm_get_xqc_depth(struct hisi_qm *qm, u16 *low_bits, u32 depth; depth = hisi_qm_get_hw_info(qm, qm_basic_info, type, qm->cap_ver); - *high_bits = depth & QM_XQ_DEPTH_MASK; - *low_bits = (depth >> QM_XQ_DEPTH_SHIFT) & QM_XQ_DEPTH_MASK; + *low_bits = depth & QM_XQ_DEPTH_MASK; + *high_bits = (depth >> QM_XQ_DEPTH_SHIFT) & QM_XQ_DEPTH_MASK; } static u32 qm_get_irq_num(struct hisi_qm *qm) -- GitLab From 94adb03fd58bbe355e3d7a9d0f701889313e4a51 Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Sat, 24 Sep 2022 18:34:45 +0800 Subject: [PATCH 0142/1423] crypto: hisilicon/sec - enabling clock gating of the address prefetch module Change the value of clock gating register to 0x7fff to enable clock gating of the address prefetch module. When the device is idle, the clock is turned off to save power. Signed-off-by: Weili Qian Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/sec2/sec_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c index 3705412bac5f1..6eb8a16ba0a70 100644 --- a/drivers/crypto/hisilicon/sec2/sec_main.c +++ b/drivers/crypto/hisilicon/sec2/sec_main.c @@ -55,7 +55,7 @@ #define SEC_CONTROL_REG 0x301200 #define SEC_DYNAMIC_GATE_REG 0x30121c #define SEC_CORE_AUTO_GATE 0x30212c -#define SEC_DYNAMIC_GATE_EN 0x7bff +#define SEC_DYNAMIC_GATE_EN 0x7fff #define SEC_CORE_AUTO_GATE_EN GENMASK(3, 0) #define SEC_CLK_GATE_ENABLE BIT(3) #define SEC_CLK_GATE_DISABLE (~BIT(3)) -- GitLab From ee1537fe3dd89860d0336563891f6cac707d0cb5 Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Sat, 24 Sep 2022 19:04:31 +0800 Subject: [PATCH 0143/1423] crypto: hisilicon/qm - re-enable communicate interrupt before notifying PF After the device is reset, the VF needs to re-enable communication interrupt before the VF sends restart complete message to the PF. If the interrupt is re-enabled after the VF notifies the PF, the PF may fail to send messages to the VF after receiving VF's restart complete message. Fixes: 760fe22cf5e9 ("crypto: hisilicon/qm - update reset flow") Signed-off-by: Weili Qian Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/qm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 01c083e2c4bd3..e3edb176d9765 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -5723,6 +5723,7 @@ static void qm_pf_reset_vf_done(struct hisi_qm *qm) cmd = QM_VF_START_FAIL; } + qm_cmd_init(qm); ret = qm_ping_pf(qm, cmd); if (ret) dev_warn(&pdev->dev, "PF responds timeout in reset done!\n"); @@ -5784,7 +5785,6 @@ static void qm_pf_reset_vf_process(struct hisi_qm *qm, goto err_get_status; qm_pf_reset_vf_done(qm); - qm_cmd_init(qm); dev_info(dev, "device reset done.\n"); -- GitLab From ad981647dbe1ea91071b9783dd62d74e22c6d955 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Mon, 26 Sep 2022 17:14:21 +0800 Subject: [PATCH 0144/1423] crypto: ccm - use local variables instead of indirect references The variable odata has been introduced into the function scope as a variable and should be used directly. Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu --- crypto/ccm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/ccm.c b/crypto/ccm.c index 6b815ece51c6a..30dbae72728f7 100644 --- a/crypto/ccm.c +++ b/crypto/ccm.c @@ -218,7 +218,7 @@ static int crypto_ccm_auth(struct aead_request *req, struct scatterlist *plain, cryptlen += ilen; } - ahash_request_set_crypt(ahreq, plain, pctx->odata, cryptlen); + ahash_request_set_crypt(ahreq, plain, odata, cryptlen); err = crypto_ahash_finup(ahreq); out: return err; -- GitLab From f30fe6314698d107edbb9db50bc3c3443a30ec80 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Mon, 26 Sep 2022 17:14:40 +0800 Subject: [PATCH 0145/1423] crypto: scatterwalk - remove duplicate function declarations scatterwalk_map() is an inline function already defined in the header file, it is necessary to delete the re-declaration at the same location, which was left out in the header file by an earlier modification. Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu --- include/crypto/scatterwalk.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/crypto/scatterwalk.h b/include/crypto/scatterwalk.h index ccdb05f68a75c..f2c42b4111b1b 100644 --- a/include/crypto/scatterwalk.h +++ b/include/crypto/scatterwalk.h @@ -93,7 +93,6 @@ static inline void scatterwalk_done(struct scatter_walk *walk, int out, void scatterwalk_copychunks(void *buf, struct scatter_walk *walk, size_t nbytes, int out); -void *scatterwalk_map(struct scatter_walk *walk); void scatterwalk_map_and_copy(void *buf, struct scatterlist *sg, unsigned int start, unsigned int nbytes, int out); -- GitLab From 237f9eceb2f3c888c1a26a4209243607d3cc0c7c Mon Sep 17 00:00:00 2001 From: ruanjinjie Date: Mon, 26 Sep 2022 17:27:11 +0800 Subject: [PATCH 0146/1423] crypto: ccp - Add __init/__exit annotations to module init/exit funcs Add missing __init/__exit annotations to module init/exit funcs Signed-off-by: ruanjinjie Acked-by: John Allen Signed-off-by: Herbert Xu --- drivers/crypto/ccp/ccp-crypto-main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/ccp/ccp-crypto-main.c b/drivers/crypto/ccp/ccp-crypto-main.c index 5976530c00a8a..3321810273053 100644 --- a/drivers/crypto/ccp/ccp-crypto-main.c +++ b/drivers/crypto/ccp/ccp-crypto-main.c @@ -400,7 +400,7 @@ static void ccp_unregister_algs(void) } } -static int ccp_crypto_init(void) +static int __init ccp_crypto_init(void) { int ret; @@ -421,7 +421,7 @@ static int ccp_crypto_init(void) return ret; } -static void ccp_crypto_exit(void) +static void __exit ccp_crypto_exit(void) { ccp_unregister_algs(); } -- GitLab From 224f3a050e495a7c3c1bcee2c613d0996bc661dc Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 26 Sep 2022 16:45:45 -0500 Subject: [PATCH 0147/1423] crypto: talitos - Replace zero-length arrays with DECLARE_FLEX_ARRAY() helper Zero-length arrays are deprecated and we are moving towards adopting C99 flexible-array members, instead. So, replace zero-length arrays declarations in anonymous union with the new DECLARE_FLEX_ARRAY() helper macro. This helper allows for flexible-array members in unions. Link: https://github.com/KSPP/linux/issues/193 Link: https://github.com/KSPP/linux/issues/216 Link: https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Signed-off-by: Herbert Xu --- drivers/crypto/talitos.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/talitos.h b/drivers/crypto/talitos.h index 32825119e8805..1a93ee355929d 100644 --- a/drivers/crypto/talitos.h +++ b/drivers/crypto/talitos.h @@ -65,8 +65,8 @@ struct talitos_edesc { dma_addr_t dma_link_tbl; struct talitos_desc desc; union { - struct talitos_ptr link_tbl[0]; - u8 buf[0]; + DECLARE_FLEX_ARRAY(struct talitos_ptr, link_tbl); + DECLARE_FLEX_ARRAY(u8, buf); }; }; -- GitLab From 22044d9b04b593831d8e16ba7aafabf4e75964f5 Mon Sep 17 00:00:00 2001 From: Peter Harliman Liem Date: Tue, 27 Sep 2022 11:10:08 +0800 Subject: [PATCH 0148/1423] crypto: inside-secure - Expand soc data structure Currently platform data is assigned directly to version string(instead of struct). To make it more scalable, we move it to use data struct instead. This allows customization for individual platforms other than version string. Signed-off-by: Peter Harliman Liem Signed-off-by: Herbert Xu --- drivers/crypto/inside-secure/safexcel.c | 44 +++++++++++++++++-------- drivers/crypto/inside-secure/safexcel.h | 6 +++- 2 files changed, 35 insertions(+), 15 deletions(-) diff --git a/drivers/crypto/inside-secure/safexcel.c b/drivers/crypto/inside-secure/safexcel.c index ad0d8c4a71ac1..8f4872470529c 100644 --- a/drivers/crypto/inside-secure/safexcel.c +++ b/drivers/crypto/inside-secure/safexcel.c @@ -410,10 +410,10 @@ static int eip197_load_firmwares(struct safexcel_crypto_priv *priv) int i, j, ret = 0, pe; int ipuesz, ifppsz, minifw = 0; - if (priv->version == EIP197D_MRVL) + if (priv->data->version == EIP197D_MRVL) dir = "eip197d"; - else if (priv->version == EIP197B_MRVL || - priv->version == EIP197_DEVBRD) + else if (priv->data->version == EIP197B_MRVL || + priv->data->version == EIP197_DEVBRD) dir = "eip197b"; else return -ENODEV; @@ -423,7 +423,7 @@ retry_fw: snprintf(fw_path, 37, "inside-secure/%s/%s", dir, fw_name[i]); ret = firmware_request_nowarn(&fw[i], fw_path, priv->dev); if (ret) { - if (minifw || priv->version != EIP197B_MRVL) + if (minifw || priv->data->version != EIP197B_MRVL) goto release_fw; /* Fallback to the old firmware location for the @@ -1597,7 +1597,7 @@ static int safexcel_probe_generic(void *pdev, safexcel_configure(priv); - if (IS_ENABLED(CONFIG_PCI) && priv->version == EIP197_DEVBRD) { + if (IS_ENABLED(CONFIG_PCI) && priv->data->version == EIP197_DEVBRD) { /* * Request MSI vectors for global + 1 per ring - * or just 1 for older dev images @@ -1731,7 +1731,7 @@ static int safexcel_probe(struct platform_device *pdev) return -ENOMEM; priv->dev = dev; - priv->version = (enum safexcel_eip_version)of_device_get_match_data(dev); + priv->data = (struct safexcel_priv_data *)of_device_get_match_data(dev); platform_set_drvdata(pdev, priv); @@ -1806,27 +1806,43 @@ static int safexcel_remove(struct platform_device *pdev) return 0; } +static const struct safexcel_priv_data eip97ies_mrvl_data = { + .version = EIP97IES_MRVL, +}; + +static const struct safexcel_priv_data eip197b_mrvl_data = { + .version = EIP197B_MRVL, +}; + +static const struct safexcel_priv_data eip197d_mrvl_data = { + .version = EIP197D_MRVL, +}; + +static const struct safexcel_priv_data eip197_devbrd_data = { + .version = EIP197_DEVBRD, +}; + static const struct of_device_id safexcel_of_match_table[] = { { .compatible = "inside-secure,safexcel-eip97ies", - .data = (void *)EIP97IES_MRVL, + .data = &eip97ies_mrvl_data, }, { .compatible = "inside-secure,safexcel-eip197b", - .data = (void *)EIP197B_MRVL, + .data = &eip197b_mrvl_data, }, { .compatible = "inside-secure,safexcel-eip197d", - .data = (void *)EIP197D_MRVL, + .data = &eip197d_mrvl_data, }, /* For backward compatibility and intended for generic use */ { .compatible = "inside-secure,safexcel-eip97", - .data = (void *)EIP97IES_MRVL, + .data = &eip97ies_mrvl_data, }, { .compatible = "inside-secure,safexcel-eip197", - .data = (void *)EIP197B_MRVL, + .data = &eip197b_mrvl_data, }, {}, }; @@ -1862,7 +1878,7 @@ static int safexcel_pci_probe(struct pci_dev *pdev, return -ENOMEM; priv->dev = dev; - priv->version = (enum safexcel_eip_version)ent->driver_data; + priv->data = (struct safexcel_priv_data *)ent->driver_data; pci_set_drvdata(pdev, priv); @@ -1881,7 +1897,7 @@ static int safexcel_pci_probe(struct pci_dev *pdev, } priv->base = pcim_iomap_table(pdev)[0]; - if (priv->version == EIP197_DEVBRD) { + if (priv->data->version == EIP197_DEVBRD) { dev_dbg(dev, "Device identified as FPGA based development board - applying HW reset\n"); rc = pcim_iomap_regions(pdev, 4, "crypto_safexcel"); @@ -1949,7 +1965,7 @@ static const struct pci_device_id safexcel_pci_ids[] = { { PCI_DEVICE_SUB(PCI_VENDOR_ID_XILINX, 0x9038, 0x16ae, 0xc522), - .driver_data = EIP197_DEVBRD, + .driver_data = (kernel_ulong_t)&eip197_devbrd_data, }, {}, }; diff --git a/drivers/crypto/inside-secure/safexcel.h b/drivers/crypto/inside-secure/safexcel.h index 797ff91512e0d..e8da8b30a3923 100644 --- a/drivers/crypto/inside-secure/safexcel.h +++ b/drivers/crypto/inside-secure/safexcel.h @@ -733,6 +733,10 @@ enum safexcel_eip_version { EIP197_DEVBRD }; +struct safexcel_priv_data { + enum safexcel_eip_version version; +}; + /* Priority we use for advertising our algorithms */ #define SAFEXCEL_CRA_PRIORITY 300 @@ -815,7 +819,7 @@ struct safexcel_crypto_priv { struct clk *reg_clk; struct safexcel_config config; - enum safexcel_eip_version version; + struct safexcel_priv_data *data; struct safexcel_register_offsets offsets; struct safexcel_hwconfig hwconfig; u32 flags; -- GitLab From 594ed3d245d3e2d0760f30724e02ecf1604b2c01 Mon Sep 17 00:00:00 2001 From: Peter Harliman Liem Date: Tue, 27 Sep 2022 11:10:09 +0800 Subject: [PATCH 0149/1423] crypto: inside-secure - Add fw_little_endian option This is to add fw_little_endian option, which can be used for platform which firmware is using little-endian (instead of big-endian). Signed-off-by: Peter Harliman Liem Signed-off-by: Herbert Xu --- drivers/crypto/inside-secure/safexcel.c | 14 ++++++++++---- drivers/crypto/inside-secure/safexcel.h | 1 + 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/crypto/inside-secure/safexcel.c b/drivers/crypto/inside-secure/safexcel.c index 8f4872470529c..4d6d64ff9a0f2 100644 --- a/drivers/crypto/inside-secure/safexcel.c +++ b/drivers/crypto/inside-secure/safexcel.c @@ -316,14 +316,20 @@ static void eip197_init_firmware(struct safexcel_crypto_priv *priv) static int eip197_write_firmware(struct safexcel_crypto_priv *priv, const struct firmware *fw) { - const __be32 *data = (const __be32 *)fw->data; + u32 val; int i; /* Write the firmware */ - for (i = 0; i < fw->size / sizeof(u32); i++) - writel(be32_to_cpu(data[i]), + for (i = 0; i < fw->size / sizeof(u32); i++) { + if (priv->data->fw_little_endian) + val = le32_to_cpu(((const __le32 *)fw->data)[i]); + else + val = be32_to_cpu(((const __be32 *)fw->data)[i]); + + writel(val, priv->base + EIP197_CLASSIFICATION_RAMS + - i * sizeof(__be32)); + i * sizeof(val)); + } /* Exclude final 2 NOPs from size */ return i - EIP197_FW_TERMINAL_NOPS; diff --git a/drivers/crypto/inside-secure/safexcel.h b/drivers/crypto/inside-secure/safexcel.h index e8da8b30a3923..f049293870b48 100644 --- a/drivers/crypto/inside-secure/safexcel.h +++ b/drivers/crypto/inside-secure/safexcel.h @@ -735,6 +735,7 @@ enum safexcel_eip_version { struct safexcel_priv_data { enum safexcel_eip_version version; + bool fw_little_endian; }; /* Priority we use for advertising our algorithms */ -- GitLab From 36dd88b1c09c78f993cb11dcc5f4211d78a10e5f Mon Sep 17 00:00:00 2001 From: Peter Harliman Liem Date: Tue, 27 Sep 2022 11:10:10 +0800 Subject: [PATCH 0150/1423] crypto: inside-secure - Add MaxLinear platform This is to add MaxLinear platform into compatible id. Firmware endianness option is added since MaxLinear firmware is in little endian format. Signed-off-by: Peter Harliman Liem Signed-off-by: Herbert Xu --- drivers/crypto/inside-secure/safexcel.c | 11 +++++++++++ drivers/crypto/inside-secure/safexcel.h | 3 ++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/inside-secure/safexcel.c b/drivers/crypto/inside-secure/safexcel.c index 4d6d64ff9a0f2..ae6110376e21c 100644 --- a/drivers/crypto/inside-secure/safexcel.c +++ b/drivers/crypto/inside-secure/safexcel.c @@ -421,6 +421,8 @@ static int eip197_load_firmwares(struct safexcel_crypto_priv *priv) else if (priv->data->version == EIP197B_MRVL || priv->data->version == EIP197_DEVBRD) dir = "eip197b"; + else if (priv->data->version == EIP197C_MXL) + dir = "eip197c"; else return -ENODEV; @@ -1828,6 +1830,11 @@ static const struct safexcel_priv_data eip197_devbrd_data = { .version = EIP197_DEVBRD, }; +static const struct safexcel_priv_data eip197c_mxl_data = { + .version = EIP197C_MXL, + .fw_little_endian = true, +}; + static const struct of_device_id safexcel_of_match_table[] = { { .compatible = "inside-secure,safexcel-eip97ies", @@ -1841,6 +1848,10 @@ static const struct of_device_id safexcel_of_match_table[] = { .compatible = "inside-secure,safexcel-eip197d", .data = &eip197d_mrvl_data, }, + { + .compatible = "inside-secure,safexcel-eip197c-mxl", + .data = &eip197c_mxl_data, + }, /* For backward compatibility and intended for generic use */ { .compatible = "inside-secure,safexcel-eip97", diff --git a/drivers/crypto/inside-secure/safexcel.h b/drivers/crypto/inside-secure/safexcel.h index f049293870b48..6c2fc662f64ff 100644 --- a/drivers/crypto/inside-secure/safexcel.h +++ b/drivers/crypto/inside-secure/safexcel.h @@ -730,7 +730,8 @@ enum safexcel_eip_version { EIP97IES_MRVL, EIP197B_MRVL, EIP197D_MRVL, - EIP197_DEVBRD + EIP197_DEVBRD, + EIP197C_MXL, }; struct safexcel_priv_data { -- GitLab From 839b8ae2fc10f205317bcc32c9de18456756e1f5 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 08:55:55 +0000 Subject: [PATCH 0151/1423] crypto: sun8i-ss - use dma_addr instead u32 The DMA address need to be stored in a dma_addr_t Fixes: 359e893e8af4 ("crypto: sun8i-ss - rework handling of IV") Reported-by: Dan Carpenter Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c index 910d6751644cf..902f6be057ec6 100644 --- a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c +++ b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c @@ -124,7 +124,7 @@ static int sun8i_ss_setup_ivs(struct skcipher_request *areq) unsigned int ivsize = crypto_skcipher_ivsize(tfm); struct sun8i_ss_flow *sf = &ss->flows[rctx->flow]; int i = 0; - u32 a; + dma_addr_t a; int err; rctx->ivlen = ivsize; -- GitLab From 375de984a3cb691a0bbaf9756e8595e0b54e27e0 Mon Sep 17 00:00:00 2001 From: Yuan Can Date: Tue, 27 Sep 2022 13:39:55 +0000 Subject: [PATCH 0152/1423] crypto: ccp - Remove unused struct ccp_crypto_cpu After commit bc3854476f36("crypto: ccp - Use a single queue for proper ordering of tfm requests"), no one use struct ccp_crypto_cpu, so remove it. Signed-off-by: Yuan Can Acked-by: John Allen Signed-off-by: Herbert Xu --- drivers/crypto/ccp/ccp-crypto-main.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/crypto/ccp/ccp-crypto-main.c b/drivers/crypto/ccp/ccp-crypto-main.c index 3321810273053..dd86d2650bea4 100644 --- a/drivers/crypto/ccp/ccp-crypto-main.c +++ b/drivers/crypto/ccp/ccp-crypto-main.c @@ -78,13 +78,6 @@ struct ccp_crypto_cmd { int ret; }; -struct ccp_crypto_cpu { - struct work_struct work; - struct completion completion; - struct ccp_crypto_cmd *crypto_cmd; - int err; -}; - static inline bool ccp_crypto_success(int err) { if (err && (err != -EINPROGRESS) && (err != -EBUSY)) -- GitLab From 094528b6a5a755b1195a01e10b13597d67d1a0e6 Mon Sep 17 00:00:00 2001 From: Natalia Petrova Date: Wed, 28 Sep 2022 13:25:05 +0300 Subject: [PATCH 0153/1423] crypto: nitrox - avoid double free on error path in nitrox_sriov_init() If alloc_workqueue() fails in nitrox_mbox_init() it deallocates ndev->iov.vfdev and returns error code, but then nitrox_sriov_init() calls nitrox_sriov_cleanup() where ndev->iov.vfdev is deallocated again. Fix this by nulling ndev->iov.vfdev after the first deallocation. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 9e5de3e06e54 ("crypto: cavium/nitrox - Add mailbox...") Signed-off-by: Natalia Petrova Signed-off-by: Alexey Khoroshilov Signed-off-by: Herbert Xu --- drivers/crypto/cavium/nitrox/nitrox_mbx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/crypto/cavium/nitrox/nitrox_mbx.c b/drivers/crypto/cavium/nitrox/nitrox_mbx.c index 9e7308e39b304..d4e06999af9b7 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_mbx.c +++ b/drivers/crypto/cavium/nitrox/nitrox_mbx.c @@ -195,6 +195,7 @@ int nitrox_mbox_init(struct nitrox_device *ndev) ndev->iov.pf2vf_wq = alloc_workqueue("nitrox_pf2vf", 0, 0); if (!ndev->iov.pf2vf_wq) { kfree(ndev->iov.vfdev); + ndev->iov.vfdev = NULL; return -ENOMEM; } /* enable pf2vf mailbox interrupts */ -- GitLab From 10da230a4df1dfe32a58eb09246f5ffe82346f27 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 28 Sep 2022 13:45:05 -0500 Subject: [PATCH 0154/1423] crypto: ccp - Add support for TEE for PCI ID 0x14CA SoCs containing 0x14CA are present both in datacenter parts that support SEV as well as client parts that support TEE. Cc: stable@vger.kernel.org # 5.15+ Tested-by: Rijo-john Thomas Signed-off-by: Mario Limonciello Acked-by: Tom Lendacky Signed-off-by: Herbert Xu --- drivers/crypto/ccp/sp-pci.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/ccp/sp-pci.c b/drivers/crypto/ccp/sp-pci.c index 792d6da7f0c07..084d052fddccb 100644 --- a/drivers/crypto/ccp/sp-pci.c +++ b/drivers/crypto/ccp/sp-pci.c @@ -381,6 +381,15 @@ static const struct psp_vdata pspv3 = { .inten_reg = 0x10690, .intsts_reg = 0x10694, }; + +static const struct psp_vdata pspv4 = { + .sev = &sevv2, + .tee = &teev1, + .feature_reg = 0x109fc, + .inten_reg = 0x10690, + .intsts_reg = 0x10694, +}; + #endif static const struct sp_dev_vdata dev_vdata[] = { @@ -426,7 +435,7 @@ static const struct sp_dev_vdata dev_vdata[] = { { /* 5 */ .bar = 2, #ifdef CONFIG_CRYPTO_DEV_SP_PSP - .psp_vdata = &pspv2, + .psp_vdata = &pspv4, #endif }, { /* 6 */ -- GitLab From be7f5ef9ff4bbe99e4fcdf63057a993be178af46 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 28 Sep 2022 23:24:43 +0100 Subject: [PATCH 0155/1423] crypto: stm32 - Fix spelling mistake "wite" -> "write" There are a couple of spelling mistakes in dev_err messages. Fix them. Signed-off-by: Colin Ian King Acked-by: nicolas.toromanoff@foss.st.com Signed-off-by: Herbert Xu --- drivers/crypto/stm32/stm32-cryp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/stm32/stm32-cryp.c b/drivers/crypto/stm32/stm32-cryp.c index 59ef541123ae6..59638dfce573b 100644 --- a/drivers/crypto/stm32/stm32-cryp.c +++ b/drivers/crypto/stm32/stm32-cryp.c @@ -1400,7 +1400,7 @@ static void stm32_cryp_irq_write_ccm_padded_data(struct stm32_cryp *cryp) /* wait end of process */ err = stm32_cryp_wait_output(cryp); if (err) { - dev_err(cryp->dev, "Timeout (wite ccm padded data)\n"); + dev_err(cryp->dev, "Timeout (write ccm padded data)\n"); return stm32_cryp_finish_req(cryp, err); } @@ -1440,7 +1440,7 @@ static void stm32_cryp_irq_write_ccm_padded_data(struct stm32_cryp *cryp) /* h) wait for completion */ err = stm32_cryp_wait_busy(cryp); if (err) - dev_err(cryp->dev, "Timeout (wite ccm padded data)\n"); + dev_err(cryp->dev, "Timeout (write ccm padded data)\n"); /* i) run the he normal Final phase */ stm32_cryp_finish_req(cryp, err); -- GitLab From 518a198f41d6539dc025f0e4fe2785f9031fa1eb Mon Sep 17 00:00:00 2001 From: Tomer Maimon Date: Thu, 29 Sep 2022 16:31:10 +0300 Subject: [PATCH 0156/1423] dt-bindings: rng: nuvoton,npcm-rng: Add npcm845 compatible string Add a compatible string for Nuvoton BMC NPCM845 RNG. Signed-off-by: Tomer Maimon Acked-by: Krzysztof Kozlowski Signed-off-by: Herbert Xu --- Documentation/devicetree/bindings/rng/nuvoton,npcm-rng.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/rng/nuvoton,npcm-rng.yaml b/Documentation/devicetree/bindings/rng/nuvoton,npcm-rng.yaml index abd134c9d4009..e8e4ab1e5b95d 100644 --- a/Documentation/devicetree/bindings/rng/nuvoton,npcm-rng.yaml +++ b/Documentation/devicetree/bindings/rng/nuvoton,npcm-rng.yaml @@ -16,7 +16,9 @@ maintainers: properties: compatible: - const: nuvoton,npcm750-rng + enum: + - nuvoton,npcm750-rng + - nuvoton,npcm845-rng reg: maxItems: 1 -- GitLab From f07b3e87fe62984db66fd4179ae7e960e4fc43e8 Mon Sep 17 00:00:00 2001 From: Tomer Maimon Date: Thu, 29 Sep 2022 16:31:11 +0300 Subject: [PATCH 0157/1423] hwrng: npcm - Add NPCM8XX support Adding RNG NPCM8XX support to NPCM RNG driver. RNG NPCM8XX uses a different clock prescaler. As part of adding NPCM8XX support: - Add NPCM8XX specific compatible string. - Add data to handle architecture specific clock prescaler. Signed-off-by: Tomer Maimon Signed-off-by: Herbert Xu --- drivers/char/hw_random/npcm-rng.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/char/hw_random/npcm-rng.c b/drivers/char/hw_random/npcm-rng.c index 1ec5f267a6566..5bf7f370f9859 100644 --- a/drivers/char/hw_random/npcm-rng.c +++ b/drivers/char/hw_random/npcm-rng.c @@ -13,11 +13,13 @@ #include #include #include +#include #define NPCM_RNGCS_REG 0x00 /* Control and status register */ #define NPCM_RNGD_REG 0x04 /* Data register */ #define NPCM_RNGMODE_REG 0x08 /* Mode register */ +#define NPCM_RNG_CLK_SET_62_5MHZ BIT(2) /* 60-80 MHz */ #define NPCM_RNG_CLK_SET_25MHZ GENMASK(4, 3) /* 20-25 MHz */ #define NPCM_RNG_DATA_VALID BIT(1) #define NPCM_RNG_ENABLE BIT(0) @@ -31,14 +33,14 @@ struct npcm_rng { void __iomem *base; struct hwrng rng; + u32 clkp; }; static int npcm_rng_init(struct hwrng *rng) { struct npcm_rng *priv = to_npcm_rng(rng); - writel(NPCM_RNG_CLK_SET_25MHZ | NPCM_RNG_ENABLE, - priv->base + NPCM_RNGCS_REG); + writel(priv->clkp | NPCM_RNG_ENABLE, priv->base + NPCM_RNGCS_REG); return 0; } @@ -47,7 +49,7 @@ static void npcm_rng_cleanup(struct hwrng *rng) { struct npcm_rng *priv = to_npcm_rng(rng); - writel(NPCM_RNG_CLK_SET_25MHZ, priv->base + NPCM_RNGCS_REG); + writel(priv->clkp, priv->base + NPCM_RNGCS_REG); } static int npcm_rng_read(struct hwrng *rng, void *buf, size_t max, bool wait) @@ -110,6 +112,7 @@ static int npcm_rng_probe(struct platform_device *pdev) priv->rng.read = npcm_rng_read; priv->rng.priv = (unsigned long)&pdev->dev; priv->rng.quality = 1000; + priv->clkp = (u32)(uintptr_t)of_device_get_match_data(&pdev->dev); writel(NPCM_RNG_M1ROSEL, priv->base + NPCM_RNGMODE_REG); @@ -162,7 +165,10 @@ static const struct dev_pm_ops npcm_rng_pm_ops = { }; static const struct of_device_id rng_dt_id[] __maybe_unused = { - { .compatible = "nuvoton,npcm750-rng", }, + { .compatible = "nuvoton,npcm750-rng", + .data = (void *)NPCM_RNG_CLK_SET_25MHZ }, + { .compatible = "nuvoton,npcm845-rng", + .data = (void *)NPCM_RNG_CLK_SET_62_5MHZ }, {}, }; MODULE_DEVICE_TABLE(of, rng_dt_id); -- GitLab From 46beeade05c6a7673873ba0a7b6396cd3a3b3473 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 30 Sep 2022 14:09:34 +0800 Subject: [PATCH 0158/1423] crypto: ixp4xx - Fix sparse warnings This fixes a number of trivial sparse warnings in ixp4xx. Signed-off-by: Herbert Xu Acked-by: Corentin Labbe Tested-by: Corentin Labbe Acked-by: Linus Walleij Signed-off-by: Herbert Xu --- drivers/crypto/ixp4xx_crypto.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/crypto/ixp4xx_crypto.c b/drivers/crypto/ixp4xx_crypto.c index d39a386b31ac7..984b3cc0237ca 100644 --- a/drivers/crypto/ixp4xx_crypto.c +++ b/drivers/crypto/ixp4xx_crypto.c @@ -420,7 +420,7 @@ static void one_packet(dma_addr_t phys) break; case CTL_FLAG_GEN_REVAES: ctx = crypto_tfm_ctx(crypt->data.tfm); - *(u32 *)ctx->decrypt.npe_ctx &= cpu_to_be32(~CIPH_ENCR); + *(__be32 *)ctx->decrypt.npe_ctx &= cpu_to_be32(~CIPH_ENCR); if (atomic_dec_and_test(&ctx->configuring)) complete(&ctx->completion); break; @@ -720,7 +720,7 @@ static int register_chain_var(struct crypto_tfm *tfm, u8 xpad, u32 target, crypt->init_len = init_len; crypt->ctl_flags |= CTL_FLAG_GEN_ICV; - buf->next = 0; + buf->next = NULL; buf->buf_len = HMAC_PAD_BLOCKLEN; buf->pkt_len = 0; buf->phys_addr = pad_phys; @@ -751,7 +751,7 @@ static int setup_auth(struct crypto_tfm *tfm, int encrypt, unsigned int authsize #ifndef __ARMEB__ cfgword ^= 0xAA000000; /* change the "byte swap" flags */ #endif - *(u32 *)cinfo = cpu_to_be32(cfgword); + *(__be32 *)cinfo = cpu_to_be32(cfgword); cinfo += sizeof(cfgword); /* write ICV to cryptinfo */ @@ -788,7 +788,7 @@ static int gen_rev_aes_key(struct crypto_tfm *tfm) if (!crypt) return -EAGAIN; - *(u32 *)dir->npe_ctx |= cpu_to_be32(CIPH_ENCR); + *(__be32 *)dir->npe_ctx |= cpu_to_be32(CIPH_ENCR); crypt->data.tfm = tfm; crypt->crypt_offs = 0; @@ -846,7 +846,7 @@ static int setup_cipher(struct crypto_tfm *tfm, int encrypt, const u8 *key, return err; } /* write cfg word to cryptinfo */ - *(u32 *)cinfo = cpu_to_be32(cipher_cfg); + *(__be32 *)cinfo = cpu_to_be32(cipher_cfg); cinfo += sizeof(cipher_cfg); /* write cipher key to cryptinfo */ -- GitLab From 65c92cbb3f2365627a10cf97560d51e88fb4e588 Mon Sep 17 00:00:00 2001 From: Robert Elliott Date: Fri, 30 Sep 2022 16:40:14 -0500 Subject: [PATCH 0159/1423] crypto: tcrypt - fix return value for multiple subtests When a test mode invokes multiple tests (e.g., mode 0 invokes modes 1 through 199, and mode 3 tests three block cipher modes with des), don't keep accumulating the return values with ret += tcrypt_test(), which results in a bogus value if more than one report a nonzero value (e.g., two reporting -2 (-ENOENT) end up reporting -4 (-EINTR)). Instead, keep track of the minimum return value reported by any subtest. Fixes: 4e033a6bc70f ("crypto: tcrypt - Do not exit on success in fips mode") Signed-off-by: Robert Elliott Signed-off-by: Herbert Xu --- crypto/tcrypt.c | 256 ++++++++++++++++++++++++------------------------ 1 file changed, 128 insertions(+), 128 deletions(-) diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index a82679b576bb4..3f7dc94a63e0b 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -1471,387 +1471,387 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) } for (i = 1; i < 200; i++) - ret += do_test(NULL, 0, 0, i, num_mb); + ret = min(ret, do_test(NULL, 0, 0, i, num_mb)); break; case 1: - ret += tcrypt_test("md5"); + ret = min(ret, tcrypt_test("md5")); break; case 2: - ret += tcrypt_test("sha1"); + ret = min(ret, tcrypt_test("sha1")); break; case 3: - ret += tcrypt_test("ecb(des)"); - ret += tcrypt_test("cbc(des)"); - ret += tcrypt_test("ctr(des)"); + ret = min(ret, tcrypt_test("ecb(des)")); + ret = min(ret, tcrypt_test("cbc(des)")); + ret = min(ret, tcrypt_test("ctr(des)")); break; case 4: - ret += tcrypt_test("ecb(des3_ede)"); - ret += tcrypt_test("cbc(des3_ede)"); - ret += tcrypt_test("ctr(des3_ede)"); + ret = min(ret, tcrypt_test("ecb(des3_ede)")); + ret = min(ret, tcrypt_test("cbc(des3_ede)")); + ret = min(ret, tcrypt_test("ctr(des3_ede)")); break; case 5: - ret += tcrypt_test("md4"); + ret = min(ret, tcrypt_test("md4")); break; case 6: - ret += tcrypt_test("sha256"); + ret = min(ret, tcrypt_test("sha256")); break; case 7: - ret += tcrypt_test("ecb(blowfish)"); - ret += tcrypt_test("cbc(blowfish)"); - ret += tcrypt_test("ctr(blowfish)"); + ret = min(ret, tcrypt_test("ecb(blowfish)")); + ret = min(ret, tcrypt_test("cbc(blowfish)")); + ret = min(ret, tcrypt_test("ctr(blowfish)")); break; case 8: - ret += tcrypt_test("ecb(twofish)"); - ret += tcrypt_test("cbc(twofish)"); - ret += tcrypt_test("ctr(twofish)"); - ret += tcrypt_test("lrw(twofish)"); - ret += tcrypt_test("xts(twofish)"); + ret = min(ret, tcrypt_test("ecb(twofish)")); + ret = min(ret, tcrypt_test("cbc(twofish)")); + ret = min(ret, tcrypt_test("ctr(twofish)")); + ret = min(ret, tcrypt_test("lrw(twofish)")); + ret = min(ret, tcrypt_test("xts(twofish)")); break; case 9: - ret += tcrypt_test("ecb(serpent)"); - ret += tcrypt_test("cbc(serpent)"); - ret += tcrypt_test("ctr(serpent)"); - ret += tcrypt_test("lrw(serpent)"); - ret += tcrypt_test("xts(serpent)"); + ret = min(ret, tcrypt_test("ecb(serpent)")); + ret = min(ret, tcrypt_test("cbc(serpent)")); + ret = min(ret, tcrypt_test("ctr(serpent)")); + ret = min(ret, tcrypt_test("lrw(serpent)")); + ret = min(ret, tcrypt_test("xts(serpent)")); break; case 10: - ret += tcrypt_test("ecb(aes)"); - ret += tcrypt_test("cbc(aes)"); - ret += tcrypt_test("lrw(aes)"); - ret += tcrypt_test("xts(aes)"); - ret += tcrypt_test("ctr(aes)"); - ret += tcrypt_test("rfc3686(ctr(aes))"); - ret += tcrypt_test("ofb(aes)"); - ret += tcrypt_test("cfb(aes)"); - ret += tcrypt_test("xctr(aes)"); + ret = min(ret, tcrypt_test("ecb(aes)")); + ret = min(ret, tcrypt_test("cbc(aes)")); + ret = min(ret, tcrypt_test("lrw(aes)")); + ret = min(ret, tcrypt_test("xts(aes)")); + ret = min(ret, tcrypt_test("ctr(aes)")); + ret = min(ret, tcrypt_test("rfc3686(ctr(aes))")); + ret = min(ret, tcrypt_test("ofb(aes)")); + ret = min(ret, tcrypt_test("cfb(aes)")); + ret = min(ret, tcrypt_test("xctr(aes)")); break; case 11: - ret += tcrypt_test("sha384"); + ret = min(ret, tcrypt_test("sha384")); break; case 12: - ret += tcrypt_test("sha512"); + ret = min(ret, tcrypt_test("sha512")); break; case 13: - ret += tcrypt_test("deflate"); + ret = min(ret, tcrypt_test("deflate")); break; case 14: - ret += tcrypt_test("ecb(cast5)"); - ret += tcrypt_test("cbc(cast5)"); - ret += tcrypt_test("ctr(cast5)"); + ret = min(ret, tcrypt_test("ecb(cast5)")); + ret = min(ret, tcrypt_test("cbc(cast5)")); + ret = min(ret, tcrypt_test("ctr(cast5)")); break; case 15: - ret += tcrypt_test("ecb(cast6)"); - ret += tcrypt_test("cbc(cast6)"); - ret += tcrypt_test("ctr(cast6)"); - ret += tcrypt_test("lrw(cast6)"); - ret += tcrypt_test("xts(cast6)"); + ret = min(ret, tcrypt_test("ecb(cast6)")); + ret = min(ret, tcrypt_test("cbc(cast6)")); + ret = min(ret, tcrypt_test("ctr(cast6)")); + ret = min(ret, tcrypt_test("lrw(cast6)")); + ret = min(ret, tcrypt_test("xts(cast6)")); break; case 16: - ret += tcrypt_test("ecb(arc4)"); + ret = min(ret, tcrypt_test("ecb(arc4)")); break; case 17: - ret += tcrypt_test("michael_mic"); + ret = min(ret, tcrypt_test("michael_mic")); break; case 18: - ret += tcrypt_test("crc32c"); + ret = min(ret, tcrypt_test("crc32c")); break; case 19: - ret += tcrypt_test("ecb(tea)"); + ret = min(ret, tcrypt_test("ecb(tea)")); break; case 20: - ret += tcrypt_test("ecb(xtea)"); + ret = min(ret, tcrypt_test("ecb(xtea)")); break; case 21: - ret += tcrypt_test("ecb(khazad)"); + ret = min(ret, tcrypt_test("ecb(khazad)")); break; case 22: - ret += tcrypt_test("wp512"); + ret = min(ret, tcrypt_test("wp512")); break; case 23: - ret += tcrypt_test("wp384"); + ret = min(ret, tcrypt_test("wp384")); break; case 24: - ret += tcrypt_test("wp256"); + ret = min(ret, tcrypt_test("wp256")); break; case 26: - ret += tcrypt_test("ecb(anubis)"); - ret += tcrypt_test("cbc(anubis)"); + ret = min(ret, tcrypt_test("ecb(anubis)")); + ret = min(ret, tcrypt_test("cbc(anubis)")); break; case 30: - ret += tcrypt_test("ecb(xeta)"); + ret = min(ret, tcrypt_test("ecb(xeta)")); break; case 31: - ret += tcrypt_test("pcbc(fcrypt)"); + ret = min(ret, tcrypt_test("pcbc(fcrypt)")); break; case 32: - ret += tcrypt_test("ecb(camellia)"); - ret += tcrypt_test("cbc(camellia)"); - ret += tcrypt_test("ctr(camellia)"); - ret += tcrypt_test("lrw(camellia)"); - ret += tcrypt_test("xts(camellia)"); + ret = min(ret, tcrypt_test("ecb(camellia)")); + ret = min(ret, tcrypt_test("cbc(camellia)")); + ret = min(ret, tcrypt_test("ctr(camellia)")); + ret = min(ret, tcrypt_test("lrw(camellia)")); + ret = min(ret, tcrypt_test("xts(camellia)")); break; case 33: - ret += tcrypt_test("sha224"); + ret = min(ret, tcrypt_test("sha224")); break; case 35: - ret += tcrypt_test("gcm(aes)"); + ret = min(ret, tcrypt_test("gcm(aes)")); break; case 36: - ret += tcrypt_test("lzo"); + ret = min(ret, tcrypt_test("lzo")); break; case 37: - ret += tcrypt_test("ccm(aes)"); + ret = min(ret, tcrypt_test("ccm(aes)")); break; case 38: - ret += tcrypt_test("cts(cbc(aes))"); + ret = min(ret, tcrypt_test("cts(cbc(aes))")); break; case 39: - ret += tcrypt_test("xxhash64"); + ret = min(ret, tcrypt_test("xxhash64")); break; case 40: - ret += tcrypt_test("rmd160"); + ret = min(ret, tcrypt_test("rmd160")); break; case 42: - ret += tcrypt_test("blake2b-512"); + ret = min(ret, tcrypt_test("blake2b-512")); break; case 43: - ret += tcrypt_test("ecb(seed)"); + ret = min(ret, tcrypt_test("ecb(seed)")); break; case 45: - ret += tcrypt_test("rfc4309(ccm(aes))"); + ret = min(ret, tcrypt_test("rfc4309(ccm(aes))")); break; case 46: - ret += tcrypt_test("ghash"); + ret = min(ret, tcrypt_test("ghash")); break; case 47: - ret += tcrypt_test("crct10dif"); + ret = min(ret, tcrypt_test("crct10dif")); break; case 48: - ret += tcrypt_test("sha3-224"); + ret = min(ret, tcrypt_test("sha3-224")); break; case 49: - ret += tcrypt_test("sha3-256"); + ret = min(ret, tcrypt_test("sha3-256")); break; case 50: - ret += tcrypt_test("sha3-384"); + ret = min(ret, tcrypt_test("sha3-384")); break; case 51: - ret += tcrypt_test("sha3-512"); + ret = min(ret, tcrypt_test("sha3-512")); break; case 52: - ret += tcrypt_test("sm3"); + ret = min(ret, tcrypt_test("sm3")); break; case 53: - ret += tcrypt_test("streebog256"); + ret = min(ret, tcrypt_test("streebog256")); break; case 54: - ret += tcrypt_test("streebog512"); + ret = min(ret, tcrypt_test("streebog512")); break; case 55: - ret += tcrypt_test("gcm(sm4)"); + ret = min(ret, tcrypt_test("gcm(sm4)")); break; case 56: - ret += tcrypt_test("ccm(sm4)"); + ret = min(ret, tcrypt_test("ccm(sm4)")); break; case 57: - ret += tcrypt_test("polyval"); + ret = min(ret, tcrypt_test("polyval")); break; case 58: - ret += tcrypt_test("gcm(aria)"); + ret = min(ret, tcrypt_test("gcm(aria)")); break; case 100: - ret += tcrypt_test("hmac(md5)"); + ret = min(ret, tcrypt_test("hmac(md5)")); break; case 101: - ret += tcrypt_test("hmac(sha1)"); + ret = min(ret, tcrypt_test("hmac(sha1)")); break; case 102: - ret += tcrypt_test("hmac(sha256)"); + ret = min(ret, tcrypt_test("hmac(sha256)")); break; case 103: - ret += tcrypt_test("hmac(sha384)"); + ret = min(ret, tcrypt_test("hmac(sha384)")); break; case 104: - ret += tcrypt_test("hmac(sha512)"); + ret = min(ret, tcrypt_test("hmac(sha512)")); break; case 105: - ret += tcrypt_test("hmac(sha224)"); + ret = min(ret, tcrypt_test("hmac(sha224)")); break; case 106: - ret += tcrypt_test("xcbc(aes)"); + ret = min(ret, tcrypt_test("xcbc(aes)")); break; case 108: - ret += tcrypt_test("hmac(rmd160)"); + ret = min(ret, tcrypt_test("hmac(rmd160)")); break; case 109: - ret += tcrypt_test("vmac64(aes)"); + ret = min(ret, tcrypt_test("vmac64(aes)")); break; case 111: - ret += tcrypt_test("hmac(sha3-224)"); + ret = min(ret, tcrypt_test("hmac(sha3-224)")); break; case 112: - ret += tcrypt_test("hmac(sha3-256)"); + ret = min(ret, tcrypt_test("hmac(sha3-256)")); break; case 113: - ret += tcrypt_test("hmac(sha3-384)"); + ret = min(ret, tcrypt_test("hmac(sha3-384)")); break; case 114: - ret += tcrypt_test("hmac(sha3-512)"); + ret = min(ret, tcrypt_test("hmac(sha3-512)")); break; case 115: - ret += tcrypt_test("hmac(streebog256)"); + ret = min(ret, tcrypt_test("hmac(streebog256)")); break; case 116: - ret += tcrypt_test("hmac(streebog512)"); + ret = min(ret, tcrypt_test("hmac(streebog512)")); break; case 150: - ret += tcrypt_test("ansi_cprng"); + ret = min(ret, tcrypt_test("ansi_cprng")); break; case 151: - ret += tcrypt_test("rfc4106(gcm(aes))"); + ret = min(ret, tcrypt_test("rfc4106(gcm(aes))")); break; case 152: - ret += tcrypt_test("rfc4543(gcm(aes))"); + ret = min(ret, tcrypt_test("rfc4543(gcm(aes))")); break; case 153: - ret += tcrypt_test("cmac(aes)"); + ret = min(ret, tcrypt_test("cmac(aes)")); break; case 154: - ret += tcrypt_test("cmac(des3_ede)"); + ret = min(ret, tcrypt_test("cmac(des3_ede)")); break; case 155: - ret += tcrypt_test("authenc(hmac(sha1),cbc(aes))"); + ret = min(ret, tcrypt_test("authenc(hmac(sha1),cbc(aes))")); break; case 156: - ret += tcrypt_test("authenc(hmac(md5),ecb(cipher_null))"); + ret = min(ret, tcrypt_test("authenc(hmac(md5),ecb(cipher_null))")); break; case 157: - ret += tcrypt_test("authenc(hmac(sha1),ecb(cipher_null))"); + ret = min(ret, tcrypt_test("authenc(hmac(sha1),ecb(cipher_null))")); break; case 158: - ret += tcrypt_test("cbcmac(sm4)"); + ret = min(ret, tcrypt_test("cbcmac(sm4)")); break; case 159: - ret += tcrypt_test("cmac(sm4)"); + ret = min(ret, tcrypt_test("cmac(sm4)")); break; case 181: - ret += tcrypt_test("authenc(hmac(sha1),cbc(des))"); + ret = min(ret, tcrypt_test("authenc(hmac(sha1),cbc(des))")); break; case 182: - ret += tcrypt_test("authenc(hmac(sha1),cbc(des3_ede))"); + ret = min(ret, tcrypt_test("authenc(hmac(sha1),cbc(des3_ede))")); break; case 183: - ret += tcrypt_test("authenc(hmac(sha224),cbc(des))"); + ret = min(ret, tcrypt_test("authenc(hmac(sha224),cbc(des))")); break; case 184: - ret += tcrypt_test("authenc(hmac(sha224),cbc(des3_ede))"); + ret = min(ret, tcrypt_test("authenc(hmac(sha224),cbc(des3_ede))")); break; case 185: - ret += tcrypt_test("authenc(hmac(sha256),cbc(des))"); + ret = min(ret, tcrypt_test("authenc(hmac(sha256),cbc(des))")); break; case 186: - ret += tcrypt_test("authenc(hmac(sha256),cbc(des3_ede))"); + ret = min(ret, tcrypt_test("authenc(hmac(sha256),cbc(des3_ede))")); break; case 187: - ret += tcrypt_test("authenc(hmac(sha384),cbc(des))"); + ret = min(ret, tcrypt_test("authenc(hmac(sha384),cbc(des))")); break; case 188: - ret += tcrypt_test("authenc(hmac(sha384),cbc(des3_ede))"); + ret = min(ret, tcrypt_test("authenc(hmac(sha384),cbc(des3_ede))")); break; case 189: - ret += tcrypt_test("authenc(hmac(sha512),cbc(des))"); + ret = min(ret, tcrypt_test("authenc(hmac(sha512),cbc(des))")); break; case 190: - ret += tcrypt_test("authenc(hmac(sha512),cbc(des3_ede))"); + ret = min(ret, tcrypt_test("authenc(hmac(sha512),cbc(des3_ede))")); break; case 191: - ret += tcrypt_test("ecb(sm4)"); - ret += tcrypt_test("cbc(sm4)"); - ret += tcrypt_test("cfb(sm4)"); - ret += tcrypt_test("ctr(sm4)"); + ret = min(ret, tcrypt_test("ecb(sm4)")); + ret = min(ret, tcrypt_test("cbc(sm4)")); + ret = min(ret, tcrypt_test("cfb(sm4)")); + ret = min(ret, tcrypt_test("ctr(sm4)")); break; case 192: - ret += tcrypt_test("ecb(aria)"); - ret += tcrypt_test("cbc(aria)"); - ret += tcrypt_test("cfb(aria)"); - ret += tcrypt_test("ctr(aria)"); + ret = min(ret, tcrypt_test("ecb(aria)")); + ret = min(ret, tcrypt_test("cbc(aria)")); + ret = min(ret, tcrypt_test("cfb(aria)")); + ret = min(ret, tcrypt_test("ctr(aria)")); break; case 200: test_cipher_speed("ecb(aes)", ENCRYPT, sec, NULL, 0, -- GitLab From 76a4e874593543a2dff91d249c95bac728df2774 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Thu, 6 Oct 2022 04:34:19 +0000 Subject: [PATCH 0160/1423] crypto: n2 - add missing hash statesize Add missing statesize to hash templates. This is mandatory otherwise no algorithms can be registered as the core requires statesize to be set. CC: stable@kernel.org # 4.3+ Reported-by: Rolf Eike Beer Tested-by: Rolf Eike Beer Fixes: 0a625fd2abaa ("crypto: n2 - Add Niagara2 crypto driver") Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/n2_core.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/crypto/n2_core.c b/drivers/crypto/n2_core.c index 31e24df18877f..20d0dcd50344b 100644 --- a/drivers/crypto/n2_core.c +++ b/drivers/crypto/n2_core.c @@ -1229,6 +1229,7 @@ struct n2_hash_tmpl { const u8 *hash_init; u8 hw_op_hashsz; u8 digest_size; + u8 statesize; u8 block_size; u8 auth_type; u8 hmac_type; @@ -1260,6 +1261,7 @@ static const struct n2_hash_tmpl hash_tmpls[] = { .hmac_type = AUTH_TYPE_HMAC_MD5, .hw_op_hashsz = MD5_DIGEST_SIZE, .digest_size = MD5_DIGEST_SIZE, + .statesize = sizeof(struct md5_state), .block_size = MD5_HMAC_BLOCK_SIZE }, { .name = "sha1", .hash_zero = sha1_zero_message_hash, @@ -1268,6 +1270,7 @@ static const struct n2_hash_tmpl hash_tmpls[] = { .hmac_type = AUTH_TYPE_HMAC_SHA1, .hw_op_hashsz = SHA1_DIGEST_SIZE, .digest_size = SHA1_DIGEST_SIZE, + .statesize = sizeof(struct sha1_state), .block_size = SHA1_BLOCK_SIZE }, { .name = "sha256", .hash_zero = sha256_zero_message_hash, @@ -1276,6 +1279,7 @@ static const struct n2_hash_tmpl hash_tmpls[] = { .hmac_type = AUTH_TYPE_HMAC_SHA256, .hw_op_hashsz = SHA256_DIGEST_SIZE, .digest_size = SHA256_DIGEST_SIZE, + .statesize = sizeof(struct sha256_state), .block_size = SHA256_BLOCK_SIZE }, { .name = "sha224", .hash_zero = sha224_zero_message_hash, @@ -1284,6 +1288,7 @@ static const struct n2_hash_tmpl hash_tmpls[] = { .hmac_type = AUTH_TYPE_RESERVED, .hw_op_hashsz = SHA256_DIGEST_SIZE, .digest_size = SHA224_DIGEST_SIZE, + .statesize = sizeof(struct sha256_state), .block_size = SHA224_BLOCK_SIZE }, }; #define NUM_HASH_TMPLS ARRAY_SIZE(hash_tmpls) @@ -1424,6 +1429,7 @@ static int __n2_register_one_ahash(const struct n2_hash_tmpl *tmpl) halg = &ahash->halg; halg->digestsize = tmpl->digest_size; + halg->statesize = tmpl->statesize; base = &halg->base; snprintf(base->cra_name, CRYPTO_MAX_ALG_NAME, "%s", tmpl->name); -- GitLab From f1da27b7c4191f78ed81d3dabf64c769f896296c Mon Sep 17 00:00:00 2001 From: "Mingming.Su" Date: Sat, 8 Oct 2022 18:45:53 +0200 Subject: [PATCH 0161/1423] hwrng: mtk - add mt7986 support 1. Add trng compatible name for MT7986 2. Fix mtk_rng_wait_ready() function Signed-off-by: Mingming.Su Signed-off-by: Frank Wunderlich Signed-off-by: Herbert Xu --- drivers/char/hw_random/mtk-rng.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/char/hw_random/mtk-rng.c b/drivers/char/hw_random/mtk-rng.c index 6c00ea0085553..aa993753ab120 100644 --- a/drivers/char/hw_random/mtk-rng.c +++ b/drivers/char/hw_random/mtk-rng.c @@ -22,7 +22,7 @@ #define RNG_AUTOSUSPEND_TIMEOUT 100 #define USEC_POLL 2 -#define TIMEOUT_POLL 20 +#define TIMEOUT_POLL 60 #define RNG_CTRL 0x00 #define RNG_EN BIT(0) @@ -77,7 +77,7 @@ static bool mtk_rng_wait_ready(struct hwrng *rng, bool wait) readl_poll_timeout_atomic(priv->base + RNG_CTRL, ready, ready & RNG_READY, USEC_POLL, TIMEOUT_POLL); - return !!ready; + return !!(ready & RNG_READY); } static int mtk_rng_read(struct hwrng *rng, void *buf, size_t max, bool wait) @@ -179,6 +179,7 @@ static const struct dev_pm_ops mtk_rng_pm_ops = { #endif /* CONFIG_PM */ static const struct of_device_id mtk_rng_match[] = { + { .compatible = "mediatek,mt7986-rng" }, { .compatible = "mediatek,mt7623-rng" }, {}, }; -- GitLab From 854e25a6d653b76007c142b7edbaba81a8789a7f Mon Sep 17 00:00:00 2001 From: jianchunfu Date: Sun, 9 Oct 2022 17:52:54 +0800 Subject: [PATCH 0162/1423] crypto: talitos - Use the defined variable to clean code Use the defined variable "dev" to make the code cleaner. Signed-off-by: jianchunfu Signed-off-by: Herbert Xu --- drivers/crypto/talitos.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c index c9ad6c2130901..71db6450b6aa4 100644 --- a/drivers/crypto/talitos.c +++ b/drivers/crypto/talitos.c @@ -1999,7 +1999,7 @@ static int ahash_process_req(struct ahash_request *areq, unsigned int nbytes) /* Buffer up to one whole block */ nents = sg_nents_for_len(areq->src, nbytes); if (nents < 0) { - dev_err(ctx->dev, "Invalid number of src SG.\n"); + dev_err(dev, "Invalid number of src SG.\n"); return nents; } sg_copy_to_buffer(areq->src, nents, @@ -2040,7 +2040,7 @@ static int ahash_process_req(struct ahash_request *areq, unsigned int nbytes) offset = nbytes_to_hash - req_ctx->nbuf; nents = sg_nents_for_len(areq->src, offset); if (nents < 0) { - dev_err(ctx->dev, "Invalid number of src SG.\n"); + dev_err(dev, "Invalid number of src SG.\n"); return nents; } sg_copy_to_buffer(areq->src, nents, @@ -2054,7 +2054,7 @@ static int ahash_process_req(struct ahash_request *areq, unsigned int nbytes) if (to_hash_later) { nents = sg_nents_for_len(areq->src, nbytes); if (nents < 0) { - dev_err(ctx->dev, "Invalid number of src SG.\n"); + dev_err(dev, "Invalid number of src SG.\n"); return nents; } sg_pcopy_to_buffer(areq->src, nents, -- GitLab From 7e11a4fc84dcc9746936c46d9a88489a365fea45 Mon Sep 17 00:00:00 2001 From: Tomas Marek Date: Wed, 12 Oct 2022 18:09:23 +0200 Subject: [PATCH 0163/1423] hwrng: stm32 - fix number of returned bytes on read The stm32_rng_read() function uses `retval` variable as a counter of generated random bytes. However, the same variable is used to store a result of the polling function in case the driver is waiting until the TRNG is ready. The TRNG generates random numbers by 16B. One loop read 4B. So, the function calls the polling every 16B, i.e. every 4th loop. The `retval` counter is reset on poll call and only number of bytes read after the last poll call is returned to the caller. The remaining sampled random bytes (for example 48 out of 64 in case 64 bytes are read) are not used. Use different variable to store the polling function result and do not overwrite `retval` counter. Cc: Oleg Karfich Signed-off-by: Tomas Marek Signed-off-by: Herbert Xu --- drivers/char/hw_random/stm32-rng.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/char/hw_random/stm32-rng.c b/drivers/char/hw_random/stm32-rng.c index bc22178f83e83..8eaacefd498bb 100644 --- a/drivers/char/hw_random/stm32-rng.c +++ b/drivers/char/hw_random/stm32-rng.c @@ -49,11 +49,13 @@ static int stm32_rng_read(struct hwrng *rng, void *data, size_t max, bool wait) /* Manage timeout which is based on timer and take */ /* care of initial delay time when enabling rng */ if (!sr && wait) { - retval = readl_relaxed_poll_timeout_atomic(priv->base + int ret; + + ret = readl_relaxed_poll_timeout_atomic(priv->base + RNG_SR, sr, sr, 10, 50000); - if (retval) + if (ret) dev_err((struct device *)priv->rng.priv, "%s: timeout %x!\n", __func__, sr); } -- GitLab From e64f57e8cd5abe167cdf453869d6274608480519 Mon Sep 17 00:00:00 2001 From: Tomas Marek Date: Wed, 12 Oct 2022 18:09:24 +0200 Subject: [PATCH 0164/1423] hwrng: stm32 - fix read of the last word The stm32_rng_read() function samples TRNG by 4 bytes until at least 5 bytes are free in the input buffer. The last four bytes are never read. For example, 60 bytes are returned in case the input buffer size is 64 bytes. Read until at least 4 bytes are free in the input buffer. Fill the buffer entirely in case the buffer size is divisible by 4. Cc: Oleg Karfich Signed-off-by: Tomas Marek Signed-off-by: Herbert Xu --- drivers/char/hw_random/stm32-rng.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/hw_random/stm32-rng.c b/drivers/char/hw_random/stm32-rng.c index 8eaacefd498bb..366edda4848b6 100644 --- a/drivers/char/hw_random/stm32-rng.c +++ b/drivers/char/hw_random/stm32-rng.c @@ -44,7 +44,7 @@ static int stm32_rng_read(struct hwrng *rng, void *data, size_t max, bool wait) pm_runtime_get_sync((struct device *) priv->rng.priv); - while (max > sizeof(u32)) { + while (max >= sizeof(u32)) { sr = readl_relaxed(priv->base + RNG_SR); /* Manage timeout which is based on timer and take */ /* care of initial delay time when enabling rng */ -- GitLab From d0b9f28f0da2808b339bb290b539518e69e48ac9 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 21 Oct 2022 18:26:11 +0100 Subject: [PATCH 0165/1423] RDMA/qib: Remove not-used variable n The variable n being incremented but it is never referenced, it is redundant and can be removed. Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20221021172611.26763-1-colin.i.king@gmail.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/qib/qib_tx.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_tx.c b/drivers/infiniband/hw/qib/qib_tx.c index 6a8148851f21d..1325110237cd9 100644 --- a/drivers/infiniband/hw/qib/qib_tx.c +++ b/drivers/infiniband/hw/qib/qib_tx.c @@ -82,7 +82,6 @@ int qib_disarm_piobufs_ifneeded(struct qib_ctxtdata *rcd) struct qib_devdata *dd = rcd->dd; unsigned i; unsigned last; - unsigned n = 0; last = rcd->pio_base + rcd->piocnt; /* @@ -102,10 +101,8 @@ int qib_disarm_piobufs_ifneeded(struct qib_ctxtdata *rcd) } spin_lock_irq(&dd->pioavail_lock); for (i = rcd->pio_base; i < last; i++) { - if (__test_and_clear_bit(i, dd->pio_need_disarm)) { - n++; + if (__test_and_clear_bit(i, dd->pio_need_disarm)) dd->f_sendctrl(rcd->ppd, QIB_SENDCTRL_DISARM_BUF(i)); - } } spin_unlock_irq(&dd->pioavail_lock); return 0; -- GitLab From 5dc1b37d75e7136588480712e7173169b3d4164f Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 21 Oct 2022 18:35:04 +0100 Subject: [PATCH 0166/1423] RDMA/qib: Remove not-used variable freeze_cnt The variable freeze_cnt being incremented but it is never referenced, it is redundant and can be removed. Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20221021173504.27546-1-colin.i.king@gmail.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/qib/qib_iba6120.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_iba6120.c b/drivers/infiniband/hw/qib/qib_iba6120.c index aea571943768b..23a81edf3f7aa 100644 --- a/drivers/infiniband/hw/qib/qib_iba6120.c +++ b/drivers/infiniband/hw/qib/qib_iba6120.c @@ -799,12 +799,9 @@ static void qib_handle_6120_hwerrors(struct qib_devdata *dd, char *msg, hwerrs &= ~TXE_PIO_PARITY; } - if (!hwerrs) { - static u32 freeze_cnt; - - freeze_cnt++; + if (!hwerrs) qib_6120_clear_freeze(dd); - } else + else isfatal = 1; } -- GitLab From 2d5206c4629dfe74a51bb9c54758095139f5d01d Mon Sep 17 00:00:00 2001 From: wangjianli Date: Sat, 22 Oct 2022 13:59:05 +0800 Subject: [PATCH 0167/1423] RDMA/qib: fix repeated words in comments Delete the redundant word 'the'. Signed-off-by: wangjianli Link: https://lore.kernel.org/r/20221022055905.49176-1-wangjianli@cdjrlc.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/qib/qib_user_sdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c index bf2f30d67949d..9fe03d6ffac1a 100644 --- a/drivers/infiniband/hw/qib/qib_user_sdma.c +++ b/drivers/infiniband/hw/qib/qib_user_sdma.c @@ -851,7 +851,7 @@ static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd, } /* - * This assignment is a bit strange. it's because the + * This assignment is a bit strange. it's because * the pbc counts the number of 32 bit words in the full * packet _except_ the first word of the pbc itself... */ -- GitLab From c4bb733234b0ffd939030bb592b691ac19519455 Mon Sep 17 00:00:00 2001 From: wangjianli Date: Sat, 22 Oct 2022 14:00:30 +0800 Subject: [PATCH 0168/1423] RDMA/core: fix repeated words in comments Delete the redundant word 'the'. Signed-off-by: wangjianli Link: https://lore.kernel.org/r/20221022060030.50900-1-wangjianli@cdjrlc.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 4084d05a45102..2e91d88793265 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -1422,7 +1422,7 @@ int rdma_read_gid_l2_fields(const struct ib_gid_attr *attr, *vlan_id = vlan_dev_vlan_id(ndev); } else { /* If the netdev is upper device and if it's lower - * device is vlan device, consider vlan id of the + * device is vlan device, consider vlan id of * the lower vlan device for this gid entry. */ netdev_walk_all_lower_dev_rcu(attr->ndev, -- GitLab From 65bf03427cee48258a227431577dc56bf70461f3 Mon Sep 17 00:00:00 2001 From: wangjianli Date: Sat, 22 Oct 2022 13:52:57 +0800 Subject: [PATCH 0169/1423] RDMA/qedr: fix repeated words in comments Delete the redundant word 'the'. Signed-off-by: wangjianli Link: https://lore.kernel.org/r/20221022055257.42905-1-wangjianli@cdjrlc.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/qedr/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index 5152f10d2e6de..5e7069b76d465 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -472,7 +472,7 @@ static irqreturn_t qedr_irq_handler(int irq, void *handle) /* The CQ's CNQ notification counter is checked before * destroying the CQ in a busy-wait loop that waits for all of * the CQ's CNQ interrupts to be processed. It is increased - * here, only after the completion handler, to ensure that the + * here, only after the completion handler, to ensure that * the handler is not running when the CQ is destroyed. */ cq->cnq_notif++; -- GitLab From 71d236399160ad9beaae7267b93d2d487e8f19a0 Mon Sep 17 00:00:00 2001 From: "yangx.jy@fujitsu.com" Date: Fri, 21 Oct 2022 13:45:17 +0000 Subject: [PATCH 0170/1423] RDMA/rxe: Remove the member 'type' of struct rxe_mr The member 'type' is included in both struct rxe_mr and struct ib_mr so remove the duplicate one of struct rxe_mr. Signed-off-by: Xiao Yang Link: https://lore.kernel.org/r/20221021134513.17730-1-yangx.jy@fujitsu.com Reviewed-by: Bob Pearson Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_mr.c | 16 ++++++++-------- drivers/infiniband/sw/rxe/rxe_verbs.h | 1 - 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 502e9ada99b30..d4f10c2d1aa79 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -26,7 +26,7 @@ int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length) { - switch (mr->type) { + switch (mr->ibmr.type) { case IB_MR_TYPE_DMA: return 0; @@ -39,7 +39,7 @@ int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length) default: pr_warn("%s: mr type (%d) not supported\n", - __func__, mr->type); + __func__, mr->ibmr.type); return -EFAULT; } } @@ -109,7 +109,7 @@ void rxe_mr_init_dma(int access, struct rxe_mr *mr) mr->access = access; mr->state = RXE_MR_STATE_VALID; - mr->type = IB_MR_TYPE_DMA; + mr->ibmr.type = IB_MR_TYPE_DMA; } int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, @@ -178,7 +178,7 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, mr->access = access; mr->offset = ib_umem_offset(umem); mr->state = RXE_MR_STATE_VALID; - mr->type = IB_MR_TYPE_USER; + mr->ibmr.type = IB_MR_TYPE_USER; return 0; @@ -205,7 +205,7 @@ int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr) mr->max_buf = max_pages; mr->state = RXE_MR_STATE_FREE; - mr->type = IB_MR_TYPE_MEM_REG; + mr->ibmr.type = IB_MR_TYPE_MEM_REG; return 0; @@ -304,7 +304,7 @@ int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, if (length == 0) return 0; - if (mr->type == IB_MR_TYPE_DMA) { + if (mr->ibmr.type == IB_MR_TYPE_DMA) { u8 *src, *dest; src = (dir == RXE_TO_MR_OBJ) ? addr : ((void *)(uintptr_t)iova); @@ -547,8 +547,8 @@ int rxe_invalidate_mr(struct rxe_qp *qp, u32 key) goto err_drop_ref; } - if (unlikely(mr->type != IB_MR_TYPE_MEM_REG)) { - pr_warn("%s: mr->type (%d) is wrong type\n", __func__, mr->type); + if (unlikely(mr->ibmr.type != IB_MR_TYPE_MEM_REG)) { + pr_warn("%s: mr type (%d) is wrong\n", __func__, mr->ibmr.type); ret = -EINVAL; goto err_drop_ref; } diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 5f5cbfcb35695..22a299b0a9f0a 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -304,7 +304,6 @@ struct rxe_mr { u32 lkey; u32 rkey; enum rxe_mr_state state; - enum ib_mr_type type; u32 offset; int access; -- GitLab From a1ccd3d911382f68753033c6adcf69663c2a9fc5 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 19 Oct 2022 15:41:25 -0500 Subject: [PATCH 0171/1423] PCI/portdrv: Squash into portdrv.c Squash portdrv_core.c and portdrv_pci.c into portdrv.c to make it easier to find things. The whole thing is less than 1000 lines, and it's a pain to bounce back and forth between two files. Several portdrv_core.c functions were non-static because they were referenced from portdrv_pci.c. Make them static since they're now all in portdrv.c. No functional change intended. Link: https://lore.kernel.org/r/20221019204127.44463-2-helgaas@kernel.org Signed-off-by: Bjorn Helgaas Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig Reviewed-by: Keith Busch --- drivers/pci/pcie/Makefile | 2 +- .../pci/pcie/{portdrv_core.c => portdrv.c} | 252 +++++++++++++++++- drivers/pci/pcie/portdrv.h | 10 - drivers/pci/pcie/portdrv_pci.c | 252 ------------------ 4 files changed, 244 insertions(+), 272 deletions(-) rename drivers/pci/pcie/{portdrv_core.c => portdrv.c} (70%) delete mode 100644 drivers/pci/pcie/portdrv_pci.c diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile index 5783a2f79e6a2..8de4ed5f98f14 100644 --- a/drivers/pci/pcie/Makefile +++ b/drivers/pci/pcie/Makefile @@ -2,7 +2,7 @@ # # Makefile for PCI Express features and port driver -pcieportdrv-y := portdrv_core.o portdrv_pci.o rcec.o +pcieportdrv-y := portdrv.o rcec.o obj-$(CONFIG_PCIEPORTBUS) += pcieportdrv.o diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv.c similarity index 70% rename from drivers/pci/pcie/portdrv_core.c rename to drivers/pci/pcie/portdrv.c index 1ac7fec47d6fb..0b4a1f9c2a6b8 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv.c @@ -1,11 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Purpose: PCI Express Port Bus Driver's Core Functions + * Purpose: PCI Express Port Bus Driver * * Copyright (C) 2004 Intel * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com) */ +#include +#include #include #include #include @@ -308,7 +310,7 @@ static int pcie_device_init(struct pci_dev *pdev, int service, int irq) * Allocate the port extension structure and register services associated with * the port. */ -int pcie_port_device_register(struct pci_dev *dev) +static int pcie_port_device_register(struct pci_dev *dev) { int status, capabilities, i, nr_service; int irqs[PCIE_PORT_DEVICE_MAXSERVICES]; @@ -362,7 +364,7 @@ error_disable: typedef int (*pcie_callback_t)(struct pcie_device *); -int pcie_port_device_iter(struct device *dev, void *data) +static int pcie_port_device_iter(struct device *dev, void *data) { struct pcie_port_service_driver *service_driver; size_t offset = *(size_t *)data; @@ -382,13 +384,13 @@ int pcie_port_device_iter(struct device *dev, void *data) * pcie_port_device_suspend - suspend port services associated with a PCIe port * @dev: PCI Express port to handle */ -int pcie_port_device_suspend(struct device *dev) +static int pcie_port_device_suspend(struct device *dev) { size_t off = offsetof(struct pcie_port_service_driver, suspend); return device_for_each_child(dev, &off, pcie_port_device_iter); } -int pcie_port_device_resume_noirq(struct device *dev) +static int pcie_port_device_resume_noirq(struct device *dev) { size_t off = offsetof(struct pcie_port_service_driver, resume_noirq); return device_for_each_child(dev, &off, pcie_port_device_iter); @@ -398,7 +400,7 @@ int pcie_port_device_resume_noirq(struct device *dev) * pcie_port_device_resume - resume port services associated with a PCIe port * @dev: PCI Express port to handle */ -int pcie_port_device_resume(struct device *dev) +static int pcie_port_device_resume(struct device *dev) { size_t off = offsetof(struct pcie_port_service_driver, resume); return device_for_each_child(dev, &off, pcie_port_device_iter); @@ -408,7 +410,7 @@ int pcie_port_device_resume(struct device *dev) * pcie_port_device_runtime_suspend - runtime suspend port services * @dev: PCI Express port to handle */ -int pcie_port_device_runtime_suspend(struct device *dev) +static int pcie_port_device_runtime_suspend(struct device *dev) { size_t off = offsetof(struct pcie_port_service_driver, runtime_suspend); return device_for_each_child(dev, &off, pcie_port_device_iter); @@ -418,7 +420,7 @@ int pcie_port_device_runtime_suspend(struct device *dev) * pcie_port_device_runtime_resume - runtime resume port services * @dev: PCI Express port to handle */ -int pcie_port_device_runtime_resume(struct device *dev) +static int pcie_port_device_runtime_resume(struct device *dev) { size_t off = offsetof(struct pcie_port_service_driver, runtime_resume); return device_for_each_child(dev, &off, pcie_port_device_iter); @@ -482,7 +484,7 @@ EXPORT_SYMBOL_GPL(pcie_port_find_device); * Remove PCI Express port service devices associated with given port and * disable MSI-X or MSI for the port. */ -void pcie_port_device_remove(struct pci_dev *dev) +static void pcie_port_device_remove(struct pci_dev *dev) { device_for_each_child(&dev->dev, NULL, remove_iter); pci_free_irq_vectors(dev); @@ -584,3 +586,235 @@ void pcie_port_service_unregister(struct pcie_port_service_driver *drv) driver_unregister(&drv->driver); } EXPORT_SYMBOL(pcie_port_service_unregister); + +/* If this switch is set, PCIe port native services should not be enabled. */ +bool pcie_ports_disabled; + +/* + * If the user specified "pcie_ports=native", use the PCIe services regardless + * of whether the platform has given us permission. On ACPI systems, this + * means we ignore _OSC. + */ +bool pcie_ports_native; + +/* + * If the user specified "pcie_ports=dpc-native", use the Linux DPC PCIe + * service even if the platform hasn't given us permission. + */ +bool pcie_ports_dpc_native; + +static int __init pcie_port_setup(char *str) +{ + if (!strncmp(str, "compat", 6)) + pcie_ports_disabled = true; + else if (!strncmp(str, "native", 6)) + pcie_ports_native = true; + else if (!strncmp(str, "dpc-native", 10)) + pcie_ports_dpc_native = true; + + return 1; +} +__setup("pcie_ports=", pcie_port_setup); + +/* global data */ + +#ifdef CONFIG_PM +static int pcie_port_runtime_suspend(struct device *dev) +{ + if (!to_pci_dev(dev)->bridge_d3) + return -EBUSY; + + return pcie_port_device_runtime_suspend(dev); +} + +static int pcie_port_runtime_idle(struct device *dev) +{ + /* + * Assume the PCI core has set bridge_d3 whenever it thinks the port + * should be good to go to D3. Everything else, including moving + * the port to D3, is handled by the PCI core. + */ + return to_pci_dev(dev)->bridge_d3 ? 0 : -EBUSY; +} + +static const struct dev_pm_ops pcie_portdrv_pm_ops = { + .suspend = pcie_port_device_suspend, + .resume_noirq = pcie_port_device_resume_noirq, + .resume = pcie_port_device_resume, + .freeze = pcie_port_device_suspend, + .thaw = pcie_port_device_resume, + .poweroff = pcie_port_device_suspend, + .restore_noirq = pcie_port_device_resume_noirq, + .restore = pcie_port_device_resume, + .runtime_suspend = pcie_port_runtime_suspend, + .runtime_resume = pcie_port_device_runtime_resume, + .runtime_idle = pcie_port_runtime_idle, +}; + +#define PCIE_PORTDRV_PM_OPS (&pcie_portdrv_pm_ops) + +#else /* !PM */ + +#define PCIE_PORTDRV_PM_OPS NULL +#endif /* !PM */ + +/* + * pcie_portdrv_probe - Probe PCI-Express port devices + * @dev: PCI-Express port device being probed + * + * If detected invokes the pcie_port_device_register() method for + * this port device. + * + */ +static int pcie_portdrv_probe(struct pci_dev *dev, + const struct pci_device_id *id) +{ + int type = pci_pcie_type(dev); + int status; + + if (!pci_is_pcie(dev) || + ((type != PCI_EXP_TYPE_ROOT_PORT) && + (type != PCI_EXP_TYPE_UPSTREAM) && + (type != PCI_EXP_TYPE_DOWNSTREAM) && + (type != PCI_EXP_TYPE_RC_EC))) + return -ENODEV; + + if (type == PCI_EXP_TYPE_RC_EC) + pcie_link_rcec(dev); + + status = pcie_port_device_register(dev); + if (status) + return status; + + pci_save_state(dev); + + dev_pm_set_driver_flags(&dev->dev, DPM_FLAG_NO_DIRECT_COMPLETE | + DPM_FLAG_SMART_SUSPEND); + + if (pci_bridge_d3_possible(dev)) { + /* + * Keep the port resumed 100ms to make sure things like + * config space accesses from userspace (lspci) will not + * cause the port to repeatedly suspend and resume. + */ + pm_runtime_set_autosuspend_delay(&dev->dev, 100); + pm_runtime_use_autosuspend(&dev->dev); + pm_runtime_mark_last_busy(&dev->dev); + pm_runtime_put_autosuspend(&dev->dev); + pm_runtime_allow(&dev->dev); + } + + return 0; +} + +static void pcie_portdrv_remove(struct pci_dev *dev) +{ + if (pci_bridge_d3_possible(dev)) { + pm_runtime_forbid(&dev->dev); + pm_runtime_get_noresume(&dev->dev); + pm_runtime_dont_use_autosuspend(&dev->dev); + } + + pcie_port_device_remove(dev); +} + +static pci_ers_result_t pcie_portdrv_error_detected(struct pci_dev *dev, + pci_channel_state_t error) +{ + if (error == pci_channel_io_frozen) + return PCI_ERS_RESULT_NEED_RESET; + return PCI_ERS_RESULT_CAN_RECOVER; +} + +static pci_ers_result_t pcie_portdrv_slot_reset(struct pci_dev *dev) +{ + size_t off = offsetof(struct pcie_port_service_driver, slot_reset); + device_for_each_child(&dev->dev, &off, pcie_port_device_iter); + + pci_restore_state(dev); + pci_save_state(dev); + return PCI_ERS_RESULT_RECOVERED; +} + +static pci_ers_result_t pcie_portdrv_mmio_enabled(struct pci_dev *dev) +{ + return PCI_ERS_RESULT_RECOVERED; +} + +/* + * LINUX Device Driver Model + */ +static const struct pci_device_id port_pci_ids[] = { + /* handle any PCI-Express port */ + { PCI_DEVICE_CLASS(PCI_CLASS_BRIDGE_PCI_NORMAL, ~0) }, + /* subtractive decode PCI-to-PCI bridge, class type is 060401h */ + { PCI_DEVICE_CLASS(PCI_CLASS_BRIDGE_PCI_SUBTRACTIVE, ~0) }, + /* handle any Root Complex Event Collector */ + { PCI_DEVICE_CLASS(((PCI_CLASS_SYSTEM_RCEC << 8) | 0x00), ~0) }, + { }, +}; + +static const struct pci_error_handlers pcie_portdrv_err_handler = { + .error_detected = pcie_portdrv_error_detected, + .slot_reset = pcie_portdrv_slot_reset, + .mmio_enabled = pcie_portdrv_mmio_enabled, +}; + +static struct pci_driver pcie_portdriver = { + .name = "pcieport", + .id_table = &port_pci_ids[0], + + .probe = pcie_portdrv_probe, + .remove = pcie_portdrv_remove, + .shutdown = pcie_portdrv_remove, + + .err_handler = &pcie_portdrv_err_handler, + + .driver_managed_dma = true, + + .driver.pm = PCIE_PORTDRV_PM_OPS, +}; + +static int __init dmi_pcie_pme_disable_msi(const struct dmi_system_id *d) +{ + pr_notice("%s detected: will not use MSI for PCIe PME signaling\n", + d->ident); + pcie_pme_disable_msi(); + return 0; +} + +static const struct dmi_system_id pcie_portdrv_dmi_table[] __initconst = { + /* + * Boxes that should not use MSI for PCIe PME signaling. + */ + { + .callback = dmi_pcie_pme_disable_msi, + .ident = "MSI Wind U-100", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, + "MICRO-STAR INTERNATIONAL CO., LTD"), + DMI_MATCH(DMI_PRODUCT_NAME, "U-100"), + }, + }, + {} +}; + +static void __init pcie_init_services(void) +{ + pcie_aer_init(); + pcie_pme_init(); + pcie_dpc_init(); + pcie_hp_init(); +} + +static int __init pcie_portdrv_init(void) +{ + if (pcie_ports_disabled) + return -EACCES; + + pcie_init_services(); + dmi_check_system(pcie_portdrv_dmi_table); + + return pci_register_driver(&pcie_portdriver); +} +device_initcall(pcie_portdrv_init); diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h index 0ef4bf5f811d9..bf380bcea6a5b 100644 --- a/drivers/pci/pcie/portdrv.h +++ b/drivers/pci/pcie/portdrv.h @@ -108,16 +108,6 @@ void pcie_port_service_unregister(struct pcie_port_service_driver *new); #define get_descriptor_id(type, service) (((type - 4) << 8) | service) extern struct bus_type pcie_port_bus_type; -int pcie_port_device_register(struct pci_dev *dev); -int pcie_port_device_iter(struct device *dev, void *data); -#ifdef CONFIG_PM -int pcie_port_device_suspend(struct device *dev); -int pcie_port_device_resume_noirq(struct device *dev); -int pcie_port_device_resume(struct device *dev); -int pcie_port_device_runtime_suspend(struct device *dev); -int pcie_port_device_runtime_resume(struct device *dev); -#endif -void pcie_port_device_remove(struct pci_dev *dev); struct pci_dev; diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c deleted file mode 100644 index 7f8788a970ae1..0000000000000 --- a/drivers/pci/pcie/portdrv_pci.c +++ /dev/null @@ -1,252 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Purpose: PCI Express Port Bus Driver - * Author: Tom Nguyen - * - * Copyright (C) 2004 Intel - * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com) - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../pci.h" -#include "portdrv.h" - -/* If this switch is set, PCIe port native services should not be enabled. */ -bool pcie_ports_disabled; - -/* - * If the user specified "pcie_ports=native", use the PCIe services regardless - * of whether the platform has given us permission. On ACPI systems, this - * means we ignore _OSC. - */ -bool pcie_ports_native; - -/* - * If the user specified "pcie_ports=dpc-native", use the Linux DPC PCIe - * service even if the platform hasn't given us permission. - */ -bool pcie_ports_dpc_native; - -static int __init pcie_port_setup(char *str) -{ - if (!strncmp(str, "compat", 6)) - pcie_ports_disabled = true; - else if (!strncmp(str, "native", 6)) - pcie_ports_native = true; - else if (!strncmp(str, "dpc-native", 10)) - pcie_ports_dpc_native = true; - - return 1; -} -__setup("pcie_ports=", pcie_port_setup); - -/* global data */ - -#ifdef CONFIG_PM -static int pcie_port_runtime_suspend(struct device *dev) -{ - if (!to_pci_dev(dev)->bridge_d3) - return -EBUSY; - - return pcie_port_device_runtime_suspend(dev); -} - -static int pcie_port_runtime_idle(struct device *dev) -{ - /* - * Assume the PCI core has set bridge_d3 whenever it thinks the port - * should be good to go to D3. Everything else, including moving - * the port to D3, is handled by the PCI core. - */ - return to_pci_dev(dev)->bridge_d3 ? 0 : -EBUSY; -} - -static const struct dev_pm_ops pcie_portdrv_pm_ops = { - .suspend = pcie_port_device_suspend, - .resume_noirq = pcie_port_device_resume_noirq, - .resume = pcie_port_device_resume, - .freeze = pcie_port_device_suspend, - .thaw = pcie_port_device_resume, - .poweroff = pcie_port_device_suspend, - .restore_noirq = pcie_port_device_resume_noirq, - .restore = pcie_port_device_resume, - .runtime_suspend = pcie_port_runtime_suspend, - .runtime_resume = pcie_port_device_runtime_resume, - .runtime_idle = pcie_port_runtime_idle, -}; - -#define PCIE_PORTDRV_PM_OPS (&pcie_portdrv_pm_ops) - -#else /* !PM */ - -#define PCIE_PORTDRV_PM_OPS NULL -#endif /* !PM */ - -/* - * pcie_portdrv_probe - Probe PCI-Express port devices - * @dev: PCI-Express port device being probed - * - * If detected invokes the pcie_port_device_register() method for - * this port device. - * - */ -static int pcie_portdrv_probe(struct pci_dev *dev, - const struct pci_device_id *id) -{ - int type = pci_pcie_type(dev); - int status; - - if (!pci_is_pcie(dev) || - ((type != PCI_EXP_TYPE_ROOT_PORT) && - (type != PCI_EXP_TYPE_UPSTREAM) && - (type != PCI_EXP_TYPE_DOWNSTREAM) && - (type != PCI_EXP_TYPE_RC_EC))) - return -ENODEV; - - if (type == PCI_EXP_TYPE_RC_EC) - pcie_link_rcec(dev); - - status = pcie_port_device_register(dev); - if (status) - return status; - - pci_save_state(dev); - - dev_pm_set_driver_flags(&dev->dev, DPM_FLAG_NO_DIRECT_COMPLETE | - DPM_FLAG_SMART_SUSPEND); - - if (pci_bridge_d3_possible(dev)) { - /* - * Keep the port resumed 100ms to make sure things like - * config space accesses from userspace (lspci) will not - * cause the port to repeatedly suspend and resume. - */ - pm_runtime_set_autosuspend_delay(&dev->dev, 100); - pm_runtime_use_autosuspend(&dev->dev); - pm_runtime_mark_last_busy(&dev->dev); - pm_runtime_put_autosuspend(&dev->dev); - pm_runtime_allow(&dev->dev); - } - - return 0; -} - -static void pcie_portdrv_remove(struct pci_dev *dev) -{ - if (pci_bridge_d3_possible(dev)) { - pm_runtime_forbid(&dev->dev); - pm_runtime_get_noresume(&dev->dev); - pm_runtime_dont_use_autosuspend(&dev->dev); - } - - pcie_port_device_remove(dev); -} - -static pci_ers_result_t pcie_portdrv_error_detected(struct pci_dev *dev, - pci_channel_state_t error) -{ - if (error == pci_channel_io_frozen) - return PCI_ERS_RESULT_NEED_RESET; - return PCI_ERS_RESULT_CAN_RECOVER; -} - -static pci_ers_result_t pcie_portdrv_slot_reset(struct pci_dev *dev) -{ - size_t off = offsetof(struct pcie_port_service_driver, slot_reset); - device_for_each_child(&dev->dev, &off, pcie_port_device_iter); - - pci_restore_state(dev); - pci_save_state(dev); - return PCI_ERS_RESULT_RECOVERED; -} - -static pci_ers_result_t pcie_portdrv_mmio_enabled(struct pci_dev *dev) -{ - return PCI_ERS_RESULT_RECOVERED; -} - -/* - * LINUX Device Driver Model - */ -static const struct pci_device_id port_pci_ids[] = { - /* handle any PCI-Express port */ - { PCI_DEVICE_CLASS(PCI_CLASS_BRIDGE_PCI_NORMAL, ~0) }, - /* subtractive decode PCI-to-PCI bridge, class type is 060401h */ - { PCI_DEVICE_CLASS(PCI_CLASS_BRIDGE_PCI_SUBTRACTIVE, ~0) }, - /* handle any Root Complex Event Collector */ - { PCI_DEVICE_CLASS(((PCI_CLASS_SYSTEM_RCEC << 8) | 0x00), ~0) }, - { }, -}; - -static const struct pci_error_handlers pcie_portdrv_err_handler = { - .error_detected = pcie_portdrv_error_detected, - .slot_reset = pcie_portdrv_slot_reset, - .mmio_enabled = pcie_portdrv_mmio_enabled, -}; - -static struct pci_driver pcie_portdriver = { - .name = "pcieport", - .id_table = &port_pci_ids[0], - - .probe = pcie_portdrv_probe, - .remove = pcie_portdrv_remove, - .shutdown = pcie_portdrv_remove, - - .err_handler = &pcie_portdrv_err_handler, - - .driver_managed_dma = true, - - .driver.pm = PCIE_PORTDRV_PM_OPS, -}; - -static int __init dmi_pcie_pme_disable_msi(const struct dmi_system_id *d) -{ - pr_notice("%s detected: will not use MSI for PCIe PME signaling\n", - d->ident); - pcie_pme_disable_msi(); - return 0; -} - -static const struct dmi_system_id pcie_portdrv_dmi_table[] __initconst = { - /* - * Boxes that should not use MSI for PCIe PME signaling. - */ - { - .callback = dmi_pcie_pme_disable_msi, - .ident = "MSI Wind U-100", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, - "MICRO-STAR INTERNATIONAL CO., LTD"), - DMI_MATCH(DMI_PRODUCT_NAME, "U-100"), - }, - }, - {} -}; - -static void __init pcie_init_services(void) -{ - pcie_aer_init(); - pcie_pme_init(); - pcie_dpc_init(); - pcie_hp_init(); -} - -static int __init pcie_portdrv_init(void) -{ - if (pcie_ports_disabled) - return -EACCES; - - pcie_init_services(); - dmi_check_system(pcie_portdrv_dmi_table); - - return pci_register_driver(&pcie_portdriver); -} -device_initcall(pcie_portdrv_init); -- GitLab From 29f193feeea3e3af1c4650870f08ca896b83e1db Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 19 Oct 2022 15:41:26 -0500 Subject: [PATCH 0172/1423] PCI/portdrv: Move private things to portdrv.c Previously several things used by portdrv_core.c and portdrv_pci.c were shared by defining them in portdrv.h. Now that portdrv_core.c and portdrv_pci.c have been squashed, move things that can be private into portdrv.c. No functional change intended. Link: https://lore.kernel.org/r/20221019204127.44463-3-helgaas@kernel.org Signed-off-by: Bjorn Helgaas Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig Reviewed-by: Keith Busch --- drivers/pci/pcie/portdrv.c | 9 +++++++++ drivers/pci/pcie/portdrv.h | 9 --------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/pci/pcie/portdrv.c b/drivers/pci/pcie/portdrv.c index 0b4a1f9c2a6b8..ae8da5b2e922c 100644 --- a/drivers/pci/pcie/portdrv.c +++ b/drivers/pci/pcie/portdrv.c @@ -21,6 +21,15 @@ #include "../pci.h" #include "portdrv.h" +/* + * The PCIe Capability Interrupt Message Number (PCIe r3.1, sec 7.8.2) must + * be one of the first 32 MSI-X entries. Per PCI r3.0, sec 6.8.3.1, MSI + * supports a maximum of 32 vectors per function. + */ +#define PCIE_PORT_MAX_MSI_ENTRIES 32 + +#define get_descriptor_id(type, service) (((type - 4) << 8) | service) + struct portdrv_service_data { struct pcie_port_service_driver *drv; struct device *dev; diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h index bf380bcea6a5b..58a2b1a1cae4c 100644 --- a/drivers/pci/pcie/portdrv.h +++ b/drivers/pci/pcie/portdrv.h @@ -98,15 +98,6 @@ struct pcie_port_service_driver { int pcie_port_service_register(struct pcie_port_service_driver *new); void pcie_port_service_unregister(struct pcie_port_service_driver *new); -/* - * The PCIe Capability Interrupt Message Number (PCIe r3.1, sec 7.8.2) must - * be one of the first 32 MSI-X entries. Per PCI r3.0, sec 6.8.3.1, MSI - * supports a maximum of 32 vectors per function. - */ -#define PCIE_PORT_MAX_MSI_ENTRIES 32 - -#define get_descriptor_id(type, service) (((type - 4) << 8) | service) - extern struct bus_type pcie_port_bus_type; struct pci_dev; -- GitLab From 461a65d7d1a4f56b97c9115eda3e8619516f40fb Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 19 Oct 2022 15:41:27 -0500 Subject: [PATCH 0173/1423] PCI/portdrv: Unexport pcie_port_service_register(), pcie_port_service_unregister() pcie_port_service_register() and pcie_port_service_unregister() are used only by the pciehp, aer, dpc, and pme PCIe port service drivers, none of which can be modules. Unexport pcie_port_service_register() and pcie_port_service_unregister(). No functional change intended. Link: https://lore.kernel.org/r/20221019204127.44463-4-helgaas@kernel.org Signed-off-by: Bjorn Helgaas Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig Reviewed-by: Keith Busch --- drivers/pci/pcie/portdrv.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/pci/pcie/portdrv.c b/drivers/pci/pcie/portdrv.c index ae8da5b2e922c..a6c4225505d53 100644 --- a/drivers/pci/pcie/portdrv.c +++ b/drivers/pci/pcie/portdrv.c @@ -584,7 +584,6 @@ int pcie_port_service_register(struct pcie_port_service_driver *new) return driver_register(&new->driver); } -EXPORT_SYMBOL(pcie_port_service_register); /** * pcie_port_service_unregister - unregister PCI Express port service driver @@ -594,7 +593,6 @@ void pcie_port_service_unregister(struct pcie_port_service_driver *drv) { driver_unregister(&drv->driver); } -EXPORT_SYMBOL(pcie_port_service_unregister); /* If this switch is set, PCIe port native services should not be enabled. */ bool pcie_ports_disabled; -- GitLab From 2f7a29debae2efef94b981377fa3622986cd57f5 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Mon, 26 Sep 2022 10:28:39 +0800 Subject: [PATCH 0174/1423] apparmor: remove useless static inline functions Remove the following useless static inline functions: 1. label_is_visible() is a static function in security/apparmor/label.c, and it's not used, aa_ns_visible() can do the same things as it, so it's redundant. 2. is_deleted() is a static function in security/apparmor/file.c, and it's not used since commit aebd873e8d3e ("apparmor: refactor path name lookup and permission checks around labels"), so it's redundant. They are redundant, so remove them. Signed-off-by: Gaosheng Cui Signed-off-by: John Johansen --- security/apparmor/file.c | 13 ------------- security/apparmor/label.c | 6 ------ 2 files changed, 19 deletions(-) diff --git a/security/apparmor/file.c b/security/apparmor/file.c index e7dc5ea389973..deb73480f0c65 100644 --- a/security/apparmor/file.c +++ b/security/apparmor/file.c @@ -141,19 +141,6 @@ int aa_audit_file(struct aa_profile *profile, struct aa_perms *perms, return aa_audit(type, profile, &sa, file_audit_cb); } -/** - * is_deleted - test if a file has been completely unlinked - * @dentry: dentry of file to test for deletion (NOT NULL) - * - * Returns: true if deleted else false - */ -static inline bool is_deleted(struct dentry *dentry) -{ - if (d_unlinked(dentry) && d_backing_inode(dentry)->i_nlink == 0) - return true; - return false; -} - static int path_name(const char *op, struct aa_label *label, const struct path *path, int flags, char *buffer, const char **name, struct path_cond *cond, u32 request) diff --git a/security/apparmor/label.c b/security/apparmor/label.c index aa4031628af55..8a2af96f4da57 100644 --- a/security/apparmor/label.c +++ b/security/apparmor/label.c @@ -1256,12 +1256,6 @@ out: return label; } -static inline bool label_is_visible(struct aa_profile *profile, - struct aa_label *label) -{ - return aa_ns_visible(profile->ns, labels_ns(label), true); -} - /* match a profile and its associated ns component if needed * Assumes visibility test has already been done. * If a subns profile is not to be matched should be prescreened with -- GitLab From 1f2bc06a8dbff73957f433b22c6fd35fccfb47a4 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Mon, 26 Sep 2022 19:48:38 +0800 Subject: [PATCH 0175/1423] apparmor: fix obsoleted comments for aa_getprocattr() and audit_resource() Update the comments for aa_getprocattr() and audit_resource(), the args of them have beed changed since commit 76a1d263aba3 ("apparmor: switch getprocattr to using label_print fns()"). Signed-off-by: Gaosheng Cui Signed-off-by: John Johansen --- security/apparmor/procattr.c | 11 +++++------ security/apparmor/resource.c | 2 ++ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/security/apparmor/procattr.c b/security/apparmor/procattr.c index 86ad26ef72ed4..197d41f9c32b7 100644 --- a/security/apparmor/procattr.c +++ b/security/apparmor/procattr.c @@ -17,14 +17,13 @@ /** - * aa_getprocattr - Return the profile information for @profile - * @profile: the profile to print profile info about (NOT NULL) - * @string: Returns - string containing the profile info (NOT NULL) + * aa_getprocattr - Return the label information for @label + * @label: the label to print label info about (NOT NULL) + * @string: Returns - string containing the label info (NOT NULL) * - * Requires: profile != NULL + * Requires: label != NULL && string != NULL * - * Creates a string containing the namespace_name://profile_name for - * @profile. + * Creates a string containing the label information for @label. * * Returns: size of string placed in @string else error code on failure */ diff --git a/security/apparmor/resource.c b/security/apparmor/resource.c index ed543f4edfd9f..1b75d8343a8db 100644 --- a/security/apparmor/resource.c +++ b/security/apparmor/resource.c @@ -45,6 +45,8 @@ static void audit_cb(struct audit_buffer *ab, void *va) * @profile: profile being enforced (NOT NULL) * @resource: rlimit being auditing * @value: value being set + * @peer: aa_albel of the task being set + * @info: info being auditing * @error: error value * * Returns: 0 or sa->error else other error code on failure -- GitLab From 58f89ce58bb4f5cf5963b20a19aaa2431b0412d8 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Mon, 3 Oct 2022 02:48:24 -0700 Subject: [PATCH 0176/1423] apparmor: refactor code that alloc null profiles Bother unconfined and learning profiles use the null profile as their base. Refactor so they are share a common base routine. This doesn't save much atm but will be important when the feature set of the parent is inherited. Signed-off-by: John Johansen --- security/apparmor/domain.c | 12 ++++---- security/apparmor/include/policy.h | 6 ++-- security/apparmor/policy.c | 47 ++++++++++++++++++++---------- security/apparmor/policy_ns.c | 6 +--- 4 files changed, 43 insertions(+), 28 deletions(-) diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c index d4b09f061aee2..b447bc13ea8e2 100644 --- a/security/apparmor/domain.c +++ b/security/apparmor/domain.c @@ -681,8 +681,8 @@ static struct aa_label *profile_transition(struct aa_profile *profile, /* no exec permission - learning mode */ struct aa_profile *new_profile = NULL; - new_profile = aa_new_null_profile(profile, false, name, - GFP_KERNEL); + new_profile = aa_new_learning_profile(profile, false, name, + GFP_KERNEL); if (!new_profile) { error = -ENOMEM; info = "could not create null profile"; @@ -1009,8 +1009,8 @@ static struct aa_label *build_change_hat(struct aa_profile *profile, if (!hat) { error = -ENOENT; if (COMPLAIN_MODE(profile)) { - hat = aa_new_null_profile(profile, true, name, - GFP_KERNEL); + hat = aa_new_learning_profile(profile, true, name, + GFP_KERNEL); if (!hat) { info = "failed null profile create"; error = -ENOMEM; @@ -1361,8 +1361,8 @@ int aa_change_profile(const char *fqname, int flags) !COMPLAIN_MODE(labels_profile(label))) goto audit; /* released below */ - tprofile = aa_new_null_profile(labels_profile(label), false, - fqname, GFP_KERNEL); + tprofile = aa_new_learning_profile(labels_profile(label), false, + fqname, GFP_KERNEL); if (!tprofile) { info = "failed null profile create"; error = -ENOMEM; diff --git a/security/apparmor/include/policy.h b/security/apparmor/include/policy.h index 5cadfb20df294..545f791cabdae 100644 --- a/security/apparmor/include/policy.h +++ b/security/apparmor/include/policy.h @@ -234,8 +234,10 @@ void aa_free_proxy_kref(struct kref *kref); struct aa_ruleset *aa_alloc_ruleset(gfp_t gfp); struct aa_profile *aa_alloc_profile(const char *name, struct aa_proxy *proxy, gfp_t gfp); -struct aa_profile *aa_new_null_profile(struct aa_profile *parent, bool hat, - const char *base, gfp_t gfp); +struct aa_profile *aa_alloc_null(struct aa_profile *parent, const char *name, + gfp_t gfp); +struct aa_profile *aa_new_learning_profile(struct aa_profile *parent, bool hat, + const char *base, gfp_t gfp); void aa_free_profile(struct aa_profile *profile); void aa_free_profile_kref(struct kref *kref); struct aa_profile *aa_find_child(struct aa_profile *parent, const char *name); diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c index 6f4cc8bfe03de..c17ccedd35f1e 100644 --- a/security/apparmor/policy.c +++ b/security/apparmor/policy.c @@ -524,8 +524,36 @@ struct aa_profile *aa_fqlookupn_profile(struct aa_label *base, return profile; } + +struct aa_profile *aa_alloc_null(struct aa_profile *parent, const char *name, + gfp_t gfp) +{ + struct aa_profile *profile; + struct aa_ruleset *rules; + + profile = aa_alloc_profile(name, NULL, gfp); + if (!profile) + return NULL; + + /* TODO: ideally we should inherit abi from parent */ + profile->label.flags |= FLAG_NULL; + rules = list_first_entry(&profile->rules, typeof(*rules), list); + rules->file.dfa = aa_get_dfa(nulldfa); + rules->policy.dfa = aa_get_dfa(nulldfa); + + if (parent) { + profile->path_flags = parent->path_flags; + + /* released on free_profile */ + rcu_assign_pointer(profile->parent, aa_get_profile(parent)); + profile->ns = aa_get_ns(parent->ns); + } + + return profile; +} + /** - * aa_new_null_profile - create or find a null-X learning profile + * aa_new_learning_profile - create or find a null-X learning profile * @parent: profile that caused this profile to be created (NOT NULL) * @hat: true if the null- learning profile is a hat * @base: name to base the null profile off of @@ -542,10 +570,9 @@ struct aa_profile *aa_fqlookupn_profile(struct aa_label *base, * * Returns: new refcounted profile else NULL on failure */ -struct aa_profile *aa_new_null_profile(struct aa_profile *parent, bool hat, - const char *base, gfp_t gfp) +struct aa_profile *aa_new_learning_profile(struct aa_profile *parent, bool hat, + const char *base, gfp_t gfp) { - struct aa_ruleset *rules; struct aa_profile *p, *profile; const char *bname; char *name = NULL; @@ -575,22 +602,12 @@ name: if (profile) goto out; - profile = aa_alloc_profile(name, NULL, gfp); + profile = aa_alloc_null(parent, name, gfp); if (!profile) goto fail; - profile->mode = APPARMOR_COMPLAIN; - profile->label.flags |= FLAG_NULL; if (hat) profile->label.flags |= FLAG_HAT; - profile->path_flags = parent->path_flags; - - /* released on free_profile */ - rcu_assign_pointer(profile->parent, aa_get_profile(parent)); - profile->ns = aa_get_ns(parent->ns); - rules = list_first_entry(&profile->rules, typeof(*rules), list); - rules->file.dfa = aa_get_dfa(nulldfa); - rules->policy.dfa = aa_get_dfa(nulldfa); mutex_lock_nested(&profile->ns->lock, profile->ns->level); p = __find_child(&parent->base.profiles, bname); diff --git a/security/apparmor/policy_ns.c b/security/apparmor/policy_ns.c index 121aa79bccaac..5c38563a6dcfd 100644 --- a/security/apparmor/policy_ns.c +++ b/security/apparmor/policy_ns.c @@ -83,18 +83,14 @@ const char *aa_ns_name(struct aa_ns *curr, struct aa_ns *view, bool subns) static struct aa_profile *alloc_unconfined(const char *name) { struct aa_profile *profile; - struct aa_ruleset *rules; - profile = aa_alloc_profile(name, NULL, GFP_KERNEL); + profile = aa_alloc_null(NULL, name, GFP_KERNEL); if (!profile) return NULL; profile->label.flags |= FLAG_IX_ON_NAME_ERROR | FLAG_IMMUTIBLE | FLAG_NS_COUNT | FLAG_UNCONFINED; profile->mode = APPARMOR_UNCONFINED; - rules = list_first_entry(&profile->rules, typeof(*rules), list); - rules->file.dfa = aa_get_dfa(nulldfa); - rules->policy.dfa = aa_get_dfa(nulldfa); return profile; } -- GitLab From 5ebc548f4f54fe971d741d80cd108f1a45c9e88d Mon Sep 17 00:00:00 2001 From: Daisuke Matsuda Date: Thu, 13 Oct 2022 10:47:23 +0900 Subject: [PATCH 0177/1423] RDMA/rxe: Make responder handle RDMA Read failures Currently, responder can reply packets with invalid payloads if it fails to copy messages to the packets. Add an error handling in read_reply() to inform a requesting node of the failure. Link: https://lore.kernel.org/r/20221013014724.3786212-1-matsuda-daisuke@fujitsu.com Suggested-by: Li Zhijian Signed-off-by: Daisuke Matsuda Reviewed-by: Li Zhijian Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_resp.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index ed5a09e86417e..82b74e926e09f 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -809,10 +809,14 @@ static enum resp_states read_reply(struct rxe_qp *qp, if (!skb) return RESPST_ERR_RNR; - rxe_mr_copy(mr, res->read.va, payload_addr(&ack_pkt), - payload, RXE_FROM_MR_OBJ); + err = rxe_mr_copy(mr, res->read.va, payload_addr(&ack_pkt), + payload, RXE_FROM_MR_OBJ); if (mr) rxe_put(mr); + if (err) { + kfree_skb(skb); + return RESPST_ERR_RKEY_VIOLATION; + } if (bth_pad(&ack_pkt)) { u8 *pad = payload_addr(&ack_pkt) + payload; -- GitLab From 5ac814e02ece516761d2e244cef93843df911ae0 Mon Sep 17 00:00:00 2001 From: Daisuke Matsuda Date: Thu, 13 Oct 2022 10:47:24 +0900 Subject: [PATCH 0178/1423] RDMA/rxe: Handle remote errors in the midst of a Read reply sequence Requesting nodes do not handle a reported error correctly if it is generated in the middle of multi-packet Read responses, and the node tries to resend the request endlessly. Let completer terminate the connection in that case. Link: https://lore.kernel.org/r/20221013014724.3786212-2-matsuda-daisuke@fujitsu.com Signed-off-by: Daisuke Matsuda Reviewed-by: Li Zhijian Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_comp.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index fb0c008af78cc..c9170dd99f3aa 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -200,6 +200,10 @@ static inline enum comp_state check_psn(struct rxe_qp *qp, */ if (pkt->psn == wqe->last_psn) return COMPST_COMP_ACK; + else if (pkt->opcode == IB_OPCODE_RC_ACKNOWLEDGE && + (qp->comp.opcode == IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST || + qp->comp.opcode == IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE)) + return COMPST_CHECK_ACK; else return COMPST_DONE; } else if ((diff > 0) && (wqe->mask & WR_ATOMIC_OR_READ_MASK)) { @@ -228,6 +232,10 @@ static inline enum comp_state check_ack(struct rxe_qp *qp, case IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST: case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE: + /* Check NAK code to handle a remote error */ + if (pkt->opcode == IB_OPCODE_RC_ACKNOWLEDGE) + break; + if (pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE && pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST) { /* read retries of partial data may restart from -- GitLab From 686d348476ee8006087cfcbef591e28f4f91bd8b Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Mon, 24 Oct 2022 03:31:54 +0000 Subject: [PATCH 0179/1423] RDMA/rxe: Remove unnecessary mr testing Before the testing, we already passed it to rxe_mr_copy() where mr could be dereferenced. so this checking is not needed. The only way that mr is NULL is when it reaches below line 780 with 'qp->resp.mr = NULL', which is not possible in Bob's explanation[1]. 778 if (res->state == rdatm_res_state_new) { 779 if (!res->replay) { 780 mr = qp->resp.mr; 781 qp->resp.mr = NULL; 782 } else { [1] https://lore.kernel.org/lkml/30ff25c4-ce66-eac4-eaa2-64c0db203a19@gmail.com/ Link: https://lore.kernel.org/r/1666582315-2-1-git-send-email-lizhijian@fujitsu.com CC: Bob Pearson Signed-off-by: Li Zhijian Reviewed-by: Bob Pearson Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_resp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 82b74e926e09f..95d372db934d7 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -811,8 +811,7 @@ static enum resp_states read_reply(struct rxe_qp *qp, err = rxe_mr_copy(mr, res->read.va, payload_addr(&ack_pkt), payload, RXE_FROM_MR_OBJ); - if (mr) - rxe_put(mr); + rxe_put(mr); if (err) { kfree_skb(skb); return RESPST_ERR_RKEY_VIOLATION; -- GitLab From 665b1856dc2399828d8ee07a18d4fd79868e729a Mon Sep 17 00:00:00 2001 From: John Johansen Date: Mon, 3 Oct 2022 06:06:26 -0700 Subject: [PATCH 0180/1423] apparmor: Fix loading of child before parent Unfortunately it is possible for some userspace's to load children profiles before the parent profile. This can even happen when the child and the parent are in different load sets. Fix this by creating a null place holder profile that grants no permissions and can be replaced by the parent once it is loaded. Signed-off-by: John Johansen --- security/apparmor/policy.c | 87 ++++++++++++++++++++++++++++++++++---- 1 file changed, 78 insertions(+), 9 deletions(-) diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c index c17ccedd35f1e..66034cf96f4c8 100644 --- a/security/apparmor/policy.c +++ b/security/apparmor/policy.c @@ -423,6 +423,57 @@ static struct aa_policy *__lookup_parent(struct aa_ns *ns, return &profile->base; } +/** + * __create_missing_ancestors - create place holders for missing ancestores + * @ns: namespace to lookup profile in (NOT NULL) + * @hname: hierarchical profile name to find parent of (NOT NULL) + * @gfp: type of allocation. + * + * Returns: NULL on error, parent profile on success + * + * Requires: ns mutex lock held + * + * Returns: unrefcounted parent policy or NULL if error creating + * place holder profiles. + */ +static struct aa_policy *__create_missing_ancestors(struct aa_ns *ns, + const char *hname, + gfp_t gfp) +{ + struct aa_policy *policy; + struct aa_profile *parent, *profile = NULL; + char *split; + + AA_BUG(!ns); + AA_BUG(!hname); + + policy = &ns->base; + + for (split = strstr(hname, "//"); split;) { + parent = profile; + profile = __strn_find_child(&policy->profiles, hname, + split - hname); + if (!profile) { + const char *name = kstrndup(hname, split - hname, + gfp); + if (!name) + return NULL; + profile = aa_alloc_null(parent, name, gfp); + kfree(name); + if (!profile) + return NULL; + if (!parent) + profile->ns = aa_get_ns(ns); + } + policy = &profile->base; + hname = split + 2; + split = strstr(hname, "//"); + } + if (!profile) + return &ns->base; + return &profile->base; +} + /** * __lookupn_profile - lookup the profile matching @hname * @base: base list to start looking up profile name from (NOT NULL) @@ -1032,6 +1083,7 @@ ssize_t aa_replace_profiles(struct aa_ns *policy_ns, struct aa_label *label, /* setup parent and ns info */ list_for_each_entry(ent, &lh, list) { struct aa_policy *policy; + struct aa_profile *p; if (aa_g_export_binary) ent->new->rawdata = aa_get_loaddata(udata); @@ -1056,21 +1108,38 @@ ssize_t aa_replace_profiles(struct aa_ns *policy_ns, struct aa_label *label, continue; /* no ref on policy only use inside lock */ + p = NULL; policy = __lookup_parent(ns, ent->new->base.hname); if (!policy) { - struct aa_profile *p; + /* first check for parent in the load set */ p = __list_lookup_parent(&lh, ent->new); if (!p) { - error = -ENOENT; - info = "parent does not exist"; - goto fail_lock; + /* + * fill in missing parent with null + * profile that doesn't have + * permissions. This allows for + * individual profile loading where + * the child is loaded before the + * parent, and outside of the current + * atomic set. This unfortunately can + * happen with some userspaces. The + * null profile will be replaced once + * the parent is loaded. + */ + policy = __create_missing_ancestors(ns, + ent->new->base.hname, + GFP_KERNEL); + if (!policy) { + error = -ENOENT; + info = "parent does not exist"; + goto fail_lock; + } } - rcu_assign_pointer(ent->new->parent, aa_get_profile(p)); - } else if (policy != &ns->base) { - /* released on profile replacement or free_profile */ - struct aa_profile *p = (struct aa_profile *) policy; - rcu_assign_pointer(ent->new->parent, aa_get_profile(p)); } + if (!p && policy != &ns->base) + /* released on profile replacement or free_profile */ + p = (struct aa_profile *) policy; + rcu_assign_pointer(ent->new->parent, aa_get_profile(p)); } /* create new fs entries for introspection if needed */ -- GitLab From 64a27ba984342d6c5cf5facc278de5c5df1fd3ff Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Sat, 8 Oct 2022 14:34:09 +0800 Subject: [PATCH 0181/1423] AppArmor: Fix kernel-doc security/apparmor/audit.c:93: warning: expecting prototype for audit_base(). Prototype was for audit_pre() instead. Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=2339 Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: John Johansen --- security/apparmor/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/apparmor/audit.c b/security/apparmor/audit.c index 8dfdda98fbf13..5a7978aa4b19e 100644 --- a/security/apparmor/audit.c +++ b/security/apparmor/audit.c @@ -83,7 +83,7 @@ static const char *const aa_class_names[] = { */ /** - * audit_base - core AppArmor function. + * audit_pre() - core AppArmor function. * @ab: audit buffer to fill (NOT NULL) * @ca: audit structure containing data to audit (NOT NULL) * -- GitLab From 391f121150a5191c932e02775b6e29e59a3f5a94 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Sat, 8 Oct 2022 14:34:10 +0800 Subject: [PATCH 0182/1423] LSM: Fix kernel-doc security/apparmor/lsm.c:753: warning: expecting prototype for apparmor_bprm_committed_cred(). Prototype was for apparmor_bprm_committed_creds() instead. Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=2338 Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: John Johansen --- security/apparmor/lsm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 8e2b951c4988c..ca4d190a737d3 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -741,7 +741,7 @@ static void apparmor_bprm_committing_creds(struct linux_binprm *bprm) } /** - * apparmor_bprm_committed_cred - do cleanup after new creds committed + * apparmor_bprm_committed_creds() - do cleanup after new creds committed * @bprm: binprm for the exec (NOT NULL) */ static void apparmor_bprm_committed_creds(struct linux_binprm *bprm) -- GitLab From a2217387c3ec09117b3b6eaa5ec8a0d7d347d4ba Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Sat, 8 Oct 2022 14:34:11 +0800 Subject: [PATCH 0183/1423] AppArmor: Fix kernel-doc security/apparmor/ipc.c:53: warning: expecting prototype for audit_cb(). Prototype was for audit_signal_cb() instead. Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=2337 Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: John Johansen --- security/apparmor/ipc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/apparmor/ipc.c b/security/apparmor/ipc.c index 1d4099385bdf8..5acde746775f7 100644 --- a/security/apparmor/ipc.c +++ b/security/apparmor/ipc.c @@ -45,7 +45,7 @@ static const char *audit_signal_mask(u32 mask) } /** - * audit_cb - call back for signal specific audit fields + * audit_signal_cb() - call back for signal specific audit fields * @ab: audit_buffer (NOT NULL) * @va: audit struct to audit values of (NOT NULL) */ -- GitLab From 37923d4321b1e38170086da2c117f78f2b0f49c6 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Fri, 21 Oct 2022 08:46:04 +0800 Subject: [PATCH 0184/1423] apparmor: Use pointer to struct aa_label for lbs_cred According to the implementations of cred_label() and set_cred_label(), we should use pointer to struct aa_label for lbs_cred instead of struct aa_task_ctx, this patch fixes it. Fixes: bbd3662a8348 ("Infrastructure management of the cred security blob") Signed-off-by: Xiu Jianfeng Signed-off-by: John Johansen --- security/apparmor/lsm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index ca4d190a737d3..25114735bc11c 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -1198,10 +1198,10 @@ static int apparmor_inet_conn_request(const struct sock *sk, struct sk_buff *skb #endif /* - * The cred blob is a pointer to, not an instance of, an aa_task_ctx. + * The cred blob is a pointer to, not an instance of, an aa_label. */ struct lsm_blob_sizes apparmor_blob_sizes __lsm_ro_after_init = { - .lbs_cred = sizeof(struct aa_task_ctx *), + .lbs_cred = sizeof(struct aa_label *), .lbs_file = sizeof(struct aa_file_ctx), .lbs_task = sizeof(struct aa_task_ctx), }; -- GitLab From d44c692350d9a376abab46aa0d70587951971068 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 14 Oct 2022 16:42:55 +0800 Subject: [PATCH 0185/1423] apparmor: Fix spelling of function name in comment block 'resouce' -> 'resource' Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=2396 Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: John Johansen --- security/apparmor/resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/apparmor/resource.c b/security/apparmor/resource.c index 1b75d8343a8db..e859481648962 100644 --- a/security/apparmor/resource.c +++ b/security/apparmor/resource.c @@ -68,7 +68,7 @@ static int audit_resource(struct aa_profile *profile, unsigned int resource, } /** - * aa_map_resouce - map compiled policy resource to internal # + * aa_map_resource - map compiled policy resource to internal # * @resource: flattened policy resource number * * Returns: resource # for the current architecture. -- GitLab From 7dd426e33e2f9275ac03a306efdc89aa86515a52 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Tue, 25 Oct 2022 11:59:30 +0800 Subject: [PATCH 0186/1423] apparmor: fix a memleak in free_ruleset() When the aa_profile is released, we will call free_ruleset to release aa_ruleset, but we don't free the memory of aa_ruleset, so there will be memleak, fix it. unreferenced object 0xffff8881475df800 (size 1024): comm "apparmor_parser", pid 883, jiffies 4294899650 (age 9114.088s) hex dump (first 32 bytes): 00 f8 5d 47 81 88 ff ff 00 f8 5d 47 81 88 ff ff ..]G......]G.... 00 00 00 00 00 00 00 00 00 dc 65 47 81 88 ff ff ..........eG.... backtrace: [<00000000370e658e>] __kmem_cache_alloc_node+0x182/0x700 [<00000000f2f5a6d2>] kmalloc_trace+0x2c/0x130 [<00000000c5c905b3>] aa_alloc_profile+0x1bc/0x5c0 [<00000000bc4fa72b>] unpack_profile+0x319/0x30c0 [<00000000eab791e9>] aa_unpack+0x307/0x1450 [<000000002c3a6ee1>] aa_replace_profiles+0x1b8/0x3790 [<00000000d0c3fd54>] policy_update+0x35a/0x890 [<00000000d04fed90>] profile_replace+0x1d1/0x260 [<00000000cba0c0a7>] vfs_write+0x283/0xd10 [<000000006bae64a5>] ksys_write+0x134/0x260 [<00000000b2fd8f31>] __x64_sys_write+0x78/0xb0 [<00000000f3c8a015>] do_syscall_64+0x5c/0x90 [<00000000a242b1db>] entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: 217af7e2f4de ("apparmor: refactor profile rules and attachments") Signed-off-by: Gaosheng Cui Signed-off-by: John Johansen --- security/apparmor/policy.c | 1 + 1 file changed, 1 insertion(+) diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c index 66034cf96f4c8..51e8184e0fec1 100644 --- a/security/apparmor/policy.c +++ b/security/apparmor/policy.c @@ -215,6 +215,7 @@ static void free_ruleset(struct aa_ruleset *rules) for (i = 0; i < rules->secmark_count; i++) kfree_sensitive(rules->secmark[i].label); kfree_sensitive(rules->secmark); + kfree_sensitive(rules); } struct aa_ruleset *aa_alloc_ruleset(gfp_t gfp) -- GitLab From 3265949f7cd36a724a35020202c618094be1cf28 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Fri, 21 Oct 2022 17:36:02 +0800 Subject: [PATCH 0187/1423] apparmor: Fix memleak issue in unpack_profile() Before aa_alloc_profile(), it has allocated string for @*ns_name if @tmpns is not NULL, so directly return -ENOMEM if aa_alloc_profile() failed will cause a memleak issue, and even if aa_alloc_profile() succeed, in the @fail_profile tag of aa_unpack(), it need to free @ns_name as well, this patch fixes them. Fixes: 736ec752d95e ("AppArmor: policy routines for loading and unpacking policy") Fixes: 04dc715e24d0 ("apparmor: audit policy ns specified in policy load") Signed-off-by: Xiu Jianfeng Signed-off-by: John Johansen --- security/apparmor/policy_unpack.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 2e028d540c6b9..1bf8cfb8700aa 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -858,8 +858,11 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) } profile = aa_alloc_profile(name, NULL, GFP_KERNEL); - if (!profile) - return ERR_PTR(-ENOMEM); + if (!profile) { + info = "out of memory"; + error = -ENOMEM; + goto fail; + } rules = list_first_entry(&profile->rules, typeof(*rules), list); /* profile renaming is optional */ @@ -1090,6 +1093,10 @@ fail: if (error == 0) /* default error covers most cases */ error = -EPROTO; + if (*ns_name) { + kfree(*ns_name); + *ns_name = NULL; + } if (profile) name = NULL; else if (!name) @@ -1392,6 +1399,7 @@ int aa_unpack(struct aa_loaddata *udata, struct list_head *lh, { struct aa_load_ent *tmp, *ent; struct aa_profile *profile = NULL; + char *ns_name = NULL; int error; struct aa_ext e = { .start = udata->data, @@ -1401,7 +1409,6 @@ int aa_unpack(struct aa_loaddata *udata, struct list_head *lh, *ns = NULL; while (e.pos < e.end) { - char *ns_name = NULL; void *start; error = verify_header(&e, e.pos == e.start, ns); if (error) @@ -1432,6 +1439,7 @@ int aa_unpack(struct aa_loaddata *udata, struct list_head *lh, ent->new = profile; ent->ns_name = ns_name; + ns_name = NULL; list_add_tail(&ent->list, lh); } udata->abi = e.version & K_ABI_MASK; @@ -1452,6 +1460,7 @@ int aa_unpack(struct aa_loaddata *udata, struct list_head *lh, return 0; fail_profile: + kfree(ns_name); aa_put_profile(profile); fail: -- GitLab From 6de0cb80e601df16f481a614daa0e84adbf0b552 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Mon, 24 Oct 2022 16:08:28 +0800 Subject: [PATCH 0188/1423] gpio: ftgpio010: use device name for gpiochip name & label Currently, we use just the fixed string "FTGPIO010" as the gpiochip name for ftgpio010 drivers. Because it's fixed, this means we cannot distinguish multiple ftgpio010 devices present on a single system. This change uses the dev_name() instead, which should be unique between multiple instances. Signed-off-by: Jeremy Kerr Acked-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-ftgpio010.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-ftgpio010.c b/drivers/gpio/gpio-ftgpio010.c index f77a965f5780d..2728672ef9f83 100644 --- a/drivers/gpio/gpio-ftgpio010.c +++ b/drivers/gpio/gpio-ftgpio010.c @@ -277,7 +277,7 @@ static int ftgpio_gpio_probe(struct platform_device *pdev) dev_err(dev, "unable to init generic GPIO\n"); goto dis_clk; } - g->gc.label = "FTGPIO010"; + g->gc.label = dev_name(dev); g->gc.base = -1; g->gc.parent = dev; g->gc.owner = THIS_MODULE; -- GitLab From b9b1fc1ae1191243d3956888c65a280a9b2c847f Mon Sep 17 00:00:00 2001 From: William Breathitt Gray Date: Sun, 18 Sep 2022 12:50:43 -0400 Subject: [PATCH 0189/1423] gpio: idio-16: Introduce the ACCES IDIO-16 GPIO library module Exposes consumer library functions to facilitate communication with devices within the ACCES IDIO-16 family such as the 104-IDIO-16 and the PCI-IDIO-16. A CONFIG_GPIO_IDIO_16 Kconfig option is introduced by this patch. Modules wanting access to these idio-16 library functions should select this Kconfig option and import the GPIO_IDIO_16 symbol namespace. Cc: Andy Shevchenko Signed-off-by: William Breathitt Gray Signed-off-by: Bartosz Golaszewski --- MAINTAINERS | 7 ++ drivers/gpio/Kconfig | 9 +++ drivers/gpio/Makefile | 1 + drivers/gpio/gpio-idio-16.c | 146 ++++++++++++++++++++++++++++++++++++ drivers/gpio/gpio-idio-16.h | 71 ++++++++++++++++++ 5 files changed, 234 insertions(+) create mode 100644 drivers/gpio/gpio-idio-16.c create mode 100644 drivers/gpio/gpio-idio-16.h diff --git a/MAINTAINERS b/MAINTAINERS index cf0f185023724..74efa0492c430 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -312,6 +312,13 @@ L: linux-iio@vger.kernel.org S: Maintained F: drivers/counter/104-quad-8.c +ACCES IDIO-16 GPIO LIBRARY +M: William Breathitt Gray +L: linux-gpio@vger.kernel.org +S: Maintained +F: drivers/gpio/gpio-idio-16.c +F: drivers/gpio/gpio-idio-16.h + ACCES PCI-IDIO-16 GPIO DRIVER M: William Breathitt Gray L: linux-gpio@vger.kernel.org diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index e034f752e7cef..3f8cf6e2165e4 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -109,6 +109,15 @@ config GPIO_REGMAP config GPIO_MAX730X tristate +config GPIO_IDIO_16 + tristate + help + Enables support for the idio-16 library functions. The idio-16 library + provides functions to facilitate communication with devices within the + ACCES IDIO-16 family such as the 104-IDIO-16 and the PCI-IDIO-16. + + If built as a module its name will be gpio-idio-16. + menu "Memory mapped GPIO drivers" depends on HAS_IOMEM diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile index 84fae267e8ebf..37a0b7ebda434 100644 --- a/drivers/gpio/Makefile +++ b/drivers/gpio/Makefile @@ -68,6 +68,7 @@ obj-$(CONFIG_GPIO_HLWD) += gpio-hlwd.o obj-$(CONFIG_HTC_EGPIO) += gpio-htc-egpio.o obj-$(CONFIG_GPIO_I8255) += gpio-i8255.o obj-$(CONFIG_GPIO_ICH) += gpio-ich.o +obj-$(CONFIG_GPIO_IDIO_16) += gpio-idio-16.o obj-$(CONFIG_GPIO_IDT3243X) += gpio-idt3243x.o obj-$(CONFIG_GPIO_IMX_SCU) += gpio-imx-scu.o obj-$(CONFIG_GPIO_IOP) += gpio-iop.o diff --git a/drivers/gpio/gpio-idio-16.c b/drivers/gpio/gpio-idio-16.c new file mode 100644 index 0000000000000..13315242d2209 --- /dev/null +++ b/drivers/gpio/gpio-idio-16.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * GPIO library for the ACCES IDIO-16 family + * Copyright (C) 2022 William Breathitt Gray + */ +#include +#include +#include +#include +#include +#include + +#include "gpio-idio-16.h" + +#define DEFAULT_SYMBOL_NAMESPACE GPIO_IDIO_16 + +/** + * idio_16_get - get signal value at signal offset + * @reg: ACCES IDIO-16 device registers + * @state: ACCES IDIO-16 device state + * @offset: offset of signal to get + * + * Returns the signal value (0=low, 1=high) for the signal at @offset. + */ +int idio_16_get(struct idio_16 __iomem *const reg, + struct idio_16_state *const state, const unsigned long offset) +{ + const unsigned long mask = BIT(offset); + + if (offset < IDIO_16_NOUT) + return test_bit(offset, state->out_state); + + if (offset < 24) + return !!(ioread8(®->in0_7) & (mask >> IDIO_16_NOUT)); + + if (offset < 32) + return !!(ioread8(®->in8_15) & (mask >> 24)); + + return -EINVAL; +} +EXPORT_SYMBOL_GPL(idio_16_get); + +/** + * idio_16_get_multiple - get multiple signal values at multiple signal offsets + * @reg: ACCES IDIO-16 device registers + * @state: ACCES IDIO-16 device state + * @mask: mask of signals to get + * @bits: bitmap to store signal values + * + * Stores in @bits the values (0=low, 1=high) for the signals defined by @mask. + */ +void idio_16_get_multiple(struct idio_16 __iomem *const reg, + struct idio_16_state *const state, + const unsigned long *const mask, + unsigned long *const bits) +{ + unsigned long flags; + const unsigned long out_mask = GENMASK(IDIO_16_NOUT - 1, 0); + + spin_lock_irqsave(&state->lock, flags); + + bitmap_replace(bits, bits, state->out_state, &out_mask, IDIO_16_NOUT); + if (*mask & GENMASK(23, 16)) + bitmap_set_value8(bits, ioread8(®->in0_7), 16); + if (*mask & GENMASK(31, 24)) + bitmap_set_value8(bits, ioread8(®->in8_15), 24); + + spin_unlock_irqrestore(&state->lock, flags); +} +EXPORT_SYMBOL_GPL(idio_16_get_multiple); + +/** + * idio_16_set - set signal value at signal offset + * @reg: ACCES IDIO-16 device registers + * @state: ACCES IDIO-16 device state + * @offset: offset of signal to set + * @value: value of signal to set + * + * Assigns output @value for the signal at @offset. + */ +void idio_16_set(struct idio_16 __iomem *const reg, + struct idio_16_state *const state, const unsigned long offset, + const unsigned long value) +{ + unsigned long flags; + + if (offset >= IDIO_16_NOUT) + return; + + spin_lock_irqsave(&state->lock, flags); + + __assign_bit(offset, state->out_state, value); + if (offset < 8) + iowrite8(bitmap_get_value8(state->out_state, 0), ®->out0_7); + else + iowrite8(bitmap_get_value8(state->out_state, 8), ®->out8_15); + + spin_unlock_irqrestore(&state->lock, flags); +} +EXPORT_SYMBOL_GPL(idio_16_set); + +/** + * idio_16_set_multiple - set signal values at multiple signal offsets + * @reg: ACCES IDIO-16 device registers + * @state: ACCES IDIO-16 device state + * @mask: mask of signals to set + * @bits: bitmap of signal output values + * + * Assigns output values defined by @bits for the signals defined by @mask. + */ +void idio_16_set_multiple(struct idio_16 __iomem *const reg, + struct idio_16_state *const state, + const unsigned long *const mask, + const unsigned long *const bits) +{ + unsigned long flags; + + spin_lock_irqsave(&state->lock, flags); + + bitmap_replace(state->out_state, state->out_state, bits, mask, + IDIO_16_NOUT); + if (*mask & GENMASK(7, 0)) + iowrite8(bitmap_get_value8(state->out_state, 0), ®->out0_7); + if (*mask & GENMASK(15, 8)) + iowrite8(bitmap_get_value8(state->out_state, 8), ®->out8_15); + + spin_unlock_irqrestore(&state->lock, flags); +} +EXPORT_SYMBOL_GPL(idio_16_set_multiple); + +/** + * idio_16_state_init - initialize idio_16_state structure + * @state: ACCES IDIO-16 device state + * + * Initializes the ACCES IDIO-16 device @state for use in idio-16 library + * functions. + */ +void idio_16_state_init(struct idio_16_state *const state) +{ + spin_lock_init(&state->lock); +} +EXPORT_SYMBOL_GPL(idio_16_state_init); + +MODULE_AUTHOR("William Breathitt Gray"); +MODULE_DESCRIPTION("ACCES IDIO-16 GPIO Library"); +MODULE_LICENSE("GPL"); diff --git a/drivers/gpio/gpio-idio-16.h b/drivers/gpio/gpio-idio-16.h new file mode 100644 index 0000000000000..928f8251a2bd9 --- /dev/null +++ b/drivers/gpio/gpio-idio-16.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright 2022 William Breathitt Gray */ +#ifndef _IDIO_16_H_ +#define _IDIO_16_H_ + +#include +#include + +/** + * struct idio_16 - IDIO-16 registers structure + * @out0_7: Read: FET Drive Outputs 0-7 + * Write: FET Drive Outputs 0-7 + * @in0_7: Read: Isolated Inputs 0-7 + * Write: Clear Interrupt + * @irq_ctl: Read: Enable IRQ + * Write: Disable IRQ + * @filter_ctl: Read: Activate Input Filters 0-15 + * Write: Deactivate Input Filters 0-15 + * @out8_15: Read: FET Drive Outputs 8-15 + * Write: FET Drive Outputs 8-15 + * @in8_15: Read: Isolated Inputs 8-15 + * Write: Unused + * @irq_status: Read: Interrupt status + * Write: Unused + */ +struct idio_16 { + u8 out0_7; + u8 in0_7; + u8 irq_ctl; + u8 filter_ctl; + u8 out8_15; + u8 in8_15; + u8 irq_status; +}; + +#define IDIO_16_NOUT 16 + +/** + * struct idio_16_state - IDIO-16 state structure + * @lock: synchronization lock for accessing device state + * @out_state: output signals state + */ +struct idio_16_state { + spinlock_t lock; + DECLARE_BITMAP(out_state, IDIO_16_NOUT); +}; + +/** + * idio_16_get_direction - get the I/O direction for a signal offset + * @offset: offset of signal to get direction + * + * Returns the signal direction (0=output, 1=input) for the signal at @offset. + */ +static inline int idio_16_get_direction(const unsigned long offset) +{ + return (offset >= IDIO_16_NOUT) ? 1 : 0; +} + +int idio_16_get(struct idio_16 __iomem *reg, struct idio_16_state *state, + unsigned long offset); +void idio_16_get_multiple(struct idio_16 __iomem *reg, + struct idio_16_state *state, + const unsigned long *mask, unsigned long *bits); +void idio_16_set(struct idio_16 __iomem *reg, struct idio_16_state *state, + unsigned long offset, unsigned long value); +void idio_16_set_multiple(struct idio_16 __iomem *reg, + struct idio_16_state *state, + const unsigned long *mask, const unsigned long *bits); +void idio_16_state_init(struct idio_16_state *state); + +#endif /* _IDIO_16_H_ */ -- GitLab From c4ec384cf726379e600764c7f2f7ad487280890a Mon Sep 17 00:00:00 2001 From: William Breathitt Gray Date: Tue, 25 Oct 2022 09:57:57 +0200 Subject: [PATCH 0190/1423] gpio: 104-idio-16: Utilize the idio-16 GPIO library The ACCES 104-IDIO-16 device is part of the ACCES IDIO-16 family, so the idio-16 GPIO library module is selected and utilized to consolidate code. Signed-off-by: William Breathitt Gray Signed-off-by: Bartosz Golaszewski --- drivers/gpio/Kconfig | 1 + drivers/gpio/gpio-104-idio-16.c | 88 ++++++--------------------------- 2 files changed, 17 insertions(+), 72 deletions(-) diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 3f8cf6e2165e4..3b0a030ce79b0 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -858,6 +858,7 @@ config GPIO_104_IDIO_16 depends on PC104 select ISA_BUS_API select GPIOLIB_IRQCHIP + select GPIO_IDIO_16 help Enables GPIO support for the ACCES 104-IDIO-16 family (104-IDIO-16, 104-IDIO-16E, 104-IDO-16, 104-IDIO-8, 104-IDIO-8E, 104-IDO-8). The diff --git a/drivers/gpio/gpio-104-idio-16.c b/drivers/gpio/gpio-104-idio-16.c index 718bd54e2a255..098fbefdbe226 100644 --- a/drivers/gpio/gpio-104-idio-16.c +++ b/drivers/gpio/gpio-104-idio-16.c @@ -6,7 +6,7 @@ * This driver supports the following ACCES devices: 104-IDIO-16, * 104-IDIO-16E, 104-IDO-16, 104-IDIO-8, 104-IDIO-8E, and 104-IDO-8. */ -#include +#include #include #include #include @@ -21,6 +21,8 @@ #include #include +#include "gpio-idio-16.h" + #define IDIO_16_EXTENT 8 #define MAX_NUM_IDIO_16 max_num_isa_dev(IDIO_16_EXTENT) @@ -34,49 +36,26 @@ static unsigned int num_irq; module_param_hw_array(irq, uint, irq, &num_irq, 0); MODULE_PARM_DESC(irq, "ACCES 104-IDIO-16 interrupt line numbers"); -/** - * struct idio_16_reg - device registers structure - * @out0_7: Read: N/A - * Write: FET Drive Outputs 0-7 - * @in0_7: Read: Isolated Inputs 0-7 - * Write: Clear Interrupt - * @irq_ctl: Read: Enable IRQ - * Write: Disable IRQ - * @unused: N/A - * @out8_15: Read: N/A - * Write: FET Drive Outputs 8-15 - * @in8_15: Read: Isolated Inputs 8-15 - * Write: N/A - */ -struct idio_16_reg { - u8 out0_7; - u8 in0_7; - u8 irq_ctl; - u8 unused; - u8 out8_15; - u8 in8_15; -}; - /** * struct idio_16_gpio - GPIO device private data structure * @chip: instance of the gpio_chip * @lock: synchronization lock to prevent I/O race conditions * @irq_mask: I/O bits affected by interrupts * @reg: I/O address offset for the device registers - * @out_state: output bits state + * @state: ACCES IDIO-16 device state */ struct idio_16_gpio { struct gpio_chip chip; raw_spinlock_t lock; unsigned long irq_mask; - struct idio_16_reg __iomem *reg; - unsigned int out_state; + struct idio_16 __iomem *reg; + struct idio_16_state state; }; static int idio_16_gpio_get_direction(struct gpio_chip *chip, unsigned int offset) { - if (offset > 15) + if (idio_16_get_direction(offset)) return GPIO_LINE_DIRECTION_IN; return GPIO_LINE_DIRECTION_OUT; @@ -98,15 +77,8 @@ static int idio_16_gpio_direction_output(struct gpio_chip *chip, static int idio_16_gpio_get(struct gpio_chip *chip, unsigned int offset) { struct idio_16_gpio *const idio16gpio = gpiochip_get_data(chip); - const unsigned int mask = BIT(offset-16); - if (offset < 16) - return -EINVAL; - - if (offset < 24) - return !!(ioread8(&idio16gpio->reg->in0_7) & mask); - - return !!(ioread8(&idio16gpio->reg->in8_15) & (mask>>8)); + return idio_16_get(idio16gpio->reg, &idio16gpio->state, offset); } static int idio_16_gpio_get_multiple(struct gpio_chip *chip, @@ -114,11 +86,7 @@ static int idio_16_gpio_get_multiple(struct gpio_chip *chip, { struct idio_16_gpio *const idio16gpio = gpiochip_get_data(chip); - *bits = 0; - if (*mask & GENMASK(23, 16)) - *bits |= (unsigned long)ioread8(&idio16gpio->reg->in0_7) << 16; - if (*mask & GENMASK(31, 24)) - *bits |= (unsigned long)ioread8(&idio16gpio->reg->in8_15) << 24; + idio_16_get_multiple(idio16gpio->reg, &idio16gpio->state, mask, bits); return 0; } @@ -127,44 +95,16 @@ static void idio_16_gpio_set(struct gpio_chip *chip, unsigned int offset, int value) { struct idio_16_gpio *const idio16gpio = gpiochip_get_data(chip); - const unsigned int mask = BIT(offset); - unsigned long flags; - if (offset > 15) - return; - - raw_spin_lock_irqsave(&idio16gpio->lock, flags); - - if (value) - idio16gpio->out_state |= mask; - else - idio16gpio->out_state &= ~mask; - - if (offset > 7) - iowrite8(idio16gpio->out_state >> 8, &idio16gpio->reg->out8_15); - else - iowrite8(idio16gpio->out_state, &idio16gpio->reg->out0_7); - - raw_spin_unlock_irqrestore(&idio16gpio->lock, flags); + idio_16_set(idio16gpio->reg, &idio16gpio->state, offset, value); } static void idio_16_gpio_set_multiple(struct gpio_chip *chip, unsigned long *mask, unsigned long *bits) { struct idio_16_gpio *const idio16gpio = gpiochip_get_data(chip); - unsigned long flags; - raw_spin_lock_irqsave(&idio16gpio->lock, flags); - - idio16gpio->out_state &= ~*mask; - idio16gpio->out_state |= *mask & *bits; - - if (*mask & 0xFF) - iowrite8(idio16gpio->out_state, &idio16gpio->reg->out0_7); - if ((*mask >> 8) & 0xFF) - iowrite8(idio16gpio->out_state >> 8, &idio16gpio->reg->out8_15); - - raw_spin_unlock_irqrestore(&idio16gpio->lock, flags); + idio_16_set_multiple(idio16gpio->reg, &idio16gpio->state, mask, bits); } static void idio_16_irq_ack(struct irq_data *data) @@ -301,7 +241,10 @@ static int idio_16_probe(struct device *dev, unsigned int id) idio16gpio->chip.get_multiple = idio_16_gpio_get_multiple; idio16gpio->chip.set = idio_16_gpio_set; idio16gpio->chip.set_multiple = idio_16_gpio_set_multiple; - idio16gpio->out_state = 0xFFFF; + + idio_16_state_init(&idio16gpio->state); + /* FET off states are represented by bit values of "1" */ + bitmap_fill(idio16gpio->state.out_state, IDIO_16_NOUT); girq = &idio16gpio->chip.irq; gpio_irq_chip_set_chip(girq, &idio_16_irqchip); @@ -343,3 +286,4 @@ module_isa_driver_with_irq(idio_16_driver, num_idio_16, num_irq); MODULE_AUTHOR("William Breathitt Gray "); MODULE_DESCRIPTION("ACCES 104-IDIO-16 GPIO driver"); MODULE_LICENSE("GPL v2"); +MODULE_IMPORT_NS(GPIO_IDIO_16); -- GitLab From e7f758fa9b7fda8b91f2e429b2be93ae0b88ac33 Mon Sep 17 00:00:00 2001 From: William Breathitt Gray Date: Sun, 18 Sep 2022 12:50:45 -0400 Subject: [PATCH 0191/1423] gpio: pci-idio-16: Utilize the idio-16 GPIO library The ACCES PCI-IDIO-16 device is part of the ACCES IDIO-16 family, so the idio-16 GPIO library module is selected and utilized to consolidate code. Signed-off-by: William Breathitt Gray Signed-off-by: Bartosz Golaszewski --- drivers/gpio/Kconfig | 1 + drivers/gpio/gpio-pci-idio-16.c | 119 ++++---------------------------- 2 files changed, 14 insertions(+), 106 deletions(-) diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 3b0a030ce79b0..5a04990f03cc8 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -1563,6 +1563,7 @@ config GPIO_PCH config GPIO_PCI_IDIO_16 tristate "ACCES PCI-IDIO-16 GPIO support" select GPIOLIB_IRQCHIP + select GPIO_IDIO_16 help Enables GPIO support for the ACCES PCI-IDIO-16. An interrupt is generated when any of the inputs change state (low to high or high to diff --git a/drivers/gpio/gpio-pci-idio-16.c b/drivers/gpio/gpio-pci-idio-16.c index 71a13a394050f..a86ce748384bb 100644 --- a/drivers/gpio/gpio-pci-idio-16.c +++ b/drivers/gpio/gpio-pci-idio-16.c @@ -3,8 +3,7 @@ * GPIO driver for the ACCES PCI-IDIO-16 * Copyright (C) 2017 William Breathitt Gray */ -#include -#include +#include #include #include #include @@ -16,51 +15,28 @@ #include #include -/** - * struct idio_16_gpio_reg - GPIO device registers structure - * @out0_7: Read: FET Drive Outputs 0-7 - * Write: FET Drive Outputs 0-7 - * @in0_7: Read: Isolated Inputs 0-7 - * Write: Clear Interrupt - * @irq_ctl: Read: Enable IRQ - * Write: Disable IRQ - * @filter_ctl: Read: Activate Input Filters 0-15 - * Write: Deactivate Input Filters 0-15 - * @out8_15: Read: FET Drive Outputs 8-15 - * Write: FET Drive Outputs 8-15 - * @in8_15: Read: Isolated Inputs 8-15 - * Write: Unused - * @irq_status: Read: Interrupt status - * Write: Unused - */ -struct idio_16_gpio_reg { - u8 out0_7; - u8 in0_7; - u8 irq_ctl; - u8 filter_ctl; - u8 out8_15; - u8 in8_15; - u8 irq_status; -}; +#include "gpio-idio-16.h" /** * struct idio_16_gpio - GPIO device private data structure * @chip: instance of the gpio_chip * @lock: synchronization lock to prevent I/O race conditions * @reg: I/O address offset for the GPIO device registers + * @state: ACCES IDIO-16 device state * @irq_mask: I/O bits affected by interrupts */ struct idio_16_gpio { struct gpio_chip chip; raw_spinlock_t lock; - struct idio_16_gpio_reg __iomem *reg; + struct idio_16 __iomem *reg; + struct idio_16_state state; unsigned long irq_mask; }; static int idio_16_gpio_get_direction(struct gpio_chip *chip, unsigned int offset) { - if (offset > 15) + if (idio_16_get_direction(offset)) return GPIO_LINE_DIRECTION_IN; return GPIO_LINE_DIRECTION_OUT; @@ -82,43 +58,16 @@ static int idio_16_gpio_direction_output(struct gpio_chip *chip, static int idio_16_gpio_get(struct gpio_chip *chip, unsigned int offset) { struct idio_16_gpio *const idio16gpio = gpiochip_get_data(chip); - unsigned long mask = BIT(offset); - - if (offset < 8) - return !!(ioread8(&idio16gpio->reg->out0_7) & mask); - - if (offset < 16) - return !!(ioread8(&idio16gpio->reg->out8_15) & (mask >> 8)); - - if (offset < 24) - return !!(ioread8(&idio16gpio->reg->in0_7) & (mask >> 16)); - return !!(ioread8(&idio16gpio->reg->in8_15) & (mask >> 24)); + return idio_16_get(idio16gpio->reg, &idio16gpio->state, offset); } static int idio_16_gpio_get_multiple(struct gpio_chip *chip, unsigned long *mask, unsigned long *bits) { struct idio_16_gpio *const idio16gpio = gpiochip_get_data(chip); - unsigned long offset; - unsigned long gpio_mask; - void __iomem *ports[] = { - &idio16gpio->reg->out0_7, &idio16gpio->reg->out8_15, - &idio16gpio->reg->in0_7, &idio16gpio->reg->in8_15, - }; - void __iomem *port_addr; - unsigned long port_state; - - /* clear bits array to a clean slate */ - bitmap_zero(bits, chip->ngpio); - - for_each_set_clump8(offset, gpio_mask, mask, ARRAY_SIZE(ports) * 8) { - port_addr = ports[offset / 8]; - port_state = ioread8(port_addr) & gpio_mask; - - bitmap_set_value8(bits, port_state, offset); - } + idio_16_get_multiple(idio16gpio->reg, &idio16gpio->state, mask, bits); return 0; } @@ -126,61 +75,16 @@ static void idio_16_gpio_set(struct gpio_chip *chip, unsigned int offset, int value) { struct idio_16_gpio *const idio16gpio = gpiochip_get_data(chip); - unsigned int mask = BIT(offset); - void __iomem *base; - unsigned long flags; - unsigned int out_state; - - if (offset > 15) - return; - - if (offset > 7) { - mask >>= 8; - base = &idio16gpio->reg->out8_15; - } else - base = &idio16gpio->reg->out0_7; - - raw_spin_lock_irqsave(&idio16gpio->lock, flags); - if (value) - out_state = ioread8(base) | mask; - else - out_state = ioread8(base) & ~mask; - - iowrite8(out_state, base); - - raw_spin_unlock_irqrestore(&idio16gpio->lock, flags); + idio_16_set(idio16gpio->reg, &idio16gpio->state, offset, value); } static void idio_16_gpio_set_multiple(struct gpio_chip *chip, unsigned long *mask, unsigned long *bits) { struct idio_16_gpio *const idio16gpio = gpiochip_get_data(chip); - unsigned long offset; - unsigned long gpio_mask; - void __iomem *ports[] = { - &idio16gpio->reg->out0_7, &idio16gpio->reg->out8_15, - }; - size_t index; - void __iomem *port_addr; - unsigned long bitmask; - unsigned long flags; - unsigned long out_state; - for_each_set_clump8(offset, gpio_mask, mask, ARRAY_SIZE(ports) * 8) { - index = offset / 8; - port_addr = ports[index]; - - bitmask = bitmap_get_value8(bits, offset) & gpio_mask; - - raw_spin_lock_irqsave(&idio16gpio->lock, flags); - - out_state = ioread8(port_addr) & ~gpio_mask; - out_state |= bitmask; - iowrite8(out_state, port_addr); - - raw_spin_unlock_irqrestore(&idio16gpio->lock, flags); - } + idio_16_set_multiple(idio16gpio->reg, &idio16gpio->state, mask, bits); } static void idio_16_irq_ack(struct irq_data *data) @@ -335,6 +239,8 @@ static int idio_16_probe(struct pci_dev *pdev, const struct pci_device_id *id) idio16gpio->chip.set = idio_16_gpio_set; idio16gpio->chip.set_multiple = idio_16_gpio_set_multiple; + idio_16_state_init(&idio16gpio->state); + girq = &idio16gpio->chip.irq; girq->chip = &idio_16_irqchip; /* This will let us handle the parent IRQ in the driver */ @@ -379,3 +285,4 @@ module_pci_driver(idio_16_driver); MODULE_AUTHOR("William Breathitt Gray "); MODULE_DESCRIPTION("ACCES PCI-IDIO-16 GPIO driver"); MODULE_LICENSE("GPL v2"); +MODULE_IMPORT_NS(GPIO_IDIO_16); -- GitLab From eac001bf4a5b7d857ec228cd18b4e3644a5ceeb9 Mon Sep 17 00:00:00 2001 From: Xiang Yang Date: Thu, 20 Oct 2022 09:44:26 +0800 Subject: [PATCH 0192/1423] gpiolib: acpi: Use METHOD_NAME__AEI macro for acpi_walk_resources Using the METHOD_NAME__AEI macro instead of using "_AEI" directly. Signed-off-by: Xiang Yang Signed-off-by: Andy Shevchenko --- drivers/gpio/gpiolib-acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c index a7d2358736fe7..064ba5150fd4d 100644 --- a/drivers/gpio/gpiolib-acpi.c +++ b/drivers/gpio/gpiolib-acpi.c @@ -512,7 +512,7 @@ void acpi_gpiochip_request_interrupts(struct gpio_chip *chip) if (ACPI_FAILURE(status)) return; - acpi_walk_resources(handle, "_AEI", + acpi_walk_resources(handle, METHOD_NAME__AEI, acpi_gpiochip_alloc_event, acpi_gpio); mutex_lock(&acpi_gpio_deferred_req_irqs_lock); -- GitLab From 8d259847243d1e21a866e828c4ce90d759f3d17b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 20 Oct 2022 18:39:14 +0300 Subject: [PATCH 0193/1423] gpiolib: cdev: Fix typo in kernel doc for struct line When eflags has been renamed to the edflags, the kernel doc change were missed. Update kernel doc accordingly. Fixes: b1a92e94560d ("gpiolib: cdev: consolidate edge detector configuration flags") Signed-off-by: Andy Shevchenko Reviewed-by: Kent Gibson --- drivers/gpio/gpiolib-cdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpiolib-cdev.c b/drivers/gpio/gpiolib-cdev.c index 0cb6b468f364f..08606f32372ce 100644 --- a/drivers/gpio/gpiolib-cdev.c +++ b/drivers/gpio/gpiolib-cdev.c @@ -410,7 +410,7 @@ out_free_lh: * @desc: the GPIO descriptor for this line. * @req: the corresponding line request * @irq: the interrupt triggered in response to events on this GPIO - * @eflags: the edge flags, GPIO_V2_LINE_FLAG_EDGE_RISING and/or + * @edflags: the edge flags, GPIO_V2_LINE_FLAG_EDGE_RISING and/or * GPIO_V2_LINE_FLAG_EDGE_FALLING, indicating the edge detection applied * @timestamp_ns: cache for the timestamp storing it between hardirq and * IRQ thread, used to bring the timestamp close to the actual event -- GitLab From c3db3c2fd9992c08f49aa93752d3c103c3a4f6aa Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Mon, 24 Oct 2022 19:30:12 +0200 Subject: [PATCH 0194/1423] f2fs: should put a page when checking the summary info The commit introduces another bug. Cc: stable@vger.kernel.org Fixes: c6ad7fd16657e ("f2fs: fix to do sanity check on summary info") Signed-off-by: Pavel Machek Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 4546e01b2ee08..dab794225cce7 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1110,6 +1110,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (ofs_in_node >= max_addrs) { f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%u, nid:%u, max:%u", ofs_in_node, dni->ino, dni->nid, max_addrs); + f2fs_put_page(node_page, 1); return false; } -- GitLab From 14dc00a0e2dbea4b685ab9723ff511fcfd223c18 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 17 Oct 2022 17:52:05 -0700 Subject: [PATCH 0195/1423] f2fs: let's avoid to get cp_rwsem twice by f2fs_evict_inode by d_invalidate f2fs_unlink -> f2fs_lock_op -> d_invalidate -> shrink_dentry_list -> iput_final -> f2fs_evict_inode -> f2fs_lock_op Reviewed-by: Chao Yu Tested-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a389772fd212a..e104409c3a0e5 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -632,6 +632,8 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) goto fail; } f2fs_delete_entry(de, page, dir, inode); + f2fs_unlock_op(sbi); + #if IS_ENABLED(CONFIG_UNICODE) /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid @@ -642,8 +644,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) if (IS_CASEFOLDED(dir)) d_invalidate(dentry); #endif - f2fs_unlock_op(sbi); - if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); fail: -- GitLab From ae25e00ba84073450c07d8ffd2d74f914a027230 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 25 Oct 2022 18:32:49 +0300 Subject: [PATCH 0196/1423] x86/retpoline: Fix crash printing warning The first argument of WARN() is a condition, so this will use "addr" as the format string and possibly crash. Fixes: 3b6c1747da48 ("x86/retpoline: Add SKL retthunk retpolines") Signed-off-by: Dan Carpenter Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/Y1gBoUZrRK5N%2FlCB@kili/ --- arch/x86/kernel/alternative.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 19221d77dc275..b4ac4e58c0106 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -418,7 +418,7 @@ clang_jcc: break; default: - WARN("%pS %px %*ph\n", addr, addr, 6, addr); + WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr); return -1; } -- GitLab From d233ab3c5c5ed4b3d2201bddb71dab5a2946c31b Mon Sep 17 00:00:00 2001 From: Heinrich Schuchardt Date: Sun, 25 Sep 2022 02:47:57 +0200 Subject: [PATCH 0197/1423] riscv/vdso: typo therefor The adverbs 'therefor' and 'therefore' have different meaning. As the meaning here is 'consequently' the spelling should be 'therefore'. Signed-off-by: Heinrich Schuchardt Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20220925004757.9089-1-heinrich.schuchardt@canonical.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/vdso.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/vdso.h b/arch/riscv/include/asm/vdso.h index af981426fe0ff..a7644f46d0e56 100644 --- a/arch/riscv/include/asm/vdso.h +++ b/arch/riscv/include/asm/vdso.h @@ -10,7 +10,7 @@ /* * All systems with an MMU have a VDSO, but systems without an MMU don't - * support shared libraries and therefor don't have one. + * support shared libraries and therefore don't have one. */ #ifdef CONFIG_MMU -- GitLab From 079f0c21ef6d79f80b19b64f5e0218d5a328c4cd Mon Sep 17 00:00:00 2001 From: Nico Boehr Date: Thu, 20 Oct 2022 16:31:55 +0200 Subject: [PATCH 0198/1423] s390/mm: gmap: sort out physical vs virtual pointers usage Fix virtual vs physical address confusion (which currently are the same). Signed-off-by: Nico Boehr Reviewed-by: Pierre Morel Reviewed-by: Claudio Imbrenda Link: https://lore.kernel.org/r/20221020143159.294605-2-nrb@linux.ibm.com Message-Id: <20221020143159.294605-2-nrb@linux.ibm.com> Signed-off-by: Janosch Frank --- arch/s390/mm/gmap.c | 147 +++++++++++++++++++++++--------------------- 1 file changed, 76 insertions(+), 71 deletions(-) diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 02d15c8dc92e9..2ccfcc8a3863e 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -72,7 +72,7 @@ static struct gmap *gmap_alloc(unsigned long limit) goto out_free; page->index = 0; list_add(&page->lru, &gmap->crst_list); - table = (unsigned long *) page_to_phys(page); + table = page_to_virt(page); crst_table_init(table, etype); gmap->table = table; gmap->asce = atype | _ASCE_TABLE_LENGTH | @@ -311,12 +311,12 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); if (!page) return -ENOMEM; - new = (unsigned long *) page_to_phys(page); + new = page_to_virt(page); crst_table_init(new, init); spin_lock(&gmap->guest_table_lock); if (*table & _REGION_ENTRY_INVALID) { list_add(&page->lru, &gmap->crst_list); - *table = (unsigned long) new | _REGION_ENTRY_LENGTH | + *table = __pa(new) | _REGION_ENTRY_LENGTH | (*table & _REGION_ENTRY_TYPE_MASK); page->index = gaddr; page = NULL; @@ -557,7 +557,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, gaddr & _REGION1_MASK)) return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); } if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; @@ -565,7 +565,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, gaddr & _REGION2_MASK)) return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); } if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; @@ -573,7 +573,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, gaddr & _REGION3_MASK)) return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); } table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; /* Walk the parent mm page table */ @@ -813,7 +813,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, break; if (*table & _REGION_ENTRY_INVALID) return NULL; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_REGION2: table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; @@ -821,7 +821,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, break; if (*table & _REGION_ENTRY_INVALID) return NULL; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_REGION3: table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; @@ -829,7 +829,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, break; if (*table & _REGION_ENTRY_INVALID) return NULL; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_SEGMENT: table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; @@ -837,7 +837,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, break; if (*table & _REGION_ENTRY_INVALID) return NULL; - table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); + table = __va(*table & _SEGMENT_ENTRY_ORIGIN); table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT; } return table; @@ -1150,7 +1150,7 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) { address = pte_val(pte) & PAGE_MASK; address += gaddr & ~PAGE_MASK; - *val = *(unsigned long *) address; + *val = *(unsigned long *)__va(address); set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG))); /* Do *NOT* clear the _PAGE_INVALID bit! */ rc = 0; @@ -1335,7 +1335,8 @@ static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, */ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) { - unsigned long sto, *ste, *pgt; + unsigned long *ste; + phys_addr_t sto, pgt; struct page *page; BUG_ON(!gmap_is_shadow(sg)); @@ -1343,13 +1344,13 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN)) return; gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1); - sto = (unsigned long) (ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT)); + sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT)); gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr); - pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN); + pgt = *ste & _SEGMENT_ENTRY_ORIGIN; *ste = _SEGMENT_ENTRY_EMPTY; - __gmap_unshadow_pgt(sg, raddr, pgt); + __gmap_unshadow_pgt(sg, raddr, __va(pgt)); /* Free page table */ - page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT); + page = phys_to_page(pgt); list_del(&page->lru); page_table_free_pgste(page); } @@ -1365,19 +1366,19 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, unsigned long *sgt) { - unsigned long *pgt; struct page *page; + phys_addr_t pgt; int i; BUG_ON(!gmap_is_shadow(sg)); for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) { if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN)) continue; - pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN); + pgt = sgt[i] & _REGION_ENTRY_ORIGIN; sgt[i] = _SEGMENT_ENTRY_EMPTY; - __gmap_unshadow_pgt(sg, raddr, pgt); + __gmap_unshadow_pgt(sg, raddr, __va(pgt)); /* Free page table */ - page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT); + page = phys_to_page(pgt); list_del(&page->lru); page_table_free_pgste(page); } @@ -1392,7 +1393,8 @@ static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, */ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) { - unsigned long r3o, *r3e, *sgt; + unsigned long r3o, *r3e; + phys_addr_t sgt; struct page *page; BUG_ON(!gmap_is_shadow(sg)); @@ -1401,12 +1403,12 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) return; gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1); r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT)); - gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr); - sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN); + gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr); + sgt = *r3e & _REGION_ENTRY_ORIGIN; *r3e = _REGION3_ENTRY_EMPTY; - __gmap_unshadow_sgt(sg, raddr, sgt); + __gmap_unshadow_sgt(sg, raddr, __va(sgt)); /* Free segment table */ - page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); + page = phys_to_page(sgt); list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1422,19 +1424,19 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, unsigned long *r3t) { - unsigned long *sgt; struct page *page; + phys_addr_t sgt; int i; BUG_ON(!gmap_is_shadow(sg)); for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) { if (!(r3t[i] & _REGION_ENTRY_ORIGIN)) continue; - sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN); + sgt = r3t[i] & _REGION_ENTRY_ORIGIN; r3t[i] = _REGION3_ENTRY_EMPTY; - __gmap_unshadow_sgt(sg, raddr, sgt); + __gmap_unshadow_sgt(sg, raddr, __va(sgt)); /* Free segment table */ - page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); + page = phys_to_page(sgt); list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1449,7 +1451,8 @@ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, */ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) { - unsigned long r2o, *r2e, *r3t; + unsigned long r2o, *r2e; + phys_addr_t r3t; struct page *page; BUG_ON(!gmap_is_shadow(sg)); @@ -1458,12 +1461,12 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) return; gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1); r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT)); - gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr); - r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN); + gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr); + r3t = *r2e & _REGION_ENTRY_ORIGIN; *r2e = _REGION2_ENTRY_EMPTY; - __gmap_unshadow_r3t(sg, raddr, r3t); + __gmap_unshadow_r3t(sg, raddr, __va(r3t)); /* Free region 3 table */ - page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); + page = phys_to_page(r3t); list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1479,7 +1482,7 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, unsigned long *r2t) { - unsigned long *r3t; + phys_addr_t r3t; struct page *page; int i; @@ -1487,11 +1490,11 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) { if (!(r2t[i] & _REGION_ENTRY_ORIGIN)) continue; - r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN); + r3t = r2t[i] & _REGION_ENTRY_ORIGIN; r2t[i] = _REGION2_ENTRY_EMPTY; - __gmap_unshadow_r3t(sg, raddr, r3t); + __gmap_unshadow_r3t(sg, raddr, __va(r3t)); /* Free region 3 table */ - page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); + page = phys_to_page(r3t); list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1506,8 +1509,9 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, */ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) { - unsigned long r1o, *r1e, *r2t; + unsigned long r1o, *r1e; struct page *page; + phys_addr_t r2t; BUG_ON(!gmap_is_shadow(sg)); r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */ @@ -1515,12 +1519,12 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) return; gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1); r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT)); - gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr); - r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN); + gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr); + r2t = *r1e & _REGION_ENTRY_ORIGIN; *r1e = _REGION1_ENTRY_EMPTY; - __gmap_unshadow_r2t(sg, raddr, r2t); + __gmap_unshadow_r2t(sg, raddr, __va(r2t)); /* Free region 2 table */ - page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); + page = phys_to_page(r2t); list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1536,22 +1540,23 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, unsigned long *r1t) { - unsigned long asce, *r2t; + unsigned long asce; struct page *page; + phys_addr_t r2t; int i; BUG_ON(!gmap_is_shadow(sg)); - asce = (unsigned long) r1t | _ASCE_TYPE_REGION1; + asce = __pa(r1t) | _ASCE_TYPE_REGION1; for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) { if (!(r1t[i] & _REGION_ENTRY_ORIGIN)) continue; - r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN); - __gmap_unshadow_r2t(sg, raddr, r2t); + r2t = r1t[i] & _REGION_ENTRY_ORIGIN; + __gmap_unshadow_r2t(sg, raddr, __va(r2t)); /* Clear entry and flush translation r1t -> r2t */ gmap_idte_one(asce, raddr); r1t[i] = _REGION1_ENTRY_EMPTY; /* Free region 2 table */ - page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); + page = phys_to_page(r2t); list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1573,7 +1578,7 @@ static void gmap_unshadow(struct gmap *sg) sg->removed = 1; gmap_call_notifier(sg, 0, -1UL); gmap_flush_tlb(sg); - table = (unsigned long *)(sg->asce & _ASCE_ORIGIN); + table = __va(sg->asce & _ASCE_ORIGIN); switch (sg->asce & _ASCE_TYPE_MASK) { case _ASCE_TYPE_REGION1: __gmap_unshadow_r1t(sg, 0, table); @@ -1748,7 +1753,8 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, int fake) { unsigned long raddr, origin, offset, len; - unsigned long *s_r2t, *table; + unsigned long *table; + phys_addr_t s_r2t; struct page *page; int rc; @@ -1760,7 +1766,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, page->index = r2t & _REGION_ENTRY_ORIGIN; if (fake) page->index |= GMAP_SHADOW_FAKE_TABLE; - s_r2t = (unsigned long *) page_to_phys(page); + s_r2t = page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */ @@ -1775,9 +1781,9 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, rc = -EAGAIN; /* Race with shadow */ goto out_free; } - crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY); + crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY); /* mark as invalid as long as the parent table is not protected */ - *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH | + *table = s_r2t | _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID; if (sg->edat_level >= 1) *table |= (r2t & _REGION_ENTRY_PROTECT); @@ -1798,8 +1804,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 4); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != - (unsigned long) s_r2t) + if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t) rc = -EAGAIN; /* Race with unshadow */ else *table &= ~_REGION_ENTRY_INVALID; @@ -1832,7 +1837,8 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, int fake) { unsigned long raddr, origin, offset, len; - unsigned long *s_r3t, *table; + unsigned long *table; + phys_addr_t s_r3t; struct page *page; int rc; @@ -1844,7 +1850,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, page->index = r3t & _REGION_ENTRY_ORIGIN; if (fake) page->index |= GMAP_SHADOW_FAKE_TABLE; - s_r3t = (unsigned long *) page_to_phys(page); + s_r3t = page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */ @@ -1859,9 +1865,9 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, rc = -EAGAIN; /* Race with shadow */ goto out_free; } - crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY); + crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY); /* mark as invalid as long as the parent table is not protected */ - *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH | + *table = s_r3t | _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID; if (sg->edat_level >= 1) *table |= (r3t & _REGION_ENTRY_PROTECT); @@ -1882,8 +1888,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 3); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != - (unsigned long) s_r3t) + if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t) rc = -EAGAIN; /* Race with unshadow */ else *table &= ~_REGION_ENTRY_INVALID; @@ -1916,7 +1921,8 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, int fake) { unsigned long raddr, origin, offset, len; - unsigned long *s_sgt, *table; + unsigned long *table; + phys_addr_t s_sgt; struct page *page; int rc; @@ -1928,7 +1934,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, page->index = sgt & _REGION_ENTRY_ORIGIN; if (fake) page->index |= GMAP_SHADOW_FAKE_TABLE; - s_sgt = (unsigned long *) page_to_phys(page); + s_sgt = page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */ @@ -1943,9 +1949,9 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, rc = -EAGAIN; /* Race with shadow */ goto out_free; } - crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY); + crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY); /* mark as invalid as long as the parent table is not protected */ - *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH | + *table = s_sgt | _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID; if (sg->edat_level >= 1) *table |= sgt & _REGION_ENTRY_PROTECT; @@ -1966,8 +1972,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 2); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != - (unsigned long) s_sgt) + if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt) rc = -EAGAIN; /* Race with unshadow */ else *table &= ~_REGION_ENTRY_INVALID; @@ -2040,8 +2045,9 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, int fake) { unsigned long raddr, origin; - unsigned long *s_pgt, *table; + unsigned long *table; struct page *page; + phys_addr_t s_pgt; int rc; BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE)); @@ -2052,7 +2058,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, page->index = pgt & _SEGMENT_ENTRY_ORIGIN; if (fake) page->index |= GMAP_SHADOW_FAKE_TABLE; - s_pgt = (unsigned long *) page_to_phys(page); + s_pgt = page_to_phys(page); /* Install shadow page table */ spin_lock(&sg->guest_table_lock); table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ @@ -2085,8 +2091,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 1); - if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != - (unsigned long) s_pgt) + if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt) rc = -EAGAIN; /* Race with unshadow */ else *table &= ~_SEGMENT_ENTRY_INVALID; -- GitLab From 6b33e68ab30949f9657e2acc59766977ae63e1cc Mon Sep 17 00:00:00 2001 From: Nico Boehr Date: Thu, 20 Oct 2022 16:31:56 +0200 Subject: [PATCH 0199/1423] s390/entry: sort out physical vs virtual pointers usage in sie64a Fix virtual vs physical address confusion (which currently are the same). sie_block is accessed in entry.S and passed it to hardware, which is why both its physical and virtual address are needed. To avoid every caller having to do the virtual-physical conversion, add a new function sie64a() which converts the virtual address to physical. Signed-off-by: Nico Boehr Reviewed-by: Alexander Gordeev Reviewed-by: Claudio Imbrenda Link: https://lore.kernel.org/r/20221020143159.294605-3-nrb@linux.ibm.com Message-Id: <20221020143159.294605-3-nrb@linux.ibm.com> Signed-off-by: Janosch Frank --- arch/s390/include/asm/kvm_host.h | 8 +++++++- arch/s390/include/asm/stacktrace.h | 1 + arch/s390/kernel/asm-offsets.c | 1 + arch/s390/kernel/entry.S | 26 +++++++++++++++----------- 4 files changed, 24 insertions(+), 12 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index b1e98a9ed152b..9a31d00e99b3c 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -1017,7 +1017,13 @@ void kvm_arch_crypto_clear_masks(struct kvm *kvm); void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm, unsigned long *aqm, unsigned long *adm); -extern int sie64a(struct kvm_s390_sie_block *, u64 *); +int __sie64a(phys_addr_t sie_block_phys, struct kvm_s390_sie_block *sie_block, u64 *rsa); + +static inline int sie64a(struct kvm_s390_sie_block *sie_block, u64 *rsa) +{ + return __sie64a(virt_to_phys(sie_block), sie_block, rsa); +} + extern char sie_exit; extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc); diff --git a/arch/s390/include/asm/stacktrace.h b/arch/s390/include/asm/stacktrace.h index b23c658dce77f..1802be5abb5dc 100644 --- a/arch/s390/include/asm/stacktrace.h +++ b/arch/s390/include/asm/stacktrace.h @@ -46,6 +46,7 @@ struct stack_frame { unsigned long sie_savearea; unsigned long sie_reason; unsigned long sie_flags; + unsigned long sie_control_block_phys; }; }; unsigned long gprs[10]; diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index d8ce965c0a97c..3f8e760298c22 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -62,6 +62,7 @@ int main(void) OFFSET(__SF_SIE_SAVEAREA, stack_frame, sie_savearea); OFFSET(__SF_SIE_REASON, stack_frame, sie_reason); OFFSET(__SF_SIE_FLAGS, stack_frame, sie_flags); + OFFSET(__SF_SIE_CONTROL_PHYS, stack_frame, sie_control_block_phys); DEFINE(STACK_FRAME_OVERHEAD, sizeof(struct stack_frame)); BLANK(); /* idle data offsets */ diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index d2a1f2f4f5b88..12e1773a94a47 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -225,18 +225,20 @@ ENDPROC(__switch_to) #if IS_ENABLED(CONFIG_KVM) /* - * sie64a calling convention: - * %r2 pointer to sie control block - * %r3 guest register save area + * __sie64a calling convention: + * %r2 pointer to sie control block phys + * %r3 pointer to sie control block virt + * %r4 guest register save area */ -ENTRY(sie64a) +ENTRY(__sie64a) stmg %r6,%r14,__SF_GPRS(%r15) # save kernel registers lg %r12,__LC_CURRENT - stg %r2,__SF_SIE_CONTROL(%r15) # save control block pointer - stg %r3,__SF_SIE_SAVEAREA(%r15) # save guest register save area + stg %r2,__SF_SIE_CONTROL_PHYS(%r15) # save sie block physical.. + stg %r3,__SF_SIE_CONTROL(%r15) # ...and virtual addresses + stg %r4,__SF_SIE_SAVEAREA(%r15) # save guest register save area xc __SF_SIE_REASON(8,%r15),__SF_SIE_REASON(%r15) # reason code = 0 mvc __SF_SIE_FLAGS(8,%r15),__TI_flags(%r12) # copy thread flags - lmg %r0,%r13,0(%r3) # load guest gprs 0-13 + lmg %r0,%r13,0(%r4) # load guest gprs 0-13 lg %r14,__LC_GMAP # get gmap pointer ltgr %r14,%r14 jz .Lsie_gmap @@ -248,6 +250,7 @@ ENTRY(sie64a) jnz .Lsie_skip TSTMSK __LC_CPU_FLAGS,_CIF_FPU jo .Lsie_skip # exit if fp/vx regs changed + lg %r14,__SF_SIE_CONTROL_PHYS(%r15) # get sie block phys addr BPEXIT __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) .Lsie_entry: sie 0(%r14) @@ -258,13 +261,14 @@ ENTRY(sie64a) BPOFF BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) .Lsie_skip: + lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce .Lsie_done: # some program checks are suppressing. C code (e.g. do_protection_exception) # will rewind the PSW by the ILC, which is often 4 bytes in case of SIE. There # are some corner cases (e.g. runtime instrumentation) where ILC is unpredictable. -# Other instructions between sie64a and .Lsie_done should not cause program +# Other instructions between __sie64a and .Lsie_done should not cause program # interrupts. So lets use 3 nops as a landing pad for all possible rewinds. .Lrewind_pad6: nopr 7 @@ -293,8 +297,8 @@ sie_exit: EX_TABLE(.Lrewind_pad4,.Lsie_fault) EX_TABLE(.Lrewind_pad2,.Lsie_fault) EX_TABLE(sie_exit,.Lsie_fault) -ENDPROC(sie64a) -EXPORT_SYMBOL(sie64a) +ENDPROC(__sie64a) +EXPORT_SYMBOL(__sie64a) EXPORT_SYMBOL(sie_exit) #endif @@ -373,7 +377,7 @@ ENTRY(pgm_check_handler) j 3f # -> fault in user space .Lpgm_skip_asce: #if IS_ENABLED(CONFIG_KVM) - # cleanup critical section for program checks in sie64a + # cleanup critical section for program checks in __sie64a OUTSIDE %r9,.Lsie_gmap,.Lsie_done,1f SIEEXIT lghi %r10,_PIF_GUEST_FAULT -- GitLab From fe0ef00304639cae82df7c9ad6a15286bd5f876e Mon Sep 17 00:00:00 2001 From: Nico Boehr Date: Thu, 20 Oct 2022 16:31:57 +0200 Subject: [PATCH 0200/1423] KVM: s390: sort out physical vs virtual pointers usage Fix virtual vs physical address confusion (which currently are the same). Signed-off-by: Nico Boehr Reviewed-by: Claudio Imbrenda Link: https://lore.kernel.org/r/20221020143159.294605-4-nrb@linux.ibm.com Message-Id: <20221020143159.294605-4-nrb@linux.ibm.com> Signed-off-by: Janosch Frank --- arch/s390/include/asm/kvm_host.h | 1 + arch/s390/kvm/intercept.c | 2 +- arch/s390/kvm/kvm-s390.c | 44 ++++++++++++++++++-------------- arch/s390/kvm/kvm-s390.h | 5 ++-- 4 files changed, 30 insertions(+), 22 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 9a31d00e99b3c..931f978758997 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -276,6 +276,7 @@ struct kvm_s390_sie_block { #define ECB3_AES 0x04 #define ECB3_RI 0x01 __u8 ecb3; /* 0x0063 */ +#define ESCA_SCAOL_MASK ~0x3fU __u32 scaol; /* 0x0064 */ __u8 sdf; /* 0x0068 */ __u8 epdx; /* 0x0069 */ diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 88112065d9411..b703b5202f257 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -217,7 +217,7 @@ static int handle_itdb(struct kvm_vcpu *vcpu) return 0; if (current->thread.per_flags & PER_FLAG_NO_TE) return 0; - itdb = (struct kvm_s390_itdb *)vcpu->arch.sie_block->itdba; + itdb = phys_to_virt(vcpu->arch.sie_block->itdba); rc = write_guest_lc(vcpu, __LC_PGM_TDB, itdb, sizeof(*itdb)); if (rc) return rc; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 45d4b8182b073..0f7ff0c9019f8 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3329,28 +3329,30 @@ static void sca_del_vcpu(struct kvm_vcpu *vcpu) static void sca_add_vcpu(struct kvm_vcpu *vcpu) { if (!kvm_s390_use_sca_entries()) { - struct bsca_block *sca = vcpu->kvm->arch.sca; + phys_addr_t sca_phys = virt_to_phys(vcpu->kvm->arch.sca); /* we still need the basic sca for the ipte control */ - vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32); - vcpu->arch.sie_block->scaol = (__u32)(__u64)sca; + vcpu->arch.sie_block->scaoh = sca_phys >> 32; + vcpu->arch.sie_block->scaol = sca_phys; return; } read_lock(&vcpu->kvm->arch.sca_lock); if (vcpu->kvm->arch.use_esca) { struct esca_block *sca = vcpu->kvm->arch.sca; + phys_addr_t sca_phys = virt_to_phys(sca); - sca->cpu[vcpu->vcpu_id].sda = (__u64) vcpu->arch.sie_block; - vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32); - vcpu->arch.sie_block->scaol = (__u32)(__u64)sca & ~0x3fU; + sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block); + vcpu->arch.sie_block->scaoh = sca_phys >> 32; + vcpu->arch.sie_block->scaol = sca_phys & ESCA_SCAOL_MASK; vcpu->arch.sie_block->ecb2 |= ECB2_ESCA; set_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn); } else { struct bsca_block *sca = vcpu->kvm->arch.sca; + phys_addr_t sca_phys = virt_to_phys(sca); - sca->cpu[vcpu->vcpu_id].sda = (__u64) vcpu->arch.sie_block; - vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32); - vcpu->arch.sie_block->scaol = (__u32)(__u64)sca; + sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block); + vcpu->arch.sie_block->scaoh = sca_phys >> 32; + vcpu->arch.sie_block->scaol = sca_phys; set_bit_inv(vcpu->vcpu_id, (unsigned long *) &sca->mcn); } read_unlock(&vcpu->kvm->arch.sca_lock); @@ -3381,6 +3383,7 @@ static int sca_switch_to_extended(struct kvm *kvm) struct kvm_vcpu *vcpu; unsigned long vcpu_idx; u32 scaol, scaoh; + phys_addr_t new_sca_phys; if (kvm->arch.use_esca) return 0; @@ -3389,8 +3392,9 @@ static int sca_switch_to_extended(struct kvm *kvm) if (!new_sca) return -ENOMEM; - scaoh = (u32)((u64)(new_sca) >> 32); - scaol = (u32)(u64)(new_sca) & ~0x3fU; + new_sca_phys = virt_to_phys(new_sca); + scaoh = new_sca_phys >> 32; + scaol = new_sca_phys & ESCA_SCAOL_MASK; kvm_s390_vcpu_block_all(kvm); write_lock(&kvm->arch.sca_lock); @@ -3610,15 +3614,18 @@ static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu) void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu) { - free_page(vcpu->arch.sie_block->cbrlo); + free_page((unsigned long)phys_to_virt(vcpu->arch.sie_block->cbrlo)); vcpu->arch.sie_block->cbrlo = 0; } int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu) { - vcpu->arch.sie_block->cbrlo = get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.sie_block->cbrlo) + void *cbrlo_page = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + + if (!cbrlo_page) return -ENOMEM; + + vcpu->arch.sie_block->cbrlo = virt_to_phys(cbrlo_page); return 0; } @@ -3628,7 +3635,7 @@ static void kvm_s390_vcpu_setup_model(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->ibc = model->ibc; if (test_kvm_facility(vcpu->kvm, 7)) - vcpu->arch.sie_block->fac = (u32)(u64) model->fac_list; + vcpu->arch.sie_block->fac = virt_to_phys(model->fac_list); } static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) @@ -3685,9 +3692,8 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) VCPU_EVENT(vcpu, 3, "AIV gisa format-%u enabled for cpu %03u", vcpu->arch.sie_block->gd & 0x3, vcpu->vcpu_id); } - vcpu->arch.sie_block->sdnxo = ((unsigned long) &vcpu->run->s.regs.sdnx) - | SDNXC; - vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb; + vcpu->arch.sie_block->sdnxo = virt_to_phys(&vcpu->run->s.regs.sdnx) | SDNXC; + vcpu->arch.sie_block->riccbd = virt_to_phys(&vcpu->run->s.regs.riccb); if (sclp.has_kss) kvm_s390_set_cpuflags(vcpu, CPUSTAT_KSS); @@ -3737,7 +3743,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) return -ENOMEM; vcpu->arch.sie_block = &sie_page->sie_block; - vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb; + vcpu->arch.sie_block->itdba = virt_to_phys(&sie_page->itdb); /* the real guest size will always be smaller than msl */ vcpu->arch.sie_block->mso = 0; diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index f6fd668f887e8..a60d1e5c44cd1 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -23,7 +23,8 @@ /* Transactional Memory Execution related macros */ #define IS_TE_ENABLED(vcpu) ((vcpu->arch.sie_block->ecb & ECB_TE)) #define TDB_FORMAT1 1 -#define IS_ITDB_VALID(vcpu) ((*(char *)vcpu->arch.sie_block->itdba == TDB_FORMAT1)) +#define IS_ITDB_VALID(vcpu) \ + ((*(char *)phys_to_virt((vcpu)->arch.sie_block->itdba) == TDB_FORMAT1)) extern debug_info_t *kvm_s390_dbf; extern debug_info_t *kvm_s390_dbf_uv; @@ -233,7 +234,7 @@ static inline unsigned long kvm_s390_get_gfn_end(struct kvm_memslots *slots) static inline u32 kvm_s390_get_gisa_desc(struct kvm *kvm) { - u32 gd = (u32)(u64)kvm->arch.gisa_int.origin; + u32 gd = virt_to_phys(kvm->arch.gisa_int.origin); if (gd && sclp.has_gisaf) gd |= GISA_FORMAT1; -- GitLab From b99f4512197acc10f63b5fb462c088c2f62b5120 Mon Sep 17 00:00:00 2001 From: Nico Boehr Date: Thu, 20 Oct 2022 16:31:58 +0200 Subject: [PATCH 0201/1423] KVM: s390: sida: sort out physical vs virtual pointers usage All callers of the sida_origin() macro actually expected a virtual address, so rename it to sida_addr() and hand out a virtual address. At some places, the macro wasn't used, potentially creating problems if the sida size ever becomes nonzero (not currently the case), so let's start using it everywhere now while at it. Signed-off-by: Nico Boehr Reviewed-by: Claudio Imbrenda Link: https://lore.kernel.org/r/20221020143159.294605-5-nrb@linux.ibm.com Message-Id: <20221020143159.294605-5-nrb@linux.ibm.com> Signed-off-by: Janosch Frank --- arch/s390/include/asm/kvm_host.h | 3 +-- arch/s390/kvm/intercept.c | 7 +++---- arch/s390/kvm/kvm-s390.c | 9 +++++---- arch/s390/kvm/priv.c | 3 +-- arch/s390/kvm/pv.c | 8 +++++--- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 931f978758997..21f1339a41974 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -142,8 +142,7 @@ struct mcck_volatile_info { CR14_EXTERNAL_DAMAGE_SUBMASK) #define SIDAD_SIZE_MASK 0xff -#define sida_origin(sie_block) \ - ((sie_block)->sidad & PAGE_MASK) +#define sida_addr(sie_block) phys_to_virt((sie_block)->sidad & PAGE_MASK) #define sida_size(sie_block) \ ((((sie_block)->sidad & SIDAD_SIZE_MASK) + 1) * PAGE_SIZE) diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index b703b5202f257..0ee02dae14b2b 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -409,8 +409,7 @@ int handle_sthyi(struct kvm_vcpu *vcpu) out: if (!cc) { if (kvm_s390_pv_cpu_is_protected(vcpu)) { - memcpy((void *)(sida_origin(vcpu->arch.sie_block)), - sctns, PAGE_SIZE); + memcpy(sida_addr(vcpu->arch.sie_block), sctns, PAGE_SIZE); } else { r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE); if (r) { @@ -464,7 +463,7 @@ static int handle_operexc(struct kvm_vcpu *vcpu) static int handle_pv_spx(struct kvm_vcpu *vcpu) { - u32 pref = *(u32 *)vcpu->arch.sie_block->sidad; + u32 pref = *(u32 *)sida_addr(vcpu->arch.sie_block); kvm_s390_set_prefix(vcpu, pref); trace_kvm_s390_handle_prefix(vcpu, 1, pref); @@ -497,7 +496,7 @@ static int handle_pv_sclp(struct kvm_vcpu *vcpu) static int handle_pv_uvc(struct kvm_vcpu *vcpu) { - struct uv_cb_share *guest_uvcb = (void *)vcpu->arch.sie_block->sidad; + struct uv_cb_share *guest_uvcb = sida_addr(vcpu->arch.sie_block); struct uv_cb_cts uvcb = { .header.cmd = UVC_CMD_UNPIN_PAGE_SHARED, .header.len = sizeof(uvcb), diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 0f7ff0c9019f8..bd6e0201bfe5f 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -5167,6 +5167,7 @@ static long kvm_s390_vcpu_sida_op(struct kvm_vcpu *vcpu, struct kvm_s390_mem_op *mop) { void __user *uaddr = (void __user *)mop->buf; + void *sida_addr; int r = 0; if (mop->flags || !mop->size) @@ -5178,16 +5179,16 @@ static long kvm_s390_vcpu_sida_op(struct kvm_vcpu *vcpu, if (!kvm_s390_pv_cpu_is_protected(vcpu)) return -EINVAL; + sida_addr = (char *)sida_addr(vcpu->arch.sie_block) + mop->sida_offset; + switch (mop->op) { case KVM_S390_MEMOP_SIDA_READ: - if (copy_to_user(uaddr, (void *)(sida_origin(vcpu->arch.sie_block) + - mop->sida_offset), mop->size)) + if (copy_to_user(uaddr, sida_addr, mop->size)) r = -EFAULT; break; case KVM_S390_MEMOP_SIDA_WRITE: - if (copy_from_user((void *)(sida_origin(vcpu->arch.sie_block) + - mop->sida_offset), uaddr, mop->size)) + if (copy_from_user(sida_addr, uaddr, mop->size)) r = -EFAULT; break; } diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 3335fa09b6f1d..9f8a192bd750f 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -924,8 +924,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu) return -EREMOTE; } if (kvm_s390_pv_cpu_is_protected(vcpu)) { - memcpy((void *)sida_origin(vcpu->arch.sie_block), (void *)mem, - PAGE_SIZE); + memcpy(sida_addr(vcpu->arch.sie_block), (void *)mem, PAGE_SIZE); rc = 0; } else { rc = write_guest(vcpu, operand2, ar, (void *)mem, PAGE_SIZE); diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c index 7cb7799a0acb4..c7435c37cdfea 100644 --- a/arch/s390/kvm/pv.c +++ b/arch/s390/kvm/pv.c @@ -44,7 +44,7 @@ int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) free_pages(vcpu->arch.pv.stor_base, get_order(uv_info.guest_cpu_stor_len)); - free_page(sida_origin(vcpu->arch.sie_block)); + free_page((unsigned long)sida_addr(vcpu->arch.sie_block)); vcpu->arch.sie_block->pv_handle_cpu = 0; vcpu->arch.sie_block->pv_handle_config = 0; memset(&vcpu->arch.pv, 0, sizeof(vcpu->arch.pv)); @@ -66,6 +66,7 @@ int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) .header.cmd = UVC_CMD_CREATE_SEC_CPU, .header.len = sizeof(uvcb), }; + void *sida_addr; int cc; if (kvm_s390_pv_cpu_get_handle(vcpu)) @@ -83,12 +84,13 @@ int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) uvcb.stor_origin = (u64)vcpu->arch.pv.stor_base; /* Alloc Secure Instruction Data Area Designation */ - vcpu->arch.sie_block->sidad = __get_free_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!vcpu->arch.sie_block->sidad) { + sida_addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!sida_addr) { free_pages(vcpu->arch.pv.stor_base, get_order(uv_info.guest_cpu_stor_len)); return -ENOMEM; } + vcpu->arch.sie_block->sidad = virt_to_phys(sida_addr); cc = uv_call(0, (u64)&uvcb); *rc = uvcb.header.rc; -- GitLab From 4435b79a366495a5cb43b792d9e7d69d489428cd Mon Sep 17 00:00:00 2001 From: Nico Boehr Date: Thu, 20 Oct 2022 16:31:59 +0200 Subject: [PATCH 0202/1423] KVM: s390: pv: sort out physical vs virtual pointers usage Fix virtual vs physical address confusion (which currently are the same). Signed-off-by: Nico Boehr Reviewed-by: Claudio Imbrenda Link: https://lore.kernel.org/r/20221020143159.294605-6-nrb@linux.ibm.com Message-Id: <20221020143159.294605-6-nrb@linux.ibm.com> Signed-off-by: Janosch Frank --- arch/s390/kvm/pv.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c index c7435c37cdfea..48c4f57d5d76f 100644 --- a/arch/s390/kvm/pv.c +++ b/arch/s390/kvm/pv.c @@ -80,8 +80,8 @@ int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) /* Input */ uvcb.guest_handle = kvm_s390_pv_get_handle(vcpu->kvm); uvcb.num = vcpu->arch.sie_block->icpua; - uvcb.state_origin = (u64)vcpu->arch.sie_block; - uvcb.stor_origin = (u64)vcpu->arch.pv.stor_base; + uvcb.state_origin = virt_to_phys(vcpu->arch.sie_block); + uvcb.stor_origin = virt_to_phys((void *)vcpu->arch.pv.stor_base); /* Alloc Secure Instruction Data Area Designation */ sida_addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); @@ -228,8 +228,9 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */ uvcb.guest_stor_len = kvm->arch.pv.guest_len; uvcb.guest_asce = kvm->arch.gmap->asce; - uvcb.guest_sca = (unsigned long)kvm->arch.sca; - uvcb.conf_base_stor_origin = (u64)kvm->arch.pv.stor_base; + uvcb.guest_sca = virt_to_phys(kvm->arch.sca); + uvcb.conf_base_stor_origin = + virt_to_phys((void *)kvm->arch.pv.stor_base); uvcb.conf_virt_stor_origin = (u64)kvm->arch.pv.stor_var; cc = uv_call_sched(0, (u64)&uvcb); -- GitLab From 77b533411595668659ce5aaade4ca36c7aa2c488 Mon Sep 17 00:00:00 2001 From: Nico Boehr Date: Tue, 25 Oct 2022 10:20:39 +0200 Subject: [PATCH 0203/1423] KVM: s390: VSIE: sort out virtual/physical address in pin_guest_page pin_guest_page() used page_to_virt() to calculate the hpa of the pinned page. This currently works, because virtual and physical addresses are the same. Use page_to_phys() instead to resolve the virtual-real address confusion. One caller of pin_guest_page() actually expected the hpa to be a hva, so add the missing phys_to_virt() conversion here. Signed-off-by: Nico Boehr Reviewed-by: Claudio Imbrenda Acked-by: David Hildenbrand Reviewed-by: Christian Borntraeger Link: https://lore.kernel.org/r/20221025082039.117372-2-nrb@linux.ibm.com Message-Id: <20221025082039.117372-2-nrb@linux.ibm.com> Signed-off-by: Janosch Frank --- arch/s390/kvm/vsie.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 94138f8f0c1c3..0e9d020d70932 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -654,7 +654,7 @@ static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa) page = gfn_to_page(kvm, gpa_to_gfn(gpa)); if (is_error_page(page)) return -EINVAL; - *hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK); + *hpa = (hpa_t)page_to_phys(page) + (gpa & ~PAGE_MASK); return 0; } @@ -869,7 +869,7 @@ static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, WARN_ON_ONCE(rc); return 1; } - vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa; + vsie_page->scb_o = phys_to_virt(hpa); return 0; } -- GitLab From 2a903ca922d007a0b40ca425ce55b5f0a0e01956 Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Fri, 7 Oct 2022 13:46:47 +0200 Subject: [PATCH 0204/1423] dt-bindings: gpio: Add gpio-latch binding document This adds a binding for a GPIO multiplexer driver based on latches connected to other GPIOs. Signed-off-by: Sascha Hauer Reviewed-by: Rob Herring Reviewed-by: Serge Semin Reviewed-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- .../devicetree/bindings/gpio/gpio-latch.yaml | 94 +++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 Documentation/devicetree/bindings/gpio/gpio-latch.yaml diff --git a/Documentation/devicetree/bindings/gpio/gpio-latch.yaml b/Documentation/devicetree/bindings/gpio/gpio-latch.yaml new file mode 100644 index 0000000000000..1ed82a2cebdaa --- /dev/null +++ b/Documentation/devicetree/bindings/gpio/gpio-latch.yaml @@ -0,0 +1,94 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/gpio/gpio-latch.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: GPIO latch controller + +maintainers: + - Sascha Hauer + +description: | + This binding describes a GPIO multiplexer based on latches connected to + other GPIOs, like this: + + CLK0 ----------------------. ,--------. + CLK1 -------------------. `--------|> #0 | + | | | + OUT0 ----------------+--|-----------|D0 Q0|-----|< + OUT1 --------------+-|--|-----------|D1 Q1|-----|< + OUT2 ------------+-|-|--|-----------|D2 Q2|-----|< + OUT3 ----------+-|-|-|--|-----------|D3 Q3|-----|< + OUT4 --------+-|-|-|-|--|-----------|D4 Q4|-----|< + OUT5 ------+-|-|-|-|-|--|-----------|D5 Q5|-----|< + OUT6 ----+-|-|-|-|-|-|--|-----------|D6 Q6|-----|< + OUT7 --+-|-|-|-|-|-|-|--|-----------|D7 Q7|-----|< + | | | | | | | | | `--------' + | | | | | | | | | + | | | | | | | | | ,--------. + | | | | | | | | `-----------|> #1 | + | | | | | | | | | | + | | | | | | | `--------------|D0 Q0|-----|< + | | | | | | `----------------|D1 Q1|-----|< + | | | | | `------------------|D2 Q2|-----|< + | | | | `--------------------|D3 Q3|-----|< + | | | `----------------------|D4 Q4|-----|< + | | `------------------------|D5 Q5|-----|< + | `--------------------------|D6 Q6|-----|< + `----------------------------|D7 Q7|-----|< + `--------' + + The number of clk-gpios and latched-gpios is not fixed. The actual number + of number of latches and the number of inputs per latch is derived from + the number of GPIOs given in the corresponding device tree properties. + +properties: + compatible: + const: gpio-latch + "#gpio-cells": + const: 2 + + clk-gpios: + description: Array of GPIOs to be used to clock a latch + + latched-gpios: + description: Array of GPIOs to be used as inputs per latch + + setup-duration-ns: + description: Delay in nanoseconds to wait after the latch inputs have been + set up + + clock-duration-ns: + description: Delay in nanoseconds to wait between clock output changes + + gpio-controller: true + + gpio-line-names: true + +required: + - compatible + - "#gpio-cells" + - gpio-controller + - clk-gpios + - latched-gpios + +additionalProperties: false + +examples: + - | + gpio-latch { + #gpio-cells = <2>; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_di_do_leds>; + compatible = "gpio-latch"; + gpio-controller; + setup-duration-ns = <100>; + clock-duration-ns = <100>; + + clk-gpios = <&gpio3 7 0>, <&gpio3 8 0>; + latched-gpios = <&gpio3 21 0>, <&gpio3 22 0>, + <&gpio3 23 0>, <&gpio3 24 0>, + <&gpio3 25 0>, <&gpio3 26 0>, + <&gpio3 27 0>, <&gpio3 28 0>; + }; -- GitLab From 1454a928b637bd169d99fc91a46b3b36cea76f9f Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Fri, 7 Oct 2022 13:46:46 +0200 Subject: [PATCH 0205/1423] gpio: Add gpio latch driver This driver implements a GPIO multiplexer based on latches connected to other GPIOs. A set of data GPIOs is connected to the data input of multiple latches. The clock input of each latch is driven by another set of GPIOs. With two 8-bit latches 10 GPIOs can be multiplexed into 16 GPIOs. GPOs might be a better term as in fact the multiplexed pins are output only. Signed-off-by: Sascha Hauer Reviewed-by: Serge Semin Reviewed-by: Linus Walleij [Bartosz: fixed the strange of_device_id formatting] Signed-off-by: Bartosz Golaszewski --- drivers/gpio/Kconfig | 6 ++ drivers/gpio/Makefile | 1 + drivers/gpio/gpio-latch.c | 219 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 226 insertions(+) create mode 100644 drivers/gpio/gpio-latch.c diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 5a04990f03cc8..8c756cb292140 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -1684,6 +1684,12 @@ config GPIO_AGGREGATOR industrial control context, to be operated from userspace using the GPIO chardev interface. +config GPIO_LATCH + tristate "GPIO latch driver" + help + Say yes here to enable a driver for GPIO multiplexers based on latches + connected to other GPIOs. + config GPIO_MOCKUP tristate "GPIO Testing Driver" select IRQ_SIM diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile index 37a0b7ebda434..8629e9eaf79e1 100644 --- a/drivers/gpio/Makefile +++ b/drivers/gpio/Makefile @@ -76,6 +76,7 @@ obj-$(CONFIG_GPIO_IT87) += gpio-it87.o obj-$(CONFIG_GPIO_IXP4XX) += gpio-ixp4xx.o obj-$(CONFIG_GPIO_JANZ_TTL) += gpio-janz-ttl.o obj-$(CONFIG_GPIO_KEMPLD) += gpio-kempld.o +obj-$(CONFIG_GPIO_LATCH) += gpio-latch.o obj-$(CONFIG_GPIO_LOGICVC) += gpio-logicvc.o obj-$(CONFIG_GPIO_LOONGSON1) += gpio-loongson1.o obj-$(CONFIG_GPIO_LOONGSON) += gpio-loongson.o diff --git a/drivers/gpio/gpio-latch.c b/drivers/gpio/gpio-latch.c new file mode 100644 index 0000000000000..d7c3b20c8482e --- /dev/null +++ b/drivers/gpio/gpio-latch.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * GPIO latch driver + * + * Copyright (C) 2022 Sascha Hauer + * + * This driver implements a GPIO (or better GPO as there is no input) + * multiplexer based on latches like this: + * + * CLK0 ----------------------. ,--------. + * CLK1 -------------------. `--------|> #0 | + * | | | + * OUT0 ----------------+--|-----------|D0 Q0|-----|< + * OUT1 --------------+-|--|-----------|D1 Q1|-----|< + * OUT2 ------------+-|-|--|-----------|D2 Q2|-----|< + * OUT3 ----------+-|-|-|--|-----------|D3 Q3|-----|< + * OUT4 --------+-|-|-|-|--|-----------|D4 Q4|-----|< + * OUT5 ------+-|-|-|-|-|--|-----------|D5 Q5|-----|< + * OUT6 ----+-|-|-|-|-|-|--|-----------|D6 Q6|-----|< + * OUT7 --+-|-|-|-|-|-|-|--|-----------|D7 Q7|-----|< + * | | | | | | | | | `--------' + * | | | | | | | | | + * | | | | | | | | | ,--------. + * | | | | | | | | `-----------|> #1 | + * | | | | | | | | | | + * | | | | | | | `--------------|D0 Q0|-----|< + * | | | | | | `----------------|D1 Q1|-----|< + * | | | | | `------------------|D2 Q2|-----|< + * | | | | `--------------------|D3 Q3|-----|< + * | | | `----------------------|D4 Q4|-----|< + * | | `------------------------|D5 Q5|-----|< + * | `--------------------------|D6 Q6|-----|< + * `----------------------------|D7 Q7|-----|< + * `--------' + * + * The above is just an example. The actual number of number of latches and + * the number of inputs per latch is derived from the number of GPIOs given + * in the corresponding device tree properties. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "gpiolib.h" + +struct gpio_latch_priv { + struct gpio_chip gc; + struct gpio_descs *clk_gpios; + struct gpio_descs *latched_gpios; + int n_latched_gpios; + unsigned int setup_duration_ns; + unsigned int clock_duration_ns; + unsigned long *shadow; + /* + * Depending on whether any of the underlying GPIOs may sleep we either + * use a mutex or a spinlock to protect our shadow map. + */ + union { + struct mutex mutex; /* protects @shadow */ + spinlock_t spinlock; /* protects @shadow */ + }; +}; + +static int gpio_latch_get_direction(struct gpio_chip *gc, unsigned int offset) +{ + return GPIO_LINE_DIRECTION_OUT; +} + +static void gpio_latch_set_unlocked(struct gpio_latch_priv *priv, + void (*set)(struct gpio_desc *desc, int value), + unsigned int offset, bool val) +{ + int latch = offset / priv->n_latched_gpios; + int i; + + assign_bit(offset, priv->shadow, val); + + for (i = 0; i < priv->n_latched_gpios; i++) + set(priv->latched_gpios->desc[i], + test_bit(latch * priv->n_latched_gpios + i, priv->shadow)); + + ndelay(priv->setup_duration_ns); + set(priv->clk_gpios->desc[latch], 1); + ndelay(priv->clock_duration_ns); + set(priv->clk_gpios->desc[latch], 0); +} + +static void gpio_latch_set(struct gpio_chip *gc, unsigned int offset, int val) +{ + struct gpio_latch_priv *priv = gpiochip_get_data(gc); + unsigned long flags; + + spin_lock_irqsave(&priv->spinlock, flags); + + gpio_latch_set_unlocked(priv, gpiod_set_value, offset, val); + + spin_unlock_irqrestore(&priv->spinlock, flags); +} + +static void gpio_latch_set_can_sleep(struct gpio_chip *gc, unsigned int offset, int val) +{ + struct gpio_latch_priv *priv = gpiochip_get_data(gc); + + mutex_lock(&priv->mutex); + + gpio_latch_set_unlocked(priv, gpiod_set_value_cansleep, offset, val); + + mutex_unlock(&priv->mutex); +} + +static bool gpio_latch_can_sleep(struct gpio_latch_priv *priv, unsigned int n_latches) +{ + int i; + + for (i = 0; i < n_latches; i++) + if (gpiod_cansleep(priv->clk_gpios->desc[i])) + return true; + + for (i = 0; i < priv->n_latched_gpios; i++) + if (gpiod_cansleep(priv->latched_gpios->desc[i])) + return true; + + return false; +} + +/* + * Some value which is still acceptable to delay in atomic context. + * If we need to go higher we might have to switch to usleep_range(), + * but that cannot ne used in atomic context and the driver would have + * to be adjusted to support that. + */ +#define DURATION_NS_MAX 5000 + +static int gpio_latch_probe(struct platform_device *pdev) +{ + struct gpio_latch_priv *priv; + unsigned int n_latches; + struct device_node *np = pdev->dev.of_node; + + priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->clk_gpios = devm_gpiod_get_array(&pdev->dev, "clk", GPIOD_OUT_LOW); + if (IS_ERR(priv->clk_gpios)) + return PTR_ERR(priv->clk_gpios); + + priv->latched_gpios = devm_gpiod_get_array(&pdev->dev, "latched", GPIOD_OUT_LOW); + if (IS_ERR(priv->latched_gpios)) + return PTR_ERR(priv->latched_gpios); + + n_latches = priv->clk_gpios->ndescs; + priv->n_latched_gpios = priv->latched_gpios->ndescs; + + priv->shadow = devm_bitmap_zalloc(&pdev->dev, n_latches * priv->n_latched_gpios, + GFP_KERNEL); + if (!priv->shadow) + return -ENOMEM; + + if (gpio_latch_can_sleep(priv, n_latches)) { + priv->gc.can_sleep = true; + priv->gc.set = gpio_latch_set_can_sleep; + mutex_init(&priv->mutex); + } else { + priv->gc.can_sleep = false; + priv->gc.set = gpio_latch_set; + spin_lock_init(&priv->spinlock); + } + + of_property_read_u32(np, "setup-duration-ns", &priv->setup_duration_ns); + if (priv->setup_duration_ns > DURATION_NS_MAX) { + dev_warn(&pdev->dev, "setup-duration-ns too high, limit to %d\n", + DURATION_NS_MAX); + priv->setup_duration_ns = DURATION_NS_MAX; + } + + of_property_read_u32(np, "clock-duration-ns", &priv->clock_duration_ns); + if (priv->clock_duration_ns > DURATION_NS_MAX) { + dev_warn(&pdev->dev, "clock-duration-ns too high, limit to %d\n", + DURATION_NS_MAX); + priv->clock_duration_ns = DURATION_NS_MAX; + } + + priv->gc.get_direction = gpio_latch_get_direction; + priv->gc.ngpio = n_latches * priv->n_latched_gpios; + priv->gc.owner = THIS_MODULE; + priv->gc.base = -1; + priv->gc.parent = &pdev->dev; + + platform_set_drvdata(pdev, priv); + + return devm_gpiochip_add_data(&pdev->dev, &priv->gc, priv); +} + +static const struct of_device_id gpio_latch_ids[] = { + { + .compatible = "gpio-latch", + }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, gpio_latch_ids); + +static struct platform_driver gpio_latch_driver = { + .driver = { + .name = "gpio-latch", + .of_match_table = gpio_latch_ids, + }, + .probe = gpio_latch_probe, +}; +module_platform_driver(gpio_latch_driver); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Sascha Hauer "); +MODULE_DESCRIPTION("GPIO latch driver"); -- GitLab From b4e83d369015e3045418ca86984c3cd8dcf5a365 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 17 Oct 2022 20:06:00 +0300 Subject: [PATCH 0206/1423] gpio: exar: Allow IO port access It's possible that PCI device can provide an IO port resource for the device. regmap MMIO currently uses MMIO by default. With an additional flag we enable support for IO port accesses. Signed-off-by: Andy Shevchenko Acked-by: Linus Walleij Acked-by: William Breathitt Gray Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-exar.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpio/gpio-exar.c b/drivers/gpio/gpio-exar.c index 482f678c893e5..df1bdaae441c5 100644 --- a/drivers/gpio/gpio-exar.c +++ b/drivers/gpio/gpio-exar.c @@ -141,6 +141,7 @@ static const struct regmap_config exar_regmap_config = { .name = "exar-gpio", .reg_bits = 16, .val_bits = 8, + .io_port = true, }; static int gpio_exar_probe(struct platform_device *pdev) -- GitLab From 94e9f9a23fe4b093cd5a8b292165fad840242b79 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 25 Oct 2022 15:38:45 -0500 Subject: [PATCH 0207/1423] agp/efficeon: Convert to generic power management Convert agpgart-efficeon from legacy PCI power management to the generic power management framework. Previously agpgart-efficeon used legacy PCI power management, which means agp_efficeon_suspend() and agp_efficeon_resume() were responsible for both device-specific things and generic PCI things like saving and restoring config space and managing power state. In this case, agp_efficeon_suspend() was empty, and agp_efficeon_resume() already did only device-specific things, so simply convert it to take a struct device * instead of a struct pci_dev *. Based on 0aeddbd0cb07 ("via-agp: convert to generic power management") by Vaibhav Gupta . Link: https://lore.kernel.org/r/20221025203852.681822-2-helgaas@kernel.org Signed-off-by: Bjorn Helgaas Acked-by: Dave Airlie --- drivers/char/agp/efficeon-agp.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/drivers/char/agp/efficeon-agp.c b/drivers/char/agp/efficeon-agp.c index c53f0f9ef5b0b..f28d423192695 100644 --- a/drivers/char/agp/efficeon-agp.c +++ b/drivers/char/agp/efficeon-agp.c @@ -412,18 +412,11 @@ static void agp_efficeon_remove(struct pci_dev *pdev) agp_put_bridge(bridge); } -#ifdef CONFIG_PM -static int agp_efficeon_suspend(struct pci_dev *dev, pm_message_t state) -{ - return 0; -} - -static int agp_efficeon_resume(struct pci_dev *pdev) +static int agp_efficeon_resume(struct device *dev) { printk(KERN_DEBUG PFX "agp_efficeon_resume()\n"); return efficeon_configure(); } -#endif static const struct pci_device_id agp_efficeon_pci_table[] = { { @@ -437,6 +430,8 @@ static const struct pci_device_id agp_efficeon_pci_table[] = { { } }; +static DEFINE_SIMPLE_DEV_PM_OPS(agp_efficeon_pm_ops, NULL, agp_efficeon_resume); + MODULE_DEVICE_TABLE(pci, agp_efficeon_pci_table); static struct pci_driver agp_efficeon_pci_driver = { @@ -444,10 +439,7 @@ static struct pci_driver agp_efficeon_pci_driver = { .id_table = agp_efficeon_pci_table, .probe = agp_efficeon_probe, .remove = agp_efficeon_remove, -#ifdef CONFIG_PM - .suspend = agp_efficeon_suspend, - .resume = agp_efficeon_resume, -#endif + .driver.pm = &agp_efficeon_pm_ops, }; static int __init agp_efficeon_init(void) -- GitLab From 7f142022e6bfd2dd5ed998f7165e396bf5966513 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 25 Oct 2022 15:38:46 -0500 Subject: [PATCH 0208/1423] agp/intel: Convert to generic power management Convert agpgart-intel from legacy PCI power management to the generic power management framework. Previously agpgart-intel used legacy PCI power management, and agp_intel_resume() was responsible for both device-specific things and generic PCI things like saving and restoring config space and managing power state. In this case, agp_intel_suspend() was empty, and agp_intel_resume() already did only device-specific things, so simply convert it to take a struct device * instead of a struct pci_dev *. Based on 0aeddbd0cb07 ("via-agp: convert to generic power management") by Vaibhav Gupta . Link: https://lore.kernel.org/r/20221025203852.681822-3-helgaas@kernel.org Signed-off-by: Bjorn Helgaas Acked-by: Dave Airlie --- drivers/char/agp/intel-agp.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c index 9e4f27a6cb5af..c518b3a9db04d 100644 --- a/drivers/char/agp/intel-agp.c +++ b/drivers/char/agp/intel-agp.c @@ -817,16 +817,15 @@ static void agp_intel_remove(struct pci_dev *pdev) agp_put_bridge(bridge); } -#ifdef CONFIG_PM -static int agp_intel_resume(struct pci_dev *pdev) +static int agp_intel_resume(struct device *dev) { + struct pci_dev *pdev = to_pci_dev(dev); struct agp_bridge_data *bridge = pci_get_drvdata(pdev); bridge->driver->configure(); return 0; } -#endif static const struct pci_device_id agp_intel_pci_table[] = { #define ID(x) \ @@ -895,14 +894,14 @@ static const struct pci_device_id agp_intel_pci_table[] = { MODULE_DEVICE_TABLE(pci, agp_intel_pci_table); +static DEFINE_SIMPLE_DEV_PM_OPS(agp_intel_pm_ops, NULL, agp_intel_resume); + static struct pci_driver agp_intel_pci_driver = { .name = "agpgart-intel", .id_table = agp_intel_pci_table, .probe = agp_intel_probe, .remove = agp_intel_remove, -#ifdef CONFIG_PM - .resume = agp_intel_resume, -#endif + .driver.pm = &agp_intel_pm_ops, }; static int __init agp_intel_init(void) -- GitLab From c78679d1fe43f9165b11c5ccd3f79c7108b066fe Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 25 Oct 2022 15:38:47 -0500 Subject: [PATCH 0209/1423] agp/amd-k7: Convert to generic power management Convert agpgart-amdk7 from legacy PCI power management to the generic power management framework. Previously agpgart-amdk7 used legacy PCI power management, and agp_amdk7_suspend() and agp_amdk7_resume() were responsible for both device-specific things and generic PCI things like saving and restoring config space and managing power state: agp_amdk7_suspend pci_save_state <-- generic PCI pci_set_power_state <-- generic PCI agp_amdk7_resume pci_set_power_state(PCI_D0) <-- generic PCI pci_restore_state <-- generic PCI amd_irongate_driver.configure <-- device-specific Convert to generic power management where the PCI bus PM methods do the generic PCI things, and the driver needs only the device-specific part, i.e., suspend_devices_and_enter dpm_suspend_start(PMSG_SUSPEND) pci_pm_suspend # PCI bus .suspend() method agp_amdk7_suspend <-- not needed at all; removed suspend_enter dpm_suspend_noirq(PMSG_SUSPEND) pci_pm_suspend_noirq # PCI bus .suspend_noirq() method pci_save_state <-- generic PCI pci_prepare_to_sleep <-- generic PCI pci_set_power_state ... dpm_resume_end(PMSG_RESUME) pci_pm_resume # PCI bus .resume() method pci_restore_standard_config pci_set_power_state(PCI_D0) <-- generic PCI pci_restore_state <-- generic PCI agp_amdk7_resume # driver->pm->resume amd_irongate_driver.configure <-- device-specific Based on 0aeddbd0cb07 ("via-agp: convert to generic power management") by Vaibhav Gupta . Link: https://lore.kernel.org/r/20221025203852.681822-4-helgaas@kernel.org Signed-off-by: Bjorn Helgaas Acked-by: Dave Airlie --- drivers/char/agp/amd-k7-agp.c | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/drivers/char/agp/amd-k7-agp.c b/drivers/char/agp/amd-k7-agp.c index 2b2095542816b..55397ba765d23 100644 --- a/drivers/char/agp/amd-k7-agp.c +++ b/drivers/char/agp/amd-k7-agp.c @@ -488,26 +488,11 @@ static void agp_amdk7_remove(struct pci_dev *pdev) agp_put_bridge(bridge); } -#ifdef CONFIG_PM - -static int agp_amdk7_suspend(struct pci_dev *pdev, pm_message_t state) +static int agp_amdk7_resume(struct device *dev) { - pci_save_state(pdev); - pci_set_power_state(pdev, pci_choose_state(pdev, state)); - - return 0; -} - -static int agp_amdk7_resume(struct pci_dev *pdev) -{ - pci_set_power_state(pdev, PCI_D0); - pci_restore_state(pdev); - return amd_irongate_driver.configure(); } -#endif /* CONFIG_PM */ - /* must be the same order as name table above */ static const struct pci_device_id agp_amdk7_pci_table[] = { { @@ -539,15 +524,14 @@ static const struct pci_device_id agp_amdk7_pci_table[] = { MODULE_DEVICE_TABLE(pci, agp_amdk7_pci_table); +static DEFINE_SIMPLE_DEV_PM_OPS(agp_amdk7_pm_ops, NULL, agp_amdk7_resume); + static struct pci_driver agp_amdk7_pci_driver = { .name = "agpgart-amdk7", .id_table = agp_amdk7_pci_table, .probe = agp_amdk7_probe, .remove = agp_amdk7_remove, -#ifdef CONFIG_PM - .suspend = agp_amdk7_suspend, - .resume = agp_amdk7_resume, -#endif + .driver.pm = &agp_amdk7_pm_ops, }; static int __init agp_amdk7_init(void) -- GitLab From 6a1274ea0e5dfb2eca85b0175820d7b5183c9cae Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 25 Oct 2022 15:38:48 -0500 Subject: [PATCH 0210/1423] agp/ati: Convert to generic power management Convert agpgart-ati from legacy PCI power management to the generic power management framework. Previously agpgart-ati used legacy PCI power management, and agp_ati_suspend() and agp_ati_resume() were responsible for both device-specific things and generic PCI things like saving and restoring config space and managing power state: agp_ati_suspend pci_save_state <-- generic PCI pci_set_power_state(PCI_D3hot) <-- generic PCI agp_ati_resume pci_set_power_state(PCI_D0) <-- generic PCI pci_restore_state <-- generic PCI ati_configure <-- device-specific With generic power management, the PCI bus PM methods do the generic PCI things, and the driver needs only the device-specific part, i.e., suspend_devices_and_enter dpm_suspend_start(PMSG_SUSPEND) pci_pm_suspend # PCI bus .suspend() method agp_ati_suspend <-- not needed at all; removed suspend_enter dpm_suspend_noirq(PMSG_SUSPEND) pci_pm_suspend_noirq # PCI bus .suspend_noirq() method pci_save_state <-- generic PCI pci_prepare_to_sleep <-- generic PCI pci_set_power_state ... dpm_resume_end(PMSG_RESUME) pci_pm_resume # PCI bus .resume() method pci_restore_standard_config pci_set_power_state(PCI_D0) <-- generic PCI pci_restore_state <-- generic PCI agp_ati_resume # driver->pm->resume ati_configure <-- device-specific Based on 0aeddbd0cb07 ("via-agp: convert to generic power management") by Vaibhav Gupta . Link: https://lore.kernel.org/r/20221025203852.681822-5-helgaas@kernel.org Signed-off-by: Bjorn Helgaas Acked-by: Dave Airlie --- drivers/char/agp/ati-agp.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/drivers/char/agp/ati-agp.c b/drivers/char/agp/ati-agp.c index 6f5530482d833..3c1fce48aabe7 100644 --- a/drivers/char/agp/ati-agp.c +++ b/drivers/char/agp/ati-agp.c @@ -238,23 +238,10 @@ static int ati_configure(void) } -#ifdef CONFIG_PM -static int agp_ati_suspend(struct pci_dev *dev, pm_message_t state) +static int agp_ati_resume(struct device *dev) { - pci_save_state(dev); - pci_set_power_state(dev, PCI_D3hot); - - return 0; -} - -static int agp_ati_resume(struct pci_dev *dev) -{ - pci_set_power_state(dev, PCI_D0); - pci_restore_state(dev); - return ati_configure(); } -#endif /* *Since we don't need contiguous memory we just try @@ -559,15 +546,14 @@ static const struct pci_device_id agp_ati_pci_table[] = { MODULE_DEVICE_TABLE(pci, agp_ati_pci_table); +static DEFINE_SIMPLE_DEV_PM_OPS(agp_ati_pm_ops, NULL, agp_ati_resume); + static struct pci_driver agp_ati_pci_driver = { .name = "agpgart-ati", .id_table = agp_ati_pci_table, .probe = agp_ati_probe, .remove = agp_ati_remove, -#ifdef CONFIG_PM - .suspend = agp_ati_suspend, - .resume = agp_ati_resume, -#endif + .driver.pm = &agp_ati_pm_ops, }; static int __init agp_ati_init(void) -- GitLab From 11a8d8774e68e07385a5b10d9546598f57ace7da Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 25 Oct 2022 15:38:49 -0500 Subject: [PATCH 0211/1423] agp/nvidia: Convert to generic power management Convert agpgart-nvidia from legacy PCI power management to the generic power management framework. Previously agpgart-nvidia used legacy PCI power management, and agp_nvidia_suspend() and agp_nvidia_resume() were responsible for both device-specific things and generic PCI things: agp_nvidia_suspend pci_save_state <-- generic PCI pci_set_power_state(PCI_D3hot) <-- generic PCI agp_nvidia_resume pci_set_power_state(PCI_D0) <-- generic PCI pci_restore_state <-- generic PCI nvidia_configure <-- device-specific Convert to generic power management where the PCI bus PM methods do the generic PCI things, and the driver needs only the device-specific part, i.e., suspend_devices_and_enter dpm_suspend_start(PMSG_SUSPEND) pci_pm_suspend # PCI bus .suspend() method agp_nvidia_suspend <-- not needed at all; removed suspend_enter dpm_suspend_noirq(PMSG_SUSPEND) pci_pm_suspend_noirq # PCI bus .suspend_noirq() method pci_save_state <-- generic PCI pci_prepare_to_sleep <-- generic PCI pci_set_power_state ... dpm_resume_end(PMSG_RESUME) pci_pm_resume # PCI bus .resume() method pci_restore_standard_config pci_set_power_state(PCI_D0) <-- generic PCI pci_restore_state <-- generic PCI agp_nvidia_resume # driver->pm->resume nvidia_configure <-- device-specific Based on 0aeddbd0cb07 ("via-agp: convert to generic power management") by Vaibhav Gupta . Link: https://lore.kernel.org/r/20221025203852.681822-6-helgaas@kernel.org Signed-off-by: Bjorn Helgaas Acked-by: Dave Airlie --- drivers/char/agp/nvidia-agp.c | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/drivers/char/agp/nvidia-agp.c b/drivers/char/agp/nvidia-agp.c index 826dbd06f6bbb..dbcbc06cc2029 100644 --- a/drivers/char/agp/nvidia-agp.c +++ b/drivers/char/agp/nvidia-agp.c @@ -404,28 +404,13 @@ static void agp_nvidia_remove(struct pci_dev *pdev) agp_put_bridge(bridge); } -#ifdef CONFIG_PM -static int agp_nvidia_suspend(struct pci_dev *pdev, pm_message_t state) +static int agp_nvidia_resume(struct device *dev) { - pci_save_state(pdev); - pci_set_power_state(pdev, PCI_D3hot); - - return 0; -} - -static int agp_nvidia_resume(struct pci_dev *pdev) -{ - /* set power state 0 and restore PCI space */ - pci_set_power_state(pdev, PCI_D0); - pci_restore_state(pdev); - /* reconfigure AGP hardware again */ nvidia_configure(); return 0; } -#endif - static const struct pci_device_id agp_nvidia_pci_table[] = { { @@ -449,15 +434,14 @@ static const struct pci_device_id agp_nvidia_pci_table[] = { MODULE_DEVICE_TABLE(pci, agp_nvidia_pci_table); +static DEFINE_SIMPLE_DEV_PM_OPS(agp_nvidia_pm_ops, NULL, agp_nvidia_resume); + static struct pci_driver agp_nvidia_pci_driver = { .name = "agpgart-nvidia", .id_table = agp_nvidia_pci_table, .probe = agp_nvidia_probe, .remove = agp_nvidia_remove, -#ifdef CONFIG_PM - .suspend = agp_nvidia_suspend, - .resume = agp_nvidia_resume, -#endif + .driver.pm = &agp_nvidia_pm_ops, }; static int __init agp_nvidia_init(void) -- GitLab From 8c1f82c710f18c5f51c3b43402cd8175d6655369 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 25 Oct 2022 15:38:50 -0500 Subject: [PATCH 0212/1423] agp/amd64: Update to DEFINE_SIMPLE_DEV_PM_OPS() As of 1a3c7bb08826 ("PM: core: Add new *_PM_OPS macros, deprecate old ones"), SIMPLE_DEV_PM_OPS() is deprecated in favor of DEFINE_SIMPLE_DEV_PM_OPS(), which has the advantage that the PM callbacks don't need to be wrapped with #ifdef CONFIG_PM or tagged with __maybe_unused. Convert to DEFINE_SIMPLE_DEV_PM_OPS(). No functional change intended. Link: https://lore.kernel.org/r/20221025203852.681822-7-helgaas@kernel.org Signed-off-by: Bjorn Helgaas Acked-by: Dave Airlie --- drivers/char/agp/amd64-agp.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c index 84a4aa9312cf8..ce8651436609f 100644 --- a/drivers/char/agp/amd64-agp.c +++ b/drivers/char/agp/amd64-agp.c @@ -588,9 +588,7 @@ static void agp_amd64_remove(struct pci_dev *pdev) agp_bridges_found--; } -#define agp_amd64_suspend NULL - -static int __maybe_unused agp_amd64_resume(struct device *dev) +static int agp_amd64_resume(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); @@ -727,7 +725,7 @@ static const struct pci_device_id agp_amd64_pci_promisc_table[] = { { } }; -static SIMPLE_DEV_PM_OPS(agp_amd64_pm_ops, agp_amd64_suspend, agp_amd64_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(agp_amd64_pm_ops, NULL, agp_amd64_resume); static struct pci_driver agp_amd64_pci_driver = { .name = "agpgart-amd64", -- GitLab From 746e926b9fe327ace4be187144913abd7cfc2f4a Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 25 Oct 2022 15:38:51 -0500 Subject: [PATCH 0213/1423] agp/sis: Update to DEFINE_SIMPLE_DEV_PM_OPS() As of 1a3c7bb08826 ("PM: core: Add new *_PM_OPS macros, deprecate old ones"), SIMPLE_DEV_PM_OPS() is deprecated in favor of DEFINE_SIMPLE_DEV_PM_OPS(), which has the advantage that the PM callbacks don't need to be wrapped with #ifdef CONFIG_PM or tagged with __maybe_unused. Convert to DEFINE_SIMPLE_DEV_PM_OPS(). No functional change intended. Link: https://lore.kernel.org/r/20221025203852.681822-8-helgaas@kernel.org Signed-off-by: Bjorn Helgaas Acked-by: Dave Airlie --- drivers/char/agp/sis-agp.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/char/agp/sis-agp.c b/drivers/char/agp/sis-agp.c index f8a02f4bef1bf..484bb101c53b5 100644 --- a/drivers/char/agp/sis-agp.c +++ b/drivers/char/agp/sis-agp.c @@ -217,10 +217,7 @@ static void agp_sis_remove(struct pci_dev *pdev) agp_put_bridge(bridge); } -#define agp_sis_suspend NULL - -static int __maybe_unused agp_sis_resume( - __attribute__((unused)) struct device *dev) +static int agp_sis_resume(__attribute__((unused)) struct device *dev) { return sis_driver.configure(); } @@ -407,7 +404,7 @@ static const struct pci_device_id agp_sis_pci_table[] = { MODULE_DEVICE_TABLE(pci, agp_sis_pci_table); -static SIMPLE_DEV_PM_OPS(agp_sis_pm_ops, agp_sis_suspend, agp_sis_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(agp_sis_pm_ops, NULL, agp_sis_resume); static struct pci_driver agp_sis_pci_driver = { .name = "agpgart-sis", -- GitLab From 73fcd4520edb430684246448d096f8f17f107c97 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 25 Oct 2022 15:38:52 -0500 Subject: [PATCH 0214/1423] agp/via: Update to DEFINE_SIMPLE_DEV_PM_OPS() As of 1a3c7bb08826 ("PM: core: Add new *_PM_OPS macros, deprecate old ones"), SIMPLE_DEV_PM_OPS() is deprecated in favor of DEFINE_SIMPLE_DEV_PM_OPS(), which has the advantage that the PM callbacks don't need to be wrapped with #ifdef CONFIG_PM or tagged with __maybe_unused. Convert to DEFINE_SIMPLE_DEV_PM_OPS(). No functional change intended. Link: https://lore.kernel.org/r/20221025203852.681822-9-helgaas@kernel.org Signed-off-by: Bjorn Helgaas Acked-by: Dave Airlie --- drivers/char/agp/via-agp.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/char/agp/via-agp.c b/drivers/char/agp/via-agp.c index b2f484f527fb7..bc5140af2dcbe 100644 --- a/drivers/char/agp/via-agp.c +++ b/drivers/char/agp/via-agp.c @@ -489,9 +489,7 @@ static void agp_via_remove(struct pci_dev *pdev) agp_put_bridge(bridge); } -#define agp_via_suspend NULL - -static int __maybe_unused agp_via_resume(struct device *dev) +static int agp_via_resume(struct device *dev) { struct agp_bridge_data *bridge = dev_get_drvdata(dev); @@ -551,7 +549,7 @@ static const struct pci_device_id agp_via_pci_table[] = { MODULE_DEVICE_TABLE(pci, agp_via_pci_table); -static SIMPLE_DEV_PM_OPS(agp_via_pm_ops, agp_via_suspend, agp_via_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(agp_via_pm_ops, NULL, agp_via_resume); static struct pci_driver agp_via_pci_driver = { .name = "agpgart-via", -- GitLab From 5984de0b41bf8f261e41b45c4fe64a32236e0c42 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 25 Oct 2022 14:35:02 -0500 Subject: [PATCH 0215/1423] PCI/PM: Remove unused 'state' parameter to pci_legacy_suspend_late() 1a1daf097e21 ("PCI/PM: Remove unused pci_driver.suspend_late() hook") removed the legacy .suspend_late() hook, which was the only user of the "state" parameter to pci_legacy_suspend_late(), but it neglected to remove the parameter. Remove the unused "state" parameter to pci_legacy_suspend_late(). Link: https://lore.kernel.org/r/20221025193502.669091-1-helgaas@kernel.org Signed-off-by: Bjorn Helgaas Acked-by: Rafael J. Wysocki --- drivers/pci/pci-driver.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 107d77f3c8467..a2ceeacc33eb6 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -646,7 +646,7 @@ static int pci_legacy_suspend(struct device *dev, pm_message_t state) return 0; } -static int pci_legacy_suspend_late(struct device *dev, pm_message_t state) +static int pci_legacy_suspend_late(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); @@ -848,7 +848,7 @@ static int pci_pm_suspend_noirq(struct device *dev) return 0; if (pci_has_legacy_pm_support(pci_dev)) - return pci_legacy_suspend_late(dev, PMSG_SUSPEND); + return pci_legacy_suspend_late(dev); if (!pm) { pci_save_state(pci_dev); @@ -1060,7 +1060,7 @@ static int pci_pm_freeze_noirq(struct device *dev) const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; if (pci_has_legacy_pm_support(pci_dev)) - return pci_legacy_suspend_late(dev, PMSG_FREEZE); + return pci_legacy_suspend_late(dev); if (pm && pm->freeze_noirq) { int error; @@ -1179,7 +1179,7 @@ static int pci_pm_poweroff_noirq(struct device *dev) return 0; if (pci_has_legacy_pm_support(pci_dev)) - return pci_legacy_suspend_late(dev, PMSG_HIBERNATE); + return pci_legacy_suspend_late(dev); if (!pm) { pci_fixup_device(pci_fixup_suspend_late, pci_dev); -- GitLab From f7ec74c14f24d78f054efd4ddbda0b4a174cf39f Mon Sep 17 00:00:00 2001 From: Shubhrajyoti Datta Date: Wed, 26 Oct 2022 20:45:41 +0530 Subject: [PATCH 0216/1423] dt-bindings: gpio: pca9570: Add compatible for slg7xl45106 This patch adds compatible string for the SLG7XL45106, I2C GPO expander. Acked-by: Krzysztof Kozlowski Signed-off-by: Shubhrajyoti Datta Signed-off-by: Bartosz Golaszewski --- Documentation/devicetree/bindings/gpio/gpio-pca9570.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/gpio/gpio-pca9570.yaml b/Documentation/devicetree/bindings/gpio/gpio-pca9570.yaml index 1acaa0a3d35af..48bf414aa50ea 100644 --- a/Documentation/devicetree/bindings/gpio/gpio-pca9570.yaml +++ b/Documentation/devicetree/bindings/gpio/gpio-pca9570.yaml @@ -12,6 +12,7 @@ maintainers: properties: compatible: enum: + - dlg,slg7xl45106 - nxp,pca9570 - nxp,pca9571 -- GitLab From b8a34582c7f7f22f82852f9d3cc192e050f892fd Mon Sep 17 00:00:00 2001 From: Shubhrajyoti Datta Date: Wed, 26 Oct 2022 20:45:42 +0530 Subject: [PATCH 0217/1423] gpio: pca9570: add a platform data structure Add struct pca9570_platform_data for adding the platform data structure. Also modify the existing structs for pca9570 and pca9571 Signed-off-by: Shubhrajyoti Datta Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-pca9570.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/drivers/gpio/gpio-pca9570.c b/drivers/gpio/gpio-pca9570.c index ab2a652964ec2..e8c2ddb1bcd8c 100644 --- a/drivers/gpio/gpio-pca9570.c +++ b/drivers/gpio/gpio-pca9570.c @@ -15,14 +15,26 @@ #include #include +/** + * struct pca9570_platform_data - GPIO platformdata + * @ngpio: no of gpios + * @command: Command to be sent + */ +struct pca9570_platform_data { + u16 ngpio; + u32 command; +}; + /** * struct pca9570 - GPIO driver data * @chip: GPIO controller chip + * @p_data: GPIO controller platform data * @lock: Protects write sequences * @out: Buffer for device register */ struct pca9570 { struct gpio_chip chip; + const struct pca9570_platform_data *p_data; struct mutex lock; u8 out; }; @@ -106,7 +118,8 @@ static int pca9570_probe(struct i2c_client *client) gpio->chip.get = pca9570_get; gpio->chip.set = pca9570_set; gpio->chip.base = -1; - gpio->chip.ngpio = (uintptr_t)device_get_match_data(&client->dev); + gpio->p_data = device_get_match_data(&client->dev); + gpio->chip.ngpio = gpio->p_data->ngpio; gpio->chip.can_sleep = true; mutex_init(&gpio->lock); @@ -119,16 +132,24 @@ static int pca9570_probe(struct i2c_client *client) return devm_gpiochip_add_data(&client->dev, &gpio->chip, gpio); } +static const struct pca9570_platform_data pca9570_gpio = { + .ngpio = 4, +}; + +static const struct pca9570_platform_data pca9571_gpio = { + .ngpio = 8, +}; + static const struct i2c_device_id pca9570_id_table[] = { - { "pca9570", 4 }, - { "pca9571", 8 }, + { "pca9570", (kernel_ulong_t)&pca9570_gpio}, + { "pca9571", (kernel_ulong_t)&pca9571_gpio }, { /* sentinel */ } }; MODULE_DEVICE_TABLE(i2c, pca9570_id_table); static const struct of_device_id pca9570_of_match_table[] = { - { .compatible = "nxp,pca9570", .data = (void *)4 }, - { .compatible = "nxp,pca9571", .data = (void *)8 }, + { .compatible = "nxp,pca9570", .data = &pca9570_gpio }, + { .compatible = "nxp,pca9571", .data = &pca9571_gpio }, { /* sentinel */ } }; MODULE_DEVICE_TABLE(of, pca9570_of_match_table); -- GitLab From fbb19fe17eaef7b6ba2e68dbf0600a97060f2909 Mon Sep 17 00:00:00 2001 From: Shubhrajyoti Datta Date: Wed, 26 Oct 2022 20:45:43 +0530 Subject: [PATCH 0218/1423] gpio: pca9570: add slg7xl45106 support Dialog semiconductors SLG7XL45106 is an 8-bit I2C GPO expander. The output port is controlled by a data byte with register address. Add a compatible string for the same. Also update the driver to write and read from it. Reviewed-by: Linus Walleij Signed-off-by: Shubhrajyoti Datta Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-pca9570.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-pca9570.c b/drivers/gpio/gpio-pca9570.c index e8c2ddb1bcd8c..6c07a8811a7a5 100644 --- a/drivers/gpio/gpio-pca9570.c +++ b/drivers/gpio/gpio-pca9570.c @@ -15,6 +15,8 @@ #include #include +#define SLG7XL45106_GPO_REG 0xDB + /** * struct pca9570_platform_data - GPIO platformdata * @ngpio: no of gpios @@ -44,7 +46,11 @@ static int pca9570_read(struct pca9570 *gpio, u8 *value) struct i2c_client *client = to_i2c_client(gpio->chip.parent); int ret; - ret = i2c_smbus_read_byte(client); + if (gpio->p_data->command != 0) + ret = i2c_smbus_read_byte_data(client, gpio->p_data->command); + else + ret = i2c_smbus_read_byte(client); + if (ret < 0) return ret; @@ -56,6 +62,9 @@ static int pca9570_write(struct pca9570 *gpio, u8 value) { struct i2c_client *client = to_i2c_client(gpio->chip.parent); + if (gpio->p_data->command != 0) + return i2c_smbus_write_byte_data(client, gpio->p_data->command, value); + return i2c_smbus_write_byte(client, value); } @@ -140,14 +149,21 @@ static const struct pca9570_platform_data pca9571_gpio = { .ngpio = 8, }; +static const struct pca9570_platform_data slg7xl45106_gpio = { + .ngpio = 8, + .command = SLG7XL45106_GPO_REG, +}; + static const struct i2c_device_id pca9570_id_table[] = { { "pca9570", (kernel_ulong_t)&pca9570_gpio}, { "pca9571", (kernel_ulong_t)&pca9571_gpio }, + { "slg7xl45106", (kernel_ulong_t)&slg7xl45106_gpio }, { /* sentinel */ } }; MODULE_DEVICE_TABLE(i2c, pca9570_id_table); static const struct of_device_id pca9570_of_match_table[] = { + { .compatible = "dlg,slg7xl45106", .data = &slg7xl45106_gpio}, { .compatible = "nxp,pca9570", .data = &pca9570_gpio }, { .compatible = "nxp,pca9571", .data = &pca9571_gpio }, { /* sentinel */ } -- GitLab From ba4ff1cb6cac8acca928ea41588cf84b18ffdedb Mon Sep 17 00:00:00 2001 From: Matt Ranostay Date: Tue, 25 Oct 2022 01:19:08 -0700 Subject: [PATCH 0219/1423] dt-bindings: PCI: ti,j721e-pci-host: add interrupt controller definition Add missing 'interrupt-controller' property and related subnodes to resolve the following warning: arch/arm64/boot/dts/ti/k3-j721s2-common-proc-board.dtb: pcie@2910000: Unevaluated properties are not allowed ('interrupt-controller' was unexpected) From schema: Documentation/devicetree/bindings/pci/ti,j721e-pci-host.yaml Link: https://lore.kernel.org/r/20221025081909.404107-2-mranostay@ti.com Signed-off-by: Matt Ranostay Signed-off-by: Lorenzo Pieralisi Acked-by: Rob Herring --- .../devicetree/bindings/pci/ti,j721e-pci-host.yaml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/Documentation/devicetree/bindings/pci/ti,j721e-pci-host.yaml b/Documentation/devicetree/bindings/pci/ti,j721e-pci-host.yaml index 2115d5a3f0e14..0f5914a22c14c 100644 --- a/Documentation/devicetree/bindings/pci/ti,j721e-pci-host.yaml +++ b/Documentation/devicetree/bindings/pci/ti,j721e-pci-host.yaml @@ -76,6 +76,19 @@ properties: msi-map: true + interrupt-controller: + type: object + additionalProperties: false + + properties: + interrupt-controller: true + + '#interrupt-cells': + const: 1 + + interrupts: + maxItems: 1 + required: - compatible - reg -- GitLab From 598418e6035622c0dc735764f0f1b7293c0c7d48 Mon Sep 17 00:00:00 2001 From: Matt Ranostay Date: Tue, 25 Oct 2022 01:19:09 -0700 Subject: [PATCH 0220/1423] dt-bindings: PCI: ti,j721e-pci-*: Add missing interrupt properties Both interrupts, and interrupt names weren't defined in both EP and host yaml. Also define the only possible interrupt-name as link_state, and maxItems of interrupts to one. This patch resolves the following warning: arch/arm64/boot/dts/ti/k3-j721s2-common-proc-board.dtb: pcie-ep@2910000: Unevaluated properties are not allowed ('interrupt-names', 'interrupts' were unexpected) From schema Documentation/devicetree/bindings/pci/ti,j721e-pci-ep.yaml Link: https://lore.kernel.org/r/20221025081909.404107-3-mranostay@ti.com Signed-off-by: Matt Ranostay Signed-off-by: Lorenzo Pieralisi Acked-by: Rob Herring --- Documentation/devicetree/bindings/pci/ti,j721e-pci-ep.yaml | 7 +++++++ .../devicetree/bindings/pci/ti,j721e-pci-host.yaml | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/Documentation/devicetree/bindings/pci/ti,j721e-pci-ep.yaml b/Documentation/devicetree/bindings/pci/ti,j721e-pci-ep.yaml index aed437dac363a..10e6eabdff534 100644 --- a/Documentation/devicetree/bindings/pci/ti,j721e-pci-ep.yaml +++ b/Documentation/devicetree/bindings/pci/ti,j721e-pci-ep.yaml @@ -58,6 +58,13 @@ properties: dma-coherent: description: Indicates that the PCIe IP block can ensure the coherency + interrupts: + maxItems: 1 + + interrupt-names: + items: + - const: link_state + required: - compatible - reg diff --git a/Documentation/devicetree/bindings/pci/ti,j721e-pci-host.yaml b/Documentation/devicetree/bindings/pci/ti,j721e-pci-host.yaml index 0f5914a22c14c..d9df7cd922f1f 100644 --- a/Documentation/devicetree/bindings/pci/ti,j721e-pci-host.yaml +++ b/Documentation/devicetree/bindings/pci/ti,j721e-pci-host.yaml @@ -76,6 +76,13 @@ properties: msi-map: true + interrupts: + maxItems: 1 + + interrupt-names: + items: + - const: link_state + interrupt-controller: type: object additionalProperties: false -- GitLab From 66110361281b2f7da0c8bd51eaf1f152f4236035 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Mon, 26 Sep 2022 16:49:23 +0530 Subject: [PATCH 0221/1423] PCI: dwc: Fix n_fts[] array overrun commit aeaa0bfe89654 ("PCI: dwc: Move N_FTS setup to common setup") incorrectly uses pci->link_gen in deriving the index to the n_fts[] array also introducing the issue of accessing beyond the boundaries of array for greater than Gen-2 speeds. This change fixes that issue. Link: https://lore.kernel.org/r/20220926111923.22487-1-vidyas@nvidia.com Fixes: aeaa0bfe8965 ("PCI: dwc: Move N_FTS setup to common setup") Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Acked-by: Jingoo Han --- drivers/pci/controller/dwc/pcie-designware.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c index c6725c519a479..9e4d96e5a3f5a 100644 --- a/drivers/pci/controller/dwc/pcie-designware.c +++ b/drivers/pci/controller/dwc/pcie-designware.c @@ -641,7 +641,7 @@ void dw_pcie_setup(struct dw_pcie *pci) if (pci->n_fts[1]) { val = dw_pcie_readl_dbi(pci, PCIE_LINK_WIDTH_SPEED_CONTROL); val &= ~PORT_LOGIC_N_FTS_MASK; - val |= pci->n_fts[pci->link_gen - 1]; + val |= pci->n_fts[1]; dw_pcie_writel_dbi(pci, PCIE_LINK_WIDTH_SPEED_CONTROL, val); } -- GitLab From 4508d32ccced24c972bc4592104513e1ff8439b5 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 25 Oct 2022 10:37:13 +0300 Subject: [PATCH 0222/1423] RDMA/core: Fix order of nldev_exit call Create symmetrical exit flow by calling to nldev_exit() after call to rdma_nl_unregister(RDMA_NL_LS). Fixes: 6c80b41abe22 ("RDMA/netlink: Add nldev initialization flows") Signed-off-by: Leon Romanovsky Link: https://lore.kernel.org/r/64e676774a53a406f4cde265d5a4cfd6b8e97df9.1666683334.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index ae60c73babcc5..3409c55ea88bf 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2843,8 +2843,8 @@ err: static void __exit ib_core_cleanup(void) { roce_gid_mgmt_cleanup(); - nldev_exit(); rdma_nl_unregister(RDMA_NL_LS); + nldev_exit(); unregister_pernet_device(&rdma_dev_net_ops); unregister_blocking_lsm_notifier(&ibdev_lsm_nb); ib_sa_cleanup(); -- GitLab From e32e1e26c4098d8a866ce09fd26d8004da4ddf9e Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Mon, 19 Sep 2022 20:03:39 +0530 Subject: [PATCH 0223/1423] PCI: Add PCI_PTM_CAP_RES macro Add macro defining Responder capable bit in Precision Time Measurement capability register. Link: https://lore.kernel.org/r/20220919143340.4527-2-vidyas@nvidia.com Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Jingoo Han --- include/uapi/linux/pci_regs.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 57b8e2ffb1dd3..1c3591c8e09ef 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -1058,6 +1058,7 @@ /* Precision Time Measurement */ #define PCI_PTM_CAP 0x04 /* PTM Capability */ #define PCI_PTM_CAP_REQ 0x00000001 /* Requester capable */ +#define PCI_PTM_CAP_RES 0x00000002 /* Responder capable */ #define PCI_PTM_CAP_ROOT 0x00000004 /* Root capable */ #define PCI_PTM_GRANULARITY_MASK 0x0000FF00 /* Clock granularity */ #define PCI_PTM_CTRL 0x08 /* PTM Control */ -- GitLab From 442ae919e6ca77354551a7b8717746b44272e274 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Mon, 19 Sep 2022 20:03:40 +0530 Subject: [PATCH 0224/1423] PCI: designware-ep: Disable PTM capabilities for EP mode Dual mode DesignWare PCIe IP has PTM capability enabled (if supported) even in the EP mode. The PCIe compliance for the EP mode expects PTM capabilities (ROOT_CAPABLE, RES_CAPABLE, CLK_GRAN) be disabled. Hence disable PTM for the EP mode. Link: https://lore.kernel.org/r/20220919143340.4527-3-vidyas@nvidia.com Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Acked-by: Jingoo Han --- .../pci/controller/dwc/pcie-designware-ep.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c index 83ddb190292e4..efc6c6360e28f 100644 --- a/drivers/pci/controller/dwc/pcie-designware-ep.c +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c @@ -643,7 +643,7 @@ static unsigned int dw_pcie_ep_find_ext_capability(struct dw_pcie *pci, int cap) int dw_pcie_ep_init_complete(struct dw_pcie_ep *ep) { struct dw_pcie *pci = to_dw_pcie_from_ep(ep); - unsigned int offset; + unsigned int offset, ptm_cap_base; unsigned int nbars; u8 hdr_type; u32 reg; @@ -659,6 +659,7 @@ int dw_pcie_ep_init_complete(struct dw_pcie_ep *ep) } offset = dw_pcie_ep_find_ext_capability(pci, PCI_EXT_CAP_ID_REBAR); + ptm_cap_base = dw_pcie_ep_find_ext_capability(pci, PCI_EXT_CAP_ID_PTM); dw_pcie_dbi_ro_wr_en(pci); @@ -671,6 +672,22 @@ int dw_pcie_ep_init_complete(struct dw_pcie_ep *ep) dw_pcie_writel_dbi(pci, offset + PCI_REBAR_CAP, 0x0); } + /* + * PTM responder capability can be disabled only after disabling + * PTM root capability. + */ + if (ptm_cap_base) { + dw_pcie_dbi_ro_wr_en(pci); + reg = dw_pcie_readl_dbi(pci, ptm_cap_base + PCI_PTM_CAP); + reg &= ~PCI_PTM_CAP_ROOT; + dw_pcie_writel_dbi(pci, ptm_cap_base + PCI_PTM_CAP, reg); + + reg = dw_pcie_readl_dbi(pci, ptm_cap_base + PCI_PTM_CAP); + reg &= ~(PCI_PTM_CAP_RES | PCI_PTM_GRANULARITY_MASK); + dw_pcie_writel_dbi(pci, ptm_cap_base + PCI_PTM_CAP, reg); + dw_pcie_dbi_ro_wr_dis(pci); + } + dw_pcie_setup(pci); dw_pcie_dbi_ro_wr_dis(pci); -- GitLab From 7711cbb4862aa00909a248f011ba3fa578bd1cf3 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Thu, 23 Jun 2022 09:38:17 +0900 Subject: [PATCH 0225/1423] PCI: endpoint: Fix WARN() when an endpoint driver is removed Since there is no release callback defined for the PCI EPC device, the below warning is thrown by driver core when a PCI endpoint driver is removed: Device 'e65d0000.pcie-ep' does not have a release() function, it is broken and must be fixed. See Documentation/core-api/kobject.rst. WARNING: CPU: 0 PID: 139 at drivers/base/core.c:2232 device_release+0x78/0x8c Hence, add the release callback and also move the kfree(epc) from pci_epc_destroy() so that the epc memory is freed when all references are dropped. Link: https://lore.kernel.org/r/20220623003817.298173-1-yoshihiro.shimoda.uh@renesas.com Tested-by: Vidya Sagar Signed-off-by: Yoshihiro Shimoda Signed-off-by: Lorenzo Pieralisi Reviewed-by: Manivannan Sadhasivam --- drivers/pci/endpoint/pci-epc-core.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c index 3bc9273d0a082..2542196e8c3d9 100644 --- a/drivers/pci/endpoint/pci-epc-core.c +++ b/drivers/pci/endpoint/pci-epc-core.c @@ -724,7 +724,6 @@ void pci_epc_destroy(struct pci_epc *epc) { pci_ep_cfs_remove_epc_group(epc->group); device_unregister(&epc->dev); - kfree(epc); } EXPORT_SYMBOL_GPL(pci_epc_destroy); @@ -746,6 +745,11 @@ void devm_pci_epc_destroy(struct device *dev, struct pci_epc *epc) } EXPORT_SYMBOL_GPL(devm_pci_epc_destroy); +static void pci_epc_release(struct device *dev) +{ + kfree(to_pci_epc(dev)); +} + /** * __pci_epc_create() - create a new endpoint controller (EPC) device * @dev: device that is creating the new EPC @@ -779,6 +783,7 @@ __pci_epc_create(struct device *dev, const struct pci_epc_ops *ops, device_initialize(&epc->dev); epc->dev.class = pci_epc_class; epc->dev.parent = dev; + epc->dev.release = pci_epc_release; epc->ops = ops; ret = dev_set_name(&epc->dev, "%s", dev_name(dev)); -- GitLab From 16e3f40779659ff525364e5d9df369953fa7192b Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sun, 4 Sep 2022 23:30:53 -0700 Subject: [PATCH 0226/1423] PCI: tegra: Switch to using devm_fwnode_gpiod_get [devm_]gpiod_get_from_of_node in drivers usage should be limited so that gpiolib can be cleaned up; let's switch to the generic device property API. It may even help with handling secondary fwnodes when gpiolib is taught to handle gpios described by swnodes. Link: https://lore.kernel.org/r/20220903-gpiod_get_from_of_node-remove-v1-1-b29adfb27a6c@gmail.com Signed-off-by: Dmitry Torokhov [lpieralisi@kernel.org: commit log] Signed-off-by: Lorenzo Pieralisi Reviewed-by: Linus Walleij --- drivers/pci/controller/pci-tegra.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/pci/controller/pci-tegra.c b/drivers/pci/controller/pci-tegra.c index 24478ae5a345d..b6f77b102709d 100644 --- a/drivers/pci/controller/pci-tegra.c +++ b/drivers/pci/controller/pci-tegra.c @@ -2197,10 +2197,11 @@ static int tegra_pcie_parse_dt(struct tegra_pcie *pcie) * and in this case fall back to using AFI per port register * to toggle PERST# SFIO line. */ - rp->reset_gpio = devm_gpiod_get_from_of_node(dev, port, - "reset-gpios", 0, - GPIOD_OUT_LOW, - label); + rp->reset_gpio = devm_fwnode_gpiod_get(dev, + of_fwnode_handle(port), + "reset", + GPIOD_OUT_LOW, + label); if (IS_ERR(rp->reset_gpio)) { if (PTR_ERR(rp->reset_gpio) == -ENOENT) { rp->reset_gpio = NULL; -- GitLab From 6acd25cc98ce0c9ee4fefdaf44fc8bca534b26e5 Mon Sep 17 00:00:00 2001 From: Kunihiko Hayashi Date: Thu, 25 Aug 2022 18:01:01 +0900 Subject: [PATCH 0227/1423] PCI: pci-epf-test: Register notifier if only core_init_notifier is enabled The pci_epf_test_notifier function should be installed also if only core_init_notifier is enabled. Fix the current logic. Link: https://lore.kernel.org/r/20220825090101.20474-1-hayashi.kunihiko@socionext.com Fixes: 5e50ee27d4a5 ("PCI: pci-epf-test: Add support to defer core initialization") Signed-off-by: Kunihiko Hayashi Signed-off-by: Lorenzo Pieralisi Acked-by: Om Prakash Singh Acked-by: Kishon Vijay Abraham I --- drivers/pci/endpoint/functions/pci-epf-test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c b/drivers/pci/endpoint/functions/pci-epf-test.c index 36b1801a061b7..55283d2379a6a 100644 --- a/drivers/pci/endpoint/functions/pci-epf-test.c +++ b/drivers/pci/endpoint/functions/pci-epf-test.c @@ -979,7 +979,7 @@ static int pci_epf_test_bind(struct pci_epf *epf) if (ret) epf_test->dma_supported = false; - if (linkup_notifier) { + if (linkup_notifier || core_init_notifier) { epf->nb.notifier_call = pci_epf_test_notifier; pci_epc_register_notifier(epc, &epf->nb); } else { -- GitLab From 5e9c68ea777594a2d63fa44c0509782e90821707 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Wed, 12 Oct 2022 01:18:40 +0200 Subject: [PATCH 0228/1423] RISC-V: Cache SBI vendor values sbi_get_mvendorid(), sbi_get_marchid() and sbi_get_mimpid() might get called multiple times, though the values of these CSRs should not change during the runtime of a specific machine. Though the values can be different depending on which hart of the system they get called. So hook into the newly introduced cpuinfo struct to allow retrieving these cached values via new functions. Also use arch_initcall for the cpuinfo setup instead, as that now clearly is "architecture specific initialization" and also makes these information available slightly earlier. [caching vendor ids] Suggested-by: Atish Patra [using cpuinfo struct as cache] Suggested-by: Anup Patel Link: https://lore.kernel.org/all/20221011231841.2951264-2-heiko@sntech.de/ Signed-off-by: Heiko Stuebner Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/sbi.h | 5 +++++ arch/riscv/kernel/cpu.c | 30 +++++++++++++++++++++++++++--- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 2a0ef738695ed..4ca7fbacff424 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -327,4 +327,9 @@ int sbi_err_map_linux_errno(int err); static inline int sbi_remote_fence_i(const struct cpumask *cpu_mask) { return -1; } static inline void sbi_init(void) {} #endif /* CONFIG_RISCV_SBI */ + +unsigned long riscv_cached_mvendorid(unsigned int cpu_id); +unsigned long riscv_cached_marchid(unsigned int cpu_id); +unsigned long riscv_cached_mimpid(unsigned int cpu_id); + #endif /* _ASM_RISCV_SBI_H */ diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c index fa427bdcf773d..bf9dd6764bad2 100644 --- a/arch/riscv/kernel/cpu.c +++ b/arch/riscv/kernel/cpu.c @@ -70,8 +70,6 @@ int riscv_of_parent_hartid(struct device_node *node, unsigned long *hartid) return -1; } -#ifdef CONFIG_PROC_FS - struct riscv_cpuinfo { unsigned long mvendorid; unsigned long marchid; @@ -79,6 +77,30 @@ struct riscv_cpuinfo { }; static DEFINE_PER_CPU(struct riscv_cpuinfo, riscv_cpuinfo); +unsigned long riscv_cached_mvendorid(unsigned int cpu_id) +{ + struct riscv_cpuinfo *ci = per_cpu_ptr(&riscv_cpuinfo, cpu_id); + + return ci->mvendorid; +} +EXPORT_SYMBOL(riscv_cached_mvendorid); + +unsigned long riscv_cached_marchid(unsigned int cpu_id) +{ + struct riscv_cpuinfo *ci = per_cpu_ptr(&riscv_cpuinfo, cpu_id); + + return ci->marchid; +} +EXPORT_SYMBOL(riscv_cached_marchid); + +unsigned long riscv_cached_mimpid(unsigned int cpu_id) +{ + struct riscv_cpuinfo *ci = per_cpu_ptr(&riscv_cpuinfo, cpu_id); + + return ci->mimpid; +} +EXPORT_SYMBOL(riscv_cached_mimpid); + static int riscv_cpuinfo_starting(unsigned int cpu) { struct riscv_cpuinfo *ci = this_cpu_ptr(&riscv_cpuinfo); @@ -113,7 +135,9 @@ static int __init riscv_cpuinfo_init(void) return 0; } -device_initcall(riscv_cpuinfo_init); +arch_initcall(riscv_cpuinfo_init); + +#ifdef CONFIG_PROC_FS #define __RISCV_ISA_EXT_DATA(UPROP, EXTID) \ { \ -- GitLab From 65e9fb081877a18c432c6ff344937b7277c044b5 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Wed, 12 Oct 2022 01:18:41 +0200 Subject: [PATCH 0229/1423] drivers/perf: riscv_pmu_sbi: add support for PMU variant on T-Head C9xx cores With the T-HEAD C9XX cores being designed before or during the ratification to the SSCOFPMF extension, it implements functionality very similar but not equal to it. It implements overflow handling and also some privilege-mode filtering. While SSCOFPMF supports this for all modes, the C9XX only implements the filtering for M-mode and S-mode but not user-mode. So add some adaptions to allow the C9XX to still handle its PMU through the regular SBI PMU interface instead of defining new interfaces or drivers. To work properly, this requires a matching change in SBI, though the actual interface between kernel and SBI does not change. The main differences are a the overflow CSR and irq number. As the reading of the overflow-csr is in the hot-path during irq handling, use an errata and alternatives to not introduce new conditionals there. Reviewed-by: Andrew Jones Reviewed-by: Conor Dooley Signed-off-by: Heiko Stuebner Link: https://lore.kernel.org/all/20221011231841.2951264-2-heiko@sntech.de/ Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig.erratas | 13 +++++++++++ arch/riscv/errata/thead/errata.c | 19 ++++++++++++++++ arch/riscv/include/asm/errata_list.h | 16 ++++++++++++- drivers/perf/riscv_pmu_sbi.c | 34 ++++++++++++++++++++-------- 4 files changed, 71 insertions(+), 11 deletions(-) diff --git a/arch/riscv/Kconfig.erratas b/arch/riscv/Kconfig.erratas index f3623df23b5fc..69621ae6d647a 100644 --- a/arch/riscv/Kconfig.erratas +++ b/arch/riscv/Kconfig.erratas @@ -66,4 +66,17 @@ config ERRATA_THEAD_CMO If you don't know what to do here, say "Y". +config ERRATA_THEAD_PMU + bool "Apply T-Head PMU errata" + depends on ERRATA_THEAD && RISCV_PMU_SBI + default y + help + The T-Head C9xx cores implement a PMU overflow extension very + similar to the core SSCOFPMF extension. + + This will apply the overflow errata to handle the non-standard + behaviour via the regular SBI PMU driver and interface. + + If you don't know what to do here, say "Y". + endmenu # "CPU errata selection" diff --git a/arch/riscv/errata/thead/errata.c b/arch/riscv/errata/thead/errata.c index 21546937db39b..fac5742d1c1e6 100644 --- a/arch/riscv/errata/thead/errata.c +++ b/arch/riscv/errata/thead/errata.c @@ -47,6 +47,22 @@ static bool errata_probe_cmo(unsigned int stage, return true; } +static bool errata_probe_pmu(unsigned int stage, + unsigned long arch_id, unsigned long impid) +{ + if (!IS_ENABLED(CONFIG_ERRATA_THEAD_PMU)) + return false; + + /* target-c9xx cores report arch_id and impid as 0 */ + if (arch_id != 0 || impid != 0) + return false; + + if (stage == RISCV_ALTERNATIVES_EARLY_BOOT) + return false; + + return true; +} + static u32 thead_errata_probe(unsigned int stage, unsigned long archid, unsigned long impid) { @@ -58,6 +74,9 @@ static u32 thead_errata_probe(unsigned int stage, if (errata_probe_cmo(stage, archid, impid)) cpu_req_errata |= BIT(ERRATA_THEAD_CMO); + if (errata_probe_pmu(stage, archid, impid)) + cpu_req_errata |= BIT(ERRATA_THEAD_PMU); + return cpu_req_errata; } diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h index 19a771085781a..4180312d2a701 100644 --- a/arch/riscv/include/asm/errata_list.h +++ b/arch/riscv/include/asm/errata_list.h @@ -6,6 +6,7 @@ #define ASM_ERRATA_LIST_H #include +#include #include #ifdef CONFIG_ERRATA_SIFIVE @@ -17,7 +18,8 @@ #ifdef CONFIG_ERRATA_THEAD #define ERRATA_THEAD_PBMT 0 #define ERRATA_THEAD_CMO 1 -#define ERRATA_THEAD_NUMBER 2 +#define ERRATA_THEAD_PMU 2 +#define ERRATA_THEAD_NUMBER 3 #endif #define CPUFEATURE_SVPBMT 0 @@ -142,6 +144,18 @@ asm volatile(ALTERNATIVE_2( \ "r"((unsigned long)(_start) + (_size)) \ : "a0") +#define THEAD_C9XX_RV_IRQ_PMU 17 +#define THEAD_C9XX_CSR_SCOUNTEROF 0x5c5 + +#define ALT_SBI_PMU_OVERFLOW(__ovl) \ +asm volatile(ALTERNATIVE( \ + "csrr %0, " __stringify(CSR_SSCOUNTOVF), \ + "csrr %0, " __stringify(THEAD_C9XX_CSR_SCOUNTEROF), \ + THEAD_VENDOR_ID, ERRATA_THEAD_PMU, \ + CONFIG_ERRATA_THEAD_PMU) \ + : "=r" (__ovl) : \ + : "memory") + #endif /* __ASSEMBLY__ */ #endif diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c index 3852c18362f53..f6507efe2a584 100644 --- a/drivers/perf/riscv_pmu_sbi.c +++ b/drivers/perf/riscv_pmu_sbi.c @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -47,6 +48,8 @@ static const struct attribute_group *riscv_pmu_attr_groups[] = { * per_cpu in case of harts with different pmu counters */ static union sbi_pmu_ctr_info *pmu_ctr_list; +static bool riscv_pmu_use_irq; +static unsigned int riscv_pmu_irq_num; static unsigned int riscv_pmu_irq; struct sbi_pmu_event_data { @@ -580,7 +583,7 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev) fidx = find_first_bit(cpu_hw_evt->used_hw_ctrs, RISCV_MAX_COUNTERS); event = cpu_hw_evt->events[fidx]; if (!event) { - csr_clear(CSR_SIP, SIP_LCOFIP); + csr_clear(CSR_SIP, BIT(riscv_pmu_irq_num)); return IRQ_NONE; } @@ -588,13 +591,13 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev) pmu_sbi_stop_hw_ctrs(pmu); /* Overflow status register should only be read after counter are stopped */ - overflow = csr_read(CSR_SSCOUNTOVF); + ALT_SBI_PMU_OVERFLOW(overflow); /* * Overflow interrupt pending bit should only be cleared after stopping * all the counters to avoid any race condition. */ - csr_clear(CSR_SIP, SIP_LCOFIP); + csr_clear(CSR_SIP, BIT(riscv_pmu_irq_num)); /* No overflow bit is set */ if (!overflow) @@ -661,10 +664,10 @@ static int pmu_sbi_starting_cpu(unsigned int cpu, struct hlist_node *node) /* Stop all the counters so that they can be enabled from perf */ pmu_sbi_stop_all(pmu); - if (riscv_isa_extension_available(NULL, SSCOFPMF)) { + if (riscv_pmu_use_irq) { cpu_hw_evt->irq = riscv_pmu_irq; - csr_clear(CSR_IP, BIT(RV_IRQ_PMU)); - csr_set(CSR_IE, BIT(RV_IRQ_PMU)); + csr_clear(CSR_IP, BIT(riscv_pmu_irq_num)); + csr_set(CSR_IE, BIT(riscv_pmu_irq_num)); enable_percpu_irq(riscv_pmu_irq, IRQ_TYPE_NONE); } @@ -673,9 +676,9 @@ static int pmu_sbi_starting_cpu(unsigned int cpu, struct hlist_node *node) static int pmu_sbi_dying_cpu(unsigned int cpu, struct hlist_node *node) { - if (riscv_isa_extension_available(NULL, SSCOFPMF)) { + if (riscv_pmu_use_irq) { disable_percpu_irq(riscv_pmu_irq); - csr_clear(CSR_IE, BIT(RV_IRQ_PMU)); + csr_clear(CSR_IE, BIT(riscv_pmu_irq_num)); } /* Disable all counters access for user mode now */ @@ -691,7 +694,18 @@ static int pmu_sbi_setup_irqs(struct riscv_pmu *pmu, struct platform_device *pde struct device_node *cpu, *child; struct irq_domain *domain = NULL; - if (!riscv_isa_extension_available(NULL, SSCOFPMF)) + if (riscv_isa_extension_available(NULL, SSCOFPMF)) { + riscv_pmu_irq_num = RV_IRQ_PMU; + riscv_pmu_use_irq = true; + } else if (IS_ENABLED(CONFIG_ERRATA_THEAD_PMU) && + riscv_cached_mvendorid(0) == THEAD_VENDOR_ID && + riscv_cached_marchid(0) == 0 && + riscv_cached_mimpid(0) == 0) { + riscv_pmu_irq_num = THEAD_C9XX_RV_IRQ_PMU; + riscv_pmu_use_irq = true; + } + + if (!riscv_pmu_use_irq) return -EOPNOTSUPP; for_each_of_cpu_node(cpu) { @@ -713,7 +727,7 @@ static int pmu_sbi_setup_irqs(struct riscv_pmu *pmu, struct platform_device *pde return -ENODEV; } - riscv_pmu_irq = irq_create_mapping(domain, RV_IRQ_PMU); + riscv_pmu_irq = irq_create_mapping(domain, riscv_pmu_irq_num); if (!riscv_pmu_irq) { pr_err("Failed to map PMU interrupt for node\n"); return -ENODEV; -- GitLab From 2348e6bf44213c5f447ff698e43c089185241ed7 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 18 Oct 2022 22:12:00 +0800 Subject: [PATCH 0230/1423] riscv: remove special treatment for the link order of head.o arch/riscv/kernel/head.o does not need any special treatment - the only requirement is the ".head.text" section must be placed before the normal ".text" section. The linker script does the right thing to do. The build system does not need to manipulate the link order of head.o. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20221018141200.1040-1-jszhang@kernel.org Signed-off-by: Palmer Dabbelt --- scripts/head-object-list.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/head-object-list.txt b/scripts/head-object-list.txt index b16326a92c458..105ea7ac47511 100644 --- a/scripts/head-object-list.txt +++ b/scripts/head-object-list.txt @@ -39,7 +39,6 @@ arch/powerpc/kernel/entry_64.o arch/powerpc/kernel/fpu.o arch/powerpc/kernel/vector.o arch/powerpc/kernel/prom_init.o -arch/riscv/kernel/head.o arch/s390/kernel/head64.o arch/sh/kernel/head_32.o arch/sparc/kernel/head_32.o -- GitLab From 28fc4e9077ce59ab28c89c20dc6be5154473218f Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Tue, 18 Oct 2022 10:45:32 +0800 Subject: [PATCH 0231/1423] f2fs: Fix the race condition of resize flag between resizefs Because the set/clear SBI_IS_RESIZEFS flag not between any locks, In the following case: thread1 thread2 ->ioctl(resizefs) ->set RESIZEFS flag ->ioctl(resizefs) ... ->set RESIZEFS flag ->clear RESIZEFS flag ->resizefs stream # No RESIZEFS flag in the stream Also before freeze_super, the resizefs not started, we should not set the SBI_IS_RESIZEFS flag. So move the set/clear SBI_IS_RESIZEFS flag between the cp_mutex and gc_lock. Fixes: b4b10061ef98 ("f2fs: refactor resize_fs to avoid meta updates in progress") Signed-off-by: Zhang Xiaoxu Signed-off-by: Zhang Qilong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index dab794225cce7..7b4be412cec06 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -2134,8 +2134,6 @@ out_unlock: if (err) return err; - set_sbi_flag(sbi, SBI_IS_RESIZEFS); - freeze_super(sbi->sb); f2fs_down_write(&sbi->gc_lock); f2fs_down_write(&sbi->cp_global_sem); @@ -2151,6 +2149,7 @@ out_unlock: if (err) goto out_err; + set_sbi_flag(sbi, SBI_IS_RESIZEFS); err = free_segment_range(sbi, secs, false); if (err) goto recover_out; @@ -2174,6 +2173,7 @@ out_unlock: f2fs_commit_super(sbi, false); } recover_out: + clear_sbi_flag(sbi, SBI_IS_RESIZEFS); if (err) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_err(sbi, "resize_fs failed, should run fsck to repair!"); @@ -2186,6 +2186,5 @@ out_err: f2fs_up_write(&sbi->cp_global_sem); f2fs_up_write(&sbi->gc_lock); thaw_super(sbi->sb); - clear_sbi_flag(sbi, SBI_IS_RESIZEFS); return err; } -- GitLab From 299c481fa5c121f892420d97f1123a853b7f1079 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:54:39 +0000 Subject: [PATCH 0232/1423] crypto: rockchip - use dev_err for error message about interrupt Interrupt is mandatory so the message should be printed as error. Reviewed-by: John Keeping Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/rockchip/rk3288_crypto.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/crypto/rockchip/rk3288_crypto.c b/drivers/crypto/rockchip/rk3288_crypto.c index 35d73061d1569..45cc5f766788c 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.c +++ b/drivers/crypto/rockchip/rk3288_crypto.c @@ -371,8 +371,7 @@ static int rk_crypto_probe(struct platform_device *pdev) crypto_info->irq = platform_get_irq(pdev, 0); if (crypto_info->irq < 0) { - dev_warn(crypto_info->dev, - "control Interrupt is not available.\n"); + dev_err(&pdev->dev, "control Interrupt is not available.\n"); err = crypto_info->irq; goto err_crypto; } -- GitLab From 8ccd9c8cd1d1618f5e073c86ffcfe15f292eefe6 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:54:40 +0000 Subject: [PATCH 0233/1423] crypto: rockchip - do not use uninitialized variable crypto_info->dev is not yet set, so use pdev->dev instead. Reviewed-by: John Keeping Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/rockchip/rk3288_crypto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/rockchip/rk3288_crypto.c b/drivers/crypto/rockchip/rk3288_crypto.c index 45cc5f766788c..21d3f14585843 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.c +++ b/drivers/crypto/rockchip/rk3288_crypto.c @@ -381,7 +381,7 @@ static int rk_crypto_probe(struct platform_device *pdev) "rk-crypto", pdev); if (err) { - dev_err(crypto_info->dev, "irq request failed.\n"); + dev_err(&pdev->dev, "irq request failed.\n"); goto err_crypto; } -- GitLab From c50ef1411c8cbad0c7db100c477126076b6e3348 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:54:41 +0000 Subject: [PATCH 0234/1423] crypto: rockchip - do not do custom power management The clock enable/disable at tfm init/exit is fragile, if 2 tfm are init in the same time and one is removed just after, it will leave the hardware uncloked even if a user remains. Instead simply enable clocks at probe time. We will do PM later. Fixes: ce0183cb6464b ("crypto: rockchip - switch to skcipher API") Reviewed-by: John Keeping Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/rockchip/rk3288_crypto.c | 4 ++-- drivers/crypto/rockchip/rk3288_crypto.h | 2 -- drivers/crypto/rockchip/rk3288_crypto_ahash.c | 3 +-- drivers/crypto/rockchip/rk3288_crypto_skcipher.c | 5 +++-- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/crypto/rockchip/rk3288_crypto.c b/drivers/crypto/rockchip/rk3288_crypto.c index 21d3f14585843..4cff49b829838 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.c +++ b/drivers/crypto/rockchip/rk3288_crypto.c @@ -394,8 +394,7 @@ static int rk_crypto_probe(struct platform_device *pdev) rk_crypto_done_task_cb, (unsigned long)crypto_info); crypto_init_queue(&crypto_info->queue, 50); - crypto_info->enable_clk = rk_crypto_enable_clk; - crypto_info->disable_clk = rk_crypto_disable_clk; + rk_crypto_enable_clk(crypto_info); crypto_info->load_data = rk_load_data; crypto_info->unload_data = rk_unload_data; crypto_info->enqueue = rk_crypto_enqueue; @@ -422,6 +421,7 @@ static int rk_crypto_remove(struct platform_device *pdev) struct rk_crypto_info *crypto_tmp = platform_get_drvdata(pdev); rk_crypto_unregister(); + rk_crypto_disable_clk(crypto_tmp); tasklet_kill(&crypto_tmp->done_task); tasklet_kill(&crypto_tmp->queue_task); return 0; diff --git a/drivers/crypto/rockchip/rk3288_crypto.h b/drivers/crypto/rockchip/rk3288_crypto.h index 97278c2574ff9..2fa7131e40604 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.h +++ b/drivers/crypto/rockchip/rk3288_crypto.h @@ -220,8 +220,6 @@ struct rk_crypto_info { int (*start)(struct rk_crypto_info *dev); int (*update)(struct rk_crypto_info *dev); void (*complete)(struct crypto_async_request *base, int err); - int (*enable_clk)(struct rk_crypto_info *dev); - void (*disable_clk)(struct rk_crypto_info *dev); int (*load_data)(struct rk_crypto_info *dev, struct scatterlist *sg_src, struct scatterlist *sg_dst); diff --git a/drivers/crypto/rockchip/rk3288_crypto_ahash.c b/drivers/crypto/rockchip/rk3288_crypto_ahash.c index ed03058497bc2..49017d1fb5104 100644 --- a/drivers/crypto/rockchip/rk3288_crypto_ahash.c +++ b/drivers/crypto/rockchip/rk3288_crypto_ahash.c @@ -301,7 +301,7 @@ static int rk_cra_hash_init(struct crypto_tfm *tfm) sizeof(struct rk_ahash_rctx) + crypto_ahash_reqsize(tctx->fallback_tfm)); - return tctx->dev->enable_clk(tctx->dev); + return 0; } static void rk_cra_hash_exit(struct crypto_tfm *tfm) @@ -309,7 +309,6 @@ static void rk_cra_hash_exit(struct crypto_tfm *tfm) struct rk_ahash_ctx *tctx = crypto_tfm_ctx(tfm); free_page((unsigned long)tctx->dev->addr_vir); - return tctx->dev->disable_clk(tctx->dev); } struct rk_crypto_tmp rk_ahash_sha1 = { diff --git a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c index 5bbf0d2722e11..8c44a19eab751 100644 --- a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c +++ b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c @@ -388,8 +388,10 @@ static int rk_ablk_init_tfm(struct crypto_skcipher *tfm) ctx->dev->update = rk_ablk_rx; ctx->dev->complete = rk_crypto_complete; ctx->dev->addr_vir = (char *)__get_free_page(GFP_KERNEL); + if (!ctx->dev->addr_vir) + return -ENOMEM; - return ctx->dev->addr_vir ? ctx->dev->enable_clk(ctx->dev) : -ENOMEM; + return 0; } static void rk_ablk_exit_tfm(struct crypto_skcipher *tfm) @@ -397,7 +399,6 @@ static void rk_ablk_exit_tfm(struct crypto_skcipher *tfm) struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); free_page((unsigned long)ctx->dev->addr_vir); - ctx->dev->disable_clk(ctx->dev); } struct rk_crypto_tmp rk_ecb_aes_alg = { -- GitLab From 6d11c9387865723fd779be00ae37a4588e60133d Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:54:42 +0000 Subject: [PATCH 0235/1423] crypto: rockchip - fix privete/private typo This fix a simple typo on private word. Reviewed-by: John Keeping Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/rockchip/rk3288_crypto.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/rockchip/rk3288_crypto.h b/drivers/crypto/rockchip/rk3288_crypto.h index 2fa7131e40604..656d6795d4004 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.h +++ b/drivers/crypto/rockchip/rk3288_crypto.h @@ -235,7 +235,7 @@ struct rk_ahash_ctx { struct crypto_ahash *fallback_tfm; }; -/* the privete variable of hash for fallback */ +/* the private variable of hash for fallback */ struct rk_ahash_rctx { struct ahash_request fallback_req; u32 mode; -- GitLab From 87e356c4966444866186f68f05832fdcc0f351a3 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:54:43 +0000 Subject: [PATCH 0236/1423] crypto: rockchip - do not store mode globally Storing the mode globally does not work if 2 requests are handled in the same time. We should store it in a request context. Fixes: ce0183cb6464b ("crypto: rockchip - switch to skcipher API") Reviewed-by: John Keeping Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/rockchip/rk3288_crypto.h | 5 +- .../crypto/rockchip/rk3288_crypto_skcipher.c | 58 ++++++++++++------- 2 files changed, 41 insertions(+), 22 deletions(-) diff --git a/drivers/crypto/rockchip/rk3288_crypto.h b/drivers/crypto/rockchip/rk3288_crypto.h index 656d6795d4004..c919d9a43a082 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.h +++ b/drivers/crypto/rockchip/rk3288_crypto.h @@ -245,10 +245,13 @@ struct rk_ahash_rctx { struct rk_cipher_ctx { struct rk_crypto_info *dev; unsigned int keylen; - u32 mode; u8 iv[AES_BLOCK_SIZE]; }; +struct rk_cipher_rctx { + u32 mode; +}; + enum alg_type { ALG_TYPE_HASH, ALG_TYPE_CIPHER, diff --git a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c index 8c44a19eab751..bbd0bf52bf07e 100644 --- a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c +++ b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c @@ -76,9 +76,10 @@ static int rk_aes_ecb_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); + struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); struct rk_crypto_info *dev = ctx->dev; - ctx->mode = RK_CRYPTO_AES_ECB_MODE; + rctx->mode = RK_CRYPTO_AES_ECB_MODE; return rk_handle_req(dev, req); } @@ -86,9 +87,10 @@ static int rk_aes_ecb_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); + struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); struct rk_crypto_info *dev = ctx->dev; - ctx->mode = RK_CRYPTO_AES_ECB_MODE | RK_CRYPTO_DEC; + rctx->mode = RK_CRYPTO_AES_ECB_MODE | RK_CRYPTO_DEC; return rk_handle_req(dev, req); } @@ -96,9 +98,10 @@ static int rk_aes_cbc_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); + struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); struct rk_crypto_info *dev = ctx->dev; - ctx->mode = RK_CRYPTO_AES_CBC_MODE; + rctx->mode = RK_CRYPTO_AES_CBC_MODE; return rk_handle_req(dev, req); } @@ -106,9 +109,10 @@ static int rk_aes_cbc_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); + struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); struct rk_crypto_info *dev = ctx->dev; - ctx->mode = RK_CRYPTO_AES_CBC_MODE | RK_CRYPTO_DEC; + rctx->mode = RK_CRYPTO_AES_CBC_MODE | RK_CRYPTO_DEC; return rk_handle_req(dev, req); } @@ -116,9 +120,10 @@ static int rk_des_ecb_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); + struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); struct rk_crypto_info *dev = ctx->dev; - ctx->mode = 0; + rctx->mode = 0; return rk_handle_req(dev, req); } @@ -126,9 +131,10 @@ static int rk_des_ecb_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); + struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); struct rk_crypto_info *dev = ctx->dev; - ctx->mode = RK_CRYPTO_DEC; + rctx->mode = RK_CRYPTO_DEC; return rk_handle_req(dev, req); } @@ -136,9 +142,10 @@ static int rk_des_cbc_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); + struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); struct rk_crypto_info *dev = ctx->dev; - ctx->mode = RK_CRYPTO_TDES_CHAINMODE_CBC; + rctx->mode = RK_CRYPTO_TDES_CHAINMODE_CBC; return rk_handle_req(dev, req); } @@ -146,9 +153,10 @@ static int rk_des_cbc_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); + struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); struct rk_crypto_info *dev = ctx->dev; - ctx->mode = RK_CRYPTO_TDES_CHAINMODE_CBC | RK_CRYPTO_DEC; + rctx->mode = RK_CRYPTO_TDES_CHAINMODE_CBC | RK_CRYPTO_DEC; return rk_handle_req(dev, req); } @@ -156,9 +164,10 @@ static int rk_des3_ede_ecb_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); + struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); struct rk_crypto_info *dev = ctx->dev; - ctx->mode = RK_CRYPTO_TDES_SELECT; + rctx->mode = RK_CRYPTO_TDES_SELECT; return rk_handle_req(dev, req); } @@ -166,9 +175,10 @@ static int rk_des3_ede_ecb_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); + struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); struct rk_crypto_info *dev = ctx->dev; - ctx->mode = RK_CRYPTO_TDES_SELECT | RK_CRYPTO_DEC; + rctx->mode = RK_CRYPTO_TDES_SELECT | RK_CRYPTO_DEC; return rk_handle_req(dev, req); } @@ -176,9 +186,10 @@ static int rk_des3_ede_cbc_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); + struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); struct rk_crypto_info *dev = ctx->dev; - ctx->mode = RK_CRYPTO_TDES_SELECT | RK_CRYPTO_TDES_CHAINMODE_CBC; + rctx->mode = RK_CRYPTO_TDES_SELECT | RK_CRYPTO_TDES_CHAINMODE_CBC; return rk_handle_req(dev, req); } @@ -186,9 +197,10 @@ static int rk_des3_ede_cbc_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); + struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); struct rk_crypto_info *dev = ctx->dev; - ctx->mode = RK_CRYPTO_TDES_SELECT | RK_CRYPTO_TDES_CHAINMODE_CBC | + rctx->mode = RK_CRYPTO_TDES_SELECT | RK_CRYPTO_TDES_CHAINMODE_CBC | RK_CRYPTO_DEC; return rk_handle_req(dev, req); } @@ -199,6 +211,7 @@ static void rk_ablk_hw_init(struct rk_crypto_info *dev) skcipher_request_cast(dev->async_req); struct crypto_skcipher *cipher = crypto_skcipher_reqtfm(req); struct crypto_tfm *tfm = crypto_skcipher_tfm(cipher); + struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(cipher); u32 ivsize, block, conf_reg = 0; @@ -206,22 +219,22 @@ static void rk_ablk_hw_init(struct rk_crypto_info *dev) ivsize = crypto_skcipher_ivsize(cipher); if (block == DES_BLOCK_SIZE) { - ctx->mode |= RK_CRYPTO_TDES_FIFO_MODE | + rctx->mode |= RK_CRYPTO_TDES_FIFO_MODE | RK_CRYPTO_TDES_BYTESWAP_KEY | RK_CRYPTO_TDES_BYTESWAP_IV; - CRYPTO_WRITE(dev, RK_CRYPTO_TDES_CTRL, ctx->mode); + CRYPTO_WRITE(dev, RK_CRYPTO_TDES_CTRL, rctx->mode); memcpy_toio(dev->reg + RK_CRYPTO_TDES_IV_0, req->iv, ivsize); conf_reg = RK_CRYPTO_DESSEL; } else { - ctx->mode |= RK_CRYPTO_AES_FIFO_MODE | + rctx->mode |= RK_CRYPTO_AES_FIFO_MODE | RK_CRYPTO_AES_KEY_CHANGE | RK_CRYPTO_AES_BYTESWAP_KEY | RK_CRYPTO_AES_BYTESWAP_IV; if (ctx->keylen == AES_KEYSIZE_192) - ctx->mode |= RK_CRYPTO_AES_192BIT_key; + rctx->mode |= RK_CRYPTO_AES_192BIT_key; else if (ctx->keylen == AES_KEYSIZE_256) - ctx->mode |= RK_CRYPTO_AES_256BIT_key; - CRYPTO_WRITE(dev, RK_CRYPTO_AES_CTRL, ctx->mode); + rctx->mode |= RK_CRYPTO_AES_256BIT_key; + CRYPTO_WRITE(dev, RK_CRYPTO_AES_CTRL, rctx->mode); memcpy_toio(dev->reg + RK_CRYPTO_AES_IV_0, req->iv, ivsize); } conf_reg |= RK_CRYPTO_BYTESWAP_BTFIFO | @@ -246,6 +259,7 @@ static int rk_set_data_start(struct rk_crypto_info *dev) struct skcipher_request *req = skcipher_request_cast(dev->async_req); struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); u32 ivsize = crypto_skcipher_ivsize(tfm); u8 *src_last_blk = page_address(sg_page(dev->sg_src)) + @@ -254,7 +268,7 @@ static int rk_set_data_start(struct rk_crypto_info *dev) /* Store the iv that need to be updated in chain mode. * And update the IV buffer to contain the next IV for decryption mode. */ - if (ctx->mode & RK_CRYPTO_DEC) { + if (rctx->mode & RK_CRYPTO_DEC) { memcpy(ctx->iv, src_last_blk, ivsize); sg_pcopy_to_buffer(dev->first, dev->src_nents, req->iv, ivsize, dev->total - ivsize); @@ -294,11 +308,12 @@ static void rk_iv_copyback(struct rk_crypto_info *dev) struct skcipher_request *req = skcipher_request_cast(dev->async_req); struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); u32 ivsize = crypto_skcipher_ivsize(tfm); /* Update the IV buffer to contain the next IV for encryption mode. */ - if (!(ctx->mode & RK_CRYPTO_DEC)) { + if (!(rctx->mode & RK_CRYPTO_DEC)) { if (dev->aligned) { memcpy(req->iv, sg_virt(dev->sg_dst) + dev->sg_dst->length - ivsize, ivsize); @@ -314,11 +329,12 @@ static void rk_update_iv(struct rk_crypto_info *dev) struct skcipher_request *req = skcipher_request_cast(dev->async_req); struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); u32 ivsize = crypto_skcipher_ivsize(tfm); u8 *new_iv = NULL; - if (ctx->mode & RK_CRYPTO_DEC) { + if (rctx->mode & RK_CRYPTO_DEC) { new_iv = ctx->iv; } else { new_iv = page_address(sg_page(dev->sg_dst)) + -- GitLab From 68ef8af09a1a912a5ed2cfaa4cca7606f52cef90 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:54:44 +0000 Subject: [PATCH 0237/1423] crypto: rockchip - add fallback for cipher The hardware does not handle 0 size length request, let's add a fallback. Furthermore fallback will be used for all unaligned case the hardware cannot handle. Fixes: ce0183cb6464b ("crypto: rockchip - switch to skcipher API") Reviewed-by: John Keeping Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/Kconfig | 4 + drivers/crypto/rockchip/rk3288_crypto.h | 2 + .../crypto/rockchip/rk3288_crypto_skcipher.c | 97 ++++++++++++++++--- 3 files changed, 90 insertions(+), 13 deletions(-) diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig index 55e75fbb658ee..113b35f695980 100644 --- a/drivers/crypto/Kconfig +++ b/drivers/crypto/Kconfig @@ -669,6 +669,10 @@ config CRYPTO_DEV_IMGTEC_HASH config CRYPTO_DEV_ROCKCHIP tristate "Rockchip's Cryptographic Engine driver" depends on OF && ARCH_ROCKCHIP + depends on PM + select CRYPTO_ECB + select CRYPTO_CBC + select CRYPTO_DES select CRYPTO_AES select CRYPTO_LIB_DES select CRYPTO_MD5 diff --git a/drivers/crypto/rockchip/rk3288_crypto.h b/drivers/crypto/rockchip/rk3288_crypto.h index c919d9a43a082..8b1e15d8ddc67 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.h +++ b/drivers/crypto/rockchip/rk3288_crypto.h @@ -246,10 +246,12 @@ struct rk_cipher_ctx { struct rk_crypto_info *dev; unsigned int keylen; u8 iv[AES_BLOCK_SIZE]; + struct crypto_skcipher *fallback_tfm; }; struct rk_cipher_rctx { u32 mode; + struct skcipher_request fallback_req; // keep at the end }; enum alg_type { diff --git a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c index bbd0bf52bf07e..eac5bba66e257 100644 --- a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c +++ b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c @@ -13,6 +13,63 @@ #define RK_CRYPTO_DEC BIT(0) +static int rk_cipher_need_fallback(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + unsigned int bs = crypto_skcipher_blocksize(tfm); + struct scatterlist *sgs, *sgd; + unsigned int stodo, dtodo, len; + + if (!req->cryptlen) + return true; + + len = req->cryptlen; + sgs = req->src; + sgd = req->dst; + while (sgs && sgd) { + if (!IS_ALIGNED(sgs->offset, sizeof(u32))) { + return true; + } + if (!IS_ALIGNED(sgd->offset, sizeof(u32))) { + return true; + } + stodo = min(len, sgs->length); + if (stodo % bs) { + return true; + } + dtodo = min(len, sgd->length); + if (dtodo % bs) { + return true; + } + if (stodo != dtodo) { + return true; + } + len -= stodo; + sgs = sg_next(sgs); + sgd = sg_next(sgd); + } + return false; +} + +static int rk_cipher_fallback(struct skcipher_request *areq) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(areq); + struct rk_cipher_ctx *op = crypto_skcipher_ctx(tfm); + struct rk_cipher_rctx *rctx = skcipher_request_ctx(areq); + int err; + + skcipher_request_set_tfm(&rctx->fallback_req, op->fallback_tfm); + skcipher_request_set_callback(&rctx->fallback_req, areq->base.flags, + areq->base.complete, areq->base.data); + skcipher_request_set_crypt(&rctx->fallback_req, areq->src, areq->dst, + areq->cryptlen, areq->iv); + if (rctx->mode & RK_CRYPTO_DEC) + err = crypto_skcipher_decrypt(&rctx->fallback_req); + else + err = crypto_skcipher_encrypt(&rctx->fallback_req); + return err; +} + static void rk_crypto_complete(struct crypto_async_request *base, int err) { if (base->complete) @@ -22,10 +79,10 @@ static void rk_crypto_complete(struct crypto_async_request *base, int err) static int rk_handle_req(struct rk_crypto_info *dev, struct skcipher_request *req) { - if (!IS_ALIGNED(req->cryptlen, dev->align_size)) - return -EINVAL; - else - return dev->enqueue(dev, &req->base); + if (rk_cipher_need_fallback(req)) + return rk_cipher_fallback(req); + + return dev->enqueue(dev, &req->base); } static int rk_aes_setkey(struct crypto_skcipher *cipher, @@ -39,7 +96,8 @@ static int rk_aes_setkey(struct crypto_skcipher *cipher, return -EINVAL; ctx->keylen = keylen; memcpy_toio(ctx->dev->reg + RK_CRYPTO_AES_KEY_0, key, keylen); - return 0; + + return crypto_skcipher_setkey(ctx->fallback_tfm, key, keylen); } static int rk_des_setkey(struct crypto_skcipher *cipher, @@ -54,7 +112,8 @@ static int rk_des_setkey(struct crypto_skcipher *cipher, ctx->keylen = keylen; memcpy_toio(ctx->dev->reg + RK_CRYPTO_TDES_KEY1_0, key, keylen); - return 0; + + return crypto_skcipher_setkey(ctx->fallback_tfm, key, keylen); } static int rk_tdes_setkey(struct crypto_skcipher *cipher, @@ -69,7 +128,7 @@ static int rk_tdes_setkey(struct crypto_skcipher *cipher, ctx->keylen = keylen; memcpy_toio(ctx->dev->reg + RK_CRYPTO_TDES_KEY1_0, key, keylen); - return 0; + return crypto_skcipher_setkey(ctx->fallback_tfm, key, keylen); } static int rk_aes_ecb_encrypt(struct skcipher_request *req) @@ -394,6 +453,7 @@ static int rk_ablk_init_tfm(struct crypto_skcipher *tfm) { struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_alg *alg = crypto_skcipher_alg(tfm); + const char *name = crypto_tfm_alg_name(&tfm->base); struct rk_crypto_tmp *algt; algt = container_of(alg, struct rk_crypto_tmp, alg.skcipher); @@ -407,6 +467,16 @@ static int rk_ablk_init_tfm(struct crypto_skcipher *tfm) if (!ctx->dev->addr_vir) return -ENOMEM; + ctx->fallback_tfm = crypto_alloc_skcipher(name, 0, CRYPTO_ALG_NEED_FALLBACK); + if (IS_ERR(ctx->fallback_tfm)) { + dev_err(ctx->dev->dev, "ERROR: Cannot allocate fallback for %s %ld\n", + name, PTR_ERR(ctx->fallback_tfm)); + return PTR_ERR(ctx->fallback_tfm); + } + + tfm->reqsize = sizeof(struct rk_cipher_rctx) + + crypto_skcipher_reqsize(ctx->fallback_tfm); + return 0; } @@ -415,6 +485,7 @@ static void rk_ablk_exit_tfm(struct crypto_skcipher *tfm) struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); free_page((unsigned long)ctx->dev->addr_vir); + crypto_free_skcipher(ctx->fallback_tfm); } struct rk_crypto_tmp rk_ecb_aes_alg = { @@ -423,7 +494,7 @@ struct rk_crypto_tmp rk_ecb_aes_alg = { .base.cra_name = "ecb(aes)", .base.cra_driver_name = "ecb-aes-rk", .base.cra_priority = 300, - .base.cra_flags = CRYPTO_ALG_ASYNC, + .base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK, .base.cra_blocksize = AES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct rk_cipher_ctx), .base.cra_alignmask = 0x0f, @@ -445,7 +516,7 @@ struct rk_crypto_tmp rk_cbc_aes_alg = { .base.cra_name = "cbc(aes)", .base.cra_driver_name = "cbc-aes-rk", .base.cra_priority = 300, - .base.cra_flags = CRYPTO_ALG_ASYNC, + .base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK, .base.cra_blocksize = AES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct rk_cipher_ctx), .base.cra_alignmask = 0x0f, @@ -468,7 +539,7 @@ struct rk_crypto_tmp rk_ecb_des_alg = { .base.cra_name = "ecb(des)", .base.cra_driver_name = "ecb-des-rk", .base.cra_priority = 300, - .base.cra_flags = CRYPTO_ALG_ASYNC, + .base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK, .base.cra_blocksize = DES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct rk_cipher_ctx), .base.cra_alignmask = 0x07, @@ -490,7 +561,7 @@ struct rk_crypto_tmp rk_cbc_des_alg = { .base.cra_name = "cbc(des)", .base.cra_driver_name = "cbc-des-rk", .base.cra_priority = 300, - .base.cra_flags = CRYPTO_ALG_ASYNC, + .base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK, .base.cra_blocksize = DES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct rk_cipher_ctx), .base.cra_alignmask = 0x07, @@ -513,7 +584,7 @@ struct rk_crypto_tmp rk_ecb_des3_ede_alg = { .base.cra_name = "ecb(des3_ede)", .base.cra_driver_name = "ecb-des3-ede-rk", .base.cra_priority = 300, - .base.cra_flags = CRYPTO_ALG_ASYNC, + .base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK, .base.cra_blocksize = DES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct rk_cipher_ctx), .base.cra_alignmask = 0x07, @@ -535,7 +606,7 @@ struct rk_crypto_tmp rk_cbc_des3_ede_alg = { .base.cra_name = "cbc(des3_ede)", .base.cra_driver_name = "cbc-des3-ede-rk", .base.cra_priority = 300, - .base.cra_flags = CRYPTO_ALG_ASYNC, + .base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK, .base.cra_blocksize = DES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct rk_cipher_ctx), .base.cra_alignmask = 0x07, -- GitLab From 816600485cb597b3ff7d6806a95a78512839f775 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:54:45 +0000 Subject: [PATCH 0238/1423] crypto: rockchip - add fallback for ahash Adds a fallback for all case hardware cannot handle. Fixes: ce0183cb6464b ("crypto: rockchip - switch to skcipher API") Reviewed-by: John Keeping Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/rockchip/rk3288_crypto_ahash.c | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/drivers/crypto/rockchip/rk3288_crypto_ahash.c b/drivers/crypto/rockchip/rk3288_crypto_ahash.c index 49017d1fb5104..16009bb0bf160 100644 --- a/drivers/crypto/rockchip/rk3288_crypto_ahash.c +++ b/drivers/crypto/rockchip/rk3288_crypto_ahash.c @@ -16,6 +16,40 @@ * so we put the fixed hash out when met zero message. */ +static bool rk_ahash_need_fallback(struct ahash_request *req) +{ + struct scatterlist *sg; + + sg = req->src; + while (sg) { + if (!IS_ALIGNED(sg->offset, sizeof(u32))) { + return true; + } + if (sg->length % 4) { + return true; + } + sg = sg_next(sg); + } + return false; +} + +static int rk_ahash_digest_fb(struct ahash_request *areq) +{ + struct rk_ahash_rctx *rctx = ahash_request_ctx(areq); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(areq); + struct rk_ahash_ctx *tfmctx = crypto_ahash_ctx(tfm); + + ahash_request_set_tfm(&rctx->fallback_req, tfmctx->fallback_tfm); + rctx->fallback_req.base.flags = areq->base.flags & + CRYPTO_TFM_REQ_MAY_SLEEP; + + rctx->fallback_req.nbytes = areq->nbytes; + rctx->fallback_req.src = areq->src; + rctx->fallback_req.result = areq->result; + + return crypto_ahash_digest(&rctx->fallback_req); +} + static int zero_message_process(struct ahash_request *req) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); @@ -167,6 +201,9 @@ static int rk_ahash_digest(struct ahash_request *req) struct rk_ahash_ctx *tctx = crypto_tfm_ctx(req->base.tfm); struct rk_crypto_info *dev = tctx->dev; + if (rk_ahash_need_fallback(req)) + return rk_ahash_digest_fb(req); + if (!req->nbytes) return zero_message_process(req); else @@ -309,6 +346,7 @@ static void rk_cra_hash_exit(struct crypto_tfm *tfm) struct rk_ahash_ctx *tctx = crypto_tfm_ctx(tfm); free_page((unsigned long)tctx->dev->addr_vir); + crypto_free_ahash(tctx->fallback_tfm); } struct rk_crypto_tmp rk_ahash_sha1 = { -- GitLab From d6b23ccef82816050c2fd458c9dabfa0e0af09b9 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:54:46 +0000 Subject: [PATCH 0239/1423] crypto: rockchip - better handle cipher key The key should not be set in hardware too much in advance, this will fail it 2 TFM with different keys generate alternative requests. The key should be stored and used just before doing cipher operations. Fixes: ce0183cb6464b ("crypto: rockchip - switch to skcipher API") Reviewed-by: John Keeping Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/rockchip/rk3288_crypto.h | 1 + drivers/crypto/rockchip/rk3288_crypto_skcipher.c | 10 +++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/crypto/rockchip/rk3288_crypto.h b/drivers/crypto/rockchip/rk3288_crypto.h index 8b1e15d8ddc67..540b81a14b9be 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.h +++ b/drivers/crypto/rockchip/rk3288_crypto.h @@ -245,6 +245,7 @@ struct rk_ahash_rctx { struct rk_cipher_ctx { struct rk_crypto_info *dev; unsigned int keylen; + u8 key[AES_MAX_KEY_SIZE]; u8 iv[AES_BLOCK_SIZE]; struct crypto_skcipher *fallback_tfm; }; diff --git a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c index eac5bba66e257..1ef94f8db2c5c 100644 --- a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c +++ b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c @@ -95,7 +95,7 @@ static int rk_aes_setkey(struct crypto_skcipher *cipher, keylen != AES_KEYSIZE_256) return -EINVAL; ctx->keylen = keylen; - memcpy_toio(ctx->dev->reg + RK_CRYPTO_AES_KEY_0, key, keylen); + memcpy(ctx->key, key, keylen); return crypto_skcipher_setkey(ctx->fallback_tfm, key, keylen); } @@ -111,7 +111,7 @@ static int rk_des_setkey(struct crypto_skcipher *cipher, return err; ctx->keylen = keylen; - memcpy_toio(ctx->dev->reg + RK_CRYPTO_TDES_KEY1_0, key, keylen); + memcpy(ctx->key, key, keylen); return crypto_skcipher_setkey(ctx->fallback_tfm, key, keylen); } @@ -127,7 +127,8 @@ static int rk_tdes_setkey(struct crypto_skcipher *cipher, return err; ctx->keylen = keylen; - memcpy_toio(ctx->dev->reg + RK_CRYPTO_TDES_KEY1_0, key, keylen); + memcpy(ctx->key, key, keylen); + return crypto_skcipher_setkey(ctx->fallback_tfm, key, keylen); } @@ -283,6 +284,7 @@ static void rk_ablk_hw_init(struct rk_crypto_info *dev) RK_CRYPTO_TDES_BYTESWAP_IV; CRYPTO_WRITE(dev, RK_CRYPTO_TDES_CTRL, rctx->mode); memcpy_toio(dev->reg + RK_CRYPTO_TDES_IV_0, req->iv, ivsize); + memcpy_toio(ctx->dev->reg + RK_CRYPTO_TDES_KEY1_0, ctx->key, ctx->keylen); conf_reg = RK_CRYPTO_DESSEL; } else { rctx->mode |= RK_CRYPTO_AES_FIFO_MODE | @@ -295,6 +297,7 @@ static void rk_ablk_hw_init(struct rk_crypto_info *dev) rctx->mode |= RK_CRYPTO_AES_256BIT_key; CRYPTO_WRITE(dev, RK_CRYPTO_AES_CTRL, rctx->mode); memcpy_toio(dev->reg + RK_CRYPTO_AES_IV_0, req->iv, ivsize); + memcpy_toio(ctx->dev->reg + RK_CRYPTO_AES_KEY_0, ctx->key, ctx->keylen); } conf_reg |= RK_CRYPTO_BYTESWAP_BTFIFO | RK_CRYPTO_BYTESWAP_BRFIFO; @@ -484,6 +487,7 @@ static void rk_ablk_exit_tfm(struct crypto_skcipher *tfm) { struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); + memzero_explicit(ctx->key, ctx->keylen); free_page((unsigned long)ctx->dev->addr_vir); crypto_free_skcipher(ctx->fallback_tfm); } -- GitLab From bb3c7b73363c9a149b12b74c44ae94b73a8fddf8 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:54:47 +0000 Subject: [PATCH 0240/1423] crypto: rockchip - remove non-aligned handling Now driver have fallback for un-aligned cases, remove all code handling those cases. Fixes: ce0183cb6464b ("crypto: rockchip - switch to skcipher API") Reviewed-by: John Keeping Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/rockchip/rk3288_crypto.c | 69 +++++-------------- drivers/crypto/rockchip/rk3288_crypto.h | 4 -- drivers/crypto/rockchip/rk3288_crypto_ahash.c | 22 ++---- .../crypto/rockchip/rk3288_crypto_skcipher.c | 39 +++-------- 4 files changed, 31 insertions(+), 103 deletions(-) diff --git a/drivers/crypto/rockchip/rk3288_crypto.c b/drivers/crypto/rockchip/rk3288_crypto.c index 4cff49b829838..b3db096e2ec29 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.c +++ b/drivers/crypto/rockchip/rk3288_crypto.c @@ -88,63 +88,26 @@ static int rk_load_data(struct rk_crypto_info *dev, { unsigned int count; - dev->aligned = dev->aligned ? - check_alignment(sg_src, sg_dst, dev->align_size) : - dev->aligned; - if (dev->aligned) { - count = min(dev->left_bytes, sg_src->length); - dev->left_bytes -= count; - - if (!dma_map_sg(dev->dev, sg_src, 1, DMA_TO_DEVICE)) { - dev_err(dev->dev, "[%s:%d] dma_map_sg(src) error\n", + count = min(dev->left_bytes, sg_src->length); + dev->left_bytes -= count; + + if (!dma_map_sg(dev->dev, sg_src, 1, DMA_TO_DEVICE)) { + dev_err(dev->dev, "[%s:%d] dma_map_sg(src) error\n", __func__, __LINE__); - return -EINVAL; - } - dev->addr_in = sg_dma_address(sg_src); + return -EINVAL; + } + dev->addr_in = sg_dma_address(sg_src); - if (sg_dst) { - if (!dma_map_sg(dev->dev, sg_dst, 1, DMA_FROM_DEVICE)) { - dev_err(dev->dev, + if (sg_dst) { + if (!dma_map_sg(dev->dev, sg_dst, 1, DMA_FROM_DEVICE)) { + dev_err(dev->dev, "[%s:%d] dma_map_sg(dst) error\n", __func__, __LINE__); - dma_unmap_sg(dev->dev, sg_src, 1, - DMA_TO_DEVICE); - return -EINVAL; - } - dev->addr_out = sg_dma_address(sg_dst); - } - } else { - count = (dev->left_bytes > PAGE_SIZE) ? - PAGE_SIZE : dev->left_bytes; - - if (!sg_pcopy_to_buffer(dev->first, dev->src_nents, - dev->addr_vir, count, - dev->total - dev->left_bytes)) { - dev_err(dev->dev, "[%s:%d] pcopy err\n", - __func__, __LINE__); + dma_unmap_sg(dev->dev, sg_src, 1, + DMA_TO_DEVICE); return -EINVAL; } - dev->left_bytes -= count; - sg_init_one(&dev->sg_tmp, dev->addr_vir, count); - if (!dma_map_sg(dev->dev, &dev->sg_tmp, 1, DMA_TO_DEVICE)) { - dev_err(dev->dev, "[%s:%d] dma_map_sg(sg_tmp) error\n", - __func__, __LINE__); - return -ENOMEM; - } - dev->addr_in = sg_dma_address(&dev->sg_tmp); - - if (sg_dst) { - if (!dma_map_sg(dev->dev, &dev->sg_tmp, 1, - DMA_FROM_DEVICE)) { - dev_err(dev->dev, - "[%s:%d] dma_map_sg(sg_tmp) error\n", - __func__, __LINE__); - dma_unmap_sg(dev->dev, &dev->sg_tmp, 1, - DMA_TO_DEVICE); - return -ENOMEM; - } - dev->addr_out = sg_dma_address(&dev->sg_tmp); - } + dev->addr_out = sg_dma_address(sg_dst); } dev->count = count; return 0; @@ -154,11 +117,11 @@ static void rk_unload_data(struct rk_crypto_info *dev) { struct scatterlist *sg_in, *sg_out; - sg_in = dev->aligned ? dev->sg_src : &dev->sg_tmp; + sg_in = dev->sg_src; dma_unmap_sg(dev->dev, sg_in, 1, DMA_TO_DEVICE); if (dev->sg_dst) { - sg_out = dev->aligned ? dev->sg_dst : &dev->sg_tmp; + sg_out = dev->sg_dst; dma_unmap_sg(dev->dev, sg_out, 1, DMA_FROM_DEVICE); } } diff --git a/drivers/crypto/rockchip/rk3288_crypto.h b/drivers/crypto/rockchip/rk3288_crypto.h index 540b81a14b9be..a7de5738f6dc4 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.h +++ b/drivers/crypto/rockchip/rk3288_crypto.h @@ -204,12 +204,8 @@ struct rk_crypto_info { /* the public variable */ struct scatterlist *sg_src; struct scatterlist *sg_dst; - struct scatterlist sg_tmp; struct scatterlist *first; unsigned int left_bytes; - void *addr_vir; - int aligned; - int align_size; size_t src_nents; size_t dst_nents; unsigned int total; diff --git a/drivers/crypto/rockchip/rk3288_crypto_ahash.c b/drivers/crypto/rockchip/rk3288_crypto_ahash.c index 16009bb0bf160..c762e462eb570 100644 --- a/drivers/crypto/rockchip/rk3288_crypto_ahash.c +++ b/drivers/crypto/rockchip/rk3288_crypto_ahash.c @@ -236,8 +236,6 @@ static int rk_ahash_start(struct rk_crypto_info *dev) dev->total = req->nbytes; dev->left_bytes = req->nbytes; - dev->aligned = 0; - dev->align_size = 4; dev->sg_dst = NULL; dev->sg_src = req->src; dev->first = req->src; @@ -272,15 +270,13 @@ static int rk_ahash_crypto_rx(struct rk_crypto_info *dev) dev->unload_data(dev); if (dev->left_bytes) { - if (dev->aligned) { - if (sg_is_last(dev->sg_src)) { - dev_warn(dev->dev, "[%s:%d], Lack of data\n", - __func__, __LINE__); - err = -ENOMEM; - goto out_rx; - } - dev->sg_src = sg_next(dev->sg_src); + if (sg_is_last(dev->sg_src)) { + dev_warn(dev->dev, "[%s:%d], Lack of data\n", + __func__, __LINE__); + err = -ENOMEM; + goto out_rx; } + dev->sg_src = sg_next(dev->sg_src); err = rk_ahash_set_data_start(dev); } else { /* @@ -318,11 +314,6 @@ static int rk_cra_hash_init(struct crypto_tfm *tfm) algt = container_of(alg, struct rk_crypto_tmp, alg.hash); tctx->dev = algt->dev; - tctx->dev->addr_vir = (void *)__get_free_page(GFP_KERNEL); - if (!tctx->dev->addr_vir) { - dev_err(tctx->dev->dev, "failed to kmalloc for addr_vir\n"); - return -ENOMEM; - } tctx->dev->start = rk_ahash_start; tctx->dev->update = rk_ahash_crypto_rx; tctx->dev->complete = rk_ahash_crypto_complete; @@ -345,7 +336,6 @@ static void rk_cra_hash_exit(struct crypto_tfm *tfm) { struct rk_ahash_ctx *tctx = crypto_tfm_ctx(tfm); - free_page((unsigned long)tctx->dev->addr_vir); crypto_free_ahash(tctx->fallback_tfm); } diff --git a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c index 1ef94f8db2c5c..d067b7f091659 100644 --- a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c +++ b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c @@ -356,7 +356,6 @@ static int rk_ablk_start(struct rk_crypto_info *dev) dev->src_nents = sg_nents(req->src); dev->sg_dst = req->dst; dev->dst_nents = sg_nents(req->dst); - dev->aligned = 1; spin_lock_irqsave(&dev->lock, flags); rk_ablk_hw_init(dev); @@ -376,13 +375,9 @@ static void rk_iv_copyback(struct rk_crypto_info *dev) /* Update the IV buffer to contain the next IV for encryption mode. */ if (!(rctx->mode & RK_CRYPTO_DEC)) { - if (dev->aligned) { - memcpy(req->iv, sg_virt(dev->sg_dst) + - dev->sg_dst->length - ivsize, ivsize); - } else { - memcpy(req->iv, dev->addr_vir + - dev->count - ivsize, ivsize); - } + memcpy(req->iv, + sg_virt(dev->sg_dst) + dev->sg_dst->length - ivsize, + ivsize); } } @@ -420,27 +415,16 @@ static int rk_ablk_rx(struct rk_crypto_info *dev) skcipher_request_cast(dev->async_req); dev->unload_data(dev); - if (!dev->aligned) { - if (!sg_pcopy_from_buffer(req->dst, dev->dst_nents, - dev->addr_vir, dev->count, - dev->total - dev->left_bytes - - dev->count)) { - err = -EINVAL; - goto out_rx; - } - } if (dev->left_bytes) { rk_update_iv(dev); - if (dev->aligned) { - if (sg_is_last(dev->sg_src)) { - dev_err(dev->dev, "[%s:%d] Lack of data\n", + if (sg_is_last(dev->sg_src)) { + dev_err(dev->dev, "[%s:%d] Lack of data\n", __func__, __LINE__); - err = -ENOMEM; - goto out_rx; - } - dev->sg_src = sg_next(dev->sg_src); - dev->sg_dst = sg_next(dev->sg_dst); + err = -ENOMEM; + goto out_rx; } + dev->sg_src = sg_next(dev->sg_src); + dev->sg_dst = sg_next(dev->sg_dst); err = rk_set_data_start(dev); } else { rk_iv_copyback(dev); @@ -462,13 +446,9 @@ static int rk_ablk_init_tfm(struct crypto_skcipher *tfm) algt = container_of(alg, struct rk_crypto_tmp, alg.skcipher); ctx->dev = algt->dev; - ctx->dev->align_size = crypto_tfm_alg_alignmask(crypto_skcipher_tfm(tfm)) + 1; ctx->dev->start = rk_ablk_start; ctx->dev->update = rk_ablk_rx; ctx->dev->complete = rk_crypto_complete; - ctx->dev->addr_vir = (char *)__get_free_page(GFP_KERNEL); - if (!ctx->dev->addr_vir) - return -ENOMEM; ctx->fallback_tfm = crypto_alloc_skcipher(name, 0, CRYPTO_ALG_NEED_FALLBACK); if (IS_ERR(ctx->fallback_tfm)) { @@ -488,7 +468,6 @@ static void rk_ablk_exit_tfm(struct crypto_skcipher *tfm) struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); memzero_explicit(ctx->key, ctx->keylen); - free_page((unsigned long)ctx->dev->addr_vir); crypto_free_skcipher(ctx->fallback_tfm); } -- GitLab From 57d67c6e8219b2a034c16d6149e30fb40fd39935 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:54:48 +0000 Subject: [PATCH 0241/1423] crypto: rockchip - rework by using crypto_engine Instead of doing manual queue management, let's use the crypto/engine for that. In the same time, rework the requests handling to be easier to understand (and fix all bugs related to them). Fixes: ce0183cb6464b ("crypto: rockchip - switch to skcipher API") Reviewed-by: John Keeping Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/Kconfig | 1 + drivers/crypto/rockchip/rk3288_crypto.c | 152 +---------- drivers/crypto/rockchip/rk3288_crypto.h | 39 +-- drivers/crypto/rockchip/rk3288_crypto_ahash.c | 144 +++++----- .../crypto/rockchip/rk3288_crypto_skcipher.c | 250 +++++++++--------- 5 files changed, 221 insertions(+), 365 deletions(-) diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig index 113b35f695980..c30b5a39c2ac2 100644 --- a/drivers/crypto/Kconfig +++ b/drivers/crypto/Kconfig @@ -674,6 +674,7 @@ config CRYPTO_DEV_ROCKCHIP select CRYPTO_CBC select CRYPTO_DES select CRYPTO_AES + select CRYPTO_ENGINE select CRYPTO_LIB_DES select CRYPTO_MD5 select CRYPTO_SHA1 diff --git a/drivers/crypto/rockchip/rk3288_crypto.c b/drivers/crypto/rockchip/rk3288_crypto.c index b3db096e2ec29..1afb65eee6c9f 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.c +++ b/drivers/crypto/rockchip/rk3288_crypto.c @@ -65,149 +65,24 @@ static void rk_crypto_disable_clk(struct rk_crypto_info *dev) clk_disable_unprepare(dev->sclk); } -static int check_alignment(struct scatterlist *sg_src, - struct scatterlist *sg_dst, - int align_mask) -{ - int in, out, align; - - in = IS_ALIGNED((uint32_t)sg_src->offset, 4) && - IS_ALIGNED((uint32_t)sg_src->length, align_mask); - if (!sg_dst) - return in; - out = IS_ALIGNED((uint32_t)sg_dst->offset, 4) && - IS_ALIGNED((uint32_t)sg_dst->length, align_mask); - align = in && out; - - return (align && (sg_src->length == sg_dst->length)); -} - -static int rk_load_data(struct rk_crypto_info *dev, - struct scatterlist *sg_src, - struct scatterlist *sg_dst) -{ - unsigned int count; - - count = min(dev->left_bytes, sg_src->length); - dev->left_bytes -= count; - - if (!dma_map_sg(dev->dev, sg_src, 1, DMA_TO_DEVICE)) { - dev_err(dev->dev, "[%s:%d] dma_map_sg(src) error\n", - __func__, __LINE__); - return -EINVAL; - } - dev->addr_in = sg_dma_address(sg_src); - - if (sg_dst) { - if (!dma_map_sg(dev->dev, sg_dst, 1, DMA_FROM_DEVICE)) { - dev_err(dev->dev, - "[%s:%d] dma_map_sg(dst) error\n", - __func__, __LINE__); - dma_unmap_sg(dev->dev, sg_src, 1, - DMA_TO_DEVICE); - return -EINVAL; - } - dev->addr_out = sg_dma_address(sg_dst); - } - dev->count = count; - return 0; -} - -static void rk_unload_data(struct rk_crypto_info *dev) -{ - struct scatterlist *sg_in, *sg_out; - - sg_in = dev->sg_src; - dma_unmap_sg(dev->dev, sg_in, 1, DMA_TO_DEVICE); - - if (dev->sg_dst) { - sg_out = dev->sg_dst; - dma_unmap_sg(dev->dev, sg_out, 1, DMA_FROM_DEVICE); - } -} - static irqreturn_t rk_crypto_irq_handle(int irq, void *dev_id) { struct rk_crypto_info *dev = platform_get_drvdata(dev_id); u32 interrupt_status; - spin_lock(&dev->lock); interrupt_status = CRYPTO_READ(dev, RK_CRYPTO_INTSTS); CRYPTO_WRITE(dev, RK_CRYPTO_INTSTS, interrupt_status); + dev->status = 1; if (interrupt_status & 0x0a) { dev_warn(dev->dev, "DMA Error\n"); - dev->err = -EFAULT; + dev->status = 0; } - tasklet_schedule(&dev->done_task); + complete(&dev->complete); - spin_unlock(&dev->lock); return IRQ_HANDLED; } -static int rk_crypto_enqueue(struct rk_crypto_info *dev, - struct crypto_async_request *async_req) -{ - unsigned long flags; - int ret; - - spin_lock_irqsave(&dev->lock, flags); - ret = crypto_enqueue_request(&dev->queue, async_req); - if (dev->busy) { - spin_unlock_irqrestore(&dev->lock, flags); - return ret; - } - dev->busy = true; - spin_unlock_irqrestore(&dev->lock, flags); - tasklet_schedule(&dev->queue_task); - - return ret; -} - -static void rk_crypto_queue_task_cb(unsigned long data) -{ - struct rk_crypto_info *dev = (struct rk_crypto_info *)data; - struct crypto_async_request *async_req, *backlog; - unsigned long flags; - int err = 0; - - dev->err = 0; - spin_lock_irqsave(&dev->lock, flags); - backlog = crypto_get_backlog(&dev->queue); - async_req = crypto_dequeue_request(&dev->queue); - - if (!async_req) { - dev->busy = false; - spin_unlock_irqrestore(&dev->lock, flags); - return; - } - spin_unlock_irqrestore(&dev->lock, flags); - - if (backlog) { - backlog->complete(backlog, -EINPROGRESS); - backlog = NULL; - } - - dev->async_req = async_req; - err = dev->start(dev); - if (err) - dev->complete(dev->async_req, err); -} - -static void rk_crypto_done_task_cb(unsigned long data) -{ - struct rk_crypto_info *dev = (struct rk_crypto_info *)data; - - if (dev->err) { - dev->complete(dev->async_req, dev->err); - return; - } - - dev->err = dev->update(dev); - if (dev->err) - dev->complete(dev->async_req, dev->err); -} - static struct rk_crypto_tmp *rk_cipher_algs[] = { &rk_ecb_aes_alg, &rk_cbc_aes_alg, @@ -300,8 +175,6 @@ static int rk_crypto_probe(struct platform_device *pdev) if (err) goto err_crypto; - spin_lock_init(&crypto_info->lock); - crypto_info->reg = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(crypto_info->reg)) { err = PTR_ERR(crypto_info->reg); @@ -351,17 +224,11 @@ static int rk_crypto_probe(struct platform_device *pdev) crypto_info->dev = &pdev->dev; platform_set_drvdata(pdev, crypto_info); - tasklet_init(&crypto_info->queue_task, - rk_crypto_queue_task_cb, (unsigned long)crypto_info); - tasklet_init(&crypto_info->done_task, - rk_crypto_done_task_cb, (unsigned long)crypto_info); - crypto_init_queue(&crypto_info->queue, 50); + crypto_info->engine = crypto_engine_alloc_init(&pdev->dev, true); + crypto_engine_start(crypto_info->engine); + init_completion(&crypto_info->complete); rk_crypto_enable_clk(crypto_info); - crypto_info->load_data = rk_load_data; - crypto_info->unload_data = rk_unload_data; - crypto_info->enqueue = rk_crypto_enqueue; - crypto_info->busy = false; err = rk_crypto_register(crypto_info); if (err) { @@ -373,9 +240,9 @@ static int rk_crypto_probe(struct platform_device *pdev) return 0; err_register_alg: - tasklet_kill(&crypto_info->queue_task); - tasklet_kill(&crypto_info->done_task); + crypto_engine_exit(crypto_info->engine); err_crypto: + dev_err(dev, "Crypto Accelerator not successfully registered\n"); return err; } @@ -385,8 +252,7 @@ static int rk_crypto_remove(struct platform_device *pdev) rk_crypto_unregister(); rk_crypto_disable_clk(crypto_tmp); - tasklet_kill(&crypto_tmp->done_task); - tasklet_kill(&crypto_tmp->queue_task); + crypto_engine_exit(crypto_tmp->engine); return 0; } diff --git a/drivers/crypto/rockchip/rk3288_crypto.h b/drivers/crypto/rockchip/rk3288_crypto.h index a7de5738f6dc4..65ed645e01687 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.h +++ b/drivers/crypto/rockchip/rk3288_crypto.h @@ -5,9 +5,11 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -193,39 +195,15 @@ struct rk_crypto_info { struct reset_control *rst; void __iomem *reg; int irq; - struct crypto_queue queue; - struct tasklet_struct queue_task; - struct tasklet_struct done_task; - struct crypto_async_request *async_req; - int err; - /* device lock */ - spinlock_t lock; - - /* the public variable */ - struct scatterlist *sg_src; - struct scatterlist *sg_dst; - struct scatterlist *first; - unsigned int left_bytes; - size_t src_nents; - size_t dst_nents; - unsigned int total; - unsigned int count; - dma_addr_t addr_in; - dma_addr_t addr_out; - bool busy; - int (*start)(struct rk_crypto_info *dev); - int (*update)(struct rk_crypto_info *dev); - void (*complete)(struct crypto_async_request *base, int err); - int (*load_data)(struct rk_crypto_info *dev, - struct scatterlist *sg_src, - struct scatterlist *sg_dst); - void (*unload_data)(struct rk_crypto_info *dev); - int (*enqueue)(struct rk_crypto_info *dev, - struct crypto_async_request *async_req); + + struct crypto_engine *engine; + struct completion complete; + int status; }; /* the private variable of hash */ struct rk_ahash_ctx { + struct crypto_engine_ctx enginectx; struct rk_crypto_info *dev; /* for fallback */ struct crypto_ahash *fallback_tfm; @@ -235,10 +213,12 @@ struct rk_ahash_ctx { struct rk_ahash_rctx { struct ahash_request fallback_req; u32 mode; + int nrsg; }; /* the private variable of cipher */ struct rk_cipher_ctx { + struct crypto_engine_ctx enginectx; struct rk_crypto_info *dev; unsigned int keylen; u8 key[AES_MAX_KEY_SIZE]; @@ -247,6 +227,7 @@ struct rk_cipher_ctx { }; struct rk_cipher_rctx { + u8 backup_iv[AES_BLOCK_SIZE]; u32 mode; struct skcipher_request fallback_req; // keep at the end }; diff --git a/drivers/crypto/rockchip/rk3288_crypto_ahash.c b/drivers/crypto/rockchip/rk3288_crypto_ahash.c index c762e462eb570..edd40e16a3f0a 100644 --- a/drivers/crypto/rockchip/rk3288_crypto_ahash.c +++ b/drivers/crypto/rockchip/rk3288_crypto_ahash.c @@ -9,6 +9,7 @@ * Some ideas are from marvell/cesa.c and s5p-sss.c driver. */ #include +#include #include "rk3288_crypto.h" /* @@ -72,16 +73,12 @@ static int zero_message_process(struct ahash_request *req) return 0; } -static void rk_ahash_crypto_complete(struct crypto_async_request *base, int err) +static void rk_ahash_reg_init(struct ahash_request *req) { - if (base->complete) - base->complete(base, err); -} - -static void rk_ahash_reg_init(struct rk_crypto_info *dev) -{ - struct ahash_request *req = ahash_request_cast(dev->async_req); struct rk_ahash_rctx *rctx = ahash_request_ctx(req); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct rk_ahash_ctx *tctx = crypto_ahash_ctx(tfm); + struct rk_crypto_info *dev = tctx->dev; int reg_status; reg_status = CRYPTO_READ(dev, RK_CRYPTO_CTRL) | @@ -108,7 +105,7 @@ static void rk_ahash_reg_init(struct rk_crypto_info *dev) RK_CRYPTO_BYTESWAP_BRFIFO | RK_CRYPTO_BYTESWAP_BTFIFO); - CRYPTO_WRITE(dev, RK_CRYPTO_HASH_MSG_LEN, dev->total); + CRYPTO_WRITE(dev, RK_CRYPTO_HASH_MSG_LEN, req->nbytes); } static int rk_ahash_init(struct ahash_request *req) @@ -206,44 +203,59 @@ static int rk_ahash_digest(struct ahash_request *req) if (!req->nbytes) return zero_message_process(req); - else - return dev->enqueue(dev, &req->base); + + return crypto_transfer_hash_request_to_engine(dev->engine, req); } -static void crypto_ahash_dma_start(struct rk_crypto_info *dev) +static void crypto_ahash_dma_start(struct rk_crypto_info *dev, struct scatterlist *sg) { - CRYPTO_WRITE(dev, RK_CRYPTO_HRDMAS, dev->addr_in); - CRYPTO_WRITE(dev, RK_CRYPTO_HRDMAL, (dev->count + 3) / 4); + CRYPTO_WRITE(dev, RK_CRYPTO_HRDMAS, sg_dma_address(sg)); + CRYPTO_WRITE(dev, RK_CRYPTO_HRDMAL, sg_dma_len(sg) / 4); CRYPTO_WRITE(dev, RK_CRYPTO_CTRL, RK_CRYPTO_HASH_START | (RK_CRYPTO_HASH_START << 16)); } -static int rk_ahash_set_data_start(struct rk_crypto_info *dev) +static int rk_hash_prepare(struct crypto_engine *engine, void *breq) +{ + struct ahash_request *areq = container_of(breq, struct ahash_request, base); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(areq); + struct rk_ahash_rctx *rctx = ahash_request_ctx(areq); + struct rk_ahash_ctx *tctx = crypto_ahash_ctx(tfm); + int ret; + + ret = dma_map_sg(tctx->dev->dev, areq->src, sg_nents(areq->src), DMA_TO_DEVICE); + if (ret <= 0) + return -EINVAL; + + rctx->nrsg = ret; + + return 0; +} + +static int rk_hash_unprepare(struct crypto_engine *engine, void *breq) { - int err; + struct ahash_request *areq = container_of(breq, struct ahash_request, base); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(areq); + struct rk_ahash_rctx *rctx = ahash_request_ctx(areq); + struct rk_ahash_ctx *tctx = crypto_ahash_ctx(tfm); - err = dev->load_data(dev, dev->sg_src, NULL); - if (!err) - crypto_ahash_dma_start(dev); - return err; + dma_unmap_sg(tctx->dev->dev, areq->src, rctx->nrsg, DMA_TO_DEVICE); + return 0; } -static int rk_ahash_start(struct rk_crypto_info *dev) +static int rk_hash_run(struct crypto_engine *engine, void *breq) { - struct ahash_request *req = ahash_request_cast(dev->async_req); - struct crypto_ahash *tfm; - struct rk_ahash_rctx *rctx; - - dev->total = req->nbytes; - dev->left_bytes = req->nbytes; - dev->sg_dst = NULL; - dev->sg_src = req->src; - dev->first = req->src; - dev->src_nents = sg_nents(req->src); - rctx = ahash_request_ctx(req); + struct ahash_request *areq = container_of(breq, struct ahash_request, base); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(areq); + struct rk_ahash_rctx *rctx = ahash_request_ctx(areq); + struct rk_ahash_ctx *tctx = crypto_ahash_ctx(tfm); + struct scatterlist *sg = areq->src; + int err = 0; + int i; + u32 v; + rctx->mode = 0; - tfm = crypto_ahash_reqtfm(req); switch (crypto_ahash_digestsize(tfm)) { case SHA1_DIGEST_SIZE: rctx->mode = RK_CRYPTO_HASH_SHA1; @@ -255,30 +267,26 @@ static int rk_ahash_start(struct rk_crypto_info *dev) rctx->mode = RK_CRYPTO_HASH_MD5; break; default: - return -EINVAL; + err = -EINVAL; + goto theend; } - rk_ahash_reg_init(dev); - return rk_ahash_set_data_start(dev); -} + rk_ahash_reg_init(areq); -static int rk_ahash_crypto_rx(struct rk_crypto_info *dev) -{ - int err = 0; - struct ahash_request *req = ahash_request_cast(dev->async_req); - struct crypto_ahash *tfm; - - dev->unload_data(dev); - if (dev->left_bytes) { - if (sg_is_last(dev->sg_src)) { - dev_warn(dev->dev, "[%s:%d], Lack of data\n", - __func__, __LINE__); - err = -ENOMEM; - goto out_rx; + while (sg) { + reinit_completion(&tctx->dev->complete); + tctx->dev->status = 0; + crypto_ahash_dma_start(tctx->dev, sg); + wait_for_completion_interruptible_timeout(&tctx->dev->complete, + msecs_to_jiffies(2000)); + if (!tctx->dev->status) { + dev_err(tctx->dev->dev, "DMA timeout\n"); + err = -EFAULT; + goto theend; } - dev->sg_src = sg_next(dev->sg_src); - err = rk_ahash_set_data_start(dev); - } else { + sg = sg_next(sg); + } + /* * it will take some time to process date after last dma * transmission. @@ -289,18 +297,20 @@ static int rk_ahash_crypto_rx(struct rk_crypto_info *dev) * efficiency, and make it response quickly when dma * complete. */ - while (!CRYPTO_READ(dev, RK_CRYPTO_HASH_STS)) - udelay(10); - - tfm = crypto_ahash_reqtfm(req); - memcpy_fromio(req->result, dev->reg + RK_CRYPTO_HASH_DOUT_0, - crypto_ahash_digestsize(tfm)); - dev->complete(dev->async_req, 0); - tasklet_schedule(&dev->queue_task); + while (!CRYPTO_READ(tctx->dev, RK_CRYPTO_HASH_STS)) + udelay(10); + + for (i = 0; i < crypto_ahash_digestsize(tfm) / 4; i++) { + v = readl(tctx->dev->reg + RK_CRYPTO_HASH_DOUT_0 + i * 4); + put_unaligned_le32(v, areq->result + i * 4); } -out_rx: - return err; +theend: + local_bh_disable(); + crypto_finalize_hash_request(engine, breq, err); + local_bh_enable(); + + return 0; } static int rk_cra_hash_init(struct crypto_tfm *tfm) @@ -314,9 +324,6 @@ static int rk_cra_hash_init(struct crypto_tfm *tfm) algt = container_of(alg, struct rk_crypto_tmp, alg.hash); tctx->dev = algt->dev; - tctx->dev->start = rk_ahash_start; - tctx->dev->update = rk_ahash_crypto_rx; - tctx->dev->complete = rk_ahash_crypto_complete; /* for fallback */ tctx->fallback_tfm = crypto_alloc_ahash(alg_name, 0, @@ -325,10 +332,15 @@ static int rk_cra_hash_init(struct crypto_tfm *tfm) dev_err(tctx->dev->dev, "Could not load fallback driver.\n"); return PTR_ERR(tctx->fallback_tfm); } + crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), sizeof(struct rk_ahash_rctx) + crypto_ahash_reqsize(tctx->fallback_tfm)); + tctx->enginectx.op.do_one_request = rk_hash_run; + tctx->enginectx.op.prepare_request = rk_hash_prepare; + tctx->enginectx.op.unprepare_request = rk_hash_unprepare; + return 0; } diff --git a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c index d067b7f091659..67a7e05d5ae31 100644 --- a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c +++ b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c @@ -9,6 +9,7 @@ * Some ideas are from marvell-cesa.c and s5p-sss.c driver. */ #include +#include #include "rk3288_crypto.h" #define RK_CRYPTO_DEC BIT(0) @@ -70,19 +71,15 @@ static int rk_cipher_fallback(struct skcipher_request *areq) return err; } -static void rk_crypto_complete(struct crypto_async_request *base, int err) -{ - if (base->complete) - base->complete(base, err); -} - static int rk_handle_req(struct rk_crypto_info *dev, struct skcipher_request *req) { + struct crypto_engine *engine = dev->engine; + if (rk_cipher_need_fallback(req)) return rk_cipher_fallback(req); - return dev->enqueue(dev, &req->base); + return crypto_transfer_skcipher_request_to_engine(engine, req); } static int rk_aes_setkey(struct crypto_skcipher *cipher, @@ -265,25 +262,21 @@ static int rk_des3_ede_cbc_decrypt(struct skcipher_request *req) return rk_handle_req(dev, req); } -static void rk_ablk_hw_init(struct rk_crypto_info *dev) +static void rk_ablk_hw_init(struct rk_crypto_info *dev, struct skcipher_request *req) { - struct skcipher_request *req = - skcipher_request_cast(dev->async_req); struct crypto_skcipher *cipher = crypto_skcipher_reqtfm(req); struct crypto_tfm *tfm = crypto_skcipher_tfm(cipher); struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(cipher); - u32 ivsize, block, conf_reg = 0; + u32 block, conf_reg = 0; block = crypto_tfm_alg_blocksize(tfm); - ivsize = crypto_skcipher_ivsize(cipher); if (block == DES_BLOCK_SIZE) { rctx->mode |= RK_CRYPTO_TDES_FIFO_MODE | RK_CRYPTO_TDES_BYTESWAP_KEY | RK_CRYPTO_TDES_BYTESWAP_IV; CRYPTO_WRITE(dev, RK_CRYPTO_TDES_CTRL, rctx->mode); - memcpy_toio(dev->reg + RK_CRYPTO_TDES_IV_0, req->iv, ivsize); memcpy_toio(ctx->dev->reg + RK_CRYPTO_TDES_KEY1_0, ctx->key, ctx->keylen); conf_reg = RK_CRYPTO_DESSEL; } else { @@ -296,7 +289,6 @@ static void rk_ablk_hw_init(struct rk_crypto_info *dev) else if (ctx->keylen == AES_KEYSIZE_256) rctx->mode |= RK_CRYPTO_AES_256BIT_key; CRYPTO_WRITE(dev, RK_CRYPTO_AES_CTRL, rctx->mode); - memcpy_toio(dev->reg + RK_CRYPTO_AES_IV_0, req->iv, ivsize); memcpy_toio(ctx->dev->reg + RK_CRYPTO_AES_KEY_0, ctx->key, ctx->keylen); } conf_reg |= RK_CRYPTO_BYTESWAP_BTFIFO | @@ -306,133 +298,138 @@ static void rk_ablk_hw_init(struct rk_crypto_info *dev) RK_CRYPTO_BCDMA_ERR_ENA | RK_CRYPTO_BCDMA_DONE_ENA); } -static void crypto_dma_start(struct rk_crypto_info *dev) +static void crypto_dma_start(struct rk_crypto_info *dev, + struct scatterlist *sgs, + struct scatterlist *sgd, unsigned int todo) { - CRYPTO_WRITE(dev, RK_CRYPTO_BRDMAS, dev->addr_in); - CRYPTO_WRITE(dev, RK_CRYPTO_BRDMAL, dev->count / 4); - CRYPTO_WRITE(dev, RK_CRYPTO_BTDMAS, dev->addr_out); + CRYPTO_WRITE(dev, RK_CRYPTO_BRDMAS, sg_dma_address(sgs)); + CRYPTO_WRITE(dev, RK_CRYPTO_BRDMAL, todo); + CRYPTO_WRITE(dev, RK_CRYPTO_BTDMAS, sg_dma_address(sgd)); CRYPTO_WRITE(dev, RK_CRYPTO_CTRL, RK_CRYPTO_BLOCK_START | _SBF(RK_CRYPTO_BLOCK_START, 16)); } -static int rk_set_data_start(struct rk_crypto_info *dev) +static int rk_cipher_run(struct crypto_engine *engine, void *async_req) { - int err; - struct skcipher_request *req = - skcipher_request_cast(dev->async_req); - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); + struct skcipher_request *areq = container_of(async_req, struct skcipher_request, base); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(areq); struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); - u32 ivsize = crypto_skcipher_ivsize(tfm); - u8 *src_last_blk = page_address(sg_page(dev->sg_src)) + - dev->sg_src->offset + dev->sg_src->length - ivsize; - - /* Store the iv that need to be updated in chain mode. - * And update the IV buffer to contain the next IV for decryption mode. - */ - if (rctx->mode & RK_CRYPTO_DEC) { - memcpy(ctx->iv, src_last_blk, ivsize); - sg_pcopy_to_buffer(dev->first, dev->src_nents, req->iv, - ivsize, dev->total - ivsize); - } - - err = dev->load_data(dev, dev->sg_src, dev->sg_dst); - if (!err) - crypto_dma_start(dev); - return err; -} - -static int rk_ablk_start(struct rk_crypto_info *dev) -{ - struct skcipher_request *req = - skcipher_request_cast(dev->async_req); - unsigned long flags; + struct rk_cipher_rctx *rctx = skcipher_request_ctx(areq); + struct scatterlist *sgs, *sgd; int err = 0; + int ivsize = crypto_skcipher_ivsize(tfm); + int offset; + u8 iv[AES_BLOCK_SIZE]; + u8 biv[AES_BLOCK_SIZE]; + u8 *ivtouse = areq->iv; + unsigned int len = areq->cryptlen; + unsigned int todo; + + ivsize = crypto_skcipher_ivsize(tfm); + if (areq->iv && crypto_skcipher_ivsize(tfm) > 0) { + if (rctx->mode & RK_CRYPTO_DEC) { + offset = areq->cryptlen - ivsize; + scatterwalk_map_and_copy(rctx->backup_iv, areq->src, + offset, ivsize, 0); + } + } - dev->left_bytes = req->cryptlen; - dev->total = req->cryptlen; - dev->sg_src = req->src; - dev->first = req->src; - dev->src_nents = sg_nents(req->src); - dev->sg_dst = req->dst; - dev->dst_nents = sg_nents(req->dst); - - spin_lock_irqsave(&dev->lock, flags); - rk_ablk_hw_init(dev); - err = rk_set_data_start(dev); - spin_unlock_irqrestore(&dev->lock, flags); - return err; -} - -static void rk_iv_copyback(struct rk_crypto_info *dev) -{ - struct skcipher_request *req = - skcipher_request_cast(dev->async_req); - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); - struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); - u32 ivsize = crypto_skcipher_ivsize(tfm); + sgs = areq->src; + sgd = areq->dst; - /* Update the IV buffer to contain the next IV for encryption mode. */ - if (!(rctx->mode & RK_CRYPTO_DEC)) { - memcpy(req->iv, - sg_virt(dev->sg_dst) + dev->sg_dst->length - ivsize, - ivsize); + while (sgs && sgd && len) { + if (!sgs->length) { + sgs = sg_next(sgs); + sgd = sg_next(sgd); + continue; + } + if (rctx->mode & RK_CRYPTO_DEC) { + /* we backup last block of source to be used as IV at next step */ + offset = sgs->length - ivsize; + scatterwalk_map_and_copy(biv, sgs, offset, ivsize, 0); + } + if (sgs == sgd) { + err = dma_map_sg(ctx->dev->dev, sgs, 1, DMA_BIDIRECTIONAL); + if (err <= 0) { + err = -EINVAL; + goto theend_iv; + } + } else { + err = dma_map_sg(ctx->dev->dev, sgs, 1, DMA_TO_DEVICE); + if (err <= 0) { + err = -EINVAL; + goto theend_iv; + } + err = dma_map_sg(ctx->dev->dev, sgd, 1, DMA_FROM_DEVICE); + if (err <= 0) { + err = -EINVAL; + goto theend_sgs; + } + } + err = 0; + rk_ablk_hw_init(ctx->dev, areq); + if (ivsize) { + if (ivsize == DES_BLOCK_SIZE) + memcpy_toio(ctx->dev->reg + RK_CRYPTO_TDES_IV_0, ivtouse, ivsize); + else + memcpy_toio(ctx->dev->reg + RK_CRYPTO_AES_IV_0, ivtouse, ivsize); + } + reinit_completion(&ctx->dev->complete); + ctx->dev->status = 0; + + todo = min(sg_dma_len(sgs), len); + len -= todo; + crypto_dma_start(ctx->dev, sgs, sgd, todo / 4); + wait_for_completion_interruptible_timeout(&ctx->dev->complete, + msecs_to_jiffies(2000)); + if (!ctx->dev->status) { + dev_err(ctx->dev->dev, "DMA timeout\n"); + err = -EFAULT; + goto theend; + } + if (sgs == sgd) { + dma_unmap_sg(ctx->dev->dev, sgs, 1, DMA_BIDIRECTIONAL); + } else { + dma_unmap_sg(ctx->dev->dev, sgs, 1, DMA_TO_DEVICE); + dma_unmap_sg(ctx->dev->dev, sgd, 1, DMA_FROM_DEVICE); + } + if (rctx->mode & RK_CRYPTO_DEC) { + memcpy(iv, biv, ivsize); + ivtouse = iv; + } else { + offset = sgd->length - ivsize; + scatterwalk_map_and_copy(iv, sgd, offset, ivsize, 0); + ivtouse = iv; + } + sgs = sg_next(sgs); + sgd = sg_next(sgd); } -} - -static void rk_update_iv(struct rk_crypto_info *dev) -{ - struct skcipher_request *req = - skcipher_request_cast(dev->async_req); - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); - struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); - u32 ivsize = crypto_skcipher_ivsize(tfm); - u8 *new_iv = NULL; - if (rctx->mode & RK_CRYPTO_DEC) { - new_iv = ctx->iv; - } else { - new_iv = page_address(sg_page(dev->sg_dst)) + - dev->sg_dst->offset + dev->sg_dst->length - ivsize; + if (areq->iv && ivsize > 0) { + offset = areq->cryptlen - ivsize; + if (rctx->mode & RK_CRYPTO_DEC) { + memcpy(areq->iv, rctx->backup_iv, ivsize); + memzero_explicit(rctx->backup_iv, ivsize); + } else { + scatterwalk_map_and_copy(areq->iv, areq->dst, offset, + ivsize, 0); + } } - if (ivsize == DES_BLOCK_SIZE) - memcpy_toio(dev->reg + RK_CRYPTO_TDES_IV_0, new_iv, ivsize); - else if (ivsize == AES_BLOCK_SIZE) - memcpy_toio(dev->reg + RK_CRYPTO_AES_IV_0, new_iv, ivsize); -} +theend: + local_bh_disable(); + crypto_finalize_skcipher_request(engine, areq, err); + local_bh_enable(); + return 0; -/* return: - * true some err was occurred - * fault no err, continue - */ -static int rk_ablk_rx(struct rk_crypto_info *dev) -{ - int err = 0; - struct skcipher_request *req = - skcipher_request_cast(dev->async_req); - - dev->unload_data(dev); - if (dev->left_bytes) { - rk_update_iv(dev); - if (sg_is_last(dev->sg_src)) { - dev_err(dev->dev, "[%s:%d] Lack of data\n", - __func__, __LINE__); - err = -ENOMEM; - goto out_rx; - } - dev->sg_src = sg_next(dev->sg_src); - dev->sg_dst = sg_next(dev->sg_dst); - err = rk_set_data_start(dev); +theend_sgs: + if (sgs == sgd) { + dma_unmap_sg(ctx->dev->dev, sgs, 1, DMA_BIDIRECTIONAL); } else { - rk_iv_copyback(dev); - /* here show the calculation is over without any err */ - dev->complete(dev->async_req, 0); - tasklet_schedule(&dev->queue_task); + dma_unmap_sg(ctx->dev->dev, sgs, 1, DMA_TO_DEVICE); + dma_unmap_sg(ctx->dev->dev, sgd, 1, DMA_FROM_DEVICE); } -out_rx: +theend_iv: return err; } @@ -446,9 +443,6 @@ static int rk_ablk_init_tfm(struct crypto_skcipher *tfm) algt = container_of(alg, struct rk_crypto_tmp, alg.skcipher); ctx->dev = algt->dev; - ctx->dev->start = rk_ablk_start; - ctx->dev->update = rk_ablk_rx; - ctx->dev->complete = rk_crypto_complete; ctx->fallback_tfm = crypto_alloc_skcipher(name, 0, CRYPTO_ALG_NEED_FALLBACK); if (IS_ERR(ctx->fallback_tfm)) { @@ -460,6 +454,8 @@ static int rk_ablk_init_tfm(struct crypto_skcipher *tfm) tfm->reqsize = sizeof(struct rk_cipher_rctx) + crypto_skcipher_reqsize(ctx->fallback_tfm); + ctx->enginectx.op.do_one_request = rk_cipher_run; + return 0; } -- GitLab From 6d55c4a206d29006c733b5083ba5da8391abbdbd Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:54:49 +0000 Subject: [PATCH 0242/1423] crypto: rockchip - rewrite type Instead of using a custom type for classify algorithms, let's just use already defined ones. And let's made a bit more verbose about what is registered. Reviewed-by: John Keeping Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/rockchip/rk3288_crypto.c | 26 +++++++++++++------ drivers/crypto/rockchip/rk3288_crypto.h | 7 +---- drivers/crypto/rockchip/rk3288_crypto_ahash.c | 6 ++--- .../crypto/rockchip/rk3288_crypto_skcipher.c | 12 ++++----- 4 files changed, 28 insertions(+), 23 deletions(-) diff --git a/drivers/crypto/rockchip/rk3288_crypto.c b/drivers/crypto/rockchip/rk3288_crypto.c index 1afb65eee6c9f..8f9664acc78d3 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.c +++ b/drivers/crypto/rockchip/rk3288_crypto.c @@ -102,12 +102,22 @@ static int rk_crypto_register(struct rk_crypto_info *crypto_info) for (i = 0; i < ARRAY_SIZE(rk_cipher_algs); i++) { rk_cipher_algs[i]->dev = crypto_info; - if (rk_cipher_algs[i]->type == ALG_TYPE_CIPHER) - err = crypto_register_skcipher( - &rk_cipher_algs[i]->alg.skcipher); - else - err = crypto_register_ahash( - &rk_cipher_algs[i]->alg.hash); + switch (rk_cipher_algs[i]->type) { + case CRYPTO_ALG_TYPE_SKCIPHER: + dev_info(crypto_info->dev, "Register %s as %s\n", + rk_cipher_algs[i]->alg.skcipher.base.cra_name, + rk_cipher_algs[i]->alg.skcipher.base.cra_driver_name); + err = crypto_register_skcipher(&rk_cipher_algs[i]->alg.skcipher); + break; + case CRYPTO_ALG_TYPE_AHASH: + dev_info(crypto_info->dev, "Register %s as %s\n", + rk_cipher_algs[i]->alg.hash.halg.base.cra_name, + rk_cipher_algs[i]->alg.hash.halg.base.cra_driver_name); + err = crypto_register_ahash(&rk_cipher_algs[i]->alg.hash); + break; + default: + dev_err(crypto_info->dev, "unknown algorithm\n"); + } if (err) goto err_cipher_algs; } @@ -115,7 +125,7 @@ static int rk_crypto_register(struct rk_crypto_info *crypto_info) err_cipher_algs: for (k = 0; k < i; k++) { - if (rk_cipher_algs[i]->type == ALG_TYPE_CIPHER) + if (rk_cipher_algs[i]->type == CRYPTO_ALG_TYPE_SKCIPHER) crypto_unregister_skcipher(&rk_cipher_algs[k]->alg.skcipher); else crypto_unregister_ahash(&rk_cipher_algs[i]->alg.hash); @@ -128,7 +138,7 @@ static void rk_crypto_unregister(void) unsigned int i; for (i = 0; i < ARRAY_SIZE(rk_cipher_algs); i++) { - if (rk_cipher_algs[i]->type == ALG_TYPE_CIPHER) + if (rk_cipher_algs[i]->type == CRYPTO_ALG_TYPE_SKCIPHER) crypto_unregister_skcipher(&rk_cipher_algs[i]->alg.skcipher); else crypto_unregister_ahash(&rk_cipher_algs[i]->alg.hash); diff --git a/drivers/crypto/rockchip/rk3288_crypto.h b/drivers/crypto/rockchip/rk3288_crypto.h index 65ed645e01687..d924ea17402a7 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.h +++ b/drivers/crypto/rockchip/rk3288_crypto.h @@ -232,18 +232,13 @@ struct rk_cipher_rctx { struct skcipher_request fallback_req; // keep at the end }; -enum alg_type { - ALG_TYPE_HASH, - ALG_TYPE_CIPHER, -}; - struct rk_crypto_tmp { + u32 type; struct rk_crypto_info *dev; union { struct skcipher_alg skcipher; struct ahash_alg hash; } alg; - enum alg_type type; }; extern struct rk_crypto_tmp rk_ecb_aes_alg; diff --git a/drivers/crypto/rockchip/rk3288_crypto_ahash.c b/drivers/crypto/rockchip/rk3288_crypto_ahash.c index edd40e16a3f0a..d08e2438d3567 100644 --- a/drivers/crypto/rockchip/rk3288_crypto_ahash.c +++ b/drivers/crypto/rockchip/rk3288_crypto_ahash.c @@ -352,7 +352,7 @@ static void rk_cra_hash_exit(struct crypto_tfm *tfm) } struct rk_crypto_tmp rk_ahash_sha1 = { - .type = ALG_TYPE_HASH, + .type = CRYPTO_ALG_TYPE_AHASH, .alg.hash = { .init = rk_ahash_init, .update = rk_ahash_update, @@ -382,7 +382,7 @@ struct rk_crypto_tmp rk_ahash_sha1 = { }; struct rk_crypto_tmp rk_ahash_sha256 = { - .type = ALG_TYPE_HASH, + .type = CRYPTO_ALG_TYPE_AHASH, .alg.hash = { .init = rk_ahash_init, .update = rk_ahash_update, @@ -412,7 +412,7 @@ struct rk_crypto_tmp rk_ahash_sha256 = { }; struct rk_crypto_tmp rk_ahash_md5 = { - .type = ALG_TYPE_HASH, + .type = CRYPTO_ALG_TYPE_AHASH, .alg.hash = { .init = rk_ahash_init, .update = rk_ahash_update, diff --git a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c index 67a7e05d5ae31..1ed297f5d8098 100644 --- a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c +++ b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c @@ -468,7 +468,7 @@ static void rk_ablk_exit_tfm(struct crypto_skcipher *tfm) } struct rk_crypto_tmp rk_ecb_aes_alg = { - .type = ALG_TYPE_CIPHER, + .type = CRYPTO_ALG_TYPE_SKCIPHER, .alg.skcipher = { .base.cra_name = "ecb(aes)", .base.cra_driver_name = "ecb-aes-rk", @@ -490,7 +490,7 @@ struct rk_crypto_tmp rk_ecb_aes_alg = { }; struct rk_crypto_tmp rk_cbc_aes_alg = { - .type = ALG_TYPE_CIPHER, + .type = CRYPTO_ALG_TYPE_SKCIPHER, .alg.skcipher = { .base.cra_name = "cbc(aes)", .base.cra_driver_name = "cbc-aes-rk", @@ -513,7 +513,7 @@ struct rk_crypto_tmp rk_cbc_aes_alg = { }; struct rk_crypto_tmp rk_ecb_des_alg = { - .type = ALG_TYPE_CIPHER, + .type = CRYPTO_ALG_TYPE_SKCIPHER, .alg.skcipher = { .base.cra_name = "ecb(des)", .base.cra_driver_name = "ecb-des-rk", @@ -535,7 +535,7 @@ struct rk_crypto_tmp rk_ecb_des_alg = { }; struct rk_crypto_tmp rk_cbc_des_alg = { - .type = ALG_TYPE_CIPHER, + .type = CRYPTO_ALG_TYPE_SKCIPHER, .alg.skcipher = { .base.cra_name = "cbc(des)", .base.cra_driver_name = "cbc-des-rk", @@ -558,7 +558,7 @@ struct rk_crypto_tmp rk_cbc_des_alg = { }; struct rk_crypto_tmp rk_ecb_des3_ede_alg = { - .type = ALG_TYPE_CIPHER, + .type = CRYPTO_ALG_TYPE_SKCIPHER, .alg.skcipher = { .base.cra_name = "ecb(des3_ede)", .base.cra_driver_name = "ecb-des3-ede-rk", @@ -580,7 +580,7 @@ struct rk_crypto_tmp rk_ecb_des3_ede_alg = { }; struct rk_crypto_tmp rk_cbc_des3_ede_alg = { - .type = ALG_TYPE_CIPHER, + .type = CRYPTO_ALG_TYPE_SKCIPHER, .alg.skcipher = { .base.cra_name = "cbc(des3_ede)", .base.cra_driver_name = "cbc-des3-ede-rk", -- GitLab From 48d904d428b68080abd9161148ca2ab1331124a4 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:54:50 +0000 Subject: [PATCH 0243/1423] crypto: rockchip - add debugfs This patch enable to access usage stats for each algorithm. Reviewed-by: John Keeping Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/Kconfig | 10 ++++ drivers/crypto/rockchip/rk3288_crypto.c | 47 +++++++++++++++++++ drivers/crypto/rockchip/rk3288_crypto.h | 11 +++++ drivers/crypto/rockchip/rk3288_crypto_ahash.c | 8 ++++ .../crypto/rockchip/rk3288_crypto_skcipher.c | 15 ++++++ 5 files changed, 91 insertions(+) diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig index c30b5a39c2ac2..2947888d3b828 100644 --- a/drivers/crypto/Kconfig +++ b/drivers/crypto/Kconfig @@ -686,6 +686,16 @@ config CRYPTO_DEV_ROCKCHIP This driver interfaces with the hardware crypto accelerator. Supporting cbc/ecb chainmode, and aes/des/des3_ede cipher mode. +config CRYPTO_DEV_ROCKCHIP_DEBUG + bool "Enable Rockchip crypto stats" + depends on CRYPTO_DEV_ROCKCHIP + depends on DEBUG_FS + help + Say y to enable Rockchip crypto debug stats. + This will create /sys/kernel/debug/rk3288_crypto/stats for displaying + the number of requests per algorithm and other internal stats. + + config CRYPTO_DEV_ZYNQMP_AES tristate "Support for Xilinx ZynqMP AES hw accelerator" depends on ZYNQMP_FIRMWARE || COMPILE_TEST diff --git a/drivers/crypto/rockchip/rk3288_crypto.c b/drivers/crypto/rockchip/rk3288_crypto.c index 8f9664acc78d3..3e1b4f3b2422e 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.c +++ b/drivers/crypto/rockchip/rk3288_crypto.c @@ -95,6 +95,41 @@ static struct rk_crypto_tmp *rk_cipher_algs[] = { &rk_ahash_md5, }; +#ifdef CONFIG_CRYPTO_DEV_ROCKCHIP_DEBUG +static int rk_crypto_debugfs_show(struct seq_file *seq, void *v) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(rk_cipher_algs); i++) { + if (!rk_cipher_algs[i]->dev) + continue; + switch (rk_cipher_algs[i]->type) { + case CRYPTO_ALG_TYPE_SKCIPHER: + seq_printf(seq, "%s %s reqs=%lu fallback=%lu\n", + rk_cipher_algs[i]->alg.skcipher.base.cra_driver_name, + rk_cipher_algs[i]->alg.skcipher.base.cra_name, + rk_cipher_algs[i]->stat_req, rk_cipher_algs[i]->stat_fb); + seq_printf(seq, "\tfallback due to length: %lu\n", + rk_cipher_algs[i]->stat_fb_len); + seq_printf(seq, "\tfallback due to alignment: %lu\n", + rk_cipher_algs[i]->stat_fb_align); + seq_printf(seq, "\tfallback due to SGs: %lu\n", + rk_cipher_algs[i]->stat_fb_sgdiff); + break; + case CRYPTO_ALG_TYPE_AHASH: + seq_printf(seq, "%s %s reqs=%lu fallback=%lu\n", + rk_cipher_algs[i]->alg.hash.halg.base.cra_driver_name, + rk_cipher_algs[i]->alg.hash.halg.base.cra_name, + rk_cipher_algs[i]->stat_req, rk_cipher_algs[i]->stat_fb); + break; + } + } + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(rk_crypto_debugfs); +#endif + static int rk_crypto_register(struct rk_crypto_info *crypto_info) { unsigned int i, k; @@ -246,6 +281,15 @@ static int rk_crypto_probe(struct platform_device *pdev) goto err_register_alg; } +#ifdef CONFIG_CRYPTO_DEV_ROCKCHIP_DEBUG + /* Ignore error of debugfs */ + crypto_info->dbgfs_dir = debugfs_create_dir("rk3288_crypto", NULL); + crypto_info->dbgfs_stats = debugfs_create_file("stats", 0444, + crypto_info->dbgfs_dir, + crypto_info, + &rk_crypto_debugfs_fops); +#endif + dev_info(dev, "Crypto Accelerator successfully registered\n"); return 0; @@ -260,6 +304,9 @@ static int rk_crypto_remove(struct platform_device *pdev) { struct rk_crypto_info *crypto_tmp = platform_get_drvdata(pdev); +#ifdef CONFIG_CRYPTO_DEV_ROCKCHIP_DEBUG + debugfs_remove_recursive(crypto_tmp->dbgfs_dir); +#endif rk_crypto_unregister(); rk_crypto_disable_clk(crypto_tmp); crypto_engine_exit(crypto_tmp->engine); diff --git a/drivers/crypto/rockchip/rk3288_crypto.h b/drivers/crypto/rockchip/rk3288_crypto.h index d924ea17402a7..945a8184bbadd 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.h +++ b/drivers/crypto/rockchip/rk3288_crypto.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -199,6 +200,10 @@ struct rk_crypto_info { struct crypto_engine *engine; struct completion complete; int status; +#ifdef CONFIG_CRYPTO_DEV_ROCKCHIP_DEBUG + struct dentry *dbgfs_dir; + struct dentry *dbgfs_stats; +#endif }; /* the private variable of hash */ @@ -239,6 +244,12 @@ struct rk_crypto_tmp { struct skcipher_alg skcipher; struct ahash_alg hash; } alg; + unsigned long stat_req; + unsigned long stat_fb; + unsigned long stat_fb_len; + unsigned long stat_fb_sglen; + unsigned long stat_fb_align; + unsigned long stat_fb_sgdiff; }; extern struct rk_crypto_tmp rk_ecb_aes_alg; diff --git a/drivers/crypto/rockchip/rk3288_crypto_ahash.c b/drivers/crypto/rockchip/rk3288_crypto_ahash.c index d08e2438d3567..8856c6226be60 100644 --- a/drivers/crypto/rockchip/rk3288_crypto_ahash.c +++ b/drivers/crypto/rockchip/rk3288_crypto_ahash.c @@ -39,6 +39,10 @@ static int rk_ahash_digest_fb(struct ahash_request *areq) struct rk_ahash_rctx *rctx = ahash_request_ctx(areq); struct crypto_ahash *tfm = crypto_ahash_reqtfm(areq); struct rk_ahash_ctx *tfmctx = crypto_ahash_ctx(tfm); + struct ahash_alg *alg = __crypto_ahash_alg(tfm->base.__crt_alg); + struct rk_crypto_tmp *algt = container_of(alg, struct rk_crypto_tmp, alg.hash); + + algt->stat_fb++; ahash_request_set_tfm(&rctx->fallback_req, tfmctx->fallback_tfm); rctx->fallback_req.base.flags = areq->base.flags & @@ -249,6 +253,8 @@ static int rk_hash_run(struct crypto_engine *engine, void *breq) struct crypto_ahash *tfm = crypto_ahash_reqtfm(areq); struct rk_ahash_rctx *rctx = ahash_request_ctx(areq); struct rk_ahash_ctx *tctx = crypto_ahash_ctx(tfm); + struct ahash_alg *alg = __crypto_ahash_alg(tfm->base.__crt_alg); + struct rk_crypto_tmp *algt = container_of(alg, struct rk_crypto_tmp, alg.hash); struct scatterlist *sg = areq->src; int err = 0; int i; @@ -256,6 +262,8 @@ static int rk_hash_run(struct crypto_engine *engine, void *breq) rctx->mode = 0; + algt->stat_req++; + switch (crypto_ahash_digestsize(tfm)) { case SHA1_DIGEST_SIZE: rctx->mode = RK_CRYPTO_HASH_SHA1; diff --git a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c index 1ed297f5d8098..91b8a4c574da2 100644 --- a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c +++ b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c @@ -18,6 +18,8 @@ static int rk_cipher_need_fallback(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); unsigned int bs = crypto_skcipher_blocksize(tfm); + struct skcipher_alg *alg = crypto_skcipher_alg(tfm); + struct rk_crypto_tmp *algt = container_of(alg, struct rk_crypto_tmp, alg.skcipher); struct scatterlist *sgs, *sgd; unsigned int stodo, dtodo, len; @@ -29,20 +31,25 @@ static int rk_cipher_need_fallback(struct skcipher_request *req) sgd = req->dst; while (sgs && sgd) { if (!IS_ALIGNED(sgs->offset, sizeof(u32))) { + algt->stat_fb_align++; return true; } if (!IS_ALIGNED(sgd->offset, sizeof(u32))) { + algt->stat_fb_align++; return true; } stodo = min(len, sgs->length); if (stodo % bs) { + algt->stat_fb_len++; return true; } dtodo = min(len, sgd->length); if (dtodo % bs) { + algt->stat_fb_len++; return true; } if (stodo != dtodo) { + algt->stat_fb_sgdiff++; return true; } len -= stodo; @@ -57,8 +64,12 @@ static int rk_cipher_fallback(struct skcipher_request *areq) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(areq); struct rk_cipher_ctx *op = crypto_skcipher_ctx(tfm); struct rk_cipher_rctx *rctx = skcipher_request_ctx(areq); + struct skcipher_alg *alg = crypto_skcipher_alg(tfm); + struct rk_crypto_tmp *algt = container_of(alg, struct rk_crypto_tmp, alg.skcipher); int err; + algt->stat_fb++; + skcipher_request_set_tfm(&rctx->fallback_req, op->fallback_tfm); skcipher_request_set_callback(&rctx->fallback_req, areq->base.flags, areq->base.complete, areq->base.data); @@ -324,6 +335,10 @@ static int rk_cipher_run(struct crypto_engine *engine, void *async_req) u8 *ivtouse = areq->iv; unsigned int len = areq->cryptlen; unsigned int todo; + struct skcipher_alg *alg = crypto_skcipher_alg(tfm); + struct rk_crypto_tmp *algt = container_of(alg, struct rk_crypto_tmp, alg.skcipher); + + algt->stat_req++; ivsize = crypto_skcipher_ivsize(tfm); if (areq->iv && crypto_skcipher_ivsize(tfm) > 0) { -- GitLab From a216be3964c15661579005012b1f0d7d20a1f265 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:54:51 +0000 Subject: [PATCH 0244/1423] crypto: rockchip - introduce PM Add runtime PM support for rockchip crypto. Reviewed-by: John Keeping Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/rockchip/rk3288_crypto.c | 51 ++++++++++++++++++- drivers/crypto/rockchip/rk3288_crypto.h | 1 + drivers/crypto/rockchip/rk3288_crypto_ahash.c | 10 ++++ .../crypto/rockchip/rk3288_crypto_skcipher.c | 9 ++++ 4 files changed, 69 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/rockchip/rk3288_crypto.c b/drivers/crypto/rockchip/rk3288_crypto.c index 3e1b4f3b2422e..d9258b9e71b3c 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.c +++ b/drivers/crypto/rockchip/rk3288_crypto.c @@ -65,6 +65,48 @@ static void rk_crypto_disable_clk(struct rk_crypto_info *dev) clk_disable_unprepare(dev->sclk); } +/* + * Power management strategy: The device is suspended unless a TFM exists for + * one of the algorithms proposed by this driver. + */ +static int rk_crypto_pm_suspend(struct device *dev) +{ + struct rk_crypto_info *rkdev = dev_get_drvdata(dev); + + rk_crypto_disable_clk(rkdev); + return 0; +} + +static int rk_crypto_pm_resume(struct device *dev) +{ + struct rk_crypto_info *rkdev = dev_get_drvdata(dev); + + return rk_crypto_enable_clk(rkdev); +} + +static const struct dev_pm_ops rk_crypto_pm_ops = { + SET_RUNTIME_PM_OPS(rk_crypto_pm_suspend, rk_crypto_pm_resume, NULL) +}; + +static int rk_crypto_pm_init(struct rk_crypto_info *rkdev) +{ + int err; + + pm_runtime_use_autosuspend(rkdev->dev); + pm_runtime_set_autosuspend_delay(rkdev->dev, 2000); + + err = pm_runtime_set_suspended(rkdev->dev); + if (err) + return err; + pm_runtime_enable(rkdev->dev); + return err; +} + +static void rk_crypto_pm_exit(struct rk_crypto_info *rkdev) +{ + pm_runtime_disable(rkdev->dev); +} + static irqreturn_t rk_crypto_irq_handle(int irq, void *dev_id) { struct rk_crypto_info *dev = platform_get_drvdata(dev_id); @@ -273,7 +315,9 @@ static int rk_crypto_probe(struct platform_device *pdev) crypto_engine_start(crypto_info->engine); init_completion(&crypto_info->complete); - rk_crypto_enable_clk(crypto_info); + err = rk_crypto_pm_init(crypto_info); + if (err) + goto err_pm; err = rk_crypto_register(crypto_info); if (err) { @@ -294,6 +338,8 @@ static int rk_crypto_probe(struct platform_device *pdev) return 0; err_register_alg: + rk_crypto_pm_exit(crypto_info); +err_pm: crypto_engine_exit(crypto_info->engine); err_crypto: dev_err(dev, "Crypto Accelerator not successfully registered\n"); @@ -308,7 +354,7 @@ static int rk_crypto_remove(struct platform_device *pdev) debugfs_remove_recursive(crypto_tmp->dbgfs_dir); #endif rk_crypto_unregister(); - rk_crypto_disable_clk(crypto_tmp); + rk_crypto_pm_exit(crypto_tmp); crypto_engine_exit(crypto_tmp->engine); return 0; } @@ -318,6 +364,7 @@ static struct platform_driver crypto_driver = { .remove = rk_crypto_remove, .driver = { .name = "rk3288-crypto", + .pm = &rk_crypto_pm_ops, .of_match_table = crypto_of_id_table, }, }; diff --git a/drivers/crypto/rockchip/rk3288_crypto.h b/drivers/crypto/rockchip/rk3288_crypto.h index 945a8184bbadd..ddbb9246ce164 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.h +++ b/drivers/crypto/rockchip/rk3288_crypto.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/crypto/rockchip/rk3288_crypto_ahash.c b/drivers/crypto/rockchip/rk3288_crypto_ahash.c index 8856c6226be60..137013bd44102 100644 --- a/drivers/crypto/rockchip/rk3288_crypto_ahash.c +++ b/drivers/crypto/rockchip/rk3288_crypto_ahash.c @@ -328,6 +328,7 @@ static int rk_cra_hash_init(struct crypto_tfm *tfm) struct ahash_alg *alg = __crypto_ahash_alg(tfm->__crt_alg); const char *alg_name = crypto_tfm_alg_name(tfm); + int err; algt = container_of(alg, struct rk_crypto_tmp, alg.hash); @@ -349,7 +350,15 @@ static int rk_cra_hash_init(struct crypto_tfm *tfm) tctx->enginectx.op.prepare_request = rk_hash_prepare; tctx->enginectx.op.unprepare_request = rk_hash_unprepare; + err = pm_runtime_resume_and_get(tctx->dev->dev); + if (err < 0) + goto error_pm; + return 0; +error_pm: + crypto_free_ahash(tctx->fallback_tfm); + + return err; } static void rk_cra_hash_exit(struct crypto_tfm *tfm) @@ -357,6 +366,7 @@ static void rk_cra_hash_exit(struct crypto_tfm *tfm) struct rk_ahash_ctx *tctx = crypto_tfm_ctx(tfm); crypto_free_ahash(tctx->fallback_tfm); + pm_runtime_put_autosuspend(tctx->dev->dev); } struct rk_crypto_tmp rk_ahash_sha1 = { diff --git a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c index 91b8a4c574da2..3bdb304aa7948 100644 --- a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c +++ b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c @@ -454,6 +454,7 @@ static int rk_ablk_init_tfm(struct crypto_skcipher *tfm) struct skcipher_alg *alg = crypto_skcipher_alg(tfm); const char *name = crypto_tfm_alg_name(&tfm->base); struct rk_crypto_tmp *algt; + int err; algt = container_of(alg, struct rk_crypto_tmp, alg.skcipher); @@ -471,7 +472,14 @@ static int rk_ablk_init_tfm(struct crypto_skcipher *tfm) ctx->enginectx.op.do_one_request = rk_cipher_run; + err = pm_runtime_resume_and_get(ctx->dev->dev); + if (err < 0) + goto error_pm; + return 0; +error_pm: + crypto_free_skcipher(ctx->fallback_tfm); + return err; } static void rk_ablk_exit_tfm(struct crypto_skcipher *tfm) @@ -480,6 +488,7 @@ static void rk_ablk_exit_tfm(struct crypto_skcipher *tfm) memzero_explicit(ctx->key, ctx->keylen); crypto_free_skcipher(ctx->fallback_tfm); + pm_runtime_put_autosuspend(ctx->dev->dev); } struct rk_crypto_tmp rk_ecb_aes_alg = { -- GitLab From 6f61192549d0214f8d9d1e1d3152e450658ed1e9 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:54:52 +0000 Subject: [PATCH 0245/1423] crypto: rockchip - handle reset also in PM reset could be handled by PM functions. We keep the initial reset pulse to be sure the hw is a know device state after probe. Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/rockchip/rk3288_crypto.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/drivers/crypto/rockchip/rk3288_crypto.c b/drivers/crypto/rockchip/rk3288_crypto.c index d9258b9e71b3c..399829ef92e0f 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.c +++ b/drivers/crypto/rockchip/rk3288_crypto.c @@ -74,14 +74,23 @@ static int rk_crypto_pm_suspend(struct device *dev) struct rk_crypto_info *rkdev = dev_get_drvdata(dev); rk_crypto_disable_clk(rkdev); + reset_control_assert(rkdev->rst); + return 0; } static int rk_crypto_pm_resume(struct device *dev) { struct rk_crypto_info *rkdev = dev_get_drvdata(dev); + int ret; + + ret = rk_crypto_enable_clk(rkdev); + if (ret) + return ret; + + reset_control_deassert(rkdev->rst); + return 0; - return rk_crypto_enable_clk(rkdev); } static const struct dev_pm_ops rk_crypto_pm_ops = { @@ -222,13 +231,6 @@ static void rk_crypto_unregister(void) } } -static void rk_crypto_action(void *data) -{ - struct rk_crypto_info *crypto_info = data; - - reset_control_assert(crypto_info->rst); -} - static const struct of_device_id crypto_of_id_table[] = { { .compatible = "rockchip,rk3288-crypto" }, {} @@ -258,10 +260,6 @@ static int rk_crypto_probe(struct platform_device *pdev) usleep_range(10, 20); reset_control_deassert(crypto_info->rst); - err = devm_add_action_or_reset(dev, rk_crypto_action, crypto_info); - if (err) - goto err_crypto; - crypto_info->reg = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(crypto_info->reg)) { err = PTR_ERR(crypto_info->reg); -- GitLab From 3a6fd464f48ad35d8cf15d81fd92094132dc862a Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:54:53 +0000 Subject: [PATCH 0246/1423] crypto: rockchip - use clk_bulk to simplify clock management rk3328 does not have the same clock names than rk3288, instead of using a complex clock management, let's use clk_bulk to simplify their handling. Reviewed-by: John Keeping Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/rockchip/rk3288_crypto.c | 66 ++++--------------------- drivers/crypto/rockchip/rk3288_crypto.h | 6 +-- 2 files changed, 11 insertions(+), 61 deletions(-) diff --git a/drivers/crypto/rockchip/rk3288_crypto.c b/drivers/crypto/rockchip/rk3288_crypto.c index 399829ef92e0f..a635029ac71d0 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.c +++ b/drivers/crypto/rockchip/rk3288_crypto.c @@ -22,47 +22,16 @@ static int rk_crypto_enable_clk(struct rk_crypto_info *dev) { int err; - err = clk_prepare_enable(dev->sclk); - if (err) { - dev_err(dev->dev, "[%s:%d], Couldn't enable clock sclk\n", - __func__, __LINE__); - goto err_return; - } - err = clk_prepare_enable(dev->aclk); - if (err) { - dev_err(dev->dev, "[%s:%d], Couldn't enable clock aclk\n", - __func__, __LINE__); - goto err_aclk; - } - err = clk_prepare_enable(dev->hclk); - if (err) { - dev_err(dev->dev, "[%s:%d], Couldn't enable clock hclk\n", - __func__, __LINE__); - goto err_hclk; - } - err = clk_prepare_enable(dev->dmaclk); - if (err) { - dev_err(dev->dev, "[%s:%d], Couldn't enable clock dmaclk\n", - __func__, __LINE__); - goto err_dmaclk; - } - return err; -err_dmaclk: - clk_disable_unprepare(dev->hclk); -err_hclk: - clk_disable_unprepare(dev->aclk); -err_aclk: - clk_disable_unprepare(dev->sclk); -err_return: + err = clk_bulk_prepare_enable(dev->num_clks, dev->clks); + if (err) + dev_err(dev->dev, "Could not enable clock clks\n"); + return err; } static void rk_crypto_disable_clk(struct rk_crypto_info *dev) { - clk_disable_unprepare(dev->dmaclk); - clk_disable_unprepare(dev->hclk); - clk_disable_unprepare(dev->aclk); - clk_disable_unprepare(dev->sclk); + clk_bulk_disable_unprepare(dev->num_clks, dev->clks); } /* @@ -266,27 +235,10 @@ static int rk_crypto_probe(struct platform_device *pdev) goto err_crypto; } - crypto_info->aclk = devm_clk_get(&pdev->dev, "aclk"); - if (IS_ERR(crypto_info->aclk)) { - err = PTR_ERR(crypto_info->aclk); - goto err_crypto; - } - - crypto_info->hclk = devm_clk_get(&pdev->dev, "hclk"); - if (IS_ERR(crypto_info->hclk)) { - err = PTR_ERR(crypto_info->hclk); - goto err_crypto; - } - - crypto_info->sclk = devm_clk_get(&pdev->dev, "sclk"); - if (IS_ERR(crypto_info->sclk)) { - err = PTR_ERR(crypto_info->sclk); - goto err_crypto; - } - - crypto_info->dmaclk = devm_clk_get(&pdev->dev, "apb_pclk"); - if (IS_ERR(crypto_info->dmaclk)) { - err = PTR_ERR(crypto_info->dmaclk); + crypto_info->num_clks = devm_clk_bulk_get_all(&pdev->dev, + &crypto_info->clks); + if (crypto_info->num_clks < 3) { + err = -EINVAL; goto err_crypto; } diff --git a/drivers/crypto/rockchip/rk3288_crypto.h b/drivers/crypto/rockchip/rk3288_crypto.h index ddbb9246ce164..28bf09fe1c1d1 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.h +++ b/drivers/crypto/rockchip/rk3288_crypto.h @@ -190,10 +190,8 @@ struct rk_crypto_info { struct device *dev; - struct clk *aclk; - struct clk *hclk; - struct clk *sclk; - struct clk *dmaclk; + struct clk_bulk_data *clks; + int num_clks; struct reset_control *rst; void __iomem *reg; int irq; -- GitLab From e803188400d32d28ecfbef0878c289e3c7026723 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:54:54 +0000 Subject: [PATCH 0247/1423] crypto: rockchip - add myself as maintainer Nobody is set as maintainer of rockchip crypto, I propose to do it as I have already reworked lot of this code. Reviewed-by: John Keeping Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- MAINTAINERS | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index cf0f185023724..3489126acd1f6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17753,6 +17753,13 @@ F: Documentation/ABI/*/sysfs-driver-hid-roccat* F: drivers/hid/hid-roccat* F: include/linux/hid-roccat* +ROCKCHIP CRYPTO DRIVERS +M: Corentin Labbe +L: linux-crypto@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/crypto/rockchip,rk3288-crypto.yaml +F: drivers/crypto/rockchip/ + ROCKCHIP I2S TDM DRIVER M: Nicolas Frattaroli L: linux-rockchip@lists.infradead.org -- GitLab From 37bc22159c456ad43fb852fc6ed60f4081df25df Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:54:55 +0000 Subject: [PATCH 0248/1423] crypto: rockchip - use read_poll_timeout Use read_poll_timeout instead of open coding it. In the same time, fix indentation of related comment. Reviewed-by: John Keeping Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/rockchip/rk3288_crypto_ahash.c | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/crypto/rockchip/rk3288_crypto_ahash.c b/drivers/crypto/rockchip/rk3288_crypto_ahash.c index 137013bd44102..1fbab86c92384 100644 --- a/drivers/crypto/rockchip/rk3288_crypto_ahash.c +++ b/drivers/crypto/rockchip/rk3288_crypto_ahash.c @@ -10,6 +10,7 @@ */ #include #include +#include #include "rk3288_crypto.h" /* @@ -295,18 +296,17 @@ static int rk_hash_run(struct crypto_engine *engine, void *breq) sg = sg_next(sg); } - /* - * it will take some time to process date after last dma - * transmission. - * - * waiting time is relative with the last date len, - * so cannot set a fixed time here. - * 10us makes system not call here frequently wasting - * efficiency, and make it response quickly when dma - * complete. - */ - while (!CRYPTO_READ(tctx->dev, RK_CRYPTO_HASH_STS)) - udelay(10); + /* + * it will take some time to process date after last dma + * transmission. + * + * waiting time is relative with the last date len, + * so cannot set a fixed time here. + * 10us makes system not call here frequently wasting + * efficiency, and make it response quickly when dma + * complete. + */ + readl_poll_timeout(tctx->dev->reg + RK_CRYPTO_HASH_STS, v, v == 0, 10, 1000); for (i = 0; i < crypto_ahash_digestsize(tfm) / 4; i++) { v = readl(tctx->dev->reg + RK_CRYPTO_HASH_DOUT_0 + i * 4); -- GitLab From 456698746b40008eb0924eb7e9ec908330948b2d Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:54:56 +0000 Subject: [PATCH 0249/1423] crypto: rockchip - fix style issue This patch fixes some warning reported by checkpatch Reviewed-by: John Keeping Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/rockchip/rk3288_crypto_ahash.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/crypto/rockchip/rk3288_crypto_ahash.c b/drivers/crypto/rockchip/rk3288_crypto_ahash.c index 1fbab86c92384..fae779d73c840 100644 --- a/drivers/crypto/rockchip/rk3288_crypto_ahash.c +++ b/drivers/crypto/rockchip/rk3288_crypto_ahash.c @@ -336,7 +336,7 @@ static int rk_cra_hash_init(struct crypto_tfm *tfm) /* for fallback */ tctx->fallback_tfm = crypto_alloc_ahash(alg_name, 0, - CRYPTO_ALG_NEED_FALLBACK); + CRYPTO_ALG_NEED_FALLBACK); if (IS_ERR(tctx->fallback_tfm)) { dev_err(tctx->dev->dev, "Could not load fallback driver.\n"); return PTR_ERR(tctx->fallback_tfm); @@ -394,8 +394,8 @@ struct rk_crypto_tmp rk_ahash_sha1 = { .cra_init = rk_cra_hash_init, .cra_exit = rk_cra_hash_exit, .cra_module = THIS_MODULE, - } - } + } + } } }; @@ -424,8 +424,8 @@ struct rk_crypto_tmp rk_ahash_sha256 = { .cra_init = rk_cra_hash_init, .cra_exit = rk_cra_hash_exit, .cra_module = THIS_MODULE, - } - } + } + } } }; @@ -454,7 +454,7 @@ struct rk_crypto_tmp rk_ahash_md5 = { .cra_init = rk_cra_hash_init, .cra_exit = rk_cra_hash_exit, .cra_module = THIS_MODULE, - } } + } } }; -- GitLab From e65e90101329de0fe304e2df057f68c5f0fa4748 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:54:57 +0000 Subject: [PATCH 0250/1423] crypto: rockchip - add support for rk3328 The rk3328 could be used as-is by the rockchip driver. Reviewed-by: John Keeping Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/rockchip/rk3288_crypto.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/crypto/rockchip/rk3288_crypto.c b/drivers/crypto/rockchip/rk3288_crypto.c index a635029ac71d0..c92559b83f7dd 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.c +++ b/drivers/crypto/rockchip/rk3288_crypto.c @@ -202,6 +202,7 @@ static void rk_crypto_unregister(void) static const struct of_device_id crypto_of_id_table[] = { { .compatible = "rockchip,rk3288-crypto" }, + { .compatible = "rockchip,rk3328-crypto" }, {} }; MODULE_DEVICE_TABLE(of, crypto_of_id_table); -- GitLab From a7fa0644dd0b91fab97398de7ea4672a6526261f Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:54:58 +0000 Subject: [PATCH 0251/1423] crypto: rockchip - rename ablk functions to cipher Some functions have still ablk in their name even if there are not handling ablk_cipher anymore. So let's rename them. Reviewed-by: John Keeping Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- .../crypto/rockchip/rk3288_crypto_skcipher.c | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c index 3bdb304aa7948..d60c206e717d5 100644 --- a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c +++ b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c @@ -273,7 +273,7 @@ static int rk_des3_ede_cbc_decrypt(struct skcipher_request *req) return rk_handle_req(dev, req); } -static void rk_ablk_hw_init(struct rk_crypto_info *dev, struct skcipher_request *req) +static void rk_cipher_hw_init(struct rk_crypto_info *dev, struct skcipher_request *req) { struct crypto_skcipher *cipher = crypto_skcipher_reqtfm(req); struct crypto_tfm *tfm = crypto_skcipher_tfm(cipher); @@ -382,7 +382,7 @@ static int rk_cipher_run(struct crypto_engine *engine, void *async_req) } } err = 0; - rk_ablk_hw_init(ctx->dev, areq); + rk_cipher_hw_init(ctx->dev, areq); if (ivsize) { if (ivsize == DES_BLOCK_SIZE) memcpy_toio(ctx->dev->reg + RK_CRYPTO_TDES_IV_0, ivtouse, ivsize); @@ -448,7 +448,7 @@ theend_iv: return err; } -static int rk_ablk_init_tfm(struct crypto_skcipher *tfm) +static int rk_cipher_tfm_init(struct crypto_skcipher *tfm) { struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_alg *alg = crypto_skcipher_alg(tfm); @@ -482,7 +482,7 @@ error_pm: return err; } -static void rk_ablk_exit_tfm(struct crypto_skcipher *tfm) +static void rk_cipher_tfm_exit(struct crypto_skcipher *tfm) { struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); @@ -503,8 +503,8 @@ struct rk_crypto_tmp rk_ecb_aes_alg = { .base.cra_alignmask = 0x0f, .base.cra_module = THIS_MODULE, - .init = rk_ablk_init_tfm, - .exit = rk_ablk_exit_tfm, + .init = rk_cipher_tfm_init, + .exit = rk_cipher_tfm_exit, .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, .setkey = rk_aes_setkey, @@ -525,8 +525,8 @@ struct rk_crypto_tmp rk_cbc_aes_alg = { .base.cra_alignmask = 0x0f, .base.cra_module = THIS_MODULE, - .init = rk_ablk_init_tfm, - .exit = rk_ablk_exit_tfm, + .init = rk_cipher_tfm_init, + .exit = rk_cipher_tfm_exit, .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, .ivsize = AES_BLOCK_SIZE, @@ -548,8 +548,8 @@ struct rk_crypto_tmp rk_ecb_des_alg = { .base.cra_alignmask = 0x07, .base.cra_module = THIS_MODULE, - .init = rk_ablk_init_tfm, - .exit = rk_ablk_exit_tfm, + .init = rk_cipher_tfm_init, + .exit = rk_cipher_tfm_exit, .min_keysize = DES_KEY_SIZE, .max_keysize = DES_KEY_SIZE, .setkey = rk_des_setkey, @@ -570,8 +570,8 @@ struct rk_crypto_tmp rk_cbc_des_alg = { .base.cra_alignmask = 0x07, .base.cra_module = THIS_MODULE, - .init = rk_ablk_init_tfm, - .exit = rk_ablk_exit_tfm, + .init = rk_cipher_tfm_init, + .exit = rk_cipher_tfm_exit, .min_keysize = DES_KEY_SIZE, .max_keysize = DES_KEY_SIZE, .ivsize = DES_BLOCK_SIZE, @@ -593,8 +593,8 @@ struct rk_crypto_tmp rk_ecb_des3_ede_alg = { .base.cra_alignmask = 0x07, .base.cra_module = THIS_MODULE, - .init = rk_ablk_init_tfm, - .exit = rk_ablk_exit_tfm, + .init = rk_cipher_tfm_init, + .exit = rk_cipher_tfm_exit, .min_keysize = DES3_EDE_KEY_SIZE, .max_keysize = DES3_EDE_KEY_SIZE, .setkey = rk_tdes_setkey, @@ -615,8 +615,8 @@ struct rk_crypto_tmp rk_cbc_des3_ede_alg = { .base.cra_alignmask = 0x07, .base.cra_module = THIS_MODULE, - .init = rk_ablk_init_tfm, - .exit = rk_ablk_exit_tfm, + .init = rk_cipher_tfm_init, + .exit = rk_cipher_tfm_exit, .min_keysize = DES3_EDE_KEY_SIZE, .max_keysize = DES3_EDE_KEY_SIZE, .ivsize = DES_BLOCK_SIZE, -- GitLab From 2e3b149578c30275db9c3501c1d9dec36d16622a Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:54:59 +0000 Subject: [PATCH 0252/1423] crypto: rockchip - rework rk_handle_req function This patch rework the rk_handle_req(), simply removing the rk_crypto_info parameter. Reviewed-by: John Keeping Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- .../crypto/rockchip/rk3288_crypto_skcipher.c | 68 +++++-------------- 1 file changed, 17 insertions(+), 51 deletions(-) diff --git a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c index d60c206e717d5..3187869c4c68e 100644 --- a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c +++ b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c @@ -82,10 +82,12 @@ static int rk_cipher_fallback(struct skcipher_request *areq) return err; } -static int rk_handle_req(struct rk_crypto_info *dev, - struct skcipher_request *req) +static int rk_cipher_handle_req(struct skcipher_request *req) { - struct crypto_engine *engine = dev->engine; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct rk_cipher_ctx *tctx = crypto_skcipher_ctx(tfm); + struct rk_crypto_info *rkc = tctx->dev; + struct crypto_engine *engine = rkc->engine; if (rk_cipher_need_fallback(req)) return rk_cipher_fallback(req); @@ -142,135 +144,99 @@ static int rk_tdes_setkey(struct crypto_skcipher *cipher, static int rk_aes_ecb_encrypt(struct skcipher_request *req) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); - struct rk_crypto_info *dev = ctx->dev; rctx->mode = RK_CRYPTO_AES_ECB_MODE; - return rk_handle_req(dev, req); + return rk_cipher_handle_req(req); } static int rk_aes_ecb_decrypt(struct skcipher_request *req) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); - struct rk_crypto_info *dev = ctx->dev; rctx->mode = RK_CRYPTO_AES_ECB_MODE | RK_CRYPTO_DEC; - return rk_handle_req(dev, req); + return rk_cipher_handle_req(req); } static int rk_aes_cbc_encrypt(struct skcipher_request *req) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); - struct rk_crypto_info *dev = ctx->dev; rctx->mode = RK_CRYPTO_AES_CBC_MODE; - return rk_handle_req(dev, req); + return rk_cipher_handle_req(req); } static int rk_aes_cbc_decrypt(struct skcipher_request *req) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); - struct rk_crypto_info *dev = ctx->dev; rctx->mode = RK_CRYPTO_AES_CBC_MODE | RK_CRYPTO_DEC; - return rk_handle_req(dev, req); + return rk_cipher_handle_req(req); } static int rk_des_ecb_encrypt(struct skcipher_request *req) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); - struct rk_crypto_info *dev = ctx->dev; rctx->mode = 0; - return rk_handle_req(dev, req); + return rk_cipher_handle_req(req); } static int rk_des_ecb_decrypt(struct skcipher_request *req) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); - struct rk_crypto_info *dev = ctx->dev; rctx->mode = RK_CRYPTO_DEC; - return rk_handle_req(dev, req); + return rk_cipher_handle_req(req); } static int rk_des_cbc_encrypt(struct skcipher_request *req) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); - struct rk_crypto_info *dev = ctx->dev; rctx->mode = RK_CRYPTO_TDES_CHAINMODE_CBC; - return rk_handle_req(dev, req); + return rk_cipher_handle_req(req); } static int rk_des_cbc_decrypt(struct skcipher_request *req) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); - struct rk_crypto_info *dev = ctx->dev; rctx->mode = RK_CRYPTO_TDES_CHAINMODE_CBC | RK_CRYPTO_DEC; - return rk_handle_req(dev, req); + return rk_cipher_handle_req(req); } static int rk_des3_ede_ecb_encrypt(struct skcipher_request *req) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); - struct rk_crypto_info *dev = ctx->dev; rctx->mode = RK_CRYPTO_TDES_SELECT; - return rk_handle_req(dev, req); + return rk_cipher_handle_req(req); } static int rk_des3_ede_ecb_decrypt(struct skcipher_request *req) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); - struct rk_crypto_info *dev = ctx->dev; rctx->mode = RK_CRYPTO_TDES_SELECT | RK_CRYPTO_DEC; - return rk_handle_req(dev, req); + return rk_cipher_handle_req(req); } static int rk_des3_ede_cbc_encrypt(struct skcipher_request *req) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); - struct rk_crypto_info *dev = ctx->dev; rctx->mode = RK_CRYPTO_TDES_SELECT | RK_CRYPTO_TDES_CHAINMODE_CBC; - return rk_handle_req(dev, req); + return rk_cipher_handle_req(req); } static int rk_des3_ede_cbc_decrypt(struct skcipher_request *req) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); - struct rk_crypto_info *dev = ctx->dev; rctx->mode = RK_CRYPTO_TDES_SELECT | RK_CRYPTO_TDES_CHAINMODE_CBC | RK_CRYPTO_DEC; - return rk_handle_req(dev, req); + return rk_cipher_handle_req(req); } static void rk_cipher_hw_init(struct rk_crypto_info *dev, struct skcipher_request *req) -- GitLab From c018c7a9dd198ce965ca4d10c7b083849bc533be Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:55:00 +0000 Subject: [PATCH 0253/1423] crypto: rockchip - use a rk_crypto_info variable instead of lot of indirection Instead of using lot of ctx->dev->xx indirections, use an intermediate variable for rk_crypto_info. This will help later, when 2 different rk_crypto_info would be used. Reviewed-by: John Keeping Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/rockchip/rk3288_crypto_ahash.c | 23 +++++++----- .../crypto/rockchip/rk3288_crypto_skcipher.c | 37 ++++++++++--------- 2 files changed, 32 insertions(+), 28 deletions(-) diff --git a/drivers/crypto/rockchip/rk3288_crypto_ahash.c b/drivers/crypto/rockchip/rk3288_crypto_ahash.c index fae779d73c840..636dbcde0ca3f 100644 --- a/drivers/crypto/rockchip/rk3288_crypto_ahash.c +++ b/drivers/crypto/rockchip/rk3288_crypto_ahash.c @@ -226,9 +226,10 @@ static int rk_hash_prepare(struct crypto_engine *engine, void *breq) struct crypto_ahash *tfm = crypto_ahash_reqtfm(areq); struct rk_ahash_rctx *rctx = ahash_request_ctx(areq); struct rk_ahash_ctx *tctx = crypto_ahash_ctx(tfm); + struct rk_crypto_info *rkc = tctx->dev; int ret; - ret = dma_map_sg(tctx->dev->dev, areq->src, sg_nents(areq->src), DMA_TO_DEVICE); + ret = dma_map_sg(rkc->dev, areq->src, sg_nents(areq->src), DMA_TO_DEVICE); if (ret <= 0) return -EINVAL; @@ -243,8 +244,9 @@ static int rk_hash_unprepare(struct crypto_engine *engine, void *breq) struct crypto_ahash *tfm = crypto_ahash_reqtfm(areq); struct rk_ahash_rctx *rctx = ahash_request_ctx(areq); struct rk_ahash_ctx *tctx = crypto_ahash_ctx(tfm); + struct rk_crypto_info *rkc = tctx->dev; - dma_unmap_sg(tctx->dev->dev, areq->src, rctx->nrsg, DMA_TO_DEVICE); + dma_unmap_sg(rkc->dev, areq->src, rctx->nrsg, DMA_TO_DEVICE); return 0; } @@ -257,6 +259,7 @@ static int rk_hash_run(struct crypto_engine *engine, void *breq) struct ahash_alg *alg = __crypto_ahash_alg(tfm->base.__crt_alg); struct rk_crypto_tmp *algt = container_of(alg, struct rk_crypto_tmp, alg.hash); struct scatterlist *sg = areq->src; + struct rk_crypto_info *rkc = tctx->dev; int err = 0; int i; u32 v; @@ -283,13 +286,13 @@ static int rk_hash_run(struct crypto_engine *engine, void *breq) rk_ahash_reg_init(areq); while (sg) { - reinit_completion(&tctx->dev->complete); - tctx->dev->status = 0; - crypto_ahash_dma_start(tctx->dev, sg); - wait_for_completion_interruptible_timeout(&tctx->dev->complete, + reinit_completion(&rkc->complete); + rkc->status = 0; + crypto_ahash_dma_start(rkc, sg); + wait_for_completion_interruptible_timeout(&rkc->complete, msecs_to_jiffies(2000)); - if (!tctx->dev->status) { - dev_err(tctx->dev->dev, "DMA timeout\n"); + if (!rkc->status) { + dev_err(rkc->dev, "DMA timeout\n"); err = -EFAULT; goto theend; } @@ -306,10 +309,10 @@ static int rk_hash_run(struct crypto_engine *engine, void *breq) * efficiency, and make it response quickly when dma * complete. */ - readl_poll_timeout(tctx->dev->reg + RK_CRYPTO_HASH_STS, v, v == 0, 10, 1000); + readl_poll_timeout(rkc->reg + RK_CRYPTO_HASH_STS, v, v == 0, 10, 1000); for (i = 0; i < crypto_ahash_digestsize(tfm) / 4; i++) { - v = readl(tctx->dev->reg + RK_CRYPTO_HASH_DOUT_0 + i * 4); + v = readl(rkc->reg + RK_CRYPTO_HASH_DOUT_0 + i * 4); put_unaligned_le32(v, areq->result + i * 4); } diff --git a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c index 3187869c4c68e..6a1bea98fded1 100644 --- a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c +++ b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c @@ -303,6 +303,7 @@ static int rk_cipher_run(struct crypto_engine *engine, void *async_req) unsigned int todo; struct skcipher_alg *alg = crypto_skcipher_alg(tfm); struct rk_crypto_tmp *algt = container_of(alg, struct rk_crypto_tmp, alg.skcipher); + struct rk_crypto_info *rkc = ctx->dev; algt->stat_req++; @@ -330,49 +331,49 @@ static int rk_cipher_run(struct crypto_engine *engine, void *async_req) scatterwalk_map_and_copy(biv, sgs, offset, ivsize, 0); } if (sgs == sgd) { - err = dma_map_sg(ctx->dev->dev, sgs, 1, DMA_BIDIRECTIONAL); + err = dma_map_sg(rkc->dev, sgs, 1, DMA_BIDIRECTIONAL); if (err <= 0) { err = -EINVAL; goto theend_iv; } } else { - err = dma_map_sg(ctx->dev->dev, sgs, 1, DMA_TO_DEVICE); + err = dma_map_sg(rkc->dev, sgs, 1, DMA_TO_DEVICE); if (err <= 0) { err = -EINVAL; goto theend_iv; } - err = dma_map_sg(ctx->dev->dev, sgd, 1, DMA_FROM_DEVICE); + err = dma_map_sg(rkc->dev, sgd, 1, DMA_FROM_DEVICE); if (err <= 0) { err = -EINVAL; goto theend_sgs; } } err = 0; - rk_cipher_hw_init(ctx->dev, areq); + rk_cipher_hw_init(rkc, areq); if (ivsize) { if (ivsize == DES_BLOCK_SIZE) - memcpy_toio(ctx->dev->reg + RK_CRYPTO_TDES_IV_0, ivtouse, ivsize); + memcpy_toio(rkc->reg + RK_CRYPTO_TDES_IV_0, ivtouse, ivsize); else - memcpy_toio(ctx->dev->reg + RK_CRYPTO_AES_IV_0, ivtouse, ivsize); + memcpy_toio(rkc->reg + RK_CRYPTO_AES_IV_0, ivtouse, ivsize); } - reinit_completion(&ctx->dev->complete); - ctx->dev->status = 0; + reinit_completion(&rkc->complete); + rkc->status = 0; todo = min(sg_dma_len(sgs), len); len -= todo; - crypto_dma_start(ctx->dev, sgs, sgd, todo / 4); - wait_for_completion_interruptible_timeout(&ctx->dev->complete, + crypto_dma_start(rkc, sgs, sgd, todo / 4); + wait_for_completion_interruptible_timeout(&rkc->complete, msecs_to_jiffies(2000)); - if (!ctx->dev->status) { - dev_err(ctx->dev->dev, "DMA timeout\n"); + if (!rkc->status) { + dev_err(rkc->dev, "DMA timeout\n"); err = -EFAULT; goto theend; } if (sgs == sgd) { - dma_unmap_sg(ctx->dev->dev, sgs, 1, DMA_BIDIRECTIONAL); + dma_unmap_sg(rkc->dev, sgs, 1, DMA_BIDIRECTIONAL); } else { - dma_unmap_sg(ctx->dev->dev, sgs, 1, DMA_TO_DEVICE); - dma_unmap_sg(ctx->dev->dev, sgd, 1, DMA_FROM_DEVICE); + dma_unmap_sg(rkc->dev, sgs, 1, DMA_TO_DEVICE); + dma_unmap_sg(rkc->dev, sgd, 1, DMA_FROM_DEVICE); } if (rctx->mode & RK_CRYPTO_DEC) { memcpy(iv, biv, ivsize); @@ -405,10 +406,10 @@ theend: theend_sgs: if (sgs == sgd) { - dma_unmap_sg(ctx->dev->dev, sgs, 1, DMA_BIDIRECTIONAL); + dma_unmap_sg(rkc->dev, sgs, 1, DMA_BIDIRECTIONAL); } else { - dma_unmap_sg(ctx->dev->dev, sgs, 1, DMA_TO_DEVICE); - dma_unmap_sg(ctx->dev->dev, sgd, 1, DMA_FROM_DEVICE); + dma_unmap_sg(rkc->dev, sgs, 1, DMA_TO_DEVICE); + dma_unmap_sg(rkc->dev, sgd, 1, DMA_FROM_DEVICE); } theend_iv: return err; -- GitLab From ea389be9857721252367fd2cf81bc8068e060693 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:55:01 +0000 Subject: [PATCH 0254/1423] crypto: rockchip - use the rk_crypto_info given as parameter Instead of using the crypto_info from TFM ctx, use the one given as parameter. Reviewed-by: John Keeping Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/rockchip/rk3288_crypto_skcipher.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c index 6a1bea98fded1..cf0dfb6029d82 100644 --- a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c +++ b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c @@ -254,7 +254,7 @@ static void rk_cipher_hw_init(struct rk_crypto_info *dev, struct skcipher_reques RK_CRYPTO_TDES_BYTESWAP_KEY | RK_CRYPTO_TDES_BYTESWAP_IV; CRYPTO_WRITE(dev, RK_CRYPTO_TDES_CTRL, rctx->mode); - memcpy_toio(ctx->dev->reg + RK_CRYPTO_TDES_KEY1_0, ctx->key, ctx->keylen); + memcpy_toio(dev->reg + RK_CRYPTO_TDES_KEY1_0, ctx->key, ctx->keylen); conf_reg = RK_CRYPTO_DESSEL; } else { rctx->mode |= RK_CRYPTO_AES_FIFO_MODE | @@ -266,7 +266,7 @@ static void rk_cipher_hw_init(struct rk_crypto_info *dev, struct skcipher_reques else if (ctx->keylen == AES_KEYSIZE_256) rctx->mode |= RK_CRYPTO_AES_256BIT_key; CRYPTO_WRITE(dev, RK_CRYPTO_AES_CTRL, rctx->mode); - memcpy_toio(ctx->dev->reg + RK_CRYPTO_AES_KEY_0, ctx->key, ctx->keylen); + memcpy_toio(dev->reg + RK_CRYPTO_AES_KEY_0, ctx->key, ctx->keylen); } conf_reg |= RK_CRYPTO_BYTESWAP_BTFIFO | RK_CRYPTO_BYTESWAP_BRFIFO; -- GitLab From 81aaf680e85207d6521b250b2a80ba7c91cc9cbe Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:55:02 +0000 Subject: [PATCH 0255/1423] dt-bindings: crypto: convert rockchip-crypto to YAML Convert rockchip-crypto to YAML. Reviewed-by: John Keeping Reviewed-by: Krzysztof Kozlowski Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- .../crypto/rockchip,rk3288-crypto.yaml | 64 +++++++++++++++++++ .../bindings/crypto/rockchip-crypto.txt | 28 -------- 2 files changed, 64 insertions(+), 28 deletions(-) create mode 100644 Documentation/devicetree/bindings/crypto/rockchip,rk3288-crypto.yaml delete mode 100644 Documentation/devicetree/bindings/crypto/rockchip-crypto.txt diff --git a/Documentation/devicetree/bindings/crypto/rockchip,rk3288-crypto.yaml b/Documentation/devicetree/bindings/crypto/rockchip,rk3288-crypto.yaml new file mode 100644 index 0000000000000..8a219d439d02a --- /dev/null +++ b/Documentation/devicetree/bindings/crypto/rockchip,rk3288-crypto.yaml @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/crypto/rockchip,rk3288-crypto.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Rockchip Electronics Security Accelerator + +maintainers: + - Heiko Stuebner + +properties: + compatible: + enum: + - rockchip,rk3288-crypto + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + clocks: + maxItems: 4 + + clock-names: + items: + - const: aclk + - const: hclk + - const: sclk + - const: apb_pclk + + resets: + maxItems: 1 + + reset-names: + items: + - const: crypto-rst + +required: + - compatible + - reg + - interrupts + - clocks + - clock-names + - resets + - reset-names + +additionalProperties: false + +examples: + - | + #include + #include + crypto@ff8a0000 { + compatible = "rockchip,rk3288-crypto"; + reg = <0xff8a0000 0x4000>; + interrupts = ; + clocks = <&cru ACLK_CRYPTO>, <&cru HCLK_CRYPTO>, + <&cru SCLK_CRYPTO>, <&cru ACLK_DMAC1>; + clock-names = "aclk", "hclk", "sclk", "apb_pclk"; + resets = <&cru SRST_CRYPTO>; + reset-names = "crypto-rst"; + }; diff --git a/Documentation/devicetree/bindings/crypto/rockchip-crypto.txt b/Documentation/devicetree/bindings/crypto/rockchip-crypto.txt deleted file mode 100644 index 5e2ba385b8c9b..0000000000000 --- a/Documentation/devicetree/bindings/crypto/rockchip-crypto.txt +++ /dev/null @@ -1,28 +0,0 @@ -Rockchip Electronics And Security Accelerator - -Required properties: -- compatible: Should be "rockchip,rk3288-crypto" -- reg: Base physical address of the engine and length of memory mapped - region -- interrupts: Interrupt number -- clocks: Reference to the clocks about crypto -- clock-names: "aclk" used to clock data - "hclk" used to clock data - "sclk" used to clock crypto accelerator - "apb_pclk" used to clock dma -- resets: Must contain an entry for each entry in reset-names. - See ../reset/reset.txt for details. -- reset-names: Must include the name "crypto-rst". - -Examples: - - crypto: cypto-controller@ff8a0000 { - compatible = "rockchip,rk3288-crypto"; - reg = <0xff8a0000 0x4000>; - interrupts = ; - clocks = <&cru ACLK_CRYPTO>, <&cru HCLK_CRYPTO>, - <&cru SCLK_CRYPTO>, <&cru ACLK_DMAC1>; - clock-names = "aclk", "hclk", "sclk", "apb_pclk"; - resets = <&cru SRST_CRYPTO>; - reset-names = "crypto-rst"; - }; -- GitLab From d1b5749687618d969c0be6428174a18a7e94ebd2 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:55:03 +0000 Subject: [PATCH 0256/1423] dt-bindings: crypto: rockchip: add new compatible Since driver support new compatible, we need to update the driver bindings. Signed-off-by: Corentin Labbe Reviewed-by: Rob Herring Signed-off-by: Herbert Xu --- .../crypto/rockchip,rk3288-crypto.yaml | 79 +++++++++++++++++-- 1 file changed, 71 insertions(+), 8 deletions(-) diff --git a/Documentation/devicetree/bindings/crypto/rockchip,rk3288-crypto.yaml b/Documentation/devicetree/bindings/crypto/rockchip,rk3288-crypto.yaml index 8a219d439d02a..f1a9da8bff7a7 100644 --- a/Documentation/devicetree/bindings/crypto/rockchip,rk3288-crypto.yaml +++ b/Documentation/devicetree/bindings/crypto/rockchip,rk3288-crypto.yaml @@ -13,6 +13,8 @@ properties: compatible: enum: - rockchip,rk3288-crypto + - rockchip,rk3328-crypto + - rockchip,rk3399-crypto reg: maxItems: 1 @@ -21,21 +23,82 @@ properties: maxItems: 1 clocks: + minItems: 3 maxItems: 4 clock-names: - items: - - const: aclk - - const: hclk - - const: sclk - - const: apb_pclk + minItems: 3 + maxItems: 4 resets: - maxItems: 1 + minItems: 1 + maxItems: 3 reset-names: - items: - - const: crypto-rst + minItems: 1 + maxItems: 3 + +allOf: + - if: + properties: + compatible: + contains: + const: rockchip,rk3288-crypto + then: + properties: + clocks: + minItems: 4 + clock-names: + items: + - const: aclk + - const: hclk + - const: sclk + - const: apb_pclk + resets: + maxItems: 1 + reset-names: + items: + - const: crypto-rst + - if: + properties: + compatible: + contains: + const: rockchip,rk3328-crypto + then: + properties: + clocks: + maxItems: 3 + clock-names: + items: + - const: hclk_master + - const: hclk_slave + - const: sclk + resets: + maxItems: 1 + reset-names: + items: + - const: crypto-rst + - if: + properties: + compatible: + contains: + const: rockchip,rk3399-crypto + then: + properties: + clocks: + maxItems: 3 + clock-names: + items: + - const: hclk_master + - const: hclk_slave + - const: sclk + resets: + minItems: 3 + reset-names: + items: + - const: master + - const: slave + - const: crypto-rst required: - compatible -- GitLab From 2d3c756adcd7a7ee15b6a55cf01b363e3f134e79 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:55:07 +0000 Subject: [PATCH 0257/1423] crypto: rockchip - store crypto_info in request context The crypto_info to use must be stored in the request context. This will help when 2 crypto_info will be available on rk3399. Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/rockchip/rk3288_crypto.h | 2 ++ drivers/crypto/rockchip/rk3288_crypto_ahash.c | 14 ++++++-------- drivers/crypto/rockchip/rk3288_crypto_skcipher.c | 6 ++++-- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/drivers/crypto/rockchip/rk3288_crypto.h b/drivers/crypto/rockchip/rk3288_crypto.h index 28bf09fe1c1d1..ff9fc25972ebe 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.h +++ b/drivers/crypto/rockchip/rk3288_crypto.h @@ -215,6 +215,7 @@ struct rk_ahash_ctx { /* the private variable of hash for fallback */ struct rk_ahash_rctx { + struct rk_crypto_info *dev; struct ahash_request fallback_req; u32 mode; int nrsg; @@ -231,6 +232,7 @@ struct rk_cipher_ctx { }; struct rk_cipher_rctx { + struct rk_crypto_info *dev; u8 backup_iv[AES_BLOCK_SIZE]; u32 mode; struct skcipher_request fallback_req; // keep at the end diff --git a/drivers/crypto/rockchip/rk3288_crypto_ahash.c b/drivers/crypto/rockchip/rk3288_crypto_ahash.c index 636dbcde0ca3f..d1bf68cb390d1 100644 --- a/drivers/crypto/rockchip/rk3288_crypto_ahash.c +++ b/drivers/crypto/rockchip/rk3288_crypto_ahash.c @@ -200,6 +200,7 @@ static int rk_ahash_export(struct ahash_request *req, void *out) static int rk_ahash_digest(struct ahash_request *req) { + struct rk_ahash_rctx *rctx = ahash_request_ctx(req); struct rk_ahash_ctx *tctx = crypto_tfm_ctx(req->base.tfm); struct rk_crypto_info *dev = tctx->dev; @@ -209,6 +210,8 @@ static int rk_ahash_digest(struct ahash_request *req) if (!req->nbytes) return zero_message_process(req); + rctx->dev = dev; + return crypto_transfer_hash_request_to_engine(dev->engine, req); } @@ -223,10 +226,8 @@ static void crypto_ahash_dma_start(struct rk_crypto_info *dev, struct scatterlis static int rk_hash_prepare(struct crypto_engine *engine, void *breq) { struct ahash_request *areq = container_of(breq, struct ahash_request, base); - struct crypto_ahash *tfm = crypto_ahash_reqtfm(areq); struct rk_ahash_rctx *rctx = ahash_request_ctx(areq); - struct rk_ahash_ctx *tctx = crypto_ahash_ctx(tfm); - struct rk_crypto_info *rkc = tctx->dev; + struct rk_crypto_info *rkc = rctx->dev; int ret; ret = dma_map_sg(rkc->dev, areq->src, sg_nents(areq->src), DMA_TO_DEVICE); @@ -241,10 +242,8 @@ static int rk_hash_prepare(struct crypto_engine *engine, void *breq) static int rk_hash_unprepare(struct crypto_engine *engine, void *breq) { struct ahash_request *areq = container_of(breq, struct ahash_request, base); - struct crypto_ahash *tfm = crypto_ahash_reqtfm(areq); struct rk_ahash_rctx *rctx = ahash_request_ctx(areq); - struct rk_ahash_ctx *tctx = crypto_ahash_ctx(tfm); - struct rk_crypto_info *rkc = tctx->dev; + struct rk_crypto_info *rkc = rctx->dev; dma_unmap_sg(rkc->dev, areq->src, rctx->nrsg, DMA_TO_DEVICE); return 0; @@ -255,11 +254,10 @@ static int rk_hash_run(struct crypto_engine *engine, void *breq) struct ahash_request *areq = container_of(breq, struct ahash_request, base); struct crypto_ahash *tfm = crypto_ahash_reqtfm(areq); struct rk_ahash_rctx *rctx = ahash_request_ctx(areq); - struct rk_ahash_ctx *tctx = crypto_ahash_ctx(tfm); struct ahash_alg *alg = __crypto_ahash_alg(tfm->base.__crt_alg); struct rk_crypto_tmp *algt = container_of(alg, struct rk_crypto_tmp, alg.hash); struct scatterlist *sg = areq->src; - struct rk_crypto_info *rkc = tctx->dev; + struct rk_crypto_info *rkc = rctx->dev; int err = 0; int i; u32 v; diff --git a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c index cf0dfb6029d82..0b1c90ababb71 100644 --- a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c +++ b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c @@ -86,12 +86,15 @@ static int rk_cipher_handle_req(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct rk_cipher_ctx *tctx = crypto_skcipher_ctx(tfm); + struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); struct rk_crypto_info *rkc = tctx->dev; struct crypto_engine *engine = rkc->engine; if (rk_cipher_need_fallback(req)) return rk_cipher_fallback(req); + rctx->dev = rkc; + return crypto_transfer_skcipher_request_to_engine(engine, req); } @@ -290,7 +293,6 @@ static int rk_cipher_run(struct crypto_engine *engine, void *async_req) { struct skcipher_request *areq = container_of(async_req, struct skcipher_request, base); struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(areq); - struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); struct rk_cipher_rctx *rctx = skcipher_request_ctx(areq); struct scatterlist *sgs, *sgd; int err = 0; @@ -303,7 +305,7 @@ static int rk_cipher_run(struct crypto_engine *engine, void *async_req) unsigned int todo; struct skcipher_alg *alg = crypto_skcipher_alg(tfm); struct rk_crypto_tmp *algt = container_of(alg, struct rk_crypto_tmp, alg.skcipher); - struct rk_crypto_info *rkc = ctx->dev; + struct rk_crypto_info *rkc = rctx->dev; algt->stat_req++; -- GitLab From e220e6719438f7a99fe0a73e6e126481380202fa Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:55:08 +0000 Subject: [PATCH 0258/1423] crypto: rockchip - Check for clocks numbers and their frequencies Add the number of clocks needed for each compatible. Rockchip's datasheet give maximum frequencies for some clocks, so add checks for verifying they are within limits. Let's start with rk3288 for clock frequency check, other will came later. Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/rockchip/rk3288_crypto.c | 75 +++++++++++++++++++++---- drivers/crypto/rockchip/rk3288_crypto.h | 16 +++++- 2 files changed, 79 insertions(+), 12 deletions(-) diff --git a/drivers/crypto/rockchip/rk3288_crypto.c b/drivers/crypto/rockchip/rk3288_crypto.c index c92559b83f7dd..232dc625d6e50 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.c +++ b/drivers/crypto/rockchip/rk3288_crypto.c @@ -14,10 +14,58 @@ #include #include #include +#include #include #include #include +static const struct rk_variant rk3288_variant = { + .num_clks = 4, + .rkclks = { + { "sclk", 150000000}, + } +}; + +static const struct rk_variant rk3328_variant = { + .num_clks = 3, +}; + +static int rk_crypto_get_clks(struct rk_crypto_info *dev) +{ + int i, j, err; + unsigned long cr; + + dev->num_clks = devm_clk_bulk_get_all(dev->dev, &dev->clks); + if (dev->num_clks < dev->variant->num_clks) { + dev_err(dev->dev, "Missing clocks, got %d instead of %d\n", + dev->num_clks, dev->variant->num_clks); + return -EINVAL; + } + + for (i = 0; i < dev->num_clks; i++) { + cr = clk_get_rate(dev->clks[i].clk); + for (j = 0; j < ARRAY_SIZE(dev->variant->rkclks); j++) { + if (dev->variant->rkclks[j].max == 0) + continue; + if (strcmp(dev->variant->rkclks[j].name, dev->clks[i].id)) + continue; + if (cr > dev->variant->rkclks[j].max) { + err = clk_set_rate(dev->clks[i].clk, + dev->variant->rkclks[j].max); + if (err) + dev_err(dev->dev, "Fail downclocking %s from %lu to %lu\n", + dev->variant->rkclks[j].name, cr, + dev->variant->rkclks[j].max); + else + dev_info(dev->dev, "Downclocking %s from %lu to %lu\n", + dev->variant->rkclks[j].name, cr, + dev->variant->rkclks[j].max); + } + } + } + return 0; +} + static int rk_crypto_enable_clk(struct rk_crypto_info *dev) { int err; @@ -201,8 +249,12 @@ static void rk_crypto_unregister(void) } static const struct of_device_id crypto_of_id_table[] = { - { .compatible = "rockchip,rk3288-crypto" }, - { .compatible = "rockchip,rk3328-crypto" }, + { .compatible = "rockchip,rk3288-crypto", + .data = &rk3288_variant, + }, + { .compatible = "rockchip,rk3328-crypto", + .data = &rk3328_variant, + }, {} }; MODULE_DEVICE_TABLE(of, crypto_of_id_table); @@ -220,6 +272,15 @@ static int rk_crypto_probe(struct platform_device *pdev) goto err_crypto; } + crypto_info->dev = &pdev->dev; + platform_set_drvdata(pdev, crypto_info); + + crypto_info->variant = of_device_get_match_data(&pdev->dev); + if (!crypto_info->variant) { + dev_err(&pdev->dev, "Missing variant\n"); + return -EINVAL; + } + crypto_info->rst = devm_reset_control_get(dev, "crypto-rst"); if (IS_ERR(crypto_info->rst)) { err = PTR_ERR(crypto_info->rst); @@ -236,12 +297,9 @@ static int rk_crypto_probe(struct platform_device *pdev) goto err_crypto; } - crypto_info->num_clks = devm_clk_bulk_get_all(&pdev->dev, - &crypto_info->clks); - if (crypto_info->num_clks < 3) { - err = -EINVAL; + err = rk_crypto_get_clks(crypto_info); + if (err) goto err_crypto; - } crypto_info->irq = platform_get_irq(pdev, 0); if (crypto_info->irq < 0) { @@ -259,9 +317,6 @@ static int rk_crypto_probe(struct platform_device *pdev) goto err_crypto; } - crypto_info->dev = &pdev->dev; - platform_set_drvdata(pdev, crypto_info); - crypto_info->engine = crypto_engine_alloc_init(&pdev->dev, true); crypto_engine_start(crypto_info->engine); init_completion(&crypto_info->complete); diff --git a/drivers/crypto/rockchip/rk3288_crypto.h b/drivers/crypto/rockchip/rk3288_crypto.h index ff9fc25972ebe..ac979d67ced98 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.h +++ b/drivers/crypto/rockchip/rk3288_crypto.h @@ -188,14 +188,26 @@ #define CRYPTO_WRITE(dev, offset, val) \ writel_relaxed((val), ((dev)->reg + (offset))) +#define RK_MAX_CLKS 4 + +struct rk_clks { + const char *name; + unsigned long max; +}; + +struct rk_variant { + int num_clks; + struct rk_clks rkclks[RK_MAX_CLKS]; +}; + struct rk_crypto_info { struct device *dev; struct clk_bulk_data *clks; - int num_clks; + int num_clks; struct reset_control *rst; void __iomem *reg; int irq; - + const struct rk_variant *variant; struct crypto_engine *engine; struct completion complete; int status; -- GitLab From 0d31b14c9e4178a129a1aa5e491e4da1489c07de Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:55:09 +0000 Subject: [PATCH 0259/1423] crypto: rockchip - rk_ahash_reg_init use crypto_info from parameter rk_ahash_reg_init() use crypto_info from TFM context, since we will remove it, let's take if from parameters. Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/rockchip/rk3288_crypto_ahash.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/crypto/rockchip/rk3288_crypto_ahash.c b/drivers/crypto/rockchip/rk3288_crypto_ahash.c index d1bf68cb390d1..30f78256c9551 100644 --- a/drivers/crypto/rockchip/rk3288_crypto_ahash.c +++ b/drivers/crypto/rockchip/rk3288_crypto_ahash.c @@ -78,12 +78,10 @@ static int zero_message_process(struct ahash_request *req) return 0; } -static void rk_ahash_reg_init(struct ahash_request *req) +static void rk_ahash_reg_init(struct ahash_request *req, + struct rk_crypto_info *dev) { struct rk_ahash_rctx *rctx = ahash_request_ctx(req); - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct rk_ahash_ctx *tctx = crypto_ahash_ctx(tfm); - struct rk_crypto_info *dev = tctx->dev; int reg_status; reg_status = CRYPTO_READ(dev, RK_CRYPTO_CTRL) | @@ -281,7 +279,7 @@ static int rk_hash_run(struct crypto_engine *engine, void *breq) goto theend; } - rk_ahash_reg_init(areq); + rk_ahash_reg_init(areq, rkc); while (sg) { reinit_completion(&rkc->complete); -- GitLab From c5a1e104c35e5134b6048f1e03960a6ac9c42935 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:55:10 +0000 Subject: [PATCH 0260/1423] crypto: rockchip - permit to have more than one reset The RK3399 has 3 resets, so the driver to handle multiple resets. This is done by using devm_reset_control_array_get_exclusive(). Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/rockchip/rk3288_crypto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/rockchip/rk3288_crypto.c b/drivers/crypto/rockchip/rk3288_crypto.c index 232dc625d6e50..d96f375423d59 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.c +++ b/drivers/crypto/rockchip/rk3288_crypto.c @@ -281,7 +281,7 @@ static int rk_crypto_probe(struct platform_device *pdev) return -EINVAL; } - crypto_info->rst = devm_reset_control_get(dev, "crypto-rst"); + crypto_info->rst = devm_reset_control_array_get_exclusive(dev); if (IS_ERR(crypto_info->rst)) { err = PTR_ERR(crypto_info->rst); goto err_crypto; -- GitLab From 9dcd71c863a6f6476378d076d3e9189c854d49fd Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 27 Sep 2022 07:55:11 +0000 Subject: [PATCH 0261/1423] crypto: rockchip - Add support for RK3399 The RK3399 has 2 rk3288 compatible crypto device named crypto0 and crypto1. The only difference is lack of RSA in crypto1. We need to add driver support for 2 parallel instance as only one need to register crypto algorithms. Then the driver will round robin each request on each device. For avoiding complexity (device bringup after a TFM is created), PM is modified to be handled per request. Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/rockchip/rk3288_crypto.c | 92 +++++++++++++++---- drivers/crypto/rockchip/rk3288_crypto.h | 25 +++-- drivers/crypto/rockchip/rk3288_crypto_ahash.c | 37 ++++---- .../crypto/rockchip/rk3288_crypto_skcipher.c | 37 ++++---- 4 files changed, 123 insertions(+), 68 deletions(-) diff --git a/drivers/crypto/rockchip/rk3288_crypto.c b/drivers/crypto/rockchip/rk3288_crypto.c index d96f375423d59..6217e73ba4c41 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.c +++ b/drivers/crypto/rockchip/rk3288_crypto.c @@ -19,6 +19,23 @@ #include #include +static struct rockchip_ip rocklist = { + .dev_list = LIST_HEAD_INIT(rocklist.dev_list), + .lock = __SPIN_LOCK_UNLOCKED(rocklist.lock), +}; + +struct rk_crypto_info *get_rk_crypto(void) +{ + struct rk_crypto_info *first; + + spin_lock(&rocklist.lock); + first = list_first_entry_or_null(&rocklist.dev_list, + struct rk_crypto_info, list); + list_rotate_left(&rocklist.dev_list); + spin_unlock(&rocklist.lock); + return first; +} + static const struct rk_variant rk3288_variant = { .num_clks = 4, .rkclks = { @@ -30,6 +47,10 @@ static const struct rk_variant rk3328_variant = { .num_clks = 3, }; +static const struct rk_variant rk3399_variant = { + .num_clks = 3, +}; + static int rk_crypto_get_clks(struct rk_crypto_info *dev) { int i, j, err; @@ -83,8 +104,8 @@ static void rk_crypto_disable_clk(struct rk_crypto_info *dev) } /* - * Power management strategy: The device is suspended unless a TFM exists for - * one of the algorithms proposed by this driver. + * Power management strategy: The device is suspended until a request + * is handled. For avoiding suspend/resume yoyo, the autosuspend is set to 2s. */ static int rk_crypto_pm_suspend(struct device *dev) { @@ -166,8 +187,17 @@ static struct rk_crypto_tmp *rk_cipher_algs[] = { #ifdef CONFIG_CRYPTO_DEV_ROCKCHIP_DEBUG static int rk_crypto_debugfs_show(struct seq_file *seq, void *v) { + struct rk_crypto_info *dd; unsigned int i; + spin_lock(&rocklist.lock); + list_for_each_entry(dd, &rocklist.dev_list, list) { + seq_printf(seq, "%s %s requests: %lu\n", + dev_driver_string(dd->dev), dev_name(dd->dev), + dd->nreq); + } + spin_unlock(&rocklist.lock); + for (i = 0; i < ARRAY_SIZE(rk_cipher_algs); i++) { if (!rk_cipher_algs[i]->dev) continue; @@ -198,6 +228,18 @@ static int rk_crypto_debugfs_show(struct seq_file *seq, void *v) DEFINE_SHOW_ATTRIBUTE(rk_crypto_debugfs); #endif +static void register_debugfs(struct rk_crypto_info *crypto_info) +{ +#ifdef CONFIG_CRYPTO_DEV_ROCKCHIP_DEBUG + /* Ignore error of debugfs */ + rocklist.dbgfs_dir = debugfs_create_dir("rk3288_crypto", NULL); + rocklist.dbgfs_stats = debugfs_create_file("stats", 0444, + rocklist.dbgfs_dir, + &rocklist, + &rk_crypto_debugfs_fops); +#endif +} + static int rk_crypto_register(struct rk_crypto_info *crypto_info) { unsigned int i, k; @@ -255,6 +297,9 @@ static const struct of_device_id crypto_of_id_table[] = { { .compatible = "rockchip,rk3328-crypto", .data = &rk3328_variant, }, + { .compatible = "rockchip,rk3399-crypto", + .data = &rk3399_variant, + }, {} }; MODULE_DEVICE_TABLE(of, crypto_of_id_table); @@ -262,7 +307,7 @@ MODULE_DEVICE_TABLE(of, crypto_of_id_table); static int rk_crypto_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; - struct rk_crypto_info *crypto_info; + struct rk_crypto_info *crypto_info, *first; int err = 0; crypto_info = devm_kzalloc(&pdev->dev, @@ -325,22 +370,22 @@ static int rk_crypto_probe(struct platform_device *pdev) if (err) goto err_pm; - err = rk_crypto_register(crypto_info); - if (err) { - dev_err(dev, "err in register alg"); - goto err_register_alg; - } + spin_lock(&rocklist.lock); + first = list_first_entry_or_null(&rocklist.dev_list, + struct rk_crypto_info, list); + list_add_tail(&crypto_info->list, &rocklist.dev_list); + spin_unlock(&rocklist.lock); + + if (!first) { + err = rk_crypto_register(crypto_info); + if (err) { + dev_err(dev, "Fail to register crypto algorithms"); + goto err_register_alg; + } -#ifdef CONFIG_CRYPTO_DEV_ROCKCHIP_DEBUG - /* Ignore error of debugfs */ - crypto_info->dbgfs_dir = debugfs_create_dir("rk3288_crypto", NULL); - crypto_info->dbgfs_stats = debugfs_create_file("stats", 0444, - crypto_info->dbgfs_dir, - crypto_info, - &rk_crypto_debugfs_fops); -#endif + register_debugfs(crypto_info); + } - dev_info(dev, "Crypto Accelerator successfully registered\n"); return 0; err_register_alg: @@ -355,11 +400,20 @@ err_crypto: static int rk_crypto_remove(struct platform_device *pdev) { struct rk_crypto_info *crypto_tmp = platform_get_drvdata(pdev); + struct rk_crypto_info *first; + + spin_lock_bh(&rocklist.lock); + list_del(&crypto_tmp->list); + first = list_first_entry_or_null(&rocklist.dev_list, + struct rk_crypto_info, list); + spin_unlock_bh(&rocklist.lock); + if (!first) { #ifdef CONFIG_CRYPTO_DEV_ROCKCHIP_DEBUG - debugfs_remove_recursive(crypto_tmp->dbgfs_dir); + debugfs_remove_recursive(rocklist.dbgfs_dir); #endif - rk_crypto_unregister(); + rk_crypto_unregister(); + } rk_crypto_pm_exit(crypto_tmp); crypto_engine_exit(crypto_tmp->engine); return 0; diff --git a/drivers/crypto/rockchip/rk3288_crypto.h b/drivers/crypto/rockchip/rk3288_crypto.h index ac979d67ced98..b2695258cade9 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.h +++ b/drivers/crypto/rockchip/rk3288_crypto.h @@ -190,6 +190,20 @@ #define RK_MAX_CLKS 4 +/* + * struct rockchip_ip - struct for managing a list of RK crypto instance + * @dev_list: Used for doing a list of rk_crypto_info + * @lock: Control access to dev_list + * @dbgfs_dir: Debugfs dentry for statistic directory + * @dbgfs_stats: Debugfs dentry for statistic counters + */ +struct rockchip_ip { + struct list_head dev_list; + spinlock_t lock; /* Control access to dev_list */ + struct dentry *dbgfs_dir; + struct dentry *dbgfs_stats; +}; + struct rk_clks { const char *name; unsigned long max; @@ -201,6 +215,7 @@ struct rk_variant { }; struct rk_crypto_info { + struct list_head list; struct device *dev; struct clk_bulk_data *clks; int num_clks; @@ -208,19 +223,15 @@ struct rk_crypto_info { void __iomem *reg; int irq; const struct rk_variant *variant; + unsigned long nreq; struct crypto_engine *engine; struct completion complete; int status; -#ifdef CONFIG_CRYPTO_DEV_ROCKCHIP_DEBUG - struct dentry *dbgfs_dir; - struct dentry *dbgfs_stats; -#endif }; /* the private variable of hash */ struct rk_ahash_ctx { struct crypto_engine_ctx enginectx; - struct rk_crypto_info *dev; /* for fallback */ struct crypto_ahash *fallback_tfm; }; @@ -236,7 +247,6 @@ struct rk_ahash_rctx { /* the private variable of cipher */ struct rk_cipher_ctx { struct crypto_engine_ctx enginectx; - struct rk_crypto_info *dev; unsigned int keylen; u8 key[AES_MAX_KEY_SIZE]; u8 iv[AES_BLOCK_SIZE]; @@ -252,7 +262,7 @@ struct rk_cipher_rctx { struct rk_crypto_tmp { u32 type; - struct rk_crypto_info *dev; + struct rk_crypto_info *dev; union { struct skcipher_alg skcipher; struct ahash_alg hash; @@ -276,4 +286,5 @@ extern struct rk_crypto_tmp rk_ahash_sha1; extern struct rk_crypto_tmp rk_ahash_sha256; extern struct rk_crypto_tmp rk_ahash_md5; +struct rk_crypto_info *get_rk_crypto(void); #endif diff --git a/drivers/crypto/rockchip/rk3288_crypto_ahash.c b/drivers/crypto/rockchip/rk3288_crypto_ahash.c index 30f78256c9551..a78ff3dcd0b1f 100644 --- a/drivers/crypto/rockchip/rk3288_crypto_ahash.c +++ b/drivers/crypto/rockchip/rk3288_crypto_ahash.c @@ -199,8 +199,8 @@ static int rk_ahash_export(struct ahash_request *req, void *out) static int rk_ahash_digest(struct ahash_request *req) { struct rk_ahash_rctx *rctx = ahash_request_ctx(req); - struct rk_ahash_ctx *tctx = crypto_tfm_ctx(req->base.tfm); - struct rk_crypto_info *dev = tctx->dev; + struct rk_crypto_info *dev; + struct crypto_engine *engine; if (rk_ahash_need_fallback(req)) return rk_ahash_digest_fb(req); @@ -208,9 +208,12 @@ static int rk_ahash_digest(struct ahash_request *req) if (!req->nbytes) return zero_message_process(req); + dev = get_rk_crypto(); + rctx->dev = dev; + engine = dev->engine; - return crypto_transfer_hash_request_to_engine(dev->engine, req); + return crypto_transfer_hash_request_to_engine(engine, req); } static void crypto_ahash_dma_start(struct rk_crypto_info *dev, struct scatterlist *sg) @@ -260,9 +263,14 @@ static int rk_hash_run(struct crypto_engine *engine, void *breq) int i; u32 v; + err = pm_runtime_resume_and_get(rkc->dev); + if (err) + return err; + rctx->mode = 0; algt->stat_req++; + rkc->nreq++; switch (crypto_ahash_digestsize(tfm)) { case SHA1_DIGEST_SIZE: @@ -313,6 +321,8 @@ static int rk_hash_run(struct crypto_engine *engine, void *breq) } theend: + pm_runtime_put_autosuspend(rkc->dev); + local_bh_disable(); crypto_finalize_hash_request(engine, breq, err); local_bh_enable(); @@ -323,21 +333,15 @@ theend: static int rk_cra_hash_init(struct crypto_tfm *tfm) { struct rk_ahash_ctx *tctx = crypto_tfm_ctx(tfm); - struct rk_crypto_tmp *algt; - struct ahash_alg *alg = __crypto_ahash_alg(tfm->__crt_alg); - const char *alg_name = crypto_tfm_alg_name(tfm); - int err; - - algt = container_of(alg, struct rk_crypto_tmp, alg.hash); - - tctx->dev = algt->dev; + struct ahash_alg *alg = __crypto_ahash_alg(tfm->__crt_alg); + struct rk_crypto_tmp *algt = container_of(alg, struct rk_crypto_tmp, alg.hash); /* for fallback */ tctx->fallback_tfm = crypto_alloc_ahash(alg_name, 0, CRYPTO_ALG_NEED_FALLBACK); if (IS_ERR(tctx->fallback_tfm)) { - dev_err(tctx->dev->dev, "Could not load fallback driver.\n"); + dev_err(algt->dev->dev, "Could not load fallback driver.\n"); return PTR_ERR(tctx->fallback_tfm); } @@ -349,15 +353,7 @@ static int rk_cra_hash_init(struct crypto_tfm *tfm) tctx->enginectx.op.prepare_request = rk_hash_prepare; tctx->enginectx.op.unprepare_request = rk_hash_unprepare; - err = pm_runtime_resume_and_get(tctx->dev->dev); - if (err < 0) - goto error_pm; - return 0; -error_pm: - crypto_free_ahash(tctx->fallback_tfm); - - return err; } static void rk_cra_hash_exit(struct crypto_tfm *tfm) @@ -365,7 +361,6 @@ static void rk_cra_hash_exit(struct crypto_tfm *tfm) struct rk_ahash_ctx *tctx = crypto_tfm_ctx(tfm); crypto_free_ahash(tctx->fallback_tfm); - pm_runtime_put_autosuspend(tctx->dev->dev); } struct rk_crypto_tmp rk_ahash_sha1 = { diff --git a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c index 0b1c90ababb71..59069457582bf 100644 --- a/drivers/crypto/rockchip/rk3288_crypto_skcipher.c +++ b/drivers/crypto/rockchip/rk3288_crypto_skcipher.c @@ -17,11 +17,11 @@ static int rk_cipher_need_fallback(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - unsigned int bs = crypto_skcipher_blocksize(tfm); struct skcipher_alg *alg = crypto_skcipher_alg(tfm); struct rk_crypto_tmp *algt = container_of(alg, struct rk_crypto_tmp, alg.skcipher); struct scatterlist *sgs, *sgd; unsigned int stodo, dtodo, len; + unsigned int bs = crypto_skcipher_blocksize(tfm); if (!req->cryptlen) return true; @@ -84,15 +84,16 @@ static int rk_cipher_fallback(struct skcipher_request *areq) static int rk_cipher_handle_req(struct skcipher_request *req) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct rk_cipher_ctx *tctx = crypto_skcipher_ctx(tfm); struct rk_cipher_rctx *rctx = skcipher_request_ctx(req); - struct rk_crypto_info *rkc = tctx->dev; - struct crypto_engine *engine = rkc->engine; + struct rk_crypto_info *rkc; + struct crypto_engine *engine; if (rk_cipher_need_fallback(req)) return rk_cipher_fallback(req); + rkc = get_rk_crypto(); + + engine = rkc->engine; rctx->dev = rkc; return crypto_transfer_skcipher_request_to_engine(engine, req); @@ -307,7 +308,12 @@ static int rk_cipher_run(struct crypto_engine *engine, void *async_req) struct rk_crypto_tmp *algt = container_of(alg, struct rk_crypto_tmp, alg.skcipher); struct rk_crypto_info *rkc = rctx->dev; + err = pm_runtime_resume_and_get(rkc->dev); + if (err) + return err; + algt->stat_req++; + rkc->nreq++; ivsize = crypto_skcipher_ivsize(tfm); if (areq->iv && crypto_skcipher_ivsize(tfm) > 0) { @@ -401,6 +407,8 @@ static int rk_cipher_run(struct crypto_engine *engine, void *async_req) } theend: + pm_runtime_put_autosuspend(rkc->dev); + local_bh_disable(); crypto_finalize_skcipher_request(engine, areq, err); local_bh_enable(); @@ -420,18 +428,13 @@ theend_iv: static int rk_cipher_tfm_init(struct crypto_skcipher *tfm) { struct rk_cipher_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_alg *alg = crypto_skcipher_alg(tfm); const char *name = crypto_tfm_alg_name(&tfm->base); - struct rk_crypto_tmp *algt; - int err; - - algt = container_of(alg, struct rk_crypto_tmp, alg.skcipher); - - ctx->dev = algt->dev; + struct skcipher_alg *alg = crypto_skcipher_alg(tfm); + struct rk_crypto_tmp *algt = container_of(alg, struct rk_crypto_tmp, alg.skcipher); ctx->fallback_tfm = crypto_alloc_skcipher(name, 0, CRYPTO_ALG_NEED_FALLBACK); if (IS_ERR(ctx->fallback_tfm)) { - dev_err(ctx->dev->dev, "ERROR: Cannot allocate fallback for %s %ld\n", + dev_err(algt->dev->dev, "ERROR: Cannot allocate fallback for %s %ld\n", name, PTR_ERR(ctx->fallback_tfm)); return PTR_ERR(ctx->fallback_tfm); } @@ -441,14 +444,7 @@ static int rk_cipher_tfm_init(struct crypto_skcipher *tfm) ctx->enginectx.op.do_one_request = rk_cipher_run; - err = pm_runtime_resume_and_get(ctx->dev->dev); - if (err < 0) - goto error_pm; - return 0; -error_pm: - crypto_free_skcipher(ctx->fallback_tfm); - return err; } static void rk_cipher_tfm_exit(struct crypto_skcipher *tfm) @@ -457,7 +453,6 @@ static void rk_cipher_tfm_exit(struct crypto_skcipher *tfm) memzero_explicit(ctx->key, ctx->keylen); crypto_free_skcipher(ctx->fallback_tfm); - pm_runtime_put_autosuspend(ctx->dev->dev); } struct rk_crypto_tmp rk_ecb_aes_alg = { -- GitLab From 7984ceb134bf31aa9a597f10ed52d831d5aede14 Mon Sep 17 00:00:00 2001 From: Frederick Lawler Date: Mon, 17 Oct 2022 14:25:00 -0500 Subject: [PATCH 0262/1423] crypto: af_alg - Support symmetric encryption via keyring keys We want to leverage keyring to store sensitive keys, and then use those keys for symmetric encryption via the crypto API. Among the key types we wish to support are: user, logon, encrypted, and trusted. User key types are already able to have their data copied to user space, but logon does not support this. Further, trusted and encrypted keys will return their encrypted data back to user space on read, which does not make them ideal for symmetric encryption. To support symmetric encryption for these key types, add a new ALG_SET_KEY_BY_KEY_SERIAL setsockopt() option to the crypto API. This allows users to pass a key_serial_t to the crypto API to perform symmetric encryption. The behavior is the same as ALG_SET_KEY, but the crypto key data is copied in kernel space from a keyring key, which allows for the support of logon, encrypted, and trusted key types. Keyring keys must have the KEY_(POS|USR|GRP|OTH)_SEARCH permission set to leverage this feature. This follows the asymmetric_key type where key lookup calls eventually lead to keyring_search_rcu() without the KEYRING_SEARCH_NO_CHECK_PERM flag set. Signed-off-by: Frederick Lawler Signed-off-by: Herbert Xu --- Documentation/crypto/userspace-if.rst | 15 ++- crypto/af_alg.c | 135 +++++++++++++++++++++++++- include/uapi/linux/if_alg.h | 1 + 3 files changed, 147 insertions(+), 4 deletions(-) diff --git a/Documentation/crypto/userspace-if.rst b/Documentation/crypto/userspace-if.rst index b45dabbf69d68..f80f243e227e6 100644 --- a/Documentation/crypto/userspace-if.rst +++ b/Documentation/crypto/userspace-if.rst @@ -131,9 +131,9 @@ from the kernel crypto API. If the buffer is too small for the message digest, the flag MSG_TRUNC is set by the kernel. In order to set a message digest key, the calling application must use -the setsockopt() option of ALG_SET_KEY. If the key is not set the HMAC -operation is performed without the initial HMAC state change caused by -the key. +the setsockopt() option of ALG_SET_KEY or ALG_SET_KEY_BY_KEY_SERIAL. If the +key is not set the HMAC operation is performed without the initial HMAC state +change caused by the key. Symmetric Cipher API -------------------- @@ -382,6 +382,15 @@ mentioned optname: - the RNG cipher type to provide the seed +- ALG_SET_KEY_BY_KEY_SERIAL -- Setting the key via keyring key_serial_t. + This operation behaves the same as ALG_SET_KEY. The decrypted + data is copied from a keyring key, and uses that data as the + key for symmetric encryption. + + The passed in key_serial_t must have the KEY_(POS|USR|GRP|OTH)_SEARCH + permission set, otherwise -EPERM is returned. Supports key types: user, + logon, encrypted, and trusted. + - ALG_SET_AEAD_AUTHSIZE -- Setting the authentication tag size for AEAD ciphers. For a encryption operation, the authentication tag of the given size will be generated. For a decryption operation, the diff --git a/crypto/af_alg.c b/crypto/af_alg.c index e893c0f6c8799..0a4fa2a429e22 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include #include @@ -19,6 +21,10 @@ #include #include #include +#include +#include +#include +#include struct alg_type_list { const struct af_alg_type *type; @@ -222,6 +228,129 @@ out: return err; } +#ifdef CONFIG_KEYS + +static const u8 *key_data_ptr_user(const struct key *key, + unsigned int *datalen) +{ + const struct user_key_payload *ukp; + + ukp = user_key_payload_locked(key); + if (IS_ERR_OR_NULL(ukp)) + return ERR_PTR(-EKEYREVOKED); + + *datalen = key->datalen; + + return ukp->data; +} + +static const u8 *key_data_ptr_encrypted(const struct key *key, + unsigned int *datalen) +{ + const struct encrypted_key_payload *ekp; + + ekp = dereference_key_locked(key); + if (IS_ERR_OR_NULL(ekp)) + return ERR_PTR(-EKEYREVOKED); + + *datalen = ekp->decrypted_datalen; + + return ekp->decrypted_data; +} + +static const u8 *key_data_ptr_trusted(const struct key *key, + unsigned int *datalen) +{ + const struct trusted_key_payload *tkp; + + tkp = dereference_key_locked(key); + if (IS_ERR_OR_NULL(tkp)) + return ERR_PTR(-EKEYREVOKED); + + *datalen = tkp->key_len; + + return tkp->key; +} + +static struct key *lookup_key(key_serial_t serial) +{ + key_ref_t key_ref; + + key_ref = lookup_user_key(serial, 0, KEY_NEED_SEARCH); + if (IS_ERR(key_ref)) + return ERR_CAST(key_ref); + + return key_ref_to_ptr(key_ref); +} + +static int alg_setkey_by_key_serial(struct alg_sock *ask, sockptr_t optval, + unsigned int optlen) +{ + const struct af_alg_type *type = ask->type; + u8 *key_data = NULL; + unsigned int key_datalen; + key_serial_t serial; + struct key *key; + const u8 *ret; + int err; + + if (optlen != sizeof(serial)) + return -EINVAL; + + if (copy_from_sockptr(&serial, optval, optlen)) + return -EFAULT; + + key = lookup_key(serial); + if (IS_ERR(key)) + return PTR_ERR(key); + + down_read(&key->sem); + + ret = ERR_PTR(-ENOPROTOOPT); + if (!strcmp(key->type->name, "user") || + !strcmp(key->type->name, "logon")) { + ret = key_data_ptr_user(key, &key_datalen); + } else if (IS_REACHABLE(CONFIG_ENCRYPTED_KEYS) && + !strcmp(key->type->name, "encrypted")) { + ret = key_data_ptr_encrypted(key, &key_datalen); + } else if (IS_REACHABLE(CONFIG_TRUSTED_KEYS) && + !strcmp(key->type->name, "trusted")) { + ret = key_data_ptr_trusted(key, &key_datalen); + } + + if (IS_ERR(ret)) { + up_read(&key->sem); + return PTR_ERR(ret); + } + + key_data = sock_kmalloc(&ask->sk, key_datalen, GFP_KERNEL); + if (!key_data) { + up_read(&key->sem); + return -ENOMEM; + } + + memcpy(key_data, ret, key_datalen); + + up_read(&key->sem); + + err = type->setkey(ask->private, key_data, key_datalen); + + sock_kzfree_s(&ask->sk, key_data, key_datalen); + + return err; +} + +#else + +static inline int alg_setkey_by_key_serial(struct alg_sock *ask, + sockptr_t optval, + unsigned int optlen) +{ + return -ENOPROTOOPT; +} + +#endif + static int alg_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { @@ -242,12 +371,16 @@ static int alg_setsockopt(struct socket *sock, int level, int optname, switch (optname) { case ALG_SET_KEY: + case ALG_SET_KEY_BY_KEY_SERIAL: if (sock->state == SS_CONNECTED) goto unlock; if (!type->setkey) goto unlock; - err = alg_setkey(sk, optval, optlen); + if (optname == ALG_SET_KEY_BY_KEY_SERIAL) + err = alg_setkey_by_key_serial(ask, optval, optlen); + else + err = alg_setkey(sk, optval, optlen); break; case ALG_SET_AEAD_AUTHSIZE: if (sock->state == SS_CONNECTED) diff --git a/include/uapi/linux/if_alg.h b/include/uapi/linux/if_alg.h index 578b18aab8215..0824fbc026a1c 100644 --- a/include/uapi/linux/if_alg.h +++ b/include/uapi/linux/if_alg.h @@ -52,6 +52,7 @@ struct af_alg_iv { #define ALG_SET_AEAD_ASSOCLEN 4 #define ALG_SET_AEAD_AUTHSIZE 5 #define ALG_SET_DRBG_ENTROPY 6 +#define ALG_SET_KEY_BY_KEY_SERIAL 7 /* Operations */ #define ALG_OP_DECRYPT 0 -- GitLab From 3efe90af4c0c46c58dba1b306de142827153d9c0 Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Sat, 22 Oct 2022 01:17:44 +0000 Subject: [PATCH 0263/1423] crypto: hisilicon/qm - increase the memory of local variables Increase the buffer to prevent stack overflow by fuzz test. The maximum length of the qos configuration buffer is 256 bytes. Currently, the value of the 'val buffer' is only 32 bytes. The sscanf does not check the dest memory length. So the 'val buffer' may stack overflow. Signed-off-by: Kai Ye Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/qm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index e3edb176d9765..5d79e9f0e7e14 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -250,7 +250,6 @@ #define QM_QOS_MIN_CIR_B 100 #define QM_QOS_MAX_CIR_U 6 #define QM_QOS_MAX_CIR_S 11 -#define QM_QOS_VAL_MAX_LEN 32 #define QM_DFX_BASE 0x0100000 #define QM_DFX_STATE1 0x0104000 #define QM_DFX_STATE2 0x01040C8 @@ -4612,7 +4611,7 @@ static ssize_t qm_get_qos_value(struct hisi_qm *qm, const char *buf, unsigned int *fun_index) { char tbuf_bdf[QM_DBG_READ_LEN] = {0}; - char val_buf[QM_QOS_VAL_MAX_LEN] = {0}; + char val_buf[QM_DBG_READ_LEN] = {0}; u32 tmp1, device, function; int ret, bus; -- GitLab From 22d7a6c39cabab811f42cb2daed2343c87b0aca5 Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Sat, 22 Oct 2022 01:17:45 +0000 Subject: [PATCH 0264/1423] crypto: hisilicon/qm - add pci bdf number check The pci bdf number check is added for qos written by using the pci api. Directly get the devfn by pci_dev, so delete some redundant code. And use the kstrtoul instead of sscanf to simplify code. Signed-off-by: Kai Ye Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/qm.c | 37 ++++++++++++----------------------- 1 file changed, 12 insertions(+), 25 deletions(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 5d79e9f0e7e14..80eeb966cf891 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -4589,49 +4589,36 @@ err_put_dfx_access: return ret; } -static ssize_t qm_qos_value_init(const char *buf, unsigned long *val) -{ - int buflen = strlen(buf); - int ret, i; - - for (i = 0; i < buflen; i++) { - if (!isdigit(buf[i])) - return -EINVAL; - } - - ret = sscanf(buf, "%lu", val); - if (ret != QM_QOS_VAL_NUM) - return -EINVAL; - - return 0; -} - static ssize_t qm_get_qos_value(struct hisi_qm *qm, const char *buf, unsigned long *val, unsigned int *fun_index) { + struct bus_type *bus_type = qm->pdev->dev.bus; char tbuf_bdf[QM_DBG_READ_LEN] = {0}; char val_buf[QM_DBG_READ_LEN] = {0}; - u32 tmp1, device, function; - int ret, bus; + struct pci_dev *pdev; + struct device *dev; + int ret; ret = sscanf(buf, "%s %s", tbuf_bdf, val_buf); if (ret != QM_QOS_PARAM_NUM) return -EINVAL; - ret = qm_qos_value_init(val_buf, val); + ret = kstrtoul(val_buf, 10, val); if (ret || *val == 0 || *val > QM_QOS_MAX_VAL) { pci_err(qm->pdev, "input qos value is error, please set 1~1000!\n"); return -EINVAL; } - ret = sscanf(tbuf_bdf, "%u:%x:%u.%u", &tmp1, &bus, &device, &function); - if (ret != QM_QOS_BDF_PARAM_NUM) { - pci_err(qm->pdev, "input pci bdf value is error!\n"); - return -EINVAL; + dev = bus_find_device_by_name(bus_type, NULL, tbuf_bdf); + if (!dev) { + pci_err(qm->pdev, "input pci bdf number is error!\n"); + return -ENODEV; } - *fun_index = PCI_DEVFN(device, function); + pdev = container_of(dev, struct pci_dev, dev); + + *fun_index = pdev->devfn; return 0; } -- GitLab From 8f82f4ae8946d665f1e38da8e2b39b929d2435b1 Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Sat, 22 Oct 2022 01:17:46 +0000 Subject: [PATCH 0265/1423] crypto: hisilicon/qm - delete redundancy check Because the permission on the VF debugfs file is "0444". So the VF function checking is redundant in qos writing api. Signed-off-by: Kai Ye Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/qm.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 80eeb966cf891..363a02810a164 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -4632,9 +4632,6 @@ static ssize_t qm_algqos_write(struct file *filp, const char __user *buf, unsigned long val; int len, ret; - if (qm->fun_type == QM_HW_VF) - return -EINVAL; - if (*pos != 0) return 0; -- GitLab From d6e9aa6e1ea872d1bbdf08ac78245cf8efeda19c Mon Sep 17 00:00:00 2001 From: wangjianli Date: Sat, 22 Oct 2022 13:38:02 +0800 Subject: [PATCH 0266/1423] crypto: octeontx - fix repeated words in comments Delete the redundant word 'the'. Signed-off-by: wangjianli Signed-off-by: Herbert Xu --- drivers/crypto/marvell/octeontx/otx_cpt_hw_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/marvell/octeontx/otx_cpt_hw_types.h b/drivers/crypto/marvell/octeontx/otx_cpt_hw_types.h index 205eacac4a348..f8aedafdfdc5f 100644 --- a/drivers/crypto/marvell/octeontx/otx_cpt_hw_types.h +++ b/drivers/crypto/marvell/octeontx/otx_cpt_hw_types.h @@ -534,7 +534,7 @@ union otx_cptx_vqx_misc_ena_w1s { * Word0 * reserved_20_63:44 [63:20] Reserved. * dbell_cnt:20 [19:0](R/W/H) Number of instruction queue 64-bit words to add - * to the CPT instruction doorbell count. Readback value is the the + * to the CPT instruction doorbell count. Readback value is the * current number of pending doorbell requests. If counter overflows * CPT()_VQ()_MISC_INT[DBELL_DOVF] is set. To reset the count back to * zero, write one to clear CPT()_VQ()_MISC_INT_ENA_W1C[DBELL_DOVF], -- GitLab From 05e88ebb9ecfe9631ccc6483a79b0eabf554da60 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 21 Oct 2022 15:01:02 -0500 Subject: [PATCH 0267/1423] RDMA/rxe: Remove redundant header files Remove unneeded include files. Link: https://lore.kernel.org/r/20221021200118.2163-2-rpearsonhpe@gmail.com Signed-off-by: Ian Ziemba Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_task.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c index ec2b7de1c4972..3fbaba9eec396 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.c +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -4,10 +4,6 @@ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. */ -#include -#include -#include - #include "rxe.h" int __rxe_do_task(struct rxe_task *task) -- GitLab From 98a54f170617746b5d09b18b23b295efc7a42a5e Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 21 Oct 2022 15:01:03 -0500 Subject: [PATCH 0268/1423] RDMA/rxe: Remove init of task locks from rxe_qp.c The calls to spin_lock_init() for the tasklet spinlocks in rxe_qp_init_misc() are redundant since they are intiialized in rxe_init_task(). This patch removes them. Link: https://lore.kernel.org/r/20221021200118.2163-3-rpearsonhpe@gmail.com Signed-off-by: Ian Ziemba Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_qp.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index a62bab88415cb..57c3f05ad15b1 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -172,10 +172,6 @@ static void rxe_qp_init_misc(struct rxe_dev *rxe, struct rxe_qp *qp, spin_lock_init(&qp->state_lock); - spin_lock_init(&qp->req.task.state_lock); - spin_lock_init(&qp->resp.task.state_lock); - spin_lock_init(&qp->comp.task.state_lock); - spin_lock_init(&qp->sq.sq_lock); spin_lock_init(&qp->rq.producer_lock); spin_lock_init(&qp->rq.consumer_lock); -- GitLab From de669ae8af49ceed0eed44f5b3d51dc62affc5e4 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 21 Oct 2022 15:01:04 -0500 Subject: [PATCH 0269/1423] RDMA/rxe: Removed unused name from rxe_task struct The name field in struct rxe_task is never used. This patch removes it. Link: https://lore.kernel.org/r/20221021200118.2163-4-rpearsonhpe@gmail.com Signed-off-by: Ian Ziemba Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_qp.c | 9 +++------ drivers/infiniband/sw/rxe/rxe_task.c | 4 +--- drivers/infiniband/sw/rxe/rxe_task.h | 4 +--- 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 57c3f05ad15b1..03bd9f3e9956a 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -238,10 +238,8 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, skb_queue_head_init(&qp->req_pkts); - rxe_init_task(&qp->req.task, qp, - rxe_requester, "req"); - rxe_init_task(&qp->comp.task, qp, - rxe_completer, "comp"); + rxe_init_task(&qp->req.task, qp, rxe_requester); + rxe_init_task(&qp->comp.task, qp, rxe_completer); qp->qp_timeout_jiffies = 0; /* Can't be set for UD/UC in modify_qp */ if (init->qp_type == IB_QPT_RC) { @@ -288,8 +286,7 @@ static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp, skb_queue_head_init(&qp->resp_pkts); - rxe_init_task(&qp->resp.task, qp, - rxe_responder, "resp"); + rxe_init_task(&qp->resp.task, qp, rxe_responder); qp->resp.opcode = OPCODE_NONE; qp->resp.msn = 0; diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c index 3fbaba9eec396..0cbba455fefd8 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.c +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -90,12 +90,10 @@ void rxe_do_task(struct tasklet_struct *t) task->ret = ret; } -int rxe_init_task(struct rxe_task *task, - void *arg, int (*func)(void *), char *name) +int rxe_init_task(struct rxe_task *task, void *arg, int (*func)(void *)) { task->arg = arg; task->func = func; - snprintf(task->name, sizeof(task->name), "%s", name); task->destroyed = false; tasklet_setup(&task->tasklet, rxe_do_task); diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h index 7f612a1c68a7b..b3dfd970d1dc6 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.h +++ b/drivers/infiniband/sw/rxe/rxe_task.h @@ -25,7 +25,6 @@ struct rxe_task { void *arg; int (*func)(void *arg); int ret; - char name[16]; bool destroyed; }; @@ -34,8 +33,7 @@ struct rxe_task { * arg => parameter to pass to fcn * func => function to call until it returns != 0 */ -int rxe_init_task(struct rxe_task *task, - void *arg, int (*func)(void *), char *name); +int rxe_init_task(struct rxe_task *task, void *arg, int (*func)(void *)); /* cleanup task */ void rxe_cleanup_task(struct rxe_task *task); -- GitLab From dccb23f6c312e4480fe32ccbc2afac1a5cac7e5e Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 21 Oct 2022 15:01:05 -0500 Subject: [PATCH 0270/1423] RDMA/rxe: Split rxe_run_task() into two subroutines Split rxe_run_task(task, sched) into rxe_run_task(task) and rxe_sched_task(task). Link: https://lore.kernel.org/r/20221021200118.2163-5-rpearsonhpe@gmail.com Signed-off-by: Ian Ziemba Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_comp.c | 19 +++++++++++-------- drivers/infiniband/sw/rxe/rxe_net.c | 4 ++-- drivers/infiniband/sw/rxe/rxe_qp.c | 10 +++++----- drivers/infiniband/sw/rxe/rxe_req.c | 10 +++++----- drivers/infiniband/sw/rxe/rxe_resp.c | 5 ++++- drivers/infiniband/sw/rxe/rxe_task.c | 15 ++++++++++----- drivers/infiniband/sw/rxe/rxe_task.h | 7 +++---- drivers/infiniband/sw/rxe/rxe_verbs.c | 8 ++++---- 8 files changed, 44 insertions(+), 34 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index c9170dd99f3aa..66f392810c860 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -118,7 +118,7 @@ void retransmit_timer(struct timer_list *t) if (qp->valid) { qp->comp.timeout = 1; - rxe_run_task(&qp->comp.task, 1); + rxe_sched_task(&qp->comp.task); } } @@ -132,7 +132,10 @@ void rxe_comp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb) if (must_sched != 0) rxe_counter_inc(SKB_TO_PKT(skb)->rxe, RXE_CNT_COMPLETER_SCHED); - rxe_run_task(&qp->comp.task, must_sched); + if (must_sched) + rxe_sched_task(&qp->comp.task); + else + rxe_run_task(&qp->comp.task); } static inline enum comp_state get_wqe(struct rxe_qp *qp, @@ -313,7 +316,7 @@ static inline enum comp_state check_ack(struct rxe_qp *qp, qp->comp.psn = pkt->psn; if (qp->req.wait_psn) { qp->req.wait_psn = 0; - rxe_run_task(&qp->req.task, 0); + rxe_run_task(&qp->req.task); } } return COMPST_ERROR_RETRY; @@ -460,7 +463,7 @@ static void do_complete(struct rxe_qp *qp, struct rxe_send_wqe *wqe) */ if (qp->req.wait_fence) { qp->req.wait_fence = 0; - rxe_run_task(&qp->req.task, 0); + rxe_run_task(&qp->req.task); } } @@ -474,7 +477,7 @@ static inline enum comp_state complete_ack(struct rxe_qp *qp, if (qp->req.need_rd_atomic) { qp->comp.timeout_retry = 0; qp->req.need_rd_atomic = 0; - rxe_run_task(&qp->req.task, 0); + rxe_run_task(&qp->req.task); } } @@ -520,7 +523,7 @@ static inline enum comp_state complete_wqe(struct rxe_qp *qp, if (qp->req.wait_psn) { qp->req.wait_psn = 0; - rxe_run_task(&qp->req.task, 1); + rxe_sched_task(&qp->req.task); } } @@ -654,7 +657,7 @@ int rxe_completer(void *arg) if (qp->req.wait_psn) { qp->req.wait_psn = 0; - rxe_run_task(&qp->req.task, 1); + rxe_sched_task(&qp->req.task); } state = COMPST_DONE; @@ -722,7 +725,7 @@ int rxe_completer(void *arg) RXE_CNT_COMP_RETRY); qp->req.need_retry = 1; qp->comp.started_retry = 1; - rxe_run_task(&qp->req.task, 0); + rxe_run_task(&qp->req.task); } goto done; diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 35f327b9d4b8e..c36cad9c7a664 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -345,7 +345,7 @@ static void rxe_skb_tx_dtor(struct sk_buff *skb) if (unlikely(qp->need_req_skb && skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW)) - rxe_run_task(&qp->req.task, 1); + rxe_sched_task(&qp->req.task); rxe_put(qp); } @@ -429,7 +429,7 @@ int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt, if ((qp_type(qp) != IB_QPT_RC) && (pkt->mask & RXE_END_MASK)) { pkt->wqe->state = wqe_state_done; - rxe_run_task(&qp->comp.task, 1); + rxe_sched_task(&qp->comp.task); } rxe_counter_inc(rxe, RXE_CNT_SENT_PKTS); diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 03bd9f3e9956a..3f6d62a80bea9 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -536,10 +536,10 @@ static void rxe_qp_drain(struct rxe_qp *qp) if (qp->req.state != QP_STATE_DRAINED) { qp->req.state = QP_STATE_DRAIN; if (qp_type(qp) == IB_QPT_RC) - rxe_run_task(&qp->comp.task, 1); + rxe_sched_task(&qp->comp.task); else __rxe_do_task(&qp->comp.task); - rxe_run_task(&qp->req.task, 1); + rxe_sched_task(&qp->req.task); } } } @@ -553,13 +553,13 @@ void rxe_qp_error(struct rxe_qp *qp) qp->attr.qp_state = IB_QPS_ERR; /* drain work and packet queues */ - rxe_run_task(&qp->resp.task, 1); + rxe_sched_task(&qp->resp.task); if (qp_type(qp) == IB_QPT_RC) - rxe_run_task(&qp->comp.task, 1); + rxe_sched_task(&qp->comp.task); else __rxe_do_task(&qp->comp.task); - rxe_run_task(&qp->req.task, 1); + rxe_sched_task(&qp->req.task); } /* called by the modify qp verb */ diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index f637712079705..41f1d84f0acbf 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -105,7 +105,7 @@ void rnr_nak_timer(struct timer_list *t) /* request a send queue retry */ qp->req.need_retry = 1; qp->req.wait_for_rnr_timer = 0; - rxe_run_task(&qp->req.task, 1); + rxe_sched_task(&qp->req.task); } static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp) @@ -608,7 +608,7 @@ static int rxe_do_local_ops(struct rxe_qp *qp, struct rxe_send_wqe *wqe) * which can lead to a deadlock. So go ahead and complete * it now. */ - rxe_run_task(&qp->comp.task, 1); + rxe_sched_task(&qp->comp.task); return 0; } @@ -733,7 +733,7 @@ int rxe_requester(void *arg) qp->req.wqe_index); wqe->state = wqe_state_done; wqe->status = IB_WC_SUCCESS; - rxe_run_task(&qp->comp.task, 0); + rxe_run_task(&qp->comp.task); goto done; } payload = mtu; @@ -795,7 +795,7 @@ int rxe_requester(void *arg) rollback_state(wqe, qp, &rollback_wqe, rollback_psn); if (err == -EAGAIN) { - rxe_run_task(&qp->req.task, 1); + rxe_sched_task(&qp->req.task); goto exit; } @@ -817,7 +817,7 @@ err: qp->req.wqe_index = queue_next_index(qp->sq.queue, qp->req.wqe_index); wqe->state = wqe_state_error; qp->req.state = QP_STATE_ERROR; - rxe_run_task(&qp->comp.task, 0); + rxe_run_task(&qp->comp.task); exit: ret = -EAGAIN; out: diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 95d372db934d7..c32bc12cc82f7 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -91,7 +91,10 @@ void rxe_resp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb) must_sched = (pkt->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST) || (skb_queue_len(&qp->req_pkts) > 1); - rxe_run_task(&qp->resp.task, must_sched); + if (must_sched) + rxe_sched_task(&qp->resp.task); + else + rxe_run_task(&qp->resp.task); } static inline enum resp_states get_req(struct rxe_qp *qp, diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c index 0cbba455fefd8..442b7348acdc3 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.c +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -123,15 +123,20 @@ void rxe_cleanup_task(struct rxe_task *task) tasklet_kill(&task->tasklet); } -void rxe_run_task(struct rxe_task *task, int sched) +void rxe_run_task(struct rxe_task *task) { if (task->destroyed) return; - if (sched) - tasklet_schedule(&task->tasklet); - else - rxe_do_task(&task->tasklet); + rxe_do_task(&task->tasklet); +} + +void rxe_sched_task(struct rxe_task *task) +{ + if (task->destroyed) + return; + + tasklet_schedule(&task->tasklet); } void rxe_disable_task(struct rxe_task *task) diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h index b3dfd970d1dc6..590b1c1d7e7ca 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.h +++ b/drivers/infiniband/sw/rxe/rxe_task.h @@ -52,10 +52,9 @@ int __rxe_do_task(struct rxe_task *task); */ void rxe_do_task(struct tasklet_struct *t); -/* run a task, else schedule it to run as a tasklet, The decision - * to run or schedule tasklet is based on the parameter sched. - */ -void rxe_run_task(struct rxe_task *task, int sched); +void rxe_run_task(struct rxe_task *task); + +void rxe_sched_task(struct rxe_task *task); /* keep a task from scheduling */ void rxe_disable_task(struct rxe_task *task); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 88825edc7dce1..f2f82efbaf6d6 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -695,9 +695,9 @@ static int rxe_post_send_kernel(struct rxe_qp *qp, const struct ib_send_wr *wr, wr = next; } - rxe_run_task(&qp->req.task, 1); + rxe_sched_task(&qp->req.task); if (unlikely(qp->req.state == QP_STATE_ERROR)) - rxe_run_task(&qp->comp.task, 1); + rxe_sched_task(&qp->comp.task); return err; } @@ -719,7 +719,7 @@ static int rxe_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, if (qp->is_user) { /* Utilize process context to do protocol processing */ - rxe_run_task(&qp->req.task, 0); + rxe_run_task(&qp->req.task); return 0; } else return rxe_post_send_kernel(qp, wr, bad_wr); @@ -759,7 +759,7 @@ static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, spin_unlock_irqrestore(&rq->producer_lock, flags); if (qp->resp.state == QP_STATE_ERROR) - rxe_run_task(&qp->resp.task, 1); + rxe_sched_task(&qp->resp.task); err1: return err; -- GitLab From dcef28528cce82a82134abd393aa0f38f2edf77e Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 21 Oct 2022 15:01:06 -0500 Subject: [PATCH 0271/1423] RDMA/rxe: Make rxe_do_task static The subroutine rxe_do_task() is only called in rxe_task.c. This patch makes it static and renames it do_task(). Link: https://lore.kernel.org/r/20221021200118.2163-6-rpearsonhpe@gmail.com Signed-off-by: Ian Ziemba Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_task.c | 6 +++--- drivers/infiniband/sw/rxe/rxe_task.h | 8 -------- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c index 442b7348acdc3..fb953f5195b87 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.c +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -24,7 +24,7 @@ int __rxe_do_task(struct rxe_task *task) * a second caller finds the task already running * but looks just after the last call to func */ -void rxe_do_task(struct tasklet_struct *t) +static void do_task(struct tasklet_struct *t) { int cont; int ret; @@ -96,7 +96,7 @@ int rxe_init_task(struct rxe_task *task, void *arg, int (*func)(void *)) task->func = func; task->destroyed = false; - tasklet_setup(&task->tasklet, rxe_do_task); + tasklet_setup(&task->tasklet, do_task); task->state = TASK_STATE_START; spin_lock_init(&task->state_lock); @@ -128,7 +128,7 @@ void rxe_run_task(struct rxe_task *task) if (task->destroyed) return; - rxe_do_task(&task->tasklet); + do_task(&task->tasklet); } void rxe_sched_task(struct rxe_task *task) diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h index 590b1c1d7e7ca..99e0173e5c46d 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.h +++ b/drivers/infiniband/sw/rxe/rxe_task.h @@ -44,14 +44,6 @@ void rxe_cleanup_task(struct rxe_task *task); */ int __rxe_do_task(struct rxe_task *task); -/* - * common function called by any of the main tasklets - * If there is any chance that there is additional - * work to do someone must reschedule the task before - * leaving - */ -void rxe_do_task(struct tasklet_struct *t); - void rxe_run_task(struct rxe_task *task); void rxe_sched_task(struct rxe_task *task); -- GitLab From 63a18baef2653f59a7c5b990283628bd54d062fd Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 21 Oct 2022 15:01:07 -0500 Subject: [PATCH 0272/1423] RDMA/rxe: Rename task->state_lock to task->lock Rename task-state_lock to task->lock Link: https://lore.kernel.org/r/20221021200118.2163-7-rpearsonhpe@gmail.com Signed-off-by: Ian Ziemba Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_task.c | 18 +++++++++--------- drivers/infiniband/sw/rxe/rxe_task.h | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c index fb953f5195b87..0208d833a41b9 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.c +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -31,22 +31,22 @@ static void do_task(struct tasklet_struct *t) struct rxe_task *task = from_tasklet(task, t, tasklet); unsigned int iterations = RXE_MAX_ITERATIONS; - spin_lock_bh(&task->state_lock); + spin_lock_bh(&task->lock); switch (task->state) { case TASK_STATE_START: task->state = TASK_STATE_BUSY; - spin_unlock_bh(&task->state_lock); + spin_unlock_bh(&task->lock); break; case TASK_STATE_BUSY: task->state = TASK_STATE_ARMED; fallthrough; case TASK_STATE_ARMED: - spin_unlock_bh(&task->state_lock); + spin_unlock_bh(&task->lock); return; default: - spin_unlock_bh(&task->state_lock); + spin_unlock_bh(&task->lock); pr_warn("%s failed with bad state %d\n", __func__, task->state); return; } @@ -55,7 +55,7 @@ static void do_task(struct tasklet_struct *t) cont = 0; ret = task->func(task->arg); - spin_lock_bh(&task->state_lock); + spin_lock_bh(&task->lock); switch (task->state) { case TASK_STATE_BUSY: if (ret) { @@ -84,7 +84,7 @@ static void do_task(struct tasklet_struct *t) pr_warn("%s failed with bad state %d\n", __func__, task->state); } - spin_unlock_bh(&task->state_lock); + spin_unlock_bh(&task->lock); } while (cont); task->ret = ret; @@ -99,7 +99,7 @@ int rxe_init_task(struct rxe_task *task, void *arg, int (*func)(void *)) tasklet_setup(&task->tasklet, do_task); task->state = TASK_STATE_START; - spin_lock_init(&task->state_lock); + spin_lock_init(&task->lock); return 0; } @@ -115,9 +115,9 @@ void rxe_cleanup_task(struct rxe_task *task) task->destroyed = true; do { - spin_lock_bh(&task->state_lock); + spin_lock_bh(&task->lock); idle = (task->state == TASK_STATE_START); - spin_unlock_bh(&task->state_lock); + spin_unlock_bh(&task->lock); } while (!idle); tasklet_kill(&task->tasklet); diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h index 99e0173e5c46d..7b88129702ac6 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.h +++ b/drivers/infiniband/sw/rxe/rxe_task.h @@ -21,7 +21,7 @@ enum { struct rxe_task { struct tasklet_struct tasklet; int state; - spinlock_t state_lock; /* spinlock for task state */ + spinlock_t lock; void *arg; int (*func)(void *arg); int ret; -- GitLab From 875ab4a8d9a7e559c4aaad28f5886d39923301b7 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 27 Sep 2022 13:53:27 +0800 Subject: [PATCH 0273/1423] RDMA/rxe: Make sure requested access is a subset of {mr,mw}->access We should reject the requests with access flags that is not registered by MR/MW. For example, lookup_mr() should return NULL when requested access is 0x03 and mr->access is 0x01. Link: https://lore.kernel.org/r/20220927055337.22630-2-lizhijian@fujitsu.com Signed-off-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mr.c | 2 +- drivers/infiniband/sw/rxe/rxe_mw.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index d4f10c2d1aa79..014c27bba049c 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -511,7 +511,7 @@ struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key, if (unlikely((type == RXE_LOOKUP_LOCAL && mr->lkey != key) || (type == RXE_LOOKUP_REMOTE && mr->rkey != key) || - mr_pd(mr) != pd || (access && !(access & mr->access)) || + mr_pd(mr) != pd || ((access & mr->access) != access) || mr->state != RXE_MR_STATE_VALID)) { rxe_put(mr); mr = NULL; diff --git a/drivers/infiniband/sw/rxe/rxe_mw.c b/drivers/infiniband/sw/rxe/rxe_mw.c index 902b7df7aaedb..8df1c9066ed89 100644 --- a/drivers/infiniband/sw/rxe/rxe_mw.c +++ b/drivers/infiniband/sw/rxe/rxe_mw.c @@ -293,8 +293,7 @@ struct rxe_mw *rxe_lookup_mw(struct rxe_qp *qp, int access, u32 rkey) if (unlikely((mw->rkey != rkey) || rxe_mw_pd(mw) != pd || (mw->ibmw.type == IB_MW_TYPE_2 && mw->qp != qp) || - (mw->length == 0) || - (access && !(access & mw->access)) || + (mw->length == 0) || ((access & mw->access) != access) || mw->state != RXE_MW_STATE_VALID)) { rxe_put(mw); return NULL; -- GitLab From b071850ef62e36b2fc2ec81863f07be857151409 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Thu, 27 Oct 2022 07:31:33 +0000 Subject: [PATCH 0274/1423] RDMA/rxe: Remove the duplicate assignment of mr->map_shift mr->map_shift is set to ilog2(RXE_BUF_PER_MAP) in both rxe_mr_init() and rxe_mr_alloc() so remove the duplicate one in rxe_mr_init(). Link: https://lore.kernel.org/r/1666855893-145-1-git-send-email-yangx.jy@fujitsu.com Signed-off-by: Xiao Yang Reviewed-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mr.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 014c27bba049c..bc081002bddcb 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -62,7 +62,6 @@ static void rxe_mr_init(int access, struct rxe_mr *mr) mr->rkey = mr->ibmr.rkey = rkey; mr->state = RXE_MR_STATE_INVALID; - mr->map_shift = ilog2(RXE_BUF_PER_MAP); } static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf) -- GitLab From 692373d186205dfb1b56f35f22702412d94d9420 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Fri, 28 Oct 2022 15:50:53 +0800 Subject: [PATCH 0275/1423] RDMA/rxe: cleanup some error handling in rxe_verbs.c Instead of 'goto and return', just return directly to simplify the error handling, and avoid some unnecessary return value check. Link: https://lore.kernel.org/r/20221028075053.3990467-1-xuhaoyue1@hisilicon.com Signed-off-by: Yunsheng Lin Signed-off-by: Haoyue Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_verbs.c | 80 ++++++++------------------- 1 file changed, 23 insertions(+), 57 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index f2f82efbaf6d6..bcdfdadaebbc8 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -238,7 +238,6 @@ static int rxe_destroy_ah(struct ib_ah *ibah, u32 flags) static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr) { - int err; int i; u32 length; struct rxe_recv_wqe *recv_wqe; @@ -246,15 +245,11 @@ static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr) int full; full = queue_full(rq->queue, QUEUE_TYPE_TO_DRIVER); - if (unlikely(full)) { - err = -ENOMEM; - goto err1; - } + if (unlikely(full)) + return -ENOMEM; - if (unlikely(num_sge > rq->max_sge)) { - err = -EINVAL; - goto err1; - } + if (unlikely(num_sge > rq->max_sge)) + return -EINVAL; length = 0; for (i = 0; i < num_sge; i++) @@ -275,9 +270,6 @@ static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr) queue_advance_producer(rq->queue, QUEUE_TYPE_TO_DRIVER); return 0; - -err1: - return err; } static int rxe_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init, @@ -343,10 +335,7 @@ static int rxe_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, if (err) return err; - err = rxe_srq_from_attr(rxe, srq, attr, mask, &ucmd, udata); - if (err) - return err; - return 0; + return rxe_srq_from_attr(rxe, srq, attr, mask, &ucmd, udata); } static int rxe_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) @@ -453,11 +442,11 @@ static int rxe_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, err = rxe_qp_chk_attr(rxe, qp, attr, mask); if (err) - goto err1; + return err; err = rxe_qp_from_attr(qp, attr, mask, udata); if (err) - goto err1; + return err; if ((mask & IB_QP_AV) && (attr->ah_attr.ah_flags & IB_AH_GRH)) qp->src_port = rdma_get_udp_sport(attr->ah_attr.grh.flow_label, @@ -465,9 +454,6 @@ static int rxe_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, qp->attr.dest_qp_num); return 0; - -err1: - return err; } static int rxe_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, @@ -501,24 +487,21 @@ static int validate_send_wr(struct rxe_qp *qp, const struct ib_send_wr *ibwr, struct rxe_sq *sq = &qp->sq; if (unlikely(num_sge > sq->max_sge)) - goto err1; + return -EINVAL; if (unlikely(mask & WR_ATOMIC_MASK)) { if (length < 8) - goto err1; + return -EINVAL; if (atomic_wr(ibwr)->remote_addr & 0x7) - goto err1; + return -EINVAL; } if (unlikely((ibwr->send_flags & IB_SEND_INLINE) && (length > sq->max_inline))) - goto err1; + return -EINVAL; return 0; - -err1: - return -EINVAL; } static void init_send_wr(struct rxe_qp *qp, struct rxe_send_wr *wr, @@ -735,14 +718,12 @@ static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, if (unlikely((qp_state(qp) < IB_QPS_INIT) || !qp->valid)) { *bad_wr = wr; - err = -EINVAL; - goto err1; + return -EINVAL; } if (unlikely(qp->srq)) { *bad_wr = wr; - err = -EINVAL; - goto err1; + return -EINVAL; } spin_lock_irqsave(&rq->producer_lock, flags); @@ -761,7 +742,6 @@ static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, if (qp->resp.state == QP_STATE_ERROR) rxe_sched_task(&qp->resp.task); -err1: return err; } @@ -826,16 +806,9 @@ static int rxe_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) err = rxe_cq_chk_attr(rxe, cq, cqe, 0); if (err) - goto err1; - - err = rxe_cq_resize_queue(cq, cqe, uresp, udata); - if (err) - goto err1; - - return 0; + return err; -err1: - return err; + return rxe_cq_resize_queue(cq, cqe, uresp, udata); } static int rxe_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) @@ -921,26 +894,22 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, struct rxe_mr *mr; mr = rxe_alloc(&rxe->mr_pool); - if (!mr) { - err = -ENOMEM; - goto err2; - } - + if (!mr) + return ERR_PTR(-ENOMEM); rxe_get(pd); mr->ibmr.pd = ibpd; err = rxe_mr_init_user(rxe, start, length, iova, access, mr); if (err) - goto err3; + goto err1; rxe_finalize(mr); return &mr->ibmr; -err3: +err1: rxe_cleanup(mr); -err2: return ERR_PTR(err); } @@ -956,25 +925,22 @@ static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, return ERR_PTR(-EINVAL); mr = rxe_alloc(&rxe->mr_pool); - if (!mr) { - err = -ENOMEM; - goto err1; - } + if (!mr) + return ERR_PTR(-ENOMEM); rxe_get(pd); mr->ibmr.pd = ibpd; err = rxe_mr_init_fast(max_num_sg, mr); if (err) - goto err2; + goto err1; rxe_finalize(mr); return &mr->ibmr; -err2: - rxe_cleanup(mr); err1: + rxe_cleanup(mr); return ERR_PTR(err); } -- GitLab From 1b52861f0e04da43013f88dd56464b5719a974e3 Mon Sep 17 00:00:00 2001 From: Jinyu Tang Date: Sun, 9 Oct 2022 21:45:03 +0800 Subject: [PATCH 0276/1423] riscv: support update_mmu_tlb() Add macro definition to support update_mmu_tlb() for riscv, this function is from commit:7df676974359 ("mm/memory.c:Update local TLB if PTE entry exists"). update_mmu_tlb() is used when a thread notice that other cpu thread has handled the fault and changed the PTE. For MIPS, it's worth to do that,this cpu thread will trap in tlb fault again otherwise. For RISCV, it's also better to flush local tlb than do nothing in update_mmu_tlb(). There are two kinds of page fault that have update_mmu_tlb() inside: 1.page fault which PTE is NOT none, only protection check error, like write protection fault. If updata_mmu_tlb() is empty, after finsh page fault this time and re-execute, cpu will find address but protection checked error in tlb again. So this will cause another page fault. PTE in memory is good now,so update_mmu_cache() in handle_pte_fault() will be executed. If updata_mmu_tlb() is not empty flush local tlb, cpu won't find this address in tlb next time, and get entry in physical memory, so it won't cause another page fault. 2.page fault which PTE is none or swapped. For this case, this cpu thread won't cause another page fault,cpu will have tlb miss when re-execute, and get entry in memory directly. But "set pte in phycial memory and flush local tlb" is pratice in Linux, it's better to flush local tlb if it find entry in phycial memory has changed. Maybe it's same for other ARCH which can't detect PTE changed and update it in local tlb automatically. Signed-off-by: Jinyu Tang Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20221009134503.18783-1-tjytimi@163.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/pgtable.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 7ec936910a96e..c61ae83aadee8 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -418,6 +418,9 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, local_flush_tlb_page(address); } +#define __HAVE_ARCH_UPDATE_MMU_TLB +#define update_mmu_tlb update_mmu_cache + static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { -- GitLab From 3558927fc2b2fd0af309648f4071035e08719866 Mon Sep 17 00:00:00 2001 From: Cleo John Date: Mon, 10 Oct 2022 20:28:48 +0200 Subject: [PATCH 0277/1423] riscv: fix styling in ucontext header Change the two comments in ucontext.h by getting them up to the coding style proposed by torvalds. Signed-off-by: Cleo John Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20221010182848.GA28029@watet-ms7b87 Signed-off-by: Palmer Dabbelt --- arch/riscv/include/uapi/asm/ucontext.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/riscv/include/uapi/asm/ucontext.h b/arch/riscv/include/uapi/asm/ucontext.h index 44eb993950e5e..516bd0bb0da5c 100644 --- a/arch/riscv/include/uapi/asm/ucontext.h +++ b/arch/riscv/include/uapi/asm/ucontext.h @@ -15,19 +15,23 @@ struct ucontext { struct ucontext *uc_link; stack_t uc_stack; sigset_t uc_sigmask; - /* There's some padding here to allow sigset_t to be expanded in the + /* + * There's some padding here to allow sigset_t to be expanded in the * future. Though this is unlikely, other architectures put uc_sigmask * at the end of this structure and explicitly state it can be - * expanded, so we didn't want to box ourselves in here. */ + * expanded, so we didn't want to box ourselves in here. + */ __u8 __unused[1024 / 8 - sizeof(sigset_t)]; - /* We can't put uc_sigmask at the end of this structure because we need + /* + * We can't put uc_sigmask at the end of this structure because we need * to be able to expand sigcontext in the future. For example, the * vector ISA extension will almost certainly add ISA state. We want * to ensure all user-visible ISA state can be saved and restored via a * ucontext, so we're putting this at the end in order to allow for * infinite extensibility. Since we know this will be extended and we * assume sigset_t won't be extended an extreme amount, we're - * prioritizing this. */ + * prioritizing this. + */ struct sigcontext uc_mcontext; }; -- GitLab From 03699f271de1f4df6369cd379506539cd7d590d3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 2 Sep 2022 14:33:44 -0700 Subject: [PATCH 0278/1423] string: Rewrite and add more kern-doc for the str*() functions While there were varying degrees of kern-doc for various str*()-family functions, many needed updating and clarification, or to just be entirely written. Update (and relocate) existing kern-doc and add missing functions, sadly shaking my head at how many times I have written "Do not use this function". Include the results in the core kernel API doc. Cc: Bagas Sanjaya Cc: Andy Shevchenko Cc: Rasmus Villemoes Cc: Andrew Morton Cc: linux-hardening@vger.kernel.org Tested-by: Akira Yokosawa Link: https://lore.kernel.org/lkml/9b0cf584-01b3-3013-b800-1ef59fe82476@gmail.com Signed-off-by: Kees Cook --- Documentation/core-api/kernel-api.rst | 3 + include/linux/fortify-string.h | 133 ++++++++++++++++++++++++-- lib/string.c | 82 ---------------- scripts/kernel-doc | 6 +- 4 files changed, 131 insertions(+), 93 deletions(-) diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst index 06f4ab1226979..0d0c4f87057cb 100644 --- a/Documentation/core-api/kernel-api.rst +++ b/Documentation/core-api/kernel-api.rst @@ -36,6 +36,9 @@ String Conversions String Manipulation ------------------- +.. kernel-doc:: include/linux/fortify-string.h + :internal: + .. kernel-doc:: lib/string.c :export: diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 0f00a551939a4..e5b39b1cc2fc1 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -106,13 +106,13 @@ extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) * Instead, please choose an alternative, so that the expectation * of @p's contents is unambiguous: * - * +--------------------+-----------------+------------+ - * | @p needs to be: | padded to @size | not padded | - * +====================+=================+============+ - * | NUL-terminated | strscpy_pad() | strscpy() | - * +--------------------+-----------------+------------+ - * | not NUL-terminated | strtomem_pad() | strtomem() | - * +--------------------+-----------------+------------+ + * +--------------------+--------------------+------------+ + * | **p** needs to be: | padded to **size** | not padded | + * +====================+====================+============+ + * | NUL-terminated | strscpy_pad() | strscpy() | + * +--------------------+--------------------+------------+ + * | not NUL-terminated | strtomem_pad() | strtomem() | + * +--------------------+--------------------+------------+ * * Note strscpy*()'s differing return values for detecting truncation, * and strtomem*()'s expectation that the destination is marked with @@ -131,6 +131,21 @@ char *strncpy(char * const POS p, const char *q, __kernel_size_t size) return __underlying_strncpy(p, q, size); } +/** + * strcat - Append a string to an existing string + * + * @p: pointer to NUL-terminated string to append to + * @q: pointer to NUL-terminated source string to append from + * + * Do not use this function. While FORTIFY_SOURCE tries to avoid + * read and write overflows, this is only possible when the + * destination buffer size is known to the compiler. Prefer + * building the string with formatting, via scnprintf() or similar. + * At the very least, use strncat(). + * + * Returns @p. + * + */ __FORTIFY_INLINE __diagnose_as(__builtin_strcat, 1, 2) char *strcat(char * const POS p, const char *q) { @@ -144,6 +159,16 @@ char *strcat(char * const POS p, const char *q) } extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen); +/** + * strnlen - Return bounded count of characters in a NUL-terminated string + * + * @p: pointer to NUL-terminated string to count. + * @maxlen: maximum number of characters to count. + * + * Returns number of characters in @p (NOT including the final NUL), or + * @maxlen, if no NUL has been found up to there. + * + */ __FORTIFY_INLINE __kernel_size_t strnlen(const char * const POS p, __kernel_size_t maxlen) { size_t p_size = __member_size(p); @@ -169,6 +194,19 @@ __FORTIFY_INLINE __kernel_size_t strnlen(const char * const POS p, __kernel_size * possible for strlen() to be used on compile-time strings for use in * static initializers (i.e. as a constant expression). */ +/** + * strlen - Return count of characters in a NUL-terminated string + * + * @p: pointer to NUL-terminated string to count. + * + * Do not use this function unless the string length is known at + * compile-time. When @p is unterminated, this function may crash + * or return unexpected counts that could lead to memory content + * exposures. Prefer strnlen(). + * + * Returns number of characters in @p (NOT including the final NUL). + * + */ #define strlen(p) \ __builtin_choose_expr(__is_constexpr(__builtin_strlen(p)), \ __builtin_strlen(p), __fortify_strlen(p)) @@ -187,8 +225,26 @@ __kernel_size_t __fortify_strlen(const char * const POS p) return ret; } -/* defined after fortified strlen to reuse it */ +/* Defined after fortified strlen() to reuse it. */ extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy); +/** + * strlcpy - Copy a string into another string buffer + * + * @p: pointer to destination of copy + * @q: pointer to NUL-terminated source string to copy + * @size: maximum number of bytes to write at @p + * + * If strlen(@q) >= @size, the copy of @q will be truncated at + * @size - 1 bytes. @p will always be NUL-terminated. + * + * Do not use this function. While FORTIFY_SOURCE tries to avoid + * over-reads when calculating strlen(@q), it is still possible. + * Prefer strscpy(), though note its different return values for + * detecting truncation. + * + * Returns total number of bytes written to @p, including terminating NUL. + * + */ __FORTIFY_INLINE size_t strlcpy(char * const POS p, const char * const POS q, size_t size) { size_t p_size = __member_size(p); @@ -214,8 +270,32 @@ __FORTIFY_INLINE size_t strlcpy(char * const POS p, const char * const POS q, si return q_len; } -/* defined after fortified strnlen to reuse it */ +/* Defined after fortified strnlen() to reuse it. */ extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(strscpy); +/** + * strscpy - Copy a C-string into a sized buffer + * + * @p: Where to copy the string to + * @q: Where to copy the string from + * @size: Size of destination buffer + * + * Copy the source string @p, or as much of it as fits, into the destination + * @q buffer. The behavior is undefined if the string buffers overlap. The + * destination @p buffer is always NUL terminated, unless it's zero-sized. + * + * Preferred to strlcpy() since the API doesn't require reading memory + * from the source @q string beyond the specified @size bytes, and since + * the return value is easier to error-check than strlcpy()'s. + * In addition, the implementation is robust to the string changing out + * from underneath it, unlike the current strlcpy() implementation. + * + * Preferred to strncpy() since it always returns a valid string, and + * doesn't unnecessarily force the tail of the destination buffer to be + * zero padded. If padding is desired please use strscpy_pad(). + * + * Returns the number of characters copied in @p (not including the + * trailing %NUL) or -E2BIG if @size is 0 or the copy of @q was truncated. + */ __FORTIFY_INLINE ssize_t strscpy(char * const POS p, const char * const POS q, size_t size) { size_t len; @@ -261,7 +341,26 @@ __FORTIFY_INLINE ssize_t strscpy(char * const POS p, const char * const POS q, s return __real_strscpy(p, q, len); } -/* defined after fortified strlen and strnlen to reuse them */ +/** + * strncat - Append a string to an existing string + * + * @p: pointer to NUL-terminated string to append to + * @q: pointer to source string to append from + * @count: Maximum bytes to read from @q + * + * Appends at most @count bytes from @q (stopping at the first + * NUL byte) after the NUL-terminated string at @p. @p will be + * NUL-terminated. + * + * Do not use this function. While FORTIFY_SOURCE tries to avoid + * read and write overflows, this is only possible when the sizes + * of @p and @q are known to the compiler. Prefer building the + * string with formatting, via scnprintf() or similar. + * + * Returns @p. + * + */ +/* Defined after fortified strlen() and strnlen() to reuse them. */ __FORTIFY_INLINE __diagnose_as(__builtin_strncat, 1, 2, 3) char *strncat(char * const POS p, const char * const POS q, __kernel_size_t count) { @@ -572,6 +671,20 @@ __FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp return __real_kmemdup(p, size, gfp); } +/** + * strcpy - Copy a string into another string buffer + * + * @p: pointer to destination of copy + * @q: pointer to NUL-terminated source string to copy + * + * Do not use this function. While FORTIFY_SOURCE tries to avoid + * overflows, this is only possible when the sizes of @q and @p are + * known to the compiler. Prefer strscpy(), though note its different + * return values for detecting truncation. + * + * Returns @p. + * + */ /* Defined after fortified strlen to reuse it. */ __FORTIFY_INLINE __diagnose_as(__builtin_strcpy, 1, 2) char *strcpy(char * const POS p, const char * const POS q) diff --git a/lib/string.c b/lib/string.c index 3371d26a0e390..4fb566ea610ff 100644 --- a/lib/string.c +++ b/lib/string.c @@ -76,11 +76,6 @@ EXPORT_SYMBOL(strcasecmp); #endif #ifndef __HAVE_ARCH_STRCPY -/** - * strcpy - Copy a %NUL terminated string - * @dest: Where to copy the string to - * @src: Where to copy the string from - */ char *strcpy(char *dest, const char *src) { char *tmp = dest; @@ -93,19 +88,6 @@ EXPORT_SYMBOL(strcpy); #endif #ifndef __HAVE_ARCH_STRNCPY -/** - * strncpy - Copy a length-limited, C-string - * @dest: Where to copy the string to - * @src: Where to copy the string from - * @count: The maximum number of bytes to copy - * - * The result is not %NUL-terminated if the source exceeds - * @count bytes. - * - * In the case where the length of @src is less than that of - * count, the remainder of @dest will be padded with %NUL. - * - */ char *strncpy(char *dest, const char *src, size_t count) { char *tmp = dest; @@ -122,17 +104,6 @@ EXPORT_SYMBOL(strncpy); #endif #ifndef __HAVE_ARCH_STRLCPY -/** - * strlcpy - Copy a C-string into a sized buffer - * @dest: Where to copy the string to - * @src: Where to copy the string from - * @size: size of destination buffer - * - * Compatible with ``*BSD``: the result is always a valid - * NUL-terminated string that fits in the buffer (unless, - * of course, the buffer size is zero). It does not pad - * out the result like strncpy() does. - */ size_t strlcpy(char *dest, const char *src, size_t size) { size_t ret = strlen(src); @@ -148,30 +119,6 @@ EXPORT_SYMBOL(strlcpy); #endif #ifndef __HAVE_ARCH_STRSCPY -/** - * strscpy - Copy a C-string into a sized buffer - * @dest: Where to copy the string to - * @src: Where to copy the string from - * @count: Size of destination buffer - * - * Copy the string, or as much of it as fits, into the dest buffer. The - * behavior is undefined if the string buffers overlap. The destination - * buffer is always NUL terminated, unless it's zero-sized. - * - * Preferred to strlcpy() since the API doesn't require reading memory - * from the src string beyond the specified "count" bytes, and since - * the return value is easier to error-check than strlcpy()'s. - * In addition, the implementation is robust to the string changing out - * from underneath it, unlike the current strlcpy() implementation. - * - * Preferred to strncpy() since it always returns a valid string, and - * doesn't unnecessarily force the tail of the destination buffer to be - * zeroed. If zeroing is desired please use strscpy_pad(). - * - * Returns: - * * The number of characters copied (not including the trailing %NUL) - * * -E2BIG if count is 0 or @src was truncated. - */ ssize_t strscpy(char *dest, const char *src, size_t count) { const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; @@ -266,11 +213,6 @@ char *stpcpy(char *__restrict__ dest, const char *__restrict__ src) EXPORT_SYMBOL(stpcpy); #ifndef __HAVE_ARCH_STRCAT -/** - * strcat - Append one %NUL-terminated string to another - * @dest: The string to be appended to - * @src: The string to append to it - */ char *strcat(char *dest, const char *src) { char *tmp = dest; @@ -285,15 +227,6 @@ EXPORT_SYMBOL(strcat); #endif #ifndef __HAVE_ARCH_STRNCAT -/** - * strncat - Append a length-limited, C-string to another - * @dest: The string to be appended to - * @src: The string to append to it - * @count: The maximum numbers of bytes to copy - * - * Note that in contrast to strncpy(), strncat() ensures the result is - * terminated. - */ char *strncat(char *dest, const char *src, size_t count) { char *tmp = dest; @@ -314,12 +247,6 @@ EXPORT_SYMBOL(strncat); #endif #ifndef __HAVE_ARCH_STRLCAT -/** - * strlcat - Append a length-limited, C-string to another - * @dest: The string to be appended to - * @src: The string to append to it - * @count: The size of the destination buffer. - */ size_t strlcat(char *dest, const char *src, size_t count) { size_t dsize = strlen(dest); @@ -484,10 +411,6 @@ EXPORT_SYMBOL(strnchr); #endif #ifndef __HAVE_ARCH_STRLEN -/** - * strlen - Find the length of a string - * @s: The string to be sized - */ size_t strlen(const char *s) { const char *sc; @@ -500,11 +423,6 @@ EXPORT_SYMBOL(strlen); #endif #ifndef __HAVE_ARCH_STRNLEN -/** - * strnlen - Find the length of a length-limited string - * @s: The string to be sized - * @count: The maximum number of bytes to search - */ size_t strnlen(const char *s, size_t count) { const char *sc; diff --git a/scripts/kernel-doc b/scripts/kernel-doc index aea04365bc69d..adbc4d3077702 100755 --- a/scripts/kernel-doc +++ b/scripts/kernel-doc @@ -1448,6 +1448,8 @@ sub create_parameterlist($$$$) { foreach my $arg (split($splitter, $args)) { # strip comments $arg =~ s/\/\*.*\*\///; + # ignore argument attributes + $arg =~ s/\sPOS0?\s/ /; # strip leading/trailing spaces $arg =~ s/^\s*//; $arg =~ s/\s*$//; @@ -1657,6 +1659,7 @@ sub dump_function($$) { $prototype =~ s/^__inline +//; $prototype =~ s/^__always_inline +//; $prototype =~ s/^noinline +//; + $prototype =~ s/^__FORTIFY_INLINE +//; $prototype =~ s/__init +//; $prototype =~ s/__init_or_module +//; $prototype =~ s/__deprecated +//; @@ -1666,7 +1669,8 @@ sub dump_function($$) { $prototype =~ s/__weak +//; $prototype =~ s/__sched +//; $prototype =~ s/__printf\s*\(\s*\d*\s*,\s*\d*\s*\) +//; - $prototype =~ s/__alloc_size\s*\(\s*\d+\s*(?:,\s*\d+\s*)?\) +//; + $prototype =~ s/__(?:re)?alloc_size\s*\(\s*\d+\s*(?:,\s*\d+\s*)?\) +//; + $prototype =~ s/__diagnose_as\s*\(\s*\S+\s*(?:,\s*\d+\s*)*\) +//; my $define = $prototype =~ s/^#\s*define\s+//; #ak added $prototype =~ s/__attribute_const__ +//; $prototype =~ s/__attribute__\s*\(\( -- GitLab From 96fce387d58fa8eae6e8d9b1ecdfbc18292d7a68 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 28 Sep 2022 14:17:05 -0700 Subject: [PATCH 0279/1423] kunit/memcpy: Add dynamic size and window tests The "side effects" memmove() test accidentally found[1] a corner case in the recent refactoring of the i386 assembly memmove(), but missed another corner case. Instead of hoping to get lucky next time, implement much more complete tests of memcpy() and memmove() -- especially the moving window overlap for memmove() -- which catches all the issues encountered and should catch anything new. [1] https://lore.kernel.org/lkml/CAKwvOdkaKTa2aiA90VzFrChNQM6O_ro+b7VWs=op70jx-DKaXA@mail.gmail.com Cc: Nick Desaulniers Tested-by: Nick Desaulniers Signed-off-by: Kees Cook --- MAINTAINERS | 1 + lib/memcpy_kunit.c | 205 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 206 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index cf0f185023724..9dd8d74c4df09 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8044,6 +8044,7 @@ S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/hardening F: include/linux/fortify-string.h F: lib/fortify_kunit.c +F: lib/memcpy_kunit.c F: lib/test_fortify/* F: scripts/test_fortify.sh K: \b__NO_FORTIFY\b diff --git a/lib/memcpy_kunit.c b/lib/memcpy_kunit.c index 2b5cc70ac53fc..c4a7107edd43a 100644 --- a/lib/memcpy_kunit.c +++ b/lib/memcpy_kunit.c @@ -270,6 +270,208 @@ static void memset_test(struct kunit *test) #undef TEST_OP } +static u8 large_src[1024]; +static u8 large_dst[2048]; +static const u8 large_zero[2048]; + +static void set_random_nonzero(struct kunit *test, u8 *byte) +{ + int failed_rng = 0; + + while (*byte == 0) { + get_random_bytes(byte, 1); + KUNIT_ASSERT_LT_MSG(test, failed_rng++, 100, + "Is the RNG broken?"); + } +} + +static void init_large(struct kunit *test) +{ + + /* Get many bit patterns. */ + get_random_bytes(large_src, ARRAY_SIZE(large_src)); + + /* Make sure we have non-zero edges. */ + set_random_nonzero(test, &large_src[0]); + set_random_nonzero(test, &large_src[ARRAY_SIZE(large_src) - 1]); + + /* Explicitly zero the entire destination. */ + memset(large_dst, 0, ARRAY_SIZE(large_dst)); +} + +/* + * Instead of an indirect function call for "copy" or a giant macro, + * use a bool to pick memcpy or memmove. + */ +static void copy_large_test(struct kunit *test, bool use_memmove) +{ + init_large(test); + + /* Copy a growing number of non-overlapping bytes ... */ + for (int bytes = 1; bytes <= ARRAY_SIZE(large_src); bytes++) { + /* Over a shifting destination window ... */ + for (int offset = 0; offset < ARRAY_SIZE(large_src); offset++) { + int right_zero_pos = offset + bytes; + int right_zero_size = ARRAY_SIZE(large_dst) - right_zero_pos; + + /* Copy! */ + if (use_memmove) + memmove(large_dst + offset, large_src, bytes); + else + memcpy(large_dst + offset, large_src, bytes); + + /* Did we touch anything before the copy area? */ + KUNIT_ASSERT_EQ_MSG(test, + memcmp(large_dst, large_zero, offset), 0, + "with size %d at offset %d", bytes, offset); + /* Did we touch anything after the copy area? */ + KUNIT_ASSERT_EQ_MSG(test, + memcmp(&large_dst[right_zero_pos], large_zero, right_zero_size), 0, + "with size %d at offset %d", bytes, offset); + + /* Are we byte-for-byte exact across the copy? */ + KUNIT_ASSERT_EQ_MSG(test, + memcmp(large_dst + offset, large_src, bytes), 0, + "with size %d at offset %d", bytes, offset); + + /* Zero out what we copied for the next cycle. */ + memset(large_dst + offset, 0, bytes); + } + /* Avoid stall warnings if this loop gets slow. */ + cond_resched(); + } +} + +static void memcpy_large_test(struct kunit *test) +{ + copy_large_test(test, false); +} + +static void memmove_large_test(struct kunit *test) +{ + copy_large_test(test, true); +} + +/* + * On the assumption that boundary conditions are going to be the most + * sensitive, instead of taking a full step (inc) each iteration, + * take single index steps for at least the first "inc"-many indexes + * from the "start" and at least the last "inc"-many indexes before + * the "end". When in the middle, take full "inc"-wide steps. For + * example, calling next_step(idx, 1, 15, 3) with idx starting at 0 + * would see the following pattern: 1 2 3 4 7 10 11 12 13 14 15. + */ +static int next_step(int idx, int start, int end, int inc) +{ + start += inc; + end -= inc; + + if (idx < start || idx + inc > end) + inc = 1; + return idx + inc; +} + +static void inner_loop(struct kunit *test, int bytes, int d_off, int s_off) +{ + int left_zero_pos, left_zero_size; + int right_zero_pos, right_zero_size; + int src_pos, src_orig_pos, src_size; + int pos; + + /* Place the source in the destination buffer. */ + memcpy(&large_dst[s_off], large_src, bytes); + + /* Copy to destination offset. */ + memmove(&large_dst[d_off], &large_dst[s_off], bytes); + + /* Make sure destination entirely matches. */ + KUNIT_ASSERT_EQ_MSG(test, memcmp(&large_dst[d_off], large_src, bytes), 0, + "with size %d at src offset %d and dest offset %d", + bytes, s_off, d_off); + + /* Calculate the expected zero spans. */ + if (s_off < d_off) { + left_zero_pos = 0; + left_zero_size = s_off; + + right_zero_pos = d_off + bytes; + right_zero_size = ARRAY_SIZE(large_dst) - right_zero_pos; + + src_pos = s_off; + src_orig_pos = 0; + src_size = d_off - s_off; + } else { + left_zero_pos = 0; + left_zero_size = d_off; + + right_zero_pos = s_off + bytes; + right_zero_size = ARRAY_SIZE(large_dst) - right_zero_pos; + + src_pos = d_off + bytes; + src_orig_pos = src_pos - s_off; + src_size = right_zero_pos - src_pos; + } + + /* Check non-overlapping source is unchanged.*/ + KUNIT_ASSERT_EQ_MSG(test, + memcmp(&large_dst[src_pos], &large_src[src_orig_pos], src_size), 0, + "with size %d at src offset %d and dest offset %d", + bytes, s_off, d_off); + + /* Check leading buffer contents are zero. */ + KUNIT_ASSERT_EQ_MSG(test, + memcmp(&large_dst[left_zero_pos], large_zero, left_zero_size), 0, + "with size %d at src offset %d and dest offset %d", + bytes, s_off, d_off); + /* Check trailing buffer contents are zero. */ + KUNIT_ASSERT_EQ_MSG(test, + memcmp(&large_dst[right_zero_pos], large_zero, right_zero_size), 0, + "with size %d at src offset %d and dest offset %d", + bytes, s_off, d_off); + + /* Zero out everything not already zeroed.*/ + pos = left_zero_pos + left_zero_size; + memset(&large_dst[pos], 0, right_zero_pos - pos); +} + +static void memmove_overlap_test(struct kunit *test) +{ + /* + * Running all possible offset and overlap combinations takes a + * very long time. Instead, only check up to 128 bytes offset + * into the destination buffer (which should result in crossing + * cachelines), with a step size of 1 through 7 to try to skip some + * redundancy. + */ + static const int offset_max = 128; /* less than ARRAY_SIZE(large_src); */ + static const int bytes_step = 7; + static const int window_step = 7; + + static const int bytes_start = 1; + static const int bytes_end = ARRAY_SIZE(large_src) + 1; + + init_large(test); + + /* Copy a growing number of overlapping bytes ... */ + for (int bytes = bytes_start; bytes < bytes_end; + bytes = next_step(bytes, bytes_start, bytes_end, bytes_step)) { + + /* Over a shifting destination window ... */ + for (int d_off = 0; d_off < offset_max; d_off++) { + int s_start = max(d_off - bytes, 0); + int s_end = min_t(int, d_off + bytes, ARRAY_SIZE(large_src)); + + /* Over a shifting source window ... */ + for (int s_off = s_start; s_off < s_end; + s_off = next_step(s_off, s_start, s_end, window_step)) + inner_loop(test, bytes, d_off, s_off); + + /* Avoid stall warnings. */ + cond_resched(); + } + } +} + static void strtomem_test(struct kunit *test) { static const char input[sizeof(unsigned long)] = "hi"; @@ -325,7 +527,10 @@ static void strtomem_test(struct kunit *test) static struct kunit_case memcpy_test_cases[] = { KUNIT_CASE(memset_test), KUNIT_CASE(memcpy_test), + KUNIT_CASE(memcpy_large_test), KUNIT_CASE(memmove_test), + KUNIT_CASE(memmove_large_test), + KUNIT_CASE(memmove_overlap_test), KUNIT_CASE(strtomem_test), {} }; -- GitLab From 310f541a027b1d5dc68f44f176cde618e6ee9691 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Wed, 12 Oct 2022 20:00:37 +0800 Subject: [PATCH 0280/1423] riscv: Enable HAVE_ARCH_HUGE_VMAP for 64BIT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This sets the HAVE_ARCH_HUGE_VMAP option, and defines the required page table functions. With this feature, ioremap area will be mapped with huge page granularity according to its actual size. This feature can be disabled by kernel parameter "nohugeiomap". Signed-off-by: Liu Shixin Reviewed-by: Björn Töpel Tested-by: Björn Töpel Link: https://lore.kernel.org/r/20221012120038.1034354-2-liushixin2@huawei.com [Palmer: minor formatting] Signed-off-by: Palmer Dabbelt --- .../features/vm/huge-vmap/arch-support.txt | 2 +- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/vmalloc.h | 18 ++++ arch/riscv/mm/Makefile | 1 + arch/riscv/mm/pgtable.c | 83 +++++++++++++++++++ 5 files changed, 104 insertions(+), 1 deletion(-) create mode 100644 arch/riscv/mm/pgtable.c diff --git a/Documentation/features/vm/huge-vmap/arch-support.txt b/Documentation/features/vm/huge-vmap/arch-support.txt index 13b4940e0c3a7..7274a4b15bcc8 100644 --- a/Documentation/features/vm/huge-vmap/arch-support.txt +++ b/Documentation/features/vm/huge-vmap/arch-support.txt @@ -21,7 +21,7 @@ | openrisc: | TODO | | parisc: | TODO | | powerpc: | ok | - | riscv: | TODO | + | riscv: | ok | | s390: | TODO | | sh: | TODO | | sparc: | TODO | diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 6b48a3ae98439..db2082dd456d6 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -72,6 +72,7 @@ config RISCV select GENERIC_VDSO_TIME_NS if HAVE_GENERIC_VDSO select HARDIRQS_SW_RESEND select HAVE_ARCH_AUDITSYSCALL + select HAVE_ARCH_HUGE_VMAP if MMU && 64BIT && !XIP_KERNEL select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL select HAVE_ARCH_JUMP_LABEL_RELATIVE if !XIP_KERNEL select HAVE_ARCH_KASAN if MMU && 64BIT diff --git a/arch/riscv/include/asm/vmalloc.h b/arch/riscv/include/asm/vmalloc.h index ff9abc00d1394..48da5371f1e9a 100644 --- a/arch/riscv/include/asm/vmalloc.h +++ b/arch/riscv/include/asm/vmalloc.h @@ -1,4 +1,22 @@ #ifndef _ASM_RISCV_VMALLOC_H #define _ASM_RISCV_VMALLOC_H +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP + +#define IOREMAP_MAX_ORDER (PUD_SHIFT) + +#define arch_vmap_pud_supported arch_vmap_pud_supported +static inline bool arch_vmap_pud_supported(pgprot_t prot) +{ + return true; +} + +#define arch_vmap_pmd_supported arch_vmap_pmd_supported +static inline bool arch_vmap_pmd_supported(pgprot_t prot) +{ + return true; +} + +#endif + #endif /* _ASM_RISCV_VMALLOC_H */ diff --git a/arch/riscv/mm/Makefile b/arch/riscv/mm/Makefile index d76aabf4b94d6..ce7f121ad2dc5 100644 --- a/arch/riscv/mm/Makefile +++ b/arch/riscv/mm/Makefile @@ -13,6 +13,7 @@ obj-y += extable.o obj-$(CONFIG_MMU) += fault.o pageattr.o obj-y += cacheflush.o obj-y += context.o +obj-y += pgtable.o ifeq ($(CONFIG_MMU),y) obj-$(CONFIG_SMP) += tlbflush.o diff --git a/arch/riscv/mm/pgtable.c b/arch/riscv/mm/pgtable.c new file mode 100644 index 0000000000000..6645ead1a7c16 --- /dev/null +++ b/arch/riscv/mm/pgtable.c @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include + +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) +{ + return 0; +} + +void p4d_clear_huge(p4d_t *p4d) +{ +} + +int pud_set_huge(pud_t *pud, phys_addr_t phys, pgprot_t prot) +{ + pud_t new_pud = pfn_pud(__phys_to_pfn(phys), prot); + + set_pud(pud, new_pud); + return 1; +} + +int pud_clear_huge(pud_t *pud) +{ + if (!pud_leaf(READ_ONCE(*pud))) + return 0; + pud_clear(pud); + return 1; +} + +int pud_free_pmd_page(pud_t *pud, unsigned long addr) +{ + pmd_t *pmd = pud_pgtable(*pud); + int i; + + pud_clear(pud); + + flush_tlb_kernel_range(addr, addr + PUD_SIZE); + + for (i = 0; i < PTRS_PER_PMD; i++) { + if (!pmd_none(pmd[i])) { + pte_t *pte = (pte_t *)pmd_page_vaddr(pmd[i]); + + pte_free_kernel(NULL, pte); + } + } + + pmd_free(NULL, pmd); + + return 1; +} + +int pmd_set_huge(pmd_t *pmd, phys_addr_t phys, pgprot_t prot) +{ + pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), prot); + + set_pmd(pmd, new_pmd); + return 1; +} + +int pmd_clear_huge(pmd_t *pmd) +{ + if (!pmd_leaf(READ_ONCE(*pmd))) + return 0; + pmd_clear(pmd); + return 1; +} + +int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) +{ + pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); + + pmd_clear(pmd); + + flush_tlb_kernel_range(addr, addr + PMD_SIZE); + pte_free_kernel(NULL, pte); + return 1; +} + +#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ -- GitLab From be79afc740b5a1b2048cd67580cdb9d76d7e6cc2 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Wed, 12 Oct 2022 20:00:38 +0800 Subject: [PATCH 0281/1423] riscv: Enable HAVE_ARCH_HUGE_VMALLOC for 64BIT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After we support HAVE_ARCH_HUGE_VMAP, we can now enable HAVE_ARCH_HUGE_VMALLOC too. This feature has been used in kvmalloc and alloc_large_system_hash for now. This feature can be disabled by kernel parameters "nohugevmalloc". Signed-off-by: Liu Shixin Reviewed-by: Björn Töpel Tested-by: Björn Töpel Link: https://lore.kernel.org/r/20221012120038.1034354-3-liushixin2@huawei.com [Palmer: minor formatting] Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index db2082dd456d6..7cd981f96f48b 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -72,6 +72,7 @@ config RISCV select GENERIC_VDSO_TIME_NS if HAVE_GENERIC_VDSO select HARDIRQS_SW_RESEND select HAVE_ARCH_AUDITSYSCALL + select HAVE_ARCH_HUGE_VMALLOC if HAVE_ARCH_HUGE_VMAP select HAVE_ARCH_HUGE_VMAP if MMU && 64BIT && !XIP_KERNEL select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL select HAVE_ARCH_JUMP_LABEL_RELATIVE if !XIP_KERNEL -- GitLab From 085bdaa6eb1476ec054164bdc4001bc3916ff5cb Mon Sep 17 00:00:00 2001 From: Shaoqin Huang Date: Tue, 11 Oct 2022 14:21:20 +0800 Subject: [PATCH 0282/1423] memblock test: Add test to memblock_add() 129th region Add 129th region into the memblock, and this will trigger the memblock_double_array() function, this needs valid memory regions. So using dummy_physical_memory_init() to allocate a large enough memory region, and split it into a large enough memory which can be choosed by memblock_double_array(), and the left memory will be split into small memory region, and add them into the memblock. It make sure the memblock_double_array() will always choose the valid memory region that is allocated by the dummy_physical_memory_init(). So memblock_double_array() must success. Another thing should be done is to restore the memory.regions after memblock_double_array(), due to now the memory.regions is pointing to a memory region allocated by dummy_physical_memory_init(). And it will affect the subsequent tests if we don't restore the memory region. So simply record the origin region, and restore it after the test. Signed-off-by: Shaoqin Huang Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/20221011062128.49359-2-shaoqin.huang@intel.com --- tools/testing/memblock/tests/basic_api.c | 93 ++++++++++++++++++++++++ tools/testing/memblock/tests/common.c | 7 +- tools/testing/memblock/tests/common.h | 6 +- 3 files changed, 103 insertions(+), 3 deletions(-) diff --git a/tools/testing/memblock/tests/basic_api.c b/tools/testing/memblock/tests/basic_api.c index a13a57ba0815f..4d61a4b474be3 100644 --- a/tools/testing/memblock/tests/basic_api.c +++ b/tools/testing/memblock/tests/basic_api.c @@ -423,6 +423,98 @@ static int memblock_add_near_max_check(void) return 0; } +/* + * A test that trying to add the 129th memory block. + * Expect to trigger memblock_double_array() to double the + * memblock.memory.max, find a new valid memory as + * memory.regions. + */ +static int memblock_add_many_check(void) +{ + int i; + void *orig_region; + struct region r = { + .base = SZ_16K, + .size = SZ_16K, + }; + phys_addr_t new_memory_regions_size; + phys_addr_t base, size = SZ_64; + phys_addr_t gap_size = SZ_64; + + PREFIX_PUSH(); + + reset_memblock_regions(); + memblock_allow_resize(); + + dummy_physical_memory_init(); + /* + * We allocated enough memory by using dummy_physical_memory_init(), and + * split it into small block. First we split a large enough memory block + * as the memory region which will be choosed by memblock_double_array(). + */ + base = PAGE_ALIGN(dummy_physical_memory_base()); + new_memory_regions_size = PAGE_ALIGN(INIT_MEMBLOCK_REGIONS * 2 * + sizeof(struct memblock_region)); + memblock_add(base, new_memory_regions_size); + + /* This is the base of small memory block. */ + base += new_memory_regions_size + gap_size; + + orig_region = memblock.memory.regions; + + for (i = 0; i < INIT_MEMBLOCK_REGIONS; i++) { + /* + * Add these small block to fulfill the memblock. We keep a + * gap between the nearby memory to avoid being merged. + */ + memblock_add(base, size); + base += size + gap_size; + + ASSERT_EQ(memblock.memory.cnt, i + 2); + ASSERT_EQ(memblock.memory.total_size, new_memory_regions_size + + (i + 1) * size); + } + + /* + * At there, memblock_double_array() has been succeed, check if it + * update the memory.max. + */ + ASSERT_EQ(memblock.memory.max, INIT_MEMBLOCK_REGIONS * 2); + + /* memblock_double_array() will reserve the memory it used. Check it. */ + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, new_memory_regions_size); + + /* + * Now memblock_double_array() works fine. Let's check after the + * double_array(), the memblock_add() still works as normal. + */ + memblock_add(r.base, r.size); + ASSERT_EQ(memblock.memory.regions[0].base, r.base); + ASSERT_EQ(memblock.memory.regions[0].size, r.size); + + ASSERT_EQ(memblock.memory.cnt, INIT_MEMBLOCK_REGIONS + 2); + ASSERT_EQ(memblock.memory.total_size, INIT_MEMBLOCK_REGIONS * size + + new_memory_regions_size + + r.size); + ASSERT_EQ(memblock.memory.max, INIT_MEMBLOCK_REGIONS * 2); + + dummy_physical_memory_cleanup(); + + /* + * The current memory.regions is occupying a range of memory that + * allocated from dummy_physical_memory_init(). After free the memory, + * we must not use it. So restore the origin memory region to make sure + * the tests can run as normal and not affected by the double array. + */ + memblock.memory.regions = orig_region; + memblock.memory.cnt = INIT_MEMBLOCK_REGIONS; + + test_pass_pop(); + + return 0; +} + static int memblock_add_checks(void) { prefix_reset(); @@ -438,6 +530,7 @@ static int memblock_add_checks(void) memblock_add_twice_check(); memblock_add_between_check(); memblock_add_near_max_check(); + memblock_add_many_check(); prefix_pop(); diff --git a/tools/testing/memblock/tests/common.c b/tools/testing/memblock/tests/common.c index 3f795047bbe18..f43b6f414983d 100644 --- a/tools/testing/memblock/tests/common.c +++ b/tools/testing/memblock/tests/common.c @@ -5,8 +5,6 @@ #include #include -#define INIT_MEMBLOCK_REGIONS 128 -#define INIT_MEMBLOCK_RESERVED_REGIONS INIT_MEMBLOCK_REGIONS #define PREFIXES_MAX 15 #define DELIM ": " #define BASIS 10000 @@ -115,6 +113,11 @@ void dummy_physical_memory_cleanup(void) free(memory_block.base); } +phys_addr_t dummy_physical_memory_base(void) +{ + return (phys_addr_t)memory_block.base; +} + static void usage(const char *prog) { BUILD_BUG_ON(ARRAY_SIZE(help_opts) != ARRAY_SIZE(long_opts) - 1); diff --git a/tools/testing/memblock/tests/common.h b/tools/testing/memblock/tests/common.h index d6bbbe63bfc36..cc82b85151b61 100644 --- a/tools/testing/memblock/tests/common.h +++ b/tools/testing/memblock/tests/common.h @@ -10,9 +10,12 @@ #include #include <../selftests/kselftest.h> -#define MEM_SIZE SZ_16K +#define MEM_SIZE SZ_32K #define NUMA_NODES 8 +#define INIT_MEMBLOCK_REGIONS 128 +#define INIT_MEMBLOCK_RESERVED_REGIONS INIT_MEMBLOCK_REGIONS + enum test_flags { /* No special request. */ TEST_F_NONE = 0x0, @@ -124,6 +127,7 @@ void setup_memblock(void); void setup_numa_memblock(const unsigned int node_fracs[]); void dummy_physical_memory_init(void); void dummy_physical_memory_cleanup(void); +phys_addr_t dummy_physical_memory_base(void); void parse_args(int argc, char **argv); void test_fail(void); -- GitLab From 5b27dd7968b9c916da4af48d0310f94152744f8e Mon Sep 17 00:00:00 2001 From: Shaoqin Huang Date: Tue, 11 Oct 2022 14:21:21 +0800 Subject: [PATCH 0283/1423] memblock test: Add test to memblock_reserve() 129th region Reserve 129th region in the memblock, and this will trigger the memblock_double_array() function, this needs valid memory regions. So using dummy_physical_memory_init() to allocate a valid memory region. At the same time, reserve 128 faked memory region, and make sure these reserved region not intersect with the valid memory region. So memblock_double_array() will choose the valid memory region, and it will success. Also need to restore the reserved.regions after memblock_double_array(), to make sure the subsequent tests can run as normal. Signed-off-by: Shaoqin Huang Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/20221011062128.49359-3-shaoqin.huang@intel.com --- tools/testing/memblock/tests/basic_api.c | 91 ++++++++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/tools/testing/memblock/tests/basic_api.c b/tools/testing/memblock/tests/basic_api.c index 4d61a4b474be3..411647094cc37 100644 --- a/tools/testing/memblock/tests/basic_api.c +++ b/tools/testing/memblock/tests/basic_api.c @@ -892,6 +892,96 @@ static int memblock_reserve_near_max_check(void) return 0; } +/* + * A test that trying to reserve the 129th memory block. + * Expect to trigger memblock_double_array() to double the + * memblock.memory.max, find a new valid memory as + * reserved.regions. + */ +static int memblock_reserve_many_check(void) +{ + int i; + void *orig_region; + struct region r = { + .base = SZ_16K, + .size = SZ_16K, + }; + phys_addr_t memory_base = SZ_128K; + phys_addr_t new_reserved_regions_size; + + PREFIX_PUSH(); + + reset_memblock_regions(); + memblock_allow_resize(); + + /* Add a valid memory region used by double_array(). */ + dummy_physical_memory_init(); + memblock_add(dummy_physical_memory_base(), MEM_SIZE); + + for (i = 0; i < INIT_MEMBLOCK_REGIONS; i++) { + /* Reserve some fakes memory region to fulfill the memblock. */ + memblock_reserve(memory_base, MEM_SIZE); + + ASSERT_EQ(memblock.reserved.cnt, i + 1); + ASSERT_EQ(memblock.reserved.total_size, (i + 1) * MEM_SIZE); + + /* Keep the gap so these memory region will not be merged. */ + memory_base += MEM_SIZE * 2; + } + + orig_region = memblock.reserved.regions; + + /* This reserve the 129 memory_region, and makes it double array. */ + memblock_reserve(memory_base, MEM_SIZE); + + /* + * This is the memory region size used by the doubled reserved.regions, + * and it has been reserved due to it has been used. The size is used to + * calculate the total_size that the memblock.reserved have now. + */ + new_reserved_regions_size = PAGE_ALIGN((INIT_MEMBLOCK_REGIONS * 2) * + sizeof(struct memblock_region)); + /* + * The double_array() will find a free memory region as the new + * reserved.regions, and the used memory region will be reserved, so + * there will be one more region exist in the reserved memblock. And the + * one more reserved region's size is new_reserved_regions_size. + */ + ASSERT_EQ(memblock.reserved.cnt, INIT_MEMBLOCK_REGIONS + 2); + ASSERT_EQ(memblock.reserved.total_size, (INIT_MEMBLOCK_REGIONS + 1) * MEM_SIZE + + new_reserved_regions_size); + ASSERT_EQ(memblock.reserved.max, INIT_MEMBLOCK_REGIONS * 2); + + /* + * Now memblock_double_array() works fine. Let's check after the + * double_array(), the memblock_reserve() still works as normal. + */ + memblock_reserve(r.base, r.size); + ASSERT_EQ(memblock.reserved.regions[0].base, r.base); + ASSERT_EQ(memblock.reserved.regions[0].size, r.size); + + ASSERT_EQ(memblock.reserved.cnt, INIT_MEMBLOCK_REGIONS + 3); + ASSERT_EQ(memblock.reserved.total_size, (INIT_MEMBLOCK_REGIONS + 1) * MEM_SIZE + + new_reserved_regions_size + + r.size); + ASSERT_EQ(memblock.reserved.max, INIT_MEMBLOCK_REGIONS * 2); + + dummy_physical_memory_cleanup(); + + /* + * The current reserved.regions is occupying a range of memory that + * allocated from dummy_physical_memory_init(). After free the memory, + * we must not use it. So restore the origin memory region to make sure + * the tests can run as normal and not affected by the double array. + */ + memblock.reserved.regions = orig_region; + memblock.reserved.cnt = INIT_MEMBLOCK_RESERVED_REGIONS; + + test_pass_pop(); + + return 0; +} + static int memblock_reserve_checks(void) { prefix_reset(); @@ -906,6 +996,7 @@ static int memblock_reserve_checks(void) memblock_reserve_twice_check(); memblock_reserve_between_check(); memblock_reserve_near_max_check(); + memblock_reserve_many_check(); prefix_pop(); -- GitLab From 62a56c540797681a5b50a4c06bf638f79b6013bc Mon Sep 17 00:00:00 2001 From: Shaoqin Huang Date: Tue, 11 Oct 2022 14:21:22 +0800 Subject: [PATCH 0284/1423] memblock test: Update TODO list Remove the completed items from TODO list. Signed-off-by: Shaoqin Huang Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/20221011062128.49359-4-shaoqin.huang@intel.com --- tools/testing/memblock/TODO | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/tools/testing/memblock/TODO b/tools/testing/memblock/TODO index 33044c634ea7f..503cc96fcdc3a 100644 --- a/tools/testing/memblock/TODO +++ b/tools/testing/memblock/TODO @@ -1,17 +1,10 @@ TODO ===== -1. Add tests trying to memblock_add() or memblock_reserve() 129th region. - This will trigger memblock_double_array(), make sure it succeeds. - *Important:* These tests require valid memory ranges, use dummy physical - memory block from common.c to implement them. It is also very - likely that the current MEM_SIZE won't be enough for these - test cases. Use realloc to adjust the size accordingly. - -2. Add test cases using this functions (implement them for both directions): +1. Add test cases using this functions (implement them for both directions): + memblock_alloc_raw() + memblock_alloc_exact_nid_raw() + memblock_alloc_try_nid_raw() -3. Add tests for memblock_alloc_node() to check if the correct NUMA node is set +2. Add tests for memblock_alloc_node() to check if the correct NUMA node is set for the new region -- GitLab From e9e6fa49dbab6d84c676666f3fe7d360497fd65b Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Fri, 28 Oct 2022 20:33:20 +0800 Subject: [PATCH 0285/1423] apparmor: Fix memleak in alloc_ns() After changes in commit a1bd627b46d1 ("apparmor: share profile name on replacement"), the hname member of struct aa_policy is not valid slab object, but a subset of that, it can not be freed by kfree_sensitive(), use aa_policy_destroy() to fix it. Fixes: a1bd627b46d1 ("apparmor: share profile name on replacement") Signed-off-by: Xiu Jianfeng Signed-off-by: John Johansen --- security/apparmor/policy_ns.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/apparmor/policy_ns.c b/security/apparmor/policy_ns.c index 5c38563a6dcfd..fd5b7afbcb487 100644 --- a/security/apparmor/policy_ns.c +++ b/security/apparmor/policy_ns.c @@ -132,7 +132,7 @@ static struct aa_ns *alloc_ns(const char *prefix, const char *name) return ns; fail_unconfined: - kfree_sensitive(ns->base.hname); + aa_policy_destroy(&ns->base); fail_ns: kfree_sensitive(ns); return NULL; -- GitLab From 969864efae78eb51b0baa1d14b2dfe08151b5874 Mon Sep 17 00:00:00 2001 From: Raju Rangoju Date: Tue, 25 Oct 2022 23:41:24 +0530 Subject: [PATCH 0286/1423] i2c: amd-mp2: use msix/msi if the hardware supports Use msix or msi interrupts if the hardware supports it. Else, fallback to legacy interrupts. Co-developed-by: Basavaraj Natikar Signed-off-by: Basavaraj Natikar Signed-off-by: Raju Rangoju Acked-by: Shyam Sundar S K Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-amd-mp2-pci.c | 30 +++++++++++++++++++--------- drivers/i2c/busses/i2c-amd-mp2.h | 1 + 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/drivers/i2c/busses/i2c-amd-mp2-pci.c b/drivers/i2c/busses/i2c-amd-mp2-pci.c index f57077a7448d1..1431653009496 100644 --- a/drivers/i2c/busses/i2c-amd-mp2-pci.c +++ b/drivers/i2c/busses/i2c-amd-mp2-pci.c @@ -288,7 +288,7 @@ static void amd_mp2_clear_reg(struct amd_mp2_dev *privdata) static int amd_mp2_pci_init(struct amd_mp2_dev *privdata, struct pci_dev *pci_dev) { - int rc; + int irq_flag = 0, rc; pci_set_drvdata(pci_dev, privdata); @@ -311,17 +311,29 @@ static int amd_mp2_pci_init(struct amd_mp2_dev *privdata, if (rc) goto err_dma_mask; - /* Set up intx irq */ + /* request and enable interrupt */ writel(0, privdata->mmio + AMD_P2C_MSG_INTEN); - pci_intx(pci_dev, 1); - rc = devm_request_irq(&pci_dev->dev, pci_dev->irq, amd_mp2_irq_isr, - IRQF_SHARED, dev_name(&pci_dev->dev), privdata); - if (rc) - pci_err(pci_dev, "Failure requesting irq %i: %d\n", - pci_dev->irq, rc); + rc = pci_alloc_irq_vectors(pci_dev, 1, 1, PCI_IRQ_ALL_TYPES); + if (rc < 0) { + dev_err(&pci_dev->dev, "Failed to allocate single IRQ err=%d\n", rc); + goto err_dma_mask; + } + + privdata->dev_irq = pci_irq_vector(pci_dev, 0); + if (!pci_dev->msix_enabled && !pci_dev->msi_enabled) + irq_flag = IRQF_SHARED; + + rc = devm_request_irq(&pci_dev->dev, privdata->dev_irq, + amd_mp2_irq_isr, irq_flag, dev_name(&pci_dev->dev), privdata); + if (rc) { + pci_err(pci_dev, "Failure requesting irq %i: %d\n", privdata->dev_irq, rc); + goto free_irq_vectors; + } return rc; +free_irq_vectors: + free_irq(privdata->dev_irq, privdata); err_dma_mask: pci_clear_master(pci_dev); err_pci_enable: @@ -364,7 +376,7 @@ static void amd_mp2_pci_remove(struct pci_dev *pci_dev) pm_runtime_forbid(&pci_dev->dev); pm_runtime_get_noresume(&pci_dev->dev); - pci_intx(pci_dev, 0); + free_irq(privdata->dev_irq, privdata); pci_clear_master(pci_dev); amd_mp2_clear_reg(privdata); diff --git a/drivers/i2c/busses/i2c-amd-mp2.h b/drivers/i2c/busses/i2c-amd-mp2.h index ddecd0c886560..018a42de8b1e3 100644 --- a/drivers/i2c/busses/i2c-amd-mp2.h +++ b/drivers/i2c/busses/i2c-amd-mp2.h @@ -183,6 +183,7 @@ struct amd_mp2_dev { struct mutex c2p_lock; u8 c2p_lock_busid; unsigned int probed; + int dev_irq; }; /* PCIe communication driver */ -- GitLab From bb2617f0f2abbd8c622b2401e5e4984a4eef895b Mon Sep 17 00:00:00 2001 From: Frank Wunderlich Date: Sun, 9 Oct 2022 12:16:30 +0200 Subject: [PATCH 0287/1423] dt-bindings: i2c: update bindings for mt7986 soc Add i2c compatible for MT7986 SOC. Signed-off-by: Frank Wunderlich Reviewed-by: AngeloGioacchino Del Regno Acked-by: Rob Herring Signed-off-by: Wolfram Sang --- Documentation/devicetree/bindings/i2c/i2c-mt65xx.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/i2c/i2c-mt65xx.yaml b/Documentation/devicetree/bindings/i2c/i2c-mt65xx.yaml index 4e730fb7be567..421563bf576cd 100644 --- a/Documentation/devicetree/bindings/i2c/i2c-mt65xx.yaml +++ b/Documentation/devicetree/bindings/i2c/i2c-mt65xx.yaml @@ -23,6 +23,7 @@ properties: - const: mediatek,mt6577-i2c - const: mediatek,mt6589-i2c - const: mediatek,mt7622-i2c + - const: mediatek,mt7986-i2c - const: mediatek,mt8168-i2c - const: mediatek,mt8173-i2c - const: mediatek,mt8183-i2c -- GitLab From e0b7afc0eba88e8270860a573fe4ea28fe2d467c Mon Sep 17 00:00:00 2001 From: Frank Wunderlich Date: Sun, 9 Oct 2022 12:16:31 +0200 Subject: [PATCH 0288/1423] i2c: mediatek: add mt7986 support Add i2c support for MT7986 SoC. Signed-off-by: Frank Wunderlich Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-mt65xx.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/i2c/busses/i2c-mt65xx.c b/drivers/i2c/busses/i2c-mt65xx.c index fc7bfd98156ba..d80e59340d97a 100644 --- a/drivers/i2c/busses/i2c-mt65xx.c +++ b/drivers/i2c/busses/i2c-mt65xx.c @@ -431,6 +431,19 @@ static const struct mtk_i2c_compatible mt8168_compat = { .max_dma_support = 33, }; +static const struct mtk_i2c_compatible mt7986_compat = { + .quirks = &mt7622_i2c_quirks, + .regs = mt_i2c_regs_v1, + .pmic_i2c = 0, + .dcm = 1, + .auto_restart = 1, + .aux_len_reg = 1, + .timing_adjust = 0, + .dma_sync = 1, + .ltiming_adjust = 0, + .max_dma_support = 32, +}; + static const struct mtk_i2c_compatible mt8173_compat = { .regs = mt_i2c_regs_v1, .pmic_i2c = 0, @@ -503,6 +516,7 @@ static const struct of_device_id mtk_i2c_of_match[] = { { .compatible = "mediatek,mt6577-i2c", .data = &mt6577_compat }, { .compatible = "mediatek,mt6589-i2c", .data = &mt6589_compat }, { .compatible = "mediatek,mt7622-i2c", .data = &mt7622_compat }, + { .compatible = "mediatek,mt7986-i2c", .data = &mt7986_compat }, { .compatible = "mediatek,mt8168-i2c", .data = &mt8168_compat }, { .compatible = "mediatek,mt8173-i2c", .data = &mt8173_compat }, { .compatible = "mediatek,mt8183-i2c", .data = &mt8183_compat }, -- GitLab From a826b6e9e467ed378a2c50c4a03cb863ab681198 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonathan=20Neusch=C3=A4fer?= Date: Sat, 8 Oct 2022 14:59:23 +0200 Subject: [PATCH 0289/1423] i2c: npcm7xx: Group bank 0/1 registers together for readability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The unlabelled registers NPCM_I2CCTL4 to NPCM_I2CSCLHT overlap with the bank 1 registers below, and they are accessed after selecting bank 0, so they clearly belong to bank 0. Move them together with the other bank 0 registers, and move the unrelated definition of npcm_i2caddr down to keep the banked registers in one piece. Signed-off-by: Jonathan Neuschäfer Reviewed-by: Tali Perry Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-npcm7xx.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/drivers/i2c/busses/i2c-npcm7xx.c b/drivers/i2c/busses/i2c-npcm7xx.c index 0c365b57d9572..9a7a2d0bf5765 100644 --- a/drivers/i2c/busses/i2c-npcm7xx.c +++ b/drivers/i2c/busses/i2c-npcm7xx.c @@ -106,7 +106,7 @@ enum i2c_addr { #define NPCM_I2CCST3 0x19 #define I2C_VER 0x1F -/*BANK0 regs*/ +/* BANK 0 regs */ #define NPCM_I2CADDR3 0x10 #define NPCM_I2CADDR7 0x11 #define NPCM_I2CADDR4 0x12 @@ -115,6 +115,20 @@ enum i2c_addr { #define NPCM_I2CADDR9 0x15 #define NPCM_I2CADDR6 0x16 #define NPCM_I2CADDR10 0x17 +#define NPCM_I2CCTL4 0x1A +#define NPCM_I2CCTL5 0x1B +#define NPCM_I2CSCLLT 0x1C /* SCL Low Time */ +#define NPCM_I2CFIF_CTL 0x1D /* FIFO Control */ +#define NPCM_I2CSCLHT 0x1E /* SCL High Time */ + +/* BANK 1 regs */ +#define NPCM_I2CFIF_CTS 0x10 /* Both FIFOs Control and Status */ +#define NPCM_I2CTXF_CTL 0x12 /* Tx-FIFO Control */ +#define NPCM_I2CT_OUT 0x14 /* Bus T.O. */ +#define NPCM_I2CPEC 0x16 /* PEC Data */ +#define NPCM_I2CTXF_STS 0x1A /* Tx-FIFO Status */ +#define NPCM_I2CRXF_STS 0x1C /* Rx-FIFO Status */ +#define NPCM_I2CRXF_CTL 0x1E /* Rx-FIFO Control */ #if IS_ENABLED(CONFIG_I2C_SLAVE) /* @@ -131,21 +145,6 @@ static const int npcm_i2caddr[I2C_NUM_OWN_ADDR] = { }; #endif -#define NPCM_I2CCTL4 0x1A -#define NPCM_I2CCTL5 0x1B -#define NPCM_I2CSCLLT 0x1C /* SCL Low Time */ -#define NPCM_I2CFIF_CTL 0x1D /* FIFO Control */ -#define NPCM_I2CSCLHT 0x1E /* SCL High Time */ - -/* BANK 1 regs */ -#define NPCM_I2CFIF_CTS 0x10 /* Both FIFOs Control and Status */ -#define NPCM_I2CTXF_CTL 0x12 /* Tx-FIFO Control */ -#define NPCM_I2CT_OUT 0x14 /* Bus T.O. */ -#define NPCM_I2CPEC 0x16 /* PEC Data */ -#define NPCM_I2CTXF_STS 0x1A /* Tx-FIFO Status */ -#define NPCM_I2CRXF_STS 0x1C /* Rx-FIFO Status */ -#define NPCM_I2CRXF_CTL 0x1E /* Rx-FIFO Control */ - /* NPCM_I2CST reg fields */ #define NPCM_I2CST_XMIT BIT(0) #define NPCM_I2CST_MASTER BIT(1) -- GitLab From 3ca8217dc450f7a50f90d2ad97787e38088af8e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonathan=20Neusch=C3=A4fer?= Date: Sat, 8 Oct 2022 14:59:24 +0200 Subject: [PATCH 0290/1423] i2c: npcm7xx: Annotate register field definitions with longer names MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To make the code easier to understand, add longer names to the definitions of register fields. These longer names are based on source code published by DELL/AESS for WPCM450, but should apply just as well to NPCM7xx and NPCM8xx. Signed-off-by: Jonathan Neuschäfer Reviewed-by: Tali Perry Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-npcm7xx.c | 56 ++++++++++++++++---------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/drivers/i2c/busses/i2c-npcm7xx.c b/drivers/i2c/busses/i2c-npcm7xx.c index 9a7a2d0bf5765..bbc7359e67f74 100644 --- a/drivers/i2c/busses/i2c-npcm7xx.c +++ b/drivers/i2c/busses/i2c-npcm7xx.c @@ -146,50 +146,50 @@ static const int npcm_i2caddr[I2C_NUM_OWN_ADDR] = { #endif /* NPCM_I2CST reg fields */ -#define NPCM_I2CST_XMIT BIT(0) -#define NPCM_I2CST_MASTER BIT(1) -#define NPCM_I2CST_NMATCH BIT(2) -#define NPCM_I2CST_STASTR BIT(3) -#define NPCM_I2CST_NEGACK BIT(4) -#define NPCM_I2CST_BER BIT(5) -#define NPCM_I2CST_SDAST BIT(6) -#define NPCM_I2CST_SLVSTP BIT(7) +#define NPCM_I2CST_XMIT BIT(0) /* Transmit mode */ +#define NPCM_I2CST_MASTER BIT(1) /* Master mode */ +#define NPCM_I2CST_NMATCH BIT(2) /* New match */ +#define NPCM_I2CST_STASTR BIT(3) /* Stall after start */ +#define NPCM_I2CST_NEGACK BIT(4) /* Negative ACK */ +#define NPCM_I2CST_BER BIT(5) /* Bus error */ +#define NPCM_I2CST_SDAST BIT(6) /* SDA status */ +#define NPCM_I2CST_SLVSTP BIT(7) /* Slave stop */ /* NPCM_I2CCST reg fields */ -#define NPCM_I2CCST_BUSY BIT(0) -#define NPCM_I2CCST_BB BIT(1) -#define NPCM_I2CCST_MATCH BIT(2) -#define NPCM_I2CCST_GCMATCH BIT(3) -#define NPCM_I2CCST_TSDA BIT(4) -#define NPCM_I2CCST_TGSCL BIT(5) -#define NPCM_I2CCST_MATCHAF BIT(6) -#define NPCM_I2CCST_ARPMATCH BIT(7) +#define NPCM_I2CCST_BUSY BIT(0) /* Busy */ +#define NPCM_I2CCST_BB BIT(1) /* Bus busy */ +#define NPCM_I2CCST_MATCH BIT(2) /* Address match */ +#define NPCM_I2CCST_GCMATCH BIT(3) /* Global call match */ +#define NPCM_I2CCST_TSDA BIT(4) /* Test SDA line */ +#define NPCM_I2CCST_TGSCL BIT(5) /* Toggle SCL line */ +#define NPCM_I2CCST_MATCHAF BIT(6) /* Match address field */ +#define NPCM_I2CCST_ARPMATCH BIT(7) /* ARP address match */ /* NPCM_I2CCTL1 reg fields */ -#define NPCM_I2CCTL1_START BIT(0) -#define NPCM_I2CCTL1_STOP BIT(1) -#define NPCM_I2CCTL1_INTEN BIT(2) +#define NPCM_I2CCTL1_START BIT(0) /* Generate start condition */ +#define NPCM_I2CCTL1_STOP BIT(1) /* Generate stop condition */ +#define NPCM_I2CCTL1_INTEN BIT(2) /* Interrupt enable */ #define NPCM_I2CCTL1_EOBINTE BIT(3) #define NPCM_I2CCTL1_ACK BIT(4) -#define NPCM_I2CCTL1_GCMEN BIT(5) -#define NPCM_I2CCTL1_NMINTE BIT(6) -#define NPCM_I2CCTL1_STASTRE BIT(7) +#define NPCM_I2CCTL1_GCMEN BIT(5) /* Global call match enable */ +#define NPCM_I2CCTL1_NMINTE BIT(6) /* New match interrupt enable */ +#define NPCM_I2CCTL1_STASTRE BIT(7) /* Stall after start enable */ /* RW1S fields (inside a RW reg): */ #define NPCM_I2CCTL1_RWS \ (NPCM_I2CCTL1_START | NPCM_I2CCTL1_STOP | NPCM_I2CCTL1_ACK) /* npcm_i2caddr reg fields */ -#define NPCM_I2CADDR_A GENMASK(6, 0) -#define NPCM_I2CADDR_SAEN BIT(7) +#define NPCM_I2CADDR_A GENMASK(6, 0) /* Address */ +#define NPCM_I2CADDR_SAEN BIT(7) /* Slave address enable */ /* NPCM_I2CCTL2 reg fields */ -#define I2CCTL2_ENABLE BIT(0) -#define I2CCTL2_SCLFRQ6_0 GENMASK(7, 1) +#define I2CCTL2_ENABLE BIT(0) /* Module enable */ +#define I2CCTL2_SCLFRQ6_0 GENMASK(7, 1) /* Bits 0:6 of frequency divisor */ /* NPCM_I2CCTL3 reg fields */ -#define I2CCTL3_SCLFRQ8_7 GENMASK(1, 0) -#define I2CCTL3_ARPMEN BIT(2) +#define I2CCTL3_SCLFRQ8_7 GENMASK(1, 0) /* Bits 7:8 of frequency divisor */ +#define I2CCTL3_ARPMEN BIT(2) /* ARP match enable */ #define I2CCTL3_IDL_START BIT(3) #define I2CCTL3_400K_MODE BIT(4) #define I2CCTL3_BNK_SEL BIT(5) -- GitLab From b1f37ef655cf372f96015bf54abdb76a91aff27e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 31 Oct 2022 11:10:56 +0100 Subject: [PATCH 0291/1423] x86: Unconfuse CONFIG_ and X86_FEATURE_ namespaces Lukas reported someone fat fingered the CONFIG_ symbol; fix er up. Fixes: 5d8213864ade ("x86/retbleed: Add SKL return thunk") Reported-by: Lukas Bulwahn Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/Y1+fL4qQEIGZEEKB@hirez.programming.kicks-ass.net --- arch/x86/include/asm/nospec-branch.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 82580adbca4b2..3ab90f23e7f7b 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -285,7 +285,7 @@ */ .macro UNTRAIN_RET #if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ - defined(CONFIG_X86_FEATURE_CALL_DEPTH) + defined(CONFIG_CALL_DEPTH_TRACKING) ANNOTATE_UNRET_END ALTERNATIVE_3 "", \ CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \ @@ -296,7 +296,7 @@ .macro UNTRAIN_RET_FROM_CALL #if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ - defined(CONFIG_X86_FEATURE_CALL_DEPTH) + defined(CONFIG_CALL_DEPTH_TRACKING) ANNOTATE_UNRET_END ALTERNATIVE_3 "", \ CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \ -- GitLab From 5ebddd7c4951c50142bcb239d4c6a82eff15759e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 28 Oct 2022 15:26:51 +0200 Subject: [PATCH 0292/1423] kallsyms: Revert "Take callthunks into account" This is a full revert of commit: f1389181622a ("kallsyms: Take callthunks into account") The commit assumes a number of things that are not quite right. Notably it assumes every symbol has PADDING_BYTES in front of it that are not claimed by another symbol. This is not true; even when compiled with: -fpatchable-function-entry=${PADDING_BYTES},${PADDING_BYTES} Notably things like .cold subfunctions do not need to adhere to this change in ABI. It it also not true when build with CFI_CLANG, which claims these PADDING_BYTES in the __cfi_##name symbol. Once the prefix bytes are not consistent and or otherwise claimed the approach this patch takes goes out the window and kallsym resolution will report invalid symbol names. Therefore revert this to make room for another approach. Reported-by: Reported-by: kernel test robot Signed-off-by: Peter Zijlstra (Intel) Tested-by: Yujie Liu Link: https://lore.kernel.org/r/202210241614.2ae4c1f5-yujie.liu@intel.com Link: https://lkml.kernel.org/r/20221028194453.330970755@infradead.org --- kernel/kallsyms.c | 45 +++++---------------------------------------- 1 file changed, 5 insertions(+), 40 deletions(-) diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index cc244c02b4cf6..60c20f301a6ba 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -293,12 +293,6 @@ static unsigned long get_symbol_pos(unsigned long addr, return low; } -#ifdef CONFIG_FUNCTION_PADDING_BYTES -#define PADDING_BYTES CONFIG_FUNCTION_PADDING_BYTES -#else -#define PADDING_BYTES 0 -#endif - /* * Lookup an address but don't bother to find any names. */ @@ -306,25 +300,13 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, unsigned long *offset) { char namebuf[KSYM_NAME_LEN]; - int ret; - - addr += PADDING_BYTES; if (is_ksym_addr(addr)) { get_symbol_pos(addr, symbolsize, offset); - ret = 1; - goto found; - } - - ret = !!module_address_lookup(addr, symbolsize, offset, NULL, NULL, namebuf); - if (!ret) { - ret = !!__bpf_address_lookup(addr, symbolsize, - offset, namebuf); + return 1; } -found: - if (ret && offset) - *offset -= PADDING_BYTES; - return ret; + return !!module_address_lookup(addr, symbolsize, offset, NULL, NULL, namebuf) || + !!__bpf_address_lookup(addr, symbolsize, offset, namebuf); } static const char *kallsyms_lookup_buildid(unsigned long addr, @@ -337,8 +319,6 @@ static const char *kallsyms_lookup_buildid(unsigned long addr, namebuf[KSYM_NAME_LEN - 1] = 0; namebuf[0] = 0; - addr += PADDING_BYTES; - if (is_ksym_addr(addr)) { unsigned long pos; @@ -368,8 +348,6 @@ static const char *kallsyms_lookup_buildid(unsigned long addr, found: cleanup_symbol_name(namebuf); - if (ret && offset) - *offset -= PADDING_BYTES; return ret; } @@ -396,8 +374,6 @@ int lookup_symbol_name(unsigned long addr, char *symname) symname[0] = '\0'; symname[KSYM_NAME_LEN - 1] = '\0'; - addr += PADDING_BYTES; - if (is_ksym_addr(addr)) { unsigned long pos; @@ -425,8 +401,6 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, name[0] = '\0'; name[KSYM_NAME_LEN - 1] = '\0'; - addr += PADDING_BYTES; - if (is_ksym_addr(addr)) { unsigned long pos; @@ -443,8 +417,6 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, return res; found: - if (offset) - *offset -= PADDING_BYTES; cleanup_symbol_name(name); return 0; } @@ -470,15 +442,8 @@ static int __sprint_symbol(char *buffer, unsigned long address, len = strlen(buffer); offset -= symbol_offset; - if (add_offset) { - char s = '+'; - - if ((long)offset < 0) { - s = '-'; - offset = 0UL - offset; - } - len += sprintf(buffer + len, "%c%#lx/%#lx", s, offset, size); - } + if (add_offset) + len += sprintf(buffer + len, "+%#lx/%#lx", offset, size); if (modname) { len += sprintf(buffer + len, " [%s", modname); -- GitLab From 4c91be8e926c6b3734d59b9348e305431484d42b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 28 Oct 2022 15:49:26 +0200 Subject: [PATCH 0293/1423] objtool: Slice up elf_create_section_symbol() In order to facilitate creation of more symbol types, slice up elf_create_section_symbol() to extract a generic helper that deals with adding ELF symbols. Signed-off-by: Peter Zijlstra (Intel) Tested-by: Yujie Liu Link: https://lkml.kernel.org/r/20221028194453.396634875@infradead.org --- tools/objtool/elf.c | 56 ++++++++++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 21 deletions(-) diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 89b37cd4ab1dc..3ad89d963e593 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -717,11 +717,11 @@ static int elf_update_symbol(struct elf *elf, struct section *symtab, } static struct symbol * -elf_create_section_symbol(struct elf *elf, struct section *sec) +__elf_create_symbol(struct elf *elf, struct symbol *sym) { struct section *symtab, *symtab_shndx; Elf32_Word first_non_local, new_idx; - struct symbol *sym, *old; + struct symbol *old; symtab = find_section_by_name(elf, ".symtab"); if (symtab) { @@ -731,27 +731,16 @@ elf_create_section_symbol(struct elf *elf, struct section *sec) return NULL; } - sym = calloc(1, sizeof(*sym)); - if (!sym) { - perror("malloc"); - return NULL; - } - - sym->name = sec->name; - sym->sec = sec; + new_idx = symtab->sh.sh_size / symtab->sh.sh_entsize; - // st_name 0 - sym->sym.st_info = GELF_ST_INFO(STB_LOCAL, STT_SECTION); - // st_other 0 - // st_value 0 - // st_size 0 + if (GELF_ST_BIND(sym->sym.st_info) != STB_LOCAL) + goto non_local; /* * Move the first global symbol, as per sh_info, into a new, higher * symbol index. This fees up a spot for a new local symbol. */ first_non_local = symtab->sh.sh_info; - new_idx = symtab->sh.sh_size / symtab->sh.sh_entsize; old = find_symbol_by_index(elf, first_non_local); if (old) { old->idx = new_idx; @@ -769,18 +758,43 @@ elf_create_section_symbol(struct elf *elf, struct section *sec) new_idx = first_non_local; } + /* + * Either way, we will add a LOCAL symbol. + */ + symtab->sh.sh_info += 1; + +non_local: sym->idx = new_idx; if (elf_update_symbol(elf, symtab, symtab_shndx, sym)) { WARN("elf_update_symbol"); return NULL; } - /* - * Either way, we added a LOCAL symbol. - */ - symtab->sh.sh_info += 1; + return sym; +} + +static struct symbol * +elf_create_section_symbol(struct elf *elf, struct section *sec) +{ + struct symbol *sym = calloc(1, sizeof(*sym)); + + if (!sym) { + perror("malloc"); + return NULL; + } - elf_add_symbol(elf, sym); + sym->name = sec->name; + sym->sec = sec; + + // st_name 0 + sym->sym.st_info = GELF_ST_INFO(STB_LOCAL, STT_SECTION); + // st_other 0 + // st_value 0 + // st_size 0 + + sym = __elf_create_symbol(elf, sym); + if (sym) + elf_add_symbol(elf, sym); return sym; } -- GitLab From 13f60e80e15dd0657c90bcca372ba045630ed9de Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 28 Oct 2022 20:29:51 +0200 Subject: [PATCH 0294/1423] objtool: Avoid O(bloody terrible) behaviour -- an ode to libelf Due to how gelf_update_sym*() requires an Elf_Data pointer, and how libelf keeps Elf_Data in a linked list per section, elf_update_symbol() ends up having to iterate this list on each update to find the correct Elf_Data for the index'ed symbol. By allocating one Elf_Data per new symbol, the list grows per new symbol, giving an effective O(n^2) insertion time. This is obviously bloody terrible. Therefore over-allocate the Elf_Data when an extention is needed. Except it turns out libelf disregards Elf_Scn::sh_size in favour of the sum of Elf_Data::d_size. IOW it will happily write out all the unused space and fill it with: 0000000000000000 0 NOTYPE LOCAL DEFAULT UND entries (aka zeros). Which obviously violates the STB_LOCAL placement rule, and is a general pain in the backside for not being the desired behaviour. Manually fix-up the Elf_Data size to avoid this problem before calling elf_update(). This significantly improves performance when adding a significant number of symbols. Signed-off-by: Peter Zijlstra (Intel) Tested-by: Yujie Liu Link: https://lkml.kernel.org/r/20221028194453.461658986@infradead.org --- tools/objtool/elf.c | 89 +++++++++++++++++++++++++++-- tools/objtool/include/objtool/elf.h | 2 +- 2 files changed, 84 insertions(+), 7 deletions(-) diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 3ad89d963e593..36dc78796f58d 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -634,6 +634,12 @@ static int elf_update_symbol(struct elf *elf, struct section *symtab, /* end-of-list */ if (!symtab_data) { + /* + * Over-allocate to avoid O(n^2) symbol creation + * behaviour. The down side is that libelf doesn't + * like this; see elf_truncate_section() for the fixup. + */ + int num = max(1U, sym->idx/3); void *buf; if (idx) { @@ -647,28 +653,34 @@ static int elf_update_symbol(struct elf *elf, struct section *symtab, if (t) shndx_data = elf_newdata(t); - buf = calloc(1, entsize); + buf = calloc(num, entsize); if (!buf) { WARN("malloc"); return -1; } symtab_data->d_buf = buf; - symtab_data->d_size = entsize; + symtab_data->d_size = num * entsize; symtab_data->d_align = 1; symtab_data->d_type = ELF_T_SYM; - symtab->sh.sh_size += entsize; symtab->changed = true; + symtab->truncate = true; if (t) { - shndx_data->d_buf = &sym->sec->idx; - shndx_data->d_size = sizeof(Elf32_Word); + buf = calloc(num, sizeof(Elf32_Word)); + if (!buf) { + WARN("malloc"); + return -1; + } + + shndx_data->d_buf = buf; + shndx_data->d_size = num * sizeof(Elf32_Word); shndx_data->d_align = sizeof(Elf32_Word); shndx_data->d_type = ELF_T_WORD; - symtab_shndx->sh.sh_size += sizeof(Elf32_Word); symtab_shndx->changed = true; + symtab_shndx->truncate = true; } break; @@ -770,6 +782,14 @@ non_local: return NULL; } + symtab->sh.sh_size += symtab->sh.sh_entsize; + symtab->changed = true; + + if (symtab_shndx) { + symtab_shndx->sh.sh_size += sizeof(Elf32_Word); + symtab_shndx->changed = true; + } + return sym; } @@ -1286,6 +1306,60 @@ int elf_write_reloc(struct elf *elf, struct reloc *reloc) return 0; } +/* + * When Elf_Scn::sh_size is smaller than the combined Elf_Data::d_size + * do you: + * + * A) adhere to the section header and truncate the data, or + * B) ignore the section header and write out all the data you've got? + * + * Yes, libelf sucks and we need to manually truncate if we over-allocate data. + */ +static int elf_truncate_section(struct elf *elf, struct section *sec) +{ + u64 size = sec->sh.sh_size; + bool truncated = false; + Elf_Data *data = NULL; + Elf_Scn *s; + + s = elf_getscn(elf->elf, sec->idx); + if (!s) { + WARN_ELF("elf_getscn"); + return -1; + } + + for (;;) { + /* get next data descriptor for the relevant section */ + data = elf_getdata(s, data); + + if (!data) { + if (size) { + WARN("end of section data but non-zero size left\n"); + return -1; + } + return 0; + } + + if (truncated) { + /* when we remove symbols */ + WARN("truncated; but more data\n"); + return -1; + } + + if (!data->d_size) { + WARN("zero size data"); + return -1; + } + + if (data->d_size > size) { + truncated = true; + data->d_size = size; + } + + size -= data->d_size; + } +} + int elf_write(struct elf *elf) { struct section *sec; @@ -1296,6 +1370,9 @@ int elf_write(struct elf *elf) /* Update changed relocation sections and section headers: */ list_for_each_entry(sec, &elf->sections, list) { + if (sec->truncate) + elf_truncate_section(elf, sec); + if (sec->changed) { s = elf_getscn(elf->elf, sec->idx); if (!s) { diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index d28533106b780..9e96a613c50f7 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -38,7 +38,7 @@ struct section { Elf_Data *data; char *name; int idx; - bool changed, text, rodata, noinstr, init; + bool changed, text, rodata, noinstr, init, truncate; }; struct symbol { -- GitLab From 9f2899fe36a623885d8576604cb582328ad32b3c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 28 Oct 2022 15:50:42 +0200 Subject: [PATCH 0295/1423] objtool: Add option to generate prefix symbols When code is compiled with: -fpatchable-function-entry=${PADDING_BYTES},${PADDING_BYTES} functions will have PADDING_BYTES of NOP in front of them. Unwinders and other things that symbolize code locations will typically attribute these bytes to the preceding function. Given that these bytes nominally belong to the following symbol this mis-attribution is confusing. Inspired by the fact that CFI_CLANG emits __cfi_##name symbols to claim these bytes, allow objtool to emit __pfx_##name symbols to do the same. Therefore add the objtool --prefix=N argument, to conditionally place a __pfx_##name symbol at N bytes ahead of symbol 'name' when: all these preceding bytes are NOP and name-N is an instruction boundary. Signed-off-by: Peter Zijlstra (Intel) Tested-by: Yujie Liu Link: https://lkml.kernel.org/r/20221028194453.526899822@infradead.org --- tools/objtool/builtin-check.c | 1 + tools/objtool/check.c | 33 ++++++++++++++++++++++++- tools/objtool/elf.c | 31 +++++++++++++++++++++++ tools/objtool/include/objtool/builtin.h | 1 + tools/objtool/include/objtool/elf.h | 2 ++ 5 files changed, 67 insertions(+), 1 deletion(-) diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index 0a04f8ea4432d..95fcecee60ce5 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -75,6 +75,7 @@ const struct option check_options[] = { OPT_BOOLEAN('r', "retpoline", &opts.retpoline, "validate and annotate retpoline usage"), OPT_BOOLEAN(0, "rethunk", &opts.rethunk, "validate and annotate rethunk usage"), OPT_BOOLEAN(0, "unret", &opts.unret, "validate entry unret placement"), + OPT_INTEGER(0, "prefix", &opts.prefix, "generate prefix symbols"), OPT_BOOLEAN('l', "sls", &opts.sls, "validate straight-line-speculation mitigations"), OPT_BOOLEAN('s', "stackval", &opts.stackval, "validate frame pointer rules"), OPT_BOOLEAN('t', "static-call", &opts.static_call, "annotate static calls"), diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 7936312e10c79..27f35f5f831ab 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -3417,7 +3417,8 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, if (func && insn_func(insn) && func != insn_func(insn)->pfunc) { /* Ignore KCFI type preambles, which always fall through */ - if (!strncmp(func->name, "__cfi_", 6)) + if (!strncmp(func->name, "__cfi_", 6) || + !strncmp(func->name, "__pfx_", 6)) return 0; WARN("%s() falls through to next function %s()", @@ -3972,6 +3973,34 @@ static bool ignore_unreachable_insn(struct objtool_file *file, struct instructio return false; } +static int add_prefix_symbol(struct objtool_file *file, struct symbol *func, + struct instruction *insn) +{ + if (!opts.prefix) + return 0; + + for (;;) { + struct instruction *prev = list_prev_entry(insn, list); + u64 offset; + + if (&prev->list == &file->insn_list) + break; + + if (prev->type != INSN_NOP) + break; + + offset = func->offset - prev->offset; + if (offset >= opts.prefix) { + if (offset == opts.prefix) + elf_create_prefix_symbol(file->elf, func, opts.prefix); + break; + } + insn = prev; + } + + return 0; +} + static int validate_symbol(struct objtool_file *file, struct section *sec, struct symbol *sym, struct insn_state *state) { @@ -3990,6 +4019,8 @@ static int validate_symbol(struct objtool_file *file, struct section *sec, if (!insn || insn->ignore || insn->visited) return 0; + add_prefix_symbol(file, sym, insn); + state->uaccess = sym->uaccess_safe; ret = validate_branch(file, insn_func(insn), insn, *state); diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 36dc78796f58d..3d636d12d679f 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -819,6 +819,37 @@ elf_create_section_symbol(struct elf *elf, struct section *sec) return sym; } +static int elf_add_string(struct elf *elf, struct section *strtab, char *str); + +struct symbol * +elf_create_prefix_symbol(struct elf *elf, struct symbol *orig, long size) +{ + struct symbol *sym = calloc(1, sizeof(*sym)); + size_t namelen = strlen(orig->name) + sizeof("__pfx_"); + char *name = malloc(namelen); + + if (!sym || !name) { + perror("malloc"); + return NULL; + } + + snprintf(name, namelen, "__pfx_%s", orig->name); + + sym->name = name; + sym->sec = orig->sec; + + sym->sym.st_name = elf_add_string(elf, NULL, name); + sym->sym.st_info = orig->sym.st_info; + sym->sym.st_value = orig->sym.st_value - size; + sym->sym.st_size = size; + + sym = __elf_create_symbol(elf, sym); + if (sym) + elf_add_symbol(elf, sym); + + return sym; +} + int elf_add_reloc_to_insn(struct elf *elf, struct section *sec, unsigned long offset, unsigned int type, struct section *insn_sec, unsigned long insn_off) diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h index 22092a9f3cf6c..f341b620dead4 100644 --- a/tools/objtool/include/objtool/builtin.h +++ b/tools/objtool/include/objtool/builtin.h @@ -26,6 +26,7 @@ struct opts { bool stackval; bool static_call; bool uaccess; + int prefix; /* options: */ bool backtrace; diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index 9e96a613c50f7..b6974e3173aa5 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -146,6 +146,8 @@ static inline bool has_multiple_files(struct elf *elf) struct elf *elf_open_read(const char *name, int flags); struct section *elf_create_section(struct elf *elf, const char *name, unsigned int sh_flags, size_t entsize, int nr); +struct symbol *elf_create_prefix_symbol(struct elf *elf, struct symbol *orig, long size); + int elf_add_reloc(struct elf *elf, struct section *sec, unsigned long offset, unsigned int type, struct symbol *sym, s64 addend); int elf_add_reloc_to_insn(struct elf *elf, struct section *sec, -- GitLab From b341b20d648bb7e9a3307c33163e7399f0913e66 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 28 Oct 2022 21:08:19 +0200 Subject: [PATCH 0296/1423] x86: Add prefix symbols for function padding When code is compiled with: -fpatchable-function-entry=${PADDING_BYTES},${PADDING_BYTES} functions will have PADDING_BYTES of NOP in front of them. Unwinders and other things that symbolize code locations will typically attribute these bytes to the preceding function. Given that these bytes nominally belong to the following symbol this mis-attribution is confusing. Inspired by the fact that CFI_CLANG emits __cfi_##name symbols to claim these bytes, use objtool to emit __pfx_##name symbols to do the same when CFI_CLANG is not used. This then shows the callthunk for symbol 'name' as: __pfx_##name+0x6/0x10 Signed-off-by: Peter Zijlstra (Intel) Tested-by: Yujie Liu Link: https://lkml.kernel.org/r/20221028194453.592512209@infradead.org --- arch/x86/Kconfig | 4 ++++ scripts/Makefile.lib | 1 + 2 files changed, 5 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b52ad13f0f449..32818aa1dca49 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2471,6 +2471,10 @@ config CALL_THUNKS def_bool n select FUNCTION_ALIGNMENT_16B +config PREFIX_SYMBOLS + def_bool y + depends on CALL_THUNKS && !CFI_CLANG + menuconfig SPECULATION_MITIGATIONS bool "Mitigations for speculative execution vulnerabilities" default y diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 85f02756dc9c1..2e03bcbf2b9b7 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -265,6 +265,7 @@ objtool-args-$(CONFIG_STACK_VALIDATION) += --stackval objtool-args-$(CONFIG_HAVE_STATIC_CALL_INLINE) += --static-call objtool-args-$(CONFIG_HAVE_UACCESS_VALIDATION) += --uaccess objtool-args-$(CONFIG_GCOV_KERNEL) += --no-unreachable +objtool-args-$(CONFIG_PREFIX_SYMBOLS) += --prefix=$(CONFIG_FUNCTION_PADDING_BYTES) objtool-args = $(objtool-args-y) \ $(if $(delay-objtool), --link) \ -- GitLab From 9a479f766be1dd777e12e3e57b6ee4c3028a40a5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 27 Oct 2022 11:28:13 +0200 Subject: [PATCH 0297/1423] objtool: Add --cfi to generate the .cfi_sites section Add the location of all __cfi_##name symbols (as generated by kCFI) to a section such that we might re-write things at kernel boot. Notably; boot time re-hashing and FineIBT are the intended use of this. Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20221027092842.568039454@infradead.org --- tools/objtool/builtin-check.c | 1 + tools/objtool/check.c | 69 +++++++++++++++++++++++++ tools/objtool/include/objtool/builtin.h | 1 + 3 files changed, 71 insertions(+) diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index 95fcecee60ce5..868e3e363786f 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -80,6 +80,7 @@ const struct option check_options[] = { OPT_BOOLEAN('s', "stackval", &opts.stackval, "validate frame pointer rules"), OPT_BOOLEAN('t', "static-call", &opts.static_call, "annotate static calls"), OPT_BOOLEAN('u', "uaccess", &opts.uaccess, "validate uaccess rules for SMAP"), + OPT_BOOLEAN(0 , "cfi", &opts.cfi, "annotate kernel control flow integrity (kCFI) function preambles"), OPT_CALLBACK_OPTARG(0, "dump", NULL, NULL, "orc", "dump metadata", parse_dump), OPT_GROUP("Options:"), diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 27f35f5f831ab..55066c4935702 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -861,6 +861,68 @@ static int create_ibt_endbr_seal_sections(struct objtool_file *file) return 0; } +static int create_cfi_sections(struct objtool_file *file) +{ + struct section *sec, *s; + struct symbol *sym; + unsigned int *loc; + int idx; + + sec = find_section_by_name(file->elf, ".cfi_sites"); + if (sec) { + INIT_LIST_HEAD(&file->call_list); + WARN("file already has .cfi_sites section, skipping"); + return 0; + } + + idx = 0; + for_each_sec(file, s) { + if (!s->text) + continue; + + list_for_each_entry(sym, &s->symbol_list, list) { + if (sym->type != STT_FUNC) + continue; + + if (strncmp(sym->name, "__cfi_", 6)) + continue; + + idx++; + } + } + + sec = elf_create_section(file->elf, ".cfi_sites", 0, sizeof(unsigned int), idx); + if (!sec) + return -1; + + idx = 0; + for_each_sec(file, s) { + if (!s->text) + continue; + + list_for_each_entry(sym, &s->symbol_list, list) { + if (sym->type != STT_FUNC) + continue; + + if (strncmp(sym->name, "__cfi_", 6)) + continue; + + loc = (unsigned int *)sec->data->d_buf + idx; + memset(loc, 0, sizeof(unsigned int)); + + if (elf_add_reloc_to_insn(file->elf, sec, + idx * sizeof(unsigned int), + R_X86_64_PC32, + s, sym->offset)) + return -1; + + idx++; + } + } + + return 0; +} + static int create_mcount_loc_sections(struct objtool_file *file) { struct section *sec; @@ -4430,6 +4492,13 @@ int check(struct objtool_file *file) warnings += ret; } + if (opts.cfi) { + ret = create_cfi_sections(file); + if (ret < 0) + goto out; + warnings += ret; + } + if (opts.rethunk) { ret = create_return_sites_sections(file); if (ret < 0) diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h index f341b620dead4..c44ff39df80c6 100644 --- a/tools/objtool/include/objtool/builtin.h +++ b/tools/objtool/include/objtool/builtin.h @@ -27,6 +27,7 @@ struct opts { bool static_call; bool uaccess; int prefix; + bool cfi; /* options: */ bool backtrace; -- GitLab From 931ab63664f02b17d2213ef36b83e1e50190a0aa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 27 Oct 2022 11:28:14 +0200 Subject: [PATCH 0298/1423] x86/ibt: Implement FineIBT Implement an alternative CFI scheme that merges both the fine-grained nature of kCFI but also takes full advantage of the coarse grained hardware CFI as provided by IBT. To contrast: kCFI is a pure software CFI scheme and relies on being able to read text -- specifically the instruction *before* the target symbol, and does the hash validation *before* doing the call (otherwise control flow is compromised already). FineIBT is a software and hardware hybrid scheme; by ensuring every branch target starts with a hash validation it is possible to place the hash validation after the branch. This has several advantages: o the (hash) load is avoided; no memop; no RX requirement. o IBT WAIT-FOR-ENDBR state is a speculation stop; by placing the hash validation in the immediate instruction after the branch target there is a minimal speculation window and the whole is a viable defence against SpectreBHB. o Kees feels obliged to mention it is slightly more vulnerable when the attacker can write code. Obviously this patch relies on kCFI, but additionally it also relies on the padding from the call-depth-tracking patches. It uses this padding to place the hash-validation while the call-sites are re-written to modify the indirect target to be 16 bytes in front of the original target, thus hitting this new preamble. Notably, there is no hardware that needs call-depth-tracking (Skylake) and supports IBT (Tigerlake and onwards). Suggested-by: Joao Moreira (Intel) Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20221027092842.634714496@infradead.org --- arch/um/kernel/um_arch.c | 5 + arch/x86/Kconfig | 14 +- arch/x86/Makefile | 2 +- arch/x86/include/asm/alternative.h | 2 + arch/x86/include/asm/linkage.h | 6 +- arch/x86/kernel/alternative.c | 253 +++++++++++++++++++++++++++-- arch/x86/kernel/cpu/common.c | 1 + arch/x86/kernel/module.c | 20 ++- arch/x86/kernel/vmlinux.lds.S | 9 + include/linux/bpf.h | 2 +- scripts/Makefile.lib | 1 + 11 files changed, 294 insertions(+), 21 deletions(-) diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 8adf8e89b2558..786b44dc20c98 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -444,6 +444,11 @@ void apply_returns(s32 *start, s32 *end) { } +void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, + s32 *start_cfi, s32 *end_cfi) +{ +} + void apply_alternatives(struct alt_instr *start, struct alt_instr *end) { } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 32818aa1dca49..479ee63898f57 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2463,17 +2463,27 @@ config FUNCTION_PADDING_BYTES default FUNCTION_PADDING_CFI if CFI_CLANG default FUNCTION_ALIGNMENT +config CALL_PADDING + def_bool n + depends on CC_HAS_ENTRY_PADDING && OBJTOOL + select FUNCTION_ALIGNMENT_16B + +config FINEIBT + def_bool y + depends on X86_KERNEL_IBT && CFI_CLANG && RETPOLINE + select CALL_PADDING + config HAVE_CALL_THUNKS def_bool y depends on CC_HAS_ENTRY_PADDING && RETHUNK && OBJTOOL config CALL_THUNKS def_bool n - select FUNCTION_ALIGNMENT_16B + select CALL_PADDING config PREFIX_SYMBOLS def_bool y - depends on CALL_THUNKS && !CFI_CLANG + depends on CALL_PADDING && !CFI_CLANG menuconfig SPECULATION_MITIGATIONS bool "Mitigations for speculative execution vulnerabilities" diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1640e005092b9..a3a07df8a6095 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -208,7 +208,7 @@ ifdef CONFIG_SLS KBUILD_CFLAGS += -mharden-sls=all endif -ifdef CONFIG_CALL_THUNKS +ifdef CONFIG_CALL_PADDING PADDING_CFLAGS := -fpatchable-function-entry=$(CONFIG_FUNCTION_PADDING_BYTES),$(CONFIG_FUNCTION_PADDING_BYTES) KBUILD_CFLAGS += $(PADDING_CFLAGS) export PADDING_CFLAGS diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 664c0779375c1..7659217f4d492 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -78,6 +78,8 @@ extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); extern void apply_retpolines(s32 *start, s32 *end); extern void apply_returns(s32 *start, s32 *end); extern void apply_ibt_endbr(s32 *start, s32 *end); +extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine, + s32 *start_cfi, s32 *end_cfi); struct module; struct paravirt_patch_site; diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 45e0df850645c..dd9b8118f7848 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -15,7 +15,7 @@ #define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT, 0x90; #define __ALIGN_STR __stringify(__ALIGN) -#if defined(CONFIG_CALL_THUNKS) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +#if defined(CONFIG_CALL_PADDING) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) #define FUNCTION_PADDING .skip CONFIG_FUNCTION_ALIGNMENT, 0x90; #else #define FUNCTION_PADDING @@ -57,7 +57,7 @@ #endif /* __ASSEMBLY__ */ /* - * Depending on -fpatchable-function-entry=N,N usage (CONFIG_CALL_THUNKS) the + * Depending on -fpatchable-function-entry=N,N usage (CONFIG_CALL_PADDING) the * CFI symbol layout changes. * * Without CALL_THUNKS: @@ -81,7 +81,7 @@ * In both cases the whole thing is FUNCTION_ALIGNMENT aligned and sized. */ -#ifdef CONFIG_CALL_THUNKS +#ifdef CONFIG_CALL_PADDING #define CFI_PRE_PADDING #define CFI_POST_PADDING .skip CONFIG_FUNCTION_PADDING_BYTES, 0x90; #else diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index b4ac4e58c0106..91b0e63a6238a 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -116,6 +116,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) extern s32 __retpoline_sites[], __retpoline_sites_end[]; extern s32 __return_sites[], __return_sites_end[]; +extern s32 __cfi_sites[], __cfi_sites_end[]; extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; @@ -656,6 +657,28 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } #ifdef CONFIG_X86_KERNEL_IBT +static void poison_endbr(void *addr, bool warn) +{ + u32 endbr, poison = gen_endbr_poison(); + + if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr))) + return; + + if (!is_endbr(endbr)) { + WARN_ON_ONCE(warn); + return; + } + + DPRINTK("ENDBR at: %pS (%px)", addr, addr); + + /* + * When we have IBT, the lack of ENDBR will trigger #CP + */ + DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr); + DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr); + text_poke_early(addr, &poison, 4); +} + /* * Generated by: objtool --ibt */ @@ -664,31 +687,232 @@ void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) s32 *s; for (s = start; s < end; s++) { - u32 endbr, poison = gen_endbr_poison(); void *addr = (void *)s + *s; - if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr))) - continue; + poison_endbr(addr, true); + if (IS_ENABLED(CONFIG_FINEIBT)) + poison_endbr(addr - 16, false); + } +} + +#else + +void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) { } + +#endif /* CONFIG_X86_KERNEL_IBT */ + +#ifdef CONFIG_FINEIBT +/* + * kCFI FineIBT + * + * __cfi_\func: __cfi_\func: + * movl $0x12345678,%eax // 5 endbr64 // 4 + * nop subl $0x12345678,%r10d // 7 + * nop jz 1f // 2 + * nop ud2 // 2 + * nop 1: nop // 1 + * nop + * nop + * nop + * nop + * nop + * nop + * nop + * + * + * caller: caller: + * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6 + * addl $-15(%r11),%r10d // 4 sub $16,%r11 // 4 + * je 1f // 2 nop4 // 4 + * ud2 // 2 + * 1: call __x86_indirect_thunk_r11 // 5 call *%r11; nop2; // 5 + * + */ + +asm( ".pushsection .rodata \n" + "fineibt_preamble_start: \n" + " endbr64 \n" + " subl $0x12345678, %r10d \n" + " je fineibt_preamble_end \n" + " ud2 \n" + " nop \n" + "fineibt_preamble_end: \n" + ".popsection\n" +); + +extern u8 fineibt_preamble_start[]; +extern u8 fineibt_preamble_end[]; + +#define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start) +#define fineibt_preamble_hash 7 + +asm( ".pushsection .rodata \n" + "fineibt_caller_start: \n" + " movl $0x12345678, %r10d \n" + " sub $16, %r11 \n" + ASM_NOP4 + "fineibt_caller_end: \n" + ".popsection \n" +); + +extern u8 fineibt_caller_start[]; +extern u8 fineibt_caller_end[]; + +#define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start) +#define fineibt_caller_hash 2 + +#define fineibt_caller_jmp (fineibt_caller_size - 2) + +static u32 decode_preamble_hash(void *addr) +{ + u8 *p = addr; + + /* b8 78 56 34 12 mov $0x12345678,%eax */ + if (p[0] == 0xb8) + return *(u32 *)(addr + 1); + + return 0; /* invalid hash value */ +} + +static u32 decode_caller_hash(void *addr) +{ + u8 *p = addr; + + /* 41 ba 78 56 34 12 mov $0x12345678,%r10d */ + if (p[0] == 0x41 && p[1] == 0xba) + return -*(u32 *)(addr + 2); + + /* e8 0c 78 56 34 12 jmp.d8 +12 */ + if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp) + return -*(u32 *)(addr + 2); + + return 0; /* invalid hash value */ +} + +/* .retpoline_sites */ +static int cfi_disable_callers(s32 *start, s32 *end) +{ + /* + * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate + * in tact for later usage. Also see decode_caller_hash() and + * cfi_rewrite_callers(). + */ + const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp }; + s32 *s; - if (WARN_ON_ONCE(!is_endbr(endbr))) + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + addr -= fineibt_caller_size; + hash = decode_caller_hash(addr); + if (!hash) /* nocfi callers */ continue; - DPRINTK("ENDBR at: %pS (%px)", addr, addr); + text_poke_early(addr, jmp, 2); + } - /* - * When we have IBT, the lack of ENDBR will trigger #CP - */ - DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr); - DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr); - text_poke_early(addr, &poison, 4); + return 0; +} + +/* .cfi_sites */ +static int cfi_rewrite_preamble(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + hash = decode_preamble_hash(addr); + if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", + addr, addr, 5, addr)) + return -EINVAL; + + text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size); + WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678); + text_poke_early(addr + fineibt_preamble_hash, &hash, 4); } + + return 0; +} + +/* .retpoline_sites */ +static int cfi_rewrite_callers(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + addr -= fineibt_caller_size; + hash = decode_caller_hash(addr); + if (hash) { + text_poke_early(addr, fineibt_caller_start, fineibt_caller_size); + WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678); + text_poke_early(addr + fineibt_caller_hash, &hash, 4); + } + /* rely on apply_retpolines() */ + } + + return 0; +} + +static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, + s32 *start_cfi, s32 *end_cfi, bool builtin) +{ + int ret; + + if (WARN_ONCE(fineibt_preamble_size != 16, + "FineIBT preamble wrong size: %ld", fineibt_preamble_size)) + return; + + if (!HAS_KERNEL_IBT || !cpu_feature_enabled(X86_FEATURE_IBT)) + return; + + /* + * Rewrite the callers to not use the __cfi_ stubs, such that we might + * rewrite them. This disables all CFI. If this succeeds but any of the + * later stages fails, we're without CFI. + */ + ret = cfi_disable_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + + ret = cfi_rewrite_preamble(start_cfi, end_cfi); + if (ret) + goto err; + + ret = cfi_rewrite_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + + if (builtin) + pr_info("Using FineIBT CFI\n"); + + return; + +err: + pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n"); } #else -void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) { } +static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, + s32 *start_cfi, s32 *end_cfi, bool builtin) +{ +} -#endif /* CONFIG_X86_KERNEL_IBT */ +#endif + +void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, + s32 *start_cfi, s32 *end_cfi) +{ + return __apply_fineibt(start_retpoline, end_retpoline, + start_cfi, end_cfi, + /* .builtin = */ false); +} #ifdef CONFIG_SMP static void alternatives_smp_lock(const s32 *start, const s32 *end, @@ -996,6 +1220,9 @@ void __init alternative_instructions(void) */ apply_paravirt(__parainstructions, __parainstructions_end); + __apply_fineibt(__retpoline_sites, __retpoline_sites_end, + __cfi_sites, __cfi_sites_end, true); + /* * Rewrite the retpolines, must be done before alternatives since * those can rewrite the retpoline thunks. diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2bec4b4b2c50d..423a760fa9de2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -609,6 +609,7 @@ static __always_inline void setup_cet(struct cpuinfo_x86 *c) if (!ibt_selftest()) { pr_err("IBT selftest: Failed!\n"); + wrmsrl(MSR_IA32_S_CET, 0); setup_clear_cpu_cap(X86_FEATURE_IBT); return; } diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 2fb9de2cef40e..0142982e94c5a 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -255,7 +255,7 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, *para = NULL, *orc = NULL, *orc_ip = NULL, *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL, - *calls = NULL; + *calls = NULL, *cfi = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -277,6 +277,8 @@ int module_finalize(const Elf_Ehdr *hdr, returns = s; if (!strcmp(".call_sites", secstrings + s->sh_name)) calls = s; + if (!strcmp(".cfi_sites", secstrings + s->sh_name)) + cfi = s; if (!strcmp(".ibt_endbr_seal", secstrings + s->sh_name)) ibt_endbr = s; } @@ -289,6 +291,22 @@ int module_finalize(const Elf_Ehdr *hdr, void *pseg = (void *)para->sh_addr; apply_paravirt(pseg, pseg + para->sh_size); } + if (retpolines || cfi) { + void *rseg = NULL, *cseg = NULL; + unsigned int rsize = 0, csize = 0; + + if (retpolines) { + rseg = (void *)retpolines->sh_addr; + rsize = retpolines->sh_size; + } + + if (cfi) { + cseg = (void *)cfi->sh_addr; + csize = cfi->sh_size; + } + + apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize); + } if (retpolines) { void *rseg = (void *)retpolines->sh_addr; apply_retpolines(rseg, rseg + retpolines->sh_size); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 49f3f86433c7d..2e0ee14229bff 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -309,6 +309,15 @@ SECTIONS } #endif +#ifdef CONFIG_FINEIBT + . = ALIGN(8); + .cfi_sites : AT(ADDR(.cfi_sites) - LOAD_OFFSET) { + __cfi_sites = .; + *(.cfi_sites) + __cfi_sites_end = .; + } +#endif + /* * struct alt_inst entries. From the header (alternative.h): * "Alternative instructions for different CPU types or capabilities" diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5296aea9b5b40..923a3d5080474 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -984,7 +984,7 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func } #ifdef CONFIG_X86_64 -#ifdef CONFIG_CALL_THUNKS +#ifdef CONFIG_CALL_PADDING #define BPF_DISPATCHER_ATTRIBUTES __attribute__((patchable_function_entry(5+CONFIG_FUNCTION_PADDING_BYTES,CONFIG_FUNCTION_PADDING_BYTES))) #else #define BPF_DISPATCHER_ATTRIBUTES __attribute__((patchable_function_entry(5))) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 2e03bcbf2b9b7..2b2fab705a633 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -256,6 +256,7 @@ objtool-args-$(CONFIG_HAVE_JUMP_LABEL_HACK) += --hacks=jump_label objtool-args-$(CONFIG_HAVE_NOINSTR_HACK) += --hacks=noinstr objtool-args-$(CONFIG_CALL_DEPTH_TRACKING) += --hacks=skylake objtool-args-$(CONFIG_X86_KERNEL_IBT) += --ibt +objtool-args-$(CONFIG_FINEIBT) += --cfi objtool-args-$(CONFIG_FTRACE_MCOUNT_USE_OBJTOOL) += --mcount objtool-args-$(CONFIG_UNWINDER_ORC) += --orc objtool-args-$(CONFIG_RETPOLINE) += --retpoline -- GitLab From 082c4c815252ea333b0f3a51e336df60c2314fe2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 27 Oct 2022 11:28:15 +0200 Subject: [PATCH 0299/1423] x86/cfi: Boot time selection of CFI scheme Add the "cfi=" boot parameter to allow people to select a CFI scheme at boot time. Mostly useful for development / debugging. Requested-by: Kees Cook Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20221027092842.699804264@infradead.org --- arch/x86/kernel/alternative.c | 99 ++++++++++++++++++++++++++++------- 1 file changed, 81 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 91b0e63a6238a..9d3b58748ca53 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -702,6 +702,47 @@ void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) { } #endif /* CONFIG_X86_KERNEL_IBT */ #ifdef CONFIG_FINEIBT + +enum cfi_mode { + CFI_DEFAULT, + CFI_OFF, + CFI_KCFI, + CFI_FINEIBT, +}; + +static enum cfi_mode cfi_mode __ro_after_init = CFI_DEFAULT; + +static __init int cfi_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + while (str) { + char *next = strchr(str, ','); + if (next) { + *next = 0; + next++; + } + + if (!strcmp(str, "auto")) { + cfi_mode = CFI_DEFAULT; + } else if (!strcmp(str, "off")) { + cfi_mode = CFI_OFF; + } else if (!strcmp(str, "kcfi")) { + cfi_mode = CFI_KCFI; + } else if (!strcmp(str, "fineibt")) { + cfi_mode = CFI_FINEIBT; + } else { + pr_err("Ignoring unknown cfi option (%s).", str); + } + + str = next; + } + + return 0; +} +early_param("cfi", cfi_parse_cmdline); + /* * kCFI FineIBT * @@ -868,30 +909,52 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, "FineIBT preamble wrong size: %ld", fineibt_preamble_size)) return; - if (!HAS_KERNEL_IBT || !cpu_feature_enabled(X86_FEATURE_IBT)) + if (cfi_mode == CFI_DEFAULT) { + cfi_mode = CFI_KCFI; + if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) + cfi_mode = CFI_FINEIBT; + } + + switch (cfi_mode) { + case CFI_OFF: + ret = cfi_disable_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + + if (builtin) + pr_info("Disabling CFI\n"); return; - /* - * Rewrite the callers to not use the __cfi_ stubs, such that we might - * rewrite them. This disables all CFI. If this succeeds but any of the - * later stages fails, we're without CFI. - */ - ret = cfi_disable_callers(start_retpoline, end_retpoline); - if (ret) - goto err; + case CFI_KCFI: + if (builtin) + pr_info("Using kCFI\n"); + return; - ret = cfi_rewrite_preamble(start_cfi, end_cfi); - if (ret) - goto err; + case CFI_FINEIBT: + /* + * Rewrite the callers to not use the __cfi_ stubs, such that we might + * rewrite them. This disables all CFI. If this succeeds but any of the + * later stages fails, we're without CFI. + */ + ret = cfi_disable_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + + ret = cfi_rewrite_preamble(start_cfi, end_cfi); + if (ret) + goto err; - ret = cfi_rewrite_callers(start_retpoline, end_retpoline); - if (ret) - goto err; + ret = cfi_rewrite_callers(start_retpoline, end_retpoline); + if (ret) + goto err; - if (builtin) - pr_info("Using FineIBT CFI\n"); + if (builtin) + pr_info("Using FineIBT CFI\n"); + return; - return; + default: + break; + } err: pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n"); -- GitLab From 0c3e806ec0f9771fa1f34c60499097d9260a8bb7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 27 Oct 2022 11:28:16 +0200 Subject: [PATCH 0300/1423] x86/cfi: Add boot time hash randomization In order to avoid known hashes (from knowing the boot image), randomize the CFI hashes with a per-boot random seed. Suggested-by: Kees Cook Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20221027092842.765195516@infradead.org --- arch/x86/kernel/alternative.c | 120 ++++++++++++++++++++++++++++++---- 1 file changed, 108 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 9d3b58748ca53..aa7f791585c5c 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -711,6 +711,24 @@ enum cfi_mode { }; static enum cfi_mode cfi_mode __ro_after_init = CFI_DEFAULT; +static bool cfi_rand __ro_after_init = true; +static u32 cfi_seed __ro_after_init; + +/* + * Re-hash the CFI hash with a boot-time seed while making sure the result is + * not a valid ENDBR instruction. + */ +static u32 cfi_rehash(u32 hash) +{ + hash ^= cfi_seed; + while (unlikely(is_endbr(hash) || is_endbr(-hash))) { + bool lsb = hash & 1; + hash >>= 1; + if (lsb) + hash ^= 0x80200003; + } + return hash; +} static __init int cfi_parse_cmdline(char *str) { @@ -728,10 +746,13 @@ static __init int cfi_parse_cmdline(char *str) cfi_mode = CFI_DEFAULT; } else if (!strcmp(str, "off")) { cfi_mode = CFI_OFF; + cfi_rand = false; } else if (!strcmp(str, "kcfi")) { cfi_mode = CFI_KCFI; } else if (!strcmp(str, "fineibt")) { cfi_mode = CFI_FINEIBT; + } else if (!strcmp(str, "norand")) { + cfi_rand = false; } else { pr_err("Ignoring unknown cfi option (%s).", str); } @@ -856,7 +877,50 @@ static int cfi_disable_callers(s32 *start, s32 *end) return 0; } +static int cfi_enable_callers(s32 *start, s32 *end) +{ + /* + * Re-enable kCFI, undo what cfi_disable_callers() did. + */ + const u8 mov[] = { 0x41, 0xba }; + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + addr -= fineibt_caller_size; + hash = decode_caller_hash(addr); + if (!hash) /* nocfi callers */ + continue; + + text_poke_early(addr, mov, 2); + } + + return 0; +} + /* .cfi_sites */ +static int cfi_rand_preamble(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + hash = decode_preamble_hash(addr); + if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", + addr, addr, 5, addr)) + return -EINVAL; + + hash = cfi_rehash(hash); + text_poke_early(addr + 1, &hash, 4); + } + + return 0; +} + static int cfi_rewrite_preamble(s32 *start, s32 *end) { s32 *s; @@ -879,6 +943,25 @@ static int cfi_rewrite_preamble(s32 *start, s32 *end) } /* .retpoline_sites */ +static int cfi_rand_callers(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + addr -= fineibt_caller_size; + hash = decode_caller_hash(addr); + if (hash) { + hash = -cfi_rehash(hash); + text_poke_early(addr + 2, &hash, 4); + } + } + + return 0; +} + static int cfi_rewrite_callers(s32 *start, s32 *end) { s32 *s; @@ -915,31 +998,44 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, cfi_mode = CFI_FINEIBT; } - switch (cfi_mode) { - case CFI_OFF: - ret = cfi_disable_callers(start_retpoline, end_retpoline); + /* + * Rewrite the callers to not use the __cfi_ stubs, such that we might + * rewrite them. This disables all CFI. If this succeeds but any of the + * later stages fails, we're without CFI. + */ + ret = cfi_disable_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + + if (cfi_rand) { + if (builtin) + cfi_seed = get_random_u32(); + + ret = cfi_rand_preamble(start_cfi, end_cfi); if (ret) goto err; + ret = cfi_rand_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + } + + switch (cfi_mode) { + case CFI_OFF: if (builtin) pr_info("Disabling CFI\n"); return; case CFI_KCFI: + ret = cfi_enable_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + if (builtin) pr_info("Using kCFI\n"); return; case CFI_FINEIBT: - /* - * Rewrite the callers to not use the __cfi_ stubs, such that we might - * rewrite them. This disables all CFI. If this succeeds but any of the - * later stages fails, we're without CFI. - */ - ret = cfi_disable_callers(start_retpoline, end_retpoline); - if (ret) - goto err; - ret = cfi_rewrite_preamble(start_cfi, end_cfi); if (ret) goto err; -- GitLab From 9e4a617757273a86b560c1ece40c48e4940a3c79 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 29 Sep 2022 02:24:53 -0700 Subject: [PATCH 0301/1423] string: Add __realloc_size hint to kmemdup() Add __realloc_size() hint to kmemdup() so the compiler can reason about the length of the returned buffer. (These must not use __alloc_size, since those include __malloc which says the contents aren't defined[1]). [1] https://lore.kernel.org/linux-hardening/d199c2af-06af-8a50-a6a1-00eefa0b67b4@prevas.dk/ Cc: Rasmus Villemoes Cc: Guenter Roeck Cc: Andy Shevchenko Cc: Paolo Abeni Cc: Geert Uytterhoeven Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 3 ++- include/linux/string.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index e5b39b1cc2fc1..49782f63f015a 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -659,7 +659,8 @@ __FORTIFY_INLINE void *memchr_inv(const void * const POS0 p, int c, size_t size) return __real_memchr_inv(p, c, size); } -extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup); +extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup) + __realloc_size(2); __FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp) { size_t p_size = __struct_size(p); diff --git a/include/linux/string.h b/include/linux/string.h index cf7607b321027..db28802ab0a65 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -176,7 +176,7 @@ extern void kfree_const(const void *x); extern char *kstrdup(const char *s, gfp_t gfp) __malloc; extern const char *kstrdup_const(const char *s, gfp_t gfp); extern char *kstrndup(const char *s, size_t len, gfp_t gfp); -extern void *kmemdup(const void *src, size_t len, gfp_t gfp); +extern void *kmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2); extern char *kmemdup_nul(const char *s, size_t len, gfp_t gfp); extern char **argv_split(gfp_t gfp, const char *str, int *argcp); -- GitLab From 41eefc46a3a4682976afb5f8c4b9734ed6bfd406 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 2 Oct 2022 09:51:46 -0700 Subject: [PATCH 0302/1423] string: Convert strscpy() self-test to KUnit Convert the strscpy() self-test to a KUnit test. Cc: David Gow Cc: Tobin C. Harding Tested-by: Nathan Chancellor Link: https://lore.kernel.org/lkml/Y072ZMk/hNkfwqMv@dev-arch.thelio-3990X Signed-off-by: Kees Cook --- MAINTAINERS | 1 + lib/Kconfig.debug | 8 ++- lib/Makefile | 2 +- lib/strscpy_kunit.c | 129 +++++++++++++++++++++++++++++++++++++ lib/test_strscpy.c | 150 -------------------------------------------- 5 files changed, 136 insertions(+), 154 deletions(-) create mode 100644 lib/strscpy_kunit.c delete mode 100644 lib/test_strscpy.c diff --git a/MAINTAINERS b/MAINTAINERS index 9dd8d74c4df09..232d78340d79e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8045,6 +8045,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/har F: include/linux/fortify-string.h F: lib/fortify_kunit.c F: lib/memcpy_kunit.c +F: lib/strscpy_kunit.c F: lib/test_fortify/* F: scripts/test_fortify.sh K: \b__NO_FORTIFY\b diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 3fc7abffc7aa2..e0a4d52e434c3 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2215,9 +2215,6 @@ config STRING_SELFTEST config TEST_STRING_HELPERS tristate "Test functions located in the string_helpers module at runtime" -config TEST_STRSCPY - tristate "Test strscpy*() family of functions at runtime" - config TEST_KSTRTOX tristate "Test kstrto*() family of functions at runtime" @@ -2583,6 +2580,11 @@ config HW_BREAKPOINT_KUNIT_TEST If unsure, say N. +config STRSCPY_KUNIT_TEST + tristate "Test strscpy*() family of functions at runtime" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + config TEST_UDELAY tristate "udelay test driver" help diff --git a/lib/Makefile b/lib/Makefile index 161d6a724ff71..1905e5c268495 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -82,7 +82,6 @@ obj-$(CONFIG_TEST_DYNAMIC_DEBUG) += test_dynamic_debug.o obj-$(CONFIG_TEST_PRINTF) += test_printf.o obj-$(CONFIG_TEST_SCANF) += test_scanf.o obj-$(CONFIG_TEST_BITMAP) += test_bitmap.o -obj-$(CONFIG_TEST_STRSCPY) += test_strscpy.o obj-$(CONFIG_TEST_UUID) += test_uuid.o obj-$(CONFIG_TEST_XARRAY) += test_xarray.o obj-$(CONFIG_TEST_PARMAN) += test_parman.o @@ -380,6 +379,7 @@ obj-$(CONFIG_OVERFLOW_KUNIT_TEST) += overflow_kunit.o CFLAGS_stackinit_kunit.o += $(call cc-disable-warning, switch-unreachable) obj-$(CONFIG_STACKINIT_KUNIT_TEST) += stackinit_kunit.o obj-$(CONFIG_FORTIFY_KUNIT_TEST) += fortify_kunit.o +obj-$(CONFIG_STRSCPY_KUNIT_TEST) += strscpy_kunit.o obj-$(CONFIG_GENERIC_LIB_DEVMEM_IS_ALLOWED) += devmem_is_allowed.o diff --git a/lib/strscpy_kunit.c b/lib/strscpy_kunit.c new file mode 100644 index 0000000000000..98523f828d3a2 --- /dev/null +++ b/lib/strscpy_kunit.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Kernel module for testing 'strscpy' family of functions. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include + +/* + * tc() - Run a specific test case. + * @src: Source string, argument to strscpy_pad() + * @count: Size of destination buffer, argument to strscpy_pad() + * @expected: Expected return value from call to strscpy_pad() + * @terminator: 1 if there should be a terminating null byte 0 otherwise. + * @chars: Number of characters from the src string expected to be + * written to the dst buffer. + * @pad: Number of pad characters expected (in the tail of dst buffer). + * (@pad does not include the null terminator byte.) + * + * Calls strscpy_pad() and verifies the return value and state of the + * destination buffer after the call returns. + */ +static void tc(struct kunit *test, char *src, int count, int expected, + int chars, int terminator, int pad) +{ + int nr_bytes_poison; + int max_expected; + int max_count; + int written; + char buf[6]; + int index, i; + const char POISON = 'z'; + + KUNIT_ASSERT_TRUE_MSG(test, src != NULL, + "null source string not supported"); + + memset(buf, POISON, sizeof(buf)); + /* Future proofing test suite, validate args */ + max_count = sizeof(buf) - 2; /* Space for null and to verify overflow */ + max_expected = count - 1; /* Space for the null */ + + KUNIT_ASSERT_LE_MSG(test, count, max_count, + "count (%d) is too big (%d) ... aborting", count, max_count); + KUNIT_EXPECT_LE_MSG(test, expected, max_expected, + "expected (%d) is bigger than can possibly be returned (%d)", + expected, max_expected); + + written = strscpy_pad(buf, src, count); + KUNIT_ASSERT_EQ(test, written, expected); + + if (count && written == -E2BIG) { + KUNIT_ASSERT_EQ_MSG(test, 0, strncmp(buf, src, count - 1), + "buffer state invalid for -E2BIG"); + KUNIT_ASSERT_EQ_MSG(test, buf[count - 1], '\0', + "too big string is not null terminated correctly"); + } + + for (i = 0; i < chars; i++) + KUNIT_ASSERT_EQ_MSG(test, buf[i], src[i], + "buf[i]==%c != src[i]==%c", buf[i], src[i]); + + if (terminator) + KUNIT_ASSERT_EQ_MSG(test, buf[count - 1], '\0', + "string is not null terminated correctly"); + + for (i = 0; i < pad; i++) { + index = chars + terminator + i; + KUNIT_ASSERT_EQ_MSG(test, buf[index], '\0', + "padding missing at index: %d", i); + } + + nr_bytes_poison = sizeof(buf) - chars - terminator - pad; + for (i = 0; i < nr_bytes_poison; i++) { + index = sizeof(buf) - 1 - i; /* Check from the end back */ + KUNIT_ASSERT_EQ_MSG(test, buf[index], POISON, + "poison value missing at index: %d", i); + } +} + +static void strscpy_test(struct kunit *test) +{ + /* + * tc() uses a destination buffer of size 6 and needs at + * least 2 characters spare (one for null and one to check for + * overflow). This means we should only call tc() with + * strings up to a maximum of 4 characters long and 'count' + * should not exceed 4. To test with longer strings increase + * the buffer size in tc(). + */ + + /* tc(test, src, count, expected, chars, terminator, pad) */ + tc(test, "a", 0, -E2BIG, 0, 0, 0); + tc(test, "", 0, -E2BIG, 0, 0, 0); + + tc(test, "a", 1, -E2BIG, 0, 1, 0); + tc(test, "", 1, 0, 0, 1, 0); + + tc(test, "ab", 2, -E2BIG, 1, 1, 0); + tc(test, "a", 2, 1, 1, 1, 0); + tc(test, "", 2, 0, 0, 1, 1); + + tc(test, "abc", 3, -E2BIG, 2, 1, 0); + tc(test, "ab", 3, 2, 2, 1, 0); + tc(test, "a", 3, 1, 1, 1, 1); + tc(test, "", 3, 0, 0, 1, 2); + + tc(test, "abcd", 4, -E2BIG, 3, 1, 0); + tc(test, "abc", 4, 3, 3, 1, 0); + tc(test, "ab", 4, 2, 2, 1, 1); + tc(test, "a", 4, 1, 1, 1, 2); + tc(test, "", 4, 0, 0, 1, 3); +} + +static struct kunit_case strscpy_test_cases[] = { + KUNIT_CASE(strscpy_test), + {} +}; + +static struct kunit_suite strscpy_test_suite = { + .name = "strscpy", + .test_cases = strscpy_test_cases, +}; + +kunit_test_suite(strscpy_test_suite); + +MODULE_AUTHOR("Tobin C. Harding "); +MODULE_LICENSE("GPL"); diff --git a/lib/test_strscpy.c b/lib/test_strscpy.c deleted file mode 100644 index a827f94601f5d..0000000000000 --- a/lib/test_strscpy.c +++ /dev/null @@ -1,150 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include - -#include "../tools/testing/selftests/kselftest_module.h" - -/* - * Kernel module for testing 'strscpy' family of functions. - */ - -KSTM_MODULE_GLOBALS(); - -/* - * tc() - Run a specific test case. - * @src: Source string, argument to strscpy_pad() - * @count: Size of destination buffer, argument to strscpy_pad() - * @expected: Expected return value from call to strscpy_pad() - * @terminator: 1 if there should be a terminating null byte 0 otherwise. - * @chars: Number of characters from the src string expected to be - * written to the dst buffer. - * @pad: Number of pad characters expected (in the tail of dst buffer). - * (@pad does not include the null terminator byte.) - * - * Calls strscpy_pad() and verifies the return value and state of the - * destination buffer after the call returns. - */ -static int __init tc(char *src, int count, int expected, - int chars, int terminator, int pad) -{ - int nr_bytes_poison; - int max_expected; - int max_count; - int written; - char buf[6]; - int index, i; - const char POISON = 'z'; - - total_tests++; - - if (!src) { - pr_err("null source string not supported\n"); - return -1; - } - - memset(buf, POISON, sizeof(buf)); - /* Future proofing test suite, validate args */ - max_count = sizeof(buf) - 2; /* Space for null and to verify overflow */ - max_expected = count - 1; /* Space for the null */ - if (count > max_count) { - pr_err("count (%d) is too big (%d) ... aborting", count, max_count); - return -1; - } - if (expected > max_expected) { - pr_warn("expected (%d) is bigger than can possibly be returned (%d)", - expected, max_expected); - } - - written = strscpy_pad(buf, src, count); - if ((written) != (expected)) { - pr_err("%d != %d (written, expected)\n", written, expected); - goto fail; - } - - if (count && written == -E2BIG) { - if (strncmp(buf, src, count - 1) != 0) { - pr_err("buffer state invalid for -E2BIG\n"); - goto fail; - } - if (buf[count - 1] != '\0') { - pr_err("too big string is not null terminated correctly\n"); - goto fail; - } - } - - for (i = 0; i < chars; i++) { - if (buf[i] != src[i]) { - pr_err("buf[i]==%c != src[i]==%c\n", buf[i], src[i]); - goto fail; - } - } - - if (terminator) { - if (buf[count - 1] != '\0') { - pr_err("string is not null terminated correctly\n"); - goto fail; - } - } - - for (i = 0; i < pad; i++) { - index = chars + terminator + i; - if (buf[index] != '\0') { - pr_err("padding missing at index: %d\n", i); - goto fail; - } - } - - nr_bytes_poison = sizeof(buf) - chars - terminator - pad; - for (i = 0; i < nr_bytes_poison; i++) { - index = sizeof(buf) - 1 - i; /* Check from the end back */ - if (buf[index] != POISON) { - pr_err("poison value missing at index: %d\n", i); - goto fail; - } - } - - return 0; -fail: - failed_tests++; - return -1; -} - -static void __init selftest(void) -{ - /* - * tc() uses a destination buffer of size 6 and needs at - * least 2 characters spare (one for null and one to check for - * overflow). This means we should only call tc() with - * strings up to a maximum of 4 characters long and 'count' - * should not exceed 4. To test with longer strings increase - * the buffer size in tc(). - */ - - /* tc(src, count, expected, chars, terminator, pad) */ - KSTM_CHECK_ZERO(tc("a", 0, -E2BIG, 0, 0, 0)); - KSTM_CHECK_ZERO(tc("", 0, -E2BIG, 0, 0, 0)); - - KSTM_CHECK_ZERO(tc("a", 1, -E2BIG, 0, 1, 0)); - KSTM_CHECK_ZERO(tc("", 1, 0, 0, 1, 0)); - - KSTM_CHECK_ZERO(tc("ab", 2, -E2BIG, 1, 1, 0)); - KSTM_CHECK_ZERO(tc("a", 2, 1, 1, 1, 0)); - KSTM_CHECK_ZERO(tc("", 2, 0, 0, 1, 1)); - - KSTM_CHECK_ZERO(tc("abc", 3, -E2BIG, 2, 1, 0)); - KSTM_CHECK_ZERO(tc("ab", 3, 2, 2, 1, 0)); - KSTM_CHECK_ZERO(tc("a", 3, 1, 1, 1, 1)); - KSTM_CHECK_ZERO(tc("", 3, 0, 0, 1, 2)); - - KSTM_CHECK_ZERO(tc("abcd", 4, -E2BIG, 3, 1, 0)); - KSTM_CHECK_ZERO(tc("abc", 4, 3, 3, 1, 0)); - KSTM_CHECK_ZERO(tc("ab", 4, 2, 2, 1, 1)); - KSTM_CHECK_ZERO(tc("a", 4, 1, 1, 1, 2)); - KSTM_CHECK_ZERO(tc("", 4, 0, 0, 1, 3)); -} - -KSTM_MODULE_LOADERS(test_strscpy); -MODULE_AUTHOR("Tobin C. Harding "); -MODULE_LICENSE("GPL"); -- GitLab From 62e1cbfc5d795381a0f237ae7ee229a92d51cf9e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 2 Oct 2022 09:17:03 -0700 Subject: [PATCH 0303/1423] fortify: Short-circuit known-safe calls to strscpy() Replacing compile-time safe calls of strcpy()-related functions with strscpy() was always calling the full strscpy() logic when a builtin would be better. For example: char buf[16]; strcpy(buf, "yes"); would reduce to __builtin_memcpy(buf, "yes", 4), but not if it was: strscpy(buf, yes, sizeof(buf)); Fix this by checking if all sizes are known at compile-time. Cc: linux-hardening@vger.kernel.org Tested-by: Nathan Chancellor Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 10 ++++++++++ lib/strscpy_kunit.c | 13 +++++++++++++ 2 files changed, 23 insertions(+) diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 49782f63f015a..32a66d4b30ca3 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -314,6 +314,16 @@ __FORTIFY_INLINE ssize_t strscpy(char * const POS p, const char * const POS q, s if (__compiletime_lessthan(p_size, size)) __write_overflow(); + /* Short-circuit for compile-time known-safe lengths. */ + if (__compiletime_lessthan(p_size, SIZE_MAX)) { + len = __compiletime_strlen(q); + + if (len < SIZE_MAX && __compiletime_lessthan(len, size)) { + __underlying_memcpy(p, q, len + 1); + return len; + } + } + /* * This call protects from read overflow, because len will default to q * length if it smaller than size. diff --git a/lib/strscpy_kunit.c b/lib/strscpy_kunit.c index 98523f828d3a2..a6b6344354ed5 100644 --- a/lib/strscpy_kunit.c +++ b/lib/strscpy_kunit.c @@ -81,6 +81,8 @@ static void tc(struct kunit *test, char *src, int count, int expected, static void strscpy_test(struct kunit *test) { + char dest[8]; + /* * tc() uses a destination buffer of size 6 and needs at * least 2 characters spare (one for null and one to check for @@ -111,6 +113,17 @@ static void strscpy_test(struct kunit *test) tc(test, "ab", 4, 2, 2, 1, 1); tc(test, "a", 4, 1, 1, 1, 2); tc(test, "", 4, 0, 0, 1, 3); + + /* Compile-time-known source strings. */ + KUNIT_EXPECT_EQ(test, strscpy(dest, "", ARRAY_SIZE(dest)), 0); + KUNIT_EXPECT_EQ(test, strscpy(dest, "", 3), 0); + KUNIT_EXPECT_EQ(test, strscpy(dest, "", 1), 0); + KUNIT_EXPECT_EQ(test, strscpy(dest, "", 0), -E2BIG); + KUNIT_EXPECT_EQ(test, strscpy(dest, "Fixed", ARRAY_SIZE(dest)), 5); + KUNIT_EXPECT_EQ(test, strscpy(dest, "Fixed", 3), -E2BIG); + KUNIT_EXPECT_EQ(test, strscpy(dest, "Fixed", 1), -E2BIG); + KUNIT_EXPECT_EQ(test, strscpy(dest, "Fixed", 0), -E2BIG); + KUNIT_EXPECT_EQ(test, strscpy(dest, "This is too long", ARRAY_SIZE(dest)), -E2BIG); } static struct kunit_case strscpy_test_cases[] = { -- GitLab From fb3d88ab354b3b07e805aba9d67cbb43d23dc70e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 2 Oct 2022 19:45:23 -0700 Subject: [PATCH 0304/1423] siphash: Convert selftest to KUnit Convert the siphash self-test to KUnit so it will be included in "all KUnit tests" coverage, and can be run individually still: $ ./tools/testing/kunit/kunit.py run siphash ... [02:58:45] Starting KUnit Kernel (1/1)... [02:58:45] ============================================================ [02:58:45] =================== siphash (1 subtest) ==================== [02:58:45] [PASSED] siphash_test [02:58:45] ===================== [PASSED] siphash ===================== [02:58:45] ============================================================ [02:58:45] Testing complete. Ran 1 tests: passed: 1 [02:58:45] Elapsed time: 21.421s total, 4.306s configuring, 16.947s building, 0.148s running Cc: Vlastimil Babka Cc: "Steven Rostedt (Google)" Cc: Yury Norov Cc: Sander Vanheule Acked-by: "Jason A. Donenfeld" Link: https://lore.kernel.org/lkml/CAHmME9r+9MPH6zk3Vn=buEMSbQiWMFryqqzerKarmjYk+tHLJA@mail.gmail.com Tested-by: David Gow Signed-off-by: Kees Cook --- MAINTAINERS | 2 +- lib/Kconfig.debug | 20 +-- lib/Makefile | 2 +- lib/{test_siphash.c => siphash_kunit.c} | 165 ++++++++++-------------- 4 files changed, 83 insertions(+), 106 deletions(-) rename lib/{test_siphash.c => siphash_kunit.c} (60%) diff --git a/MAINTAINERS b/MAINTAINERS index 232d78340d79e..1cd80c1137212 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18864,7 +18864,7 @@ M: Jason A. Donenfeld S: Maintained F: include/linux/siphash.h F: lib/siphash.c -F: lib/test_siphash.c +F: lib/siphash_kunit.c SIS 190 ETHERNET DRIVER M: Francois Romieu diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e0a4d52e434c3..50cc1c4efaddb 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2244,15 +2244,6 @@ config TEST_RHASHTABLE If unsure, say N. -config TEST_SIPHASH - tristate "Perform selftest on siphash functions" - help - Enable this option to test the kernel's siphash () hash - functions on boot (or module load). - - This is intended to help people writing architecture-specific - optimized versions. If unsure, say N. - config TEST_IDA tristate "Perform selftest on IDA functions" @@ -2585,6 +2576,17 @@ config STRSCPY_KUNIT_TEST depends on KUNIT default KUNIT_ALL_TESTS +config SIPHASH_KUNIT_TEST + tristate "Perform selftest on siphash functions" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + Enable this option to test the kernel's siphash () hash + functions on boot (or module load). + + This is intended to help people writing architecture-specific + optimized versions. If unsure, say N. + config TEST_UDELAY tristate "udelay test driver" help diff --git a/lib/Makefile b/lib/Makefile index 1905e5c268495..77c7951c8cf09 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -62,7 +62,6 @@ obj-$(CONFIG_TEST_BITOPS) += test_bitops.o CFLAGS_test_bitops.o += -Werror obj-$(CONFIG_CPUMASK_KUNIT_TEST) += cpumask_kunit.o obj-$(CONFIG_TEST_SYSCTL) += test_sysctl.o -obj-$(CONFIG_TEST_SIPHASH) += test_siphash.o obj-$(CONFIG_HASH_KUNIT_TEST) += test_hash.o obj-$(CONFIG_TEST_IDA) += test_ida.o obj-$(CONFIG_TEST_UBSAN) += test_ubsan.o @@ -380,6 +379,7 @@ CFLAGS_stackinit_kunit.o += $(call cc-disable-warning, switch-unreachable) obj-$(CONFIG_STACKINIT_KUNIT_TEST) += stackinit_kunit.o obj-$(CONFIG_FORTIFY_KUNIT_TEST) += fortify_kunit.o obj-$(CONFIG_STRSCPY_KUNIT_TEST) += strscpy_kunit.o +obj-$(CONFIG_SIPHASH_KUNIT_TEST) += siphash_kunit.o obj-$(CONFIG_GENERIC_LIB_DEVMEM_IS_ALLOWED) += devmem_is_allowed.o diff --git a/lib/test_siphash.c b/lib/siphash_kunit.c similarity index 60% rename from lib/test_siphash.c rename to lib/siphash_kunit.c index a96788d0141d4..a3c697e8be357 100644 --- a/lib/test_siphash.c +++ b/lib/siphash_kunit.c @@ -13,6 +13,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -109,114 +110,88 @@ static const u32 test_vectors_hsiphash[64] = { }; #endif -static int __init siphash_test_init(void) +#define chk(hash, vector, fmt...) \ + KUNIT_EXPECT_EQ_MSG(test, hash, vector, fmt) + +static void siphash_test(struct kunit *test) { u8 in[64] __aligned(SIPHASH_ALIGNMENT); u8 in_unaligned[65] __aligned(SIPHASH_ALIGNMENT); u8 i; - int ret = 0; for (i = 0; i < 64; ++i) { in[i] = i; in_unaligned[i + 1] = i; - if (siphash(in, i, &test_key_siphash) != - test_vectors_siphash[i]) { - pr_info("siphash self-test aligned %u: FAIL\n", i + 1); - ret = -EINVAL; - } - if (siphash(in_unaligned + 1, i, &test_key_siphash) != - test_vectors_siphash[i]) { - pr_info("siphash self-test unaligned %u: FAIL\n", i + 1); - ret = -EINVAL; - } - if (hsiphash(in, i, &test_key_hsiphash) != - test_vectors_hsiphash[i]) { - pr_info("hsiphash self-test aligned %u: FAIL\n", i + 1); - ret = -EINVAL; - } - if (hsiphash(in_unaligned + 1, i, &test_key_hsiphash) != - test_vectors_hsiphash[i]) { - pr_info("hsiphash self-test unaligned %u: FAIL\n", i + 1); - ret = -EINVAL; - } - } - if (siphash_1u64(0x0706050403020100ULL, &test_key_siphash) != - test_vectors_siphash[8]) { - pr_info("siphash self-test 1u64: FAIL\n"); - ret = -EINVAL; - } - if (siphash_2u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL, - &test_key_siphash) != test_vectors_siphash[16]) { - pr_info("siphash self-test 2u64: FAIL\n"); - ret = -EINVAL; + chk(siphash(in, i, &test_key_siphash), + test_vectors_siphash[i], + "siphash self-test aligned %u: FAIL", i + 1); + chk(siphash(in_unaligned + 1, i, &test_key_siphash), + test_vectors_siphash[i], + "siphash self-test unaligned %u: FAIL", i + 1); + chk(hsiphash(in, i, &test_key_hsiphash), + test_vectors_hsiphash[i], + "hsiphash self-test aligned %u: FAIL", i + 1); + chk(hsiphash(in_unaligned + 1, i, &test_key_hsiphash), + test_vectors_hsiphash[i], + "hsiphash self-test unaligned %u: FAIL", i + 1); } - if (siphash_3u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL, - 0x1716151413121110ULL, &test_key_siphash) != - test_vectors_siphash[24]) { - pr_info("siphash self-test 3u64: FAIL\n"); - ret = -EINVAL; - } - if (siphash_4u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL, + chk(siphash_1u64(0x0706050403020100ULL, &test_key_siphash), + test_vectors_siphash[8], + "siphash self-test 1u64: FAIL"); + chk(siphash_2u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL, + &test_key_siphash), + test_vectors_siphash[16], + "siphash self-test 2u64: FAIL"); + chk(siphash_3u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL, + 0x1716151413121110ULL, &test_key_siphash), + test_vectors_siphash[24], + "siphash self-test 3u64: FAIL"); + chk(siphash_4u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL, 0x1716151413121110ULL, 0x1f1e1d1c1b1a1918ULL, - &test_key_siphash) != test_vectors_siphash[32]) { - pr_info("siphash self-test 4u64: FAIL\n"); - ret = -EINVAL; - } - if (siphash_1u32(0x03020100U, &test_key_siphash) != - test_vectors_siphash[4]) { - pr_info("siphash self-test 1u32: FAIL\n"); - ret = -EINVAL; - } - if (siphash_2u32(0x03020100U, 0x07060504U, &test_key_siphash) != - test_vectors_siphash[8]) { - pr_info("siphash self-test 2u32: FAIL\n"); - ret = -EINVAL; - } - if (siphash_3u32(0x03020100U, 0x07060504U, - 0x0b0a0908U, &test_key_siphash) != - test_vectors_siphash[12]) { - pr_info("siphash self-test 3u32: FAIL\n"); - ret = -EINVAL; - } - if (siphash_4u32(0x03020100U, 0x07060504U, - 0x0b0a0908U, 0x0f0e0d0cU, &test_key_siphash) != - test_vectors_siphash[16]) { - pr_info("siphash self-test 4u32: FAIL\n"); - ret = -EINVAL; - } - if (hsiphash_1u32(0x03020100U, &test_key_hsiphash) != - test_vectors_hsiphash[4]) { - pr_info("hsiphash self-test 1u32: FAIL\n"); - ret = -EINVAL; - } - if (hsiphash_2u32(0x03020100U, 0x07060504U, &test_key_hsiphash) != - test_vectors_hsiphash[8]) { - pr_info("hsiphash self-test 2u32: FAIL\n"); - ret = -EINVAL; - } - if (hsiphash_3u32(0x03020100U, 0x07060504U, - 0x0b0a0908U, &test_key_hsiphash) != - test_vectors_hsiphash[12]) { - pr_info("hsiphash self-test 3u32: FAIL\n"); - ret = -EINVAL; - } - if (hsiphash_4u32(0x03020100U, 0x07060504U, - 0x0b0a0908U, 0x0f0e0d0cU, &test_key_hsiphash) != - test_vectors_hsiphash[16]) { - pr_info("hsiphash self-test 4u32: FAIL\n"); - ret = -EINVAL; - } - if (!ret) - pr_info("self-tests: pass\n"); - return ret; + &test_key_siphash), + test_vectors_siphash[32], + "siphash self-test 4u64: FAIL"); + chk(siphash_1u32(0x03020100U, &test_key_siphash), + test_vectors_siphash[4], + "siphash self-test 1u32: FAIL"); + chk(siphash_2u32(0x03020100U, 0x07060504U, &test_key_siphash), + test_vectors_siphash[8], + "siphash self-test 2u32: FAIL"); + chk(siphash_3u32(0x03020100U, 0x07060504U, + 0x0b0a0908U, &test_key_siphash), + test_vectors_siphash[12], + "siphash self-test 3u32: FAIL"); + chk(siphash_4u32(0x03020100U, 0x07060504U, + 0x0b0a0908U, 0x0f0e0d0cU, &test_key_siphash), + test_vectors_siphash[16], + "siphash self-test 4u32: FAIL"); + chk(hsiphash_1u32(0x03020100U, &test_key_hsiphash), + test_vectors_hsiphash[4], + "hsiphash self-test 1u32: FAIL"); + chk(hsiphash_2u32(0x03020100U, 0x07060504U, &test_key_hsiphash), + test_vectors_hsiphash[8], + "hsiphash self-test 2u32: FAIL"); + chk(hsiphash_3u32(0x03020100U, 0x07060504U, + 0x0b0a0908U, &test_key_hsiphash), + test_vectors_hsiphash[12], + "hsiphash self-test 3u32: FAIL"); + chk(hsiphash_4u32(0x03020100U, 0x07060504U, + 0x0b0a0908U, 0x0f0e0d0cU, &test_key_hsiphash), + test_vectors_hsiphash[16], + "hsiphash self-test 4u32: FAIL"); } -static void __exit siphash_test_exit(void) -{ -} +static struct kunit_case siphash_test_cases[] = { + KUNIT_CASE(siphash_test), + {} +}; + +static struct kunit_suite siphash_test_suite = { + .name = "siphash", + .test_cases = siphash_test_cases, +}; -module_init(siphash_test_init); -module_exit(siphash_test_exit); +kunit_test_suite(siphash_test_suite); MODULE_AUTHOR("Jason A. Donenfeld "); MODULE_LICENSE("Dual BSD/GPL"); -- GitLab From e9a40e1585d792751d3a122392695e5a53032809 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 25 Oct 2022 16:05:18 -0700 Subject: [PATCH 0305/1423] fortify: Do not cast to "unsigned char" Do not cast to "unsigned char", as this needlessly creates type problems when attempting builds without -Wno-pointer-sign[1]. The intent of the cast is to drop possible "const" types. [1] https://lore.kernel.org/lkml/CAHk-=wgz3Uba8w7kdXhsqR1qvfemYL+OFQdefJnkeqXG8qZ_pA@mail.gmail.com/ Suggested-by: Linus Torvalds Fixes: 3009f891bb9f ("fortify: Allow strlen() and strnlen() to pass compile-time known lengths") Cc: linux-hardening@vger.kernel.org Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 32a66d4b30ca3..aa31f54f8b573 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -18,7 +18,7 @@ void __write_overflow_field(size_t avail, size_t wanted) __compiletime_warning(" #define __compiletime_strlen(p) \ ({ \ - unsigned char *__p = (unsigned char *)(p); \ + char *__p = (char *)(p); \ size_t __ret = SIZE_MAX; \ size_t __p_size = __member_size(p); \ if (__p_size != SIZE_MAX && \ -- GitLab From 5a17f040fa332e71a45ca9ff02d6979d9176a423 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 26 Oct 2022 16:31:11 -0700 Subject: [PATCH 0306/1423] cred: Do not default to init_cred in prepare_kernel_cred() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A common exploit pattern for ROP attacks is to abuse prepare_kernel_cred() in order to construct escalated privileges[1]. Instead of providing a short-hand argument (NULL) to the "daemon" argument to indicate using init_cred as the base cred, require that "daemon" is always set to an actual task. Replace all existing callers that were passing NULL with &init_task. Future attacks will need to have sufficiently powerful read/write primitives to have found an appropriately privileged task and written it to the ROP stack as an argument to succeed, which is similarly difficult to the prior effort needed to escalate privileges before struct cred existed: locate the current cred and overwrite the uid member. This has the added benefit of meaning that prepare_kernel_cred() can no longer exceed the privileges of the init task, which may have changed from the original init_cred (e.g. dropping capabilities from the bounding set). [1] https://google.com/search?q=commit_creds(prepare_kernel_cred(0)) Cc: "Eric W. Biederman" Cc: David Howells Cc: "Rafael J. Wysocki" Cc: Steve French Cc: Ronnie Sahlberg Cc: Shyam Prasad N Cc: Tom Talpey Cc: Namjae Jeon Cc: Trond Myklebust Cc: Anna Schumaker Cc: Chuck Lever Cc: Jeff Layton Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: "Michal Koutný" Cc: Peter Zijlstra Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Cc: linux-nfs@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Acked-by: Luis Chamberlain Reviewed-by: Sergey Senozhatsky Acked-by: Russ Weight Acked-by: Greg Kroah-Hartman Acked-by: Paulo Alcantara (SUSE) Link: https://lore.kernel.org/r/20221026232943.never.775-kees@kernel.org --- drivers/base/firmware_loader/main.c | 2 +- fs/cifs/cifs_spnego.c | 2 +- fs/cifs/cifsacl.c | 2 +- fs/ksmbd/smb_common.c | 2 +- fs/nfs/flexfilelayout/flexfilelayout.c | 4 ++-- fs/nfs/nfs4idmap.c | 2 +- fs/nfsd/nfs4callback.c | 2 +- kernel/cred.c | 15 +++++++-------- net/dns_resolver/dns_key.c | 2 +- 9 files changed, 16 insertions(+), 17 deletions(-) diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c index 7c3590fd97c28..017c4cdb219eb 100644 --- a/drivers/base/firmware_loader/main.c +++ b/drivers/base/firmware_loader/main.c @@ -821,7 +821,7 @@ _request_firmware(const struct firmware **firmware_p, const char *name, * called by a driver when serving an unrelated request from userland, we use * the kernel credentials to read the file. */ - kern_cred = prepare_kernel_cred(NULL); + kern_cred = prepare_kernel_cred(&init_task); if (!kern_cred) { ret = -ENOMEM; goto out; diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 342717bf1dc28..6f3285f1dfee5 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -189,7 +189,7 @@ init_cifs_spnego(void) * spnego upcalls. */ - cred = prepare_kernel_cred(NULL); + cred = prepare_kernel_cred(&init_task); if (!cred) return -ENOMEM; diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index fa480d62f3137..574de2b225ae6 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -465,7 +465,7 @@ init_cifs_idmap(void) * this is used to prevent malicious redirections from being installed * with add_key(). */ - cred = prepare_kernel_cred(NULL); + cred = prepare_kernel_cred(&init_task); if (!cred) return -ENOMEM; diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c index d96da872d70a1..2a4fbbd55b91f 100644 --- a/fs/ksmbd/smb_common.c +++ b/fs/ksmbd/smb_common.c @@ -623,7 +623,7 @@ int ksmbd_override_fsids(struct ksmbd_work *work) if (share->force_gid != KSMBD_SHARE_INVALID_GID) gid = share->force_gid; - cred = prepare_kernel_cred(NULL); + cred = prepare_kernel_cred(&init_task); if (!cred) return -ENOMEM; diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 1ec79ccf89ad2..7deb3cd76abe4 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -493,10 +493,10 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, gid = make_kgid(&init_user_ns, id); if (gfp_flags & __GFP_FS) - kcred = prepare_kernel_cred(NULL); + kcred = prepare_kernel_cred(&init_task); else { unsigned int nofs_flags = memalloc_nofs_save(); - kcred = prepare_kernel_cred(NULL); + kcred = prepare_kernel_cred(&init_task); memalloc_nofs_restore(nofs_flags); } rc = -ENOMEM; diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index e3fdd2f45b01f..25a7c771cfd89 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -203,7 +203,7 @@ int nfs_idmap_init(void) printk(KERN_NOTICE "NFS: Registering the %s key type\n", key_type_id_resolver.name); - cred = prepare_kernel_cred(NULL); + cred = prepare_kernel_cred(&init_task); if (!cred) return -ENOMEM; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index f0e69edf5f0f1..4a9e8d17e56aa 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -870,7 +870,7 @@ static const struct cred *get_backchannel_cred(struct nfs4_client *clp, struct r } else { struct cred *kcred; - kcred = prepare_kernel_cred(NULL); + kcred = prepare_kernel_cred(&init_task); if (!kcred) return NULL; diff --git a/kernel/cred.c b/kernel/cred.c index e10c15f51c1fe..811ad654abd18 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -701,9 +701,9 @@ void __init cred_init(void) * override a task's own credentials so that work can be done on behalf of that * task that requires a different subjective context. * - * @daemon is used to provide a base for the security record, but can be NULL. - * If @daemon is supplied, then the security data will be derived from that; - * otherwise they'll be set to 0 and no groups, full capabilities and no keys. + * @daemon is used to provide a base cred, with the security data derived from + * that; if this is "&init_task", they'll be set to 0, no groups, full + * capabilities, and no keys. * * The caller may change these controls afterwards if desired. * @@ -714,17 +714,16 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) const struct cred *old; struct cred *new; + if (WARN_ON_ONCE(!daemon)) + return NULL; + new = kmem_cache_alloc(cred_jar, GFP_KERNEL); if (!new) return NULL; kdebug("prepare_kernel_cred() alloc %p", new); - if (daemon) - old = get_task_cred(daemon); - else - old = get_cred(&init_cred); - + old = get_task_cred(daemon); validate_creds(old); *new = *old; diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index 3aced951d5ab8..01e54b46ae0b9 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -337,7 +337,7 @@ static int __init init_dns_resolver(void) * this is used to prevent malicious redirections from being installed * with add_key(). */ - cred = prepare_kernel_cred(NULL); + cred = prepare_kernel_cred(&init_task); if (!cred) return -ENOMEM; -- GitLab From e1789d7c752ed001cf1a4bbbd624f70a7dd3c6db Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 25 Oct 2022 00:30:23 -0700 Subject: [PATCH 0307/1423] kbuild: upgrade the orphan section warning to an error if CONFIG_WERROR is set Andrew Cooper suggested upgrading the orphan section warning to a hard link error. However Nathan Chancellor said outright turning the warning into an error with no escape hatch might be too aggressive, as we have had these warnings triggered by new compiler generated sections, and suggested turning orphan sections into an error only if CONFIG_WERROR is set. Kees Cook echoed and emphasized that the mandate from Linus is that we should avoid breaking builds. It wrecks bisection, it causes problems across compiler versions, etc. Thus upgrade the orphan section warning to a hard link error only if CONFIG_WERROR is set. Suggested-by: Andrew Cooper Suggested-by: Nathan Chancellor Signed-off-by: Xin Li Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221025073023.16137-2-xin3.li@intel.com --- Makefile | 2 +- arch/arm/boot/compressed/Makefile | 2 +- arch/arm64/kernel/vdso/Makefile | 2 +- arch/arm64/kernel/vdso32/Makefile | 2 +- arch/x86/boot/compressed/Makefile | 2 +- init/Kconfig | 15 ++++++++++++--- 6 files changed, 17 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index f41ec8c8426ba..9a496bef3170c 100644 --- a/Makefile +++ b/Makefile @@ -1118,7 +1118,7 @@ endif # We never want expected sections to be placed heuristically by the # linker. All sections should be explicitly named in the linker script. ifdef CONFIG_LD_ORPHAN_WARN -LDFLAGS_vmlinux += --orphan-handling=warn +LDFLAGS_vmlinux += --orphan-handling=$(CONFIG_LD_ORPHAN_WARN_LEVEL) endif # Align the bit size of userspace programs with the kernel diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index 41bcbb460fac4..53cadc3aaff11 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -123,7 +123,7 @@ LDFLAGS_vmlinux += --no-undefined LDFLAGS_vmlinux += -X # Report orphan sections ifdef CONFIG_LD_ORPHAN_WARN -LDFLAGS_vmlinux += --orphan-handling=warn +LDFLAGS_vmlinux += --orphan-handling=$(CONFIG_LD_ORPHAN_WARN_LEVEL) endif # Next argument is a linker script LDFLAGS_vmlinux += -T diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index 619e2dc7ee14c..beaf9586338f5 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -27,7 +27,7 @@ ldflags-y := -shared -soname=linux-vdso.so.1 --hash-style=sysv \ -Bsymbolic --build-id=sha1 -n $(btildflags-y) ifdef CONFIG_LD_ORPHAN_WARN - ldflags-y += --orphan-handling=warn + ldflags-y += --orphan-handling=$(CONFIG_LD_ORPHAN_WARN_LEVEL) endif ldflags-y += -T diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index 36c8f66cad251..f59bd1a4ead6b 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -104,7 +104,7 @@ VDSO_AFLAGS += -D__ASSEMBLY__ VDSO_LDFLAGS += -Bsymbolic --no-undefined -soname=linux-vdso.so.1 VDSO_LDFLAGS += -z max-page-size=4096 -z common-page-size=4096 VDSO_LDFLAGS += -shared --hash-style=sysv --build-id=sha1 -VDSO_LDFLAGS += --orphan-handling=warn +VDSO_LDFLAGS += --orphan-handling=$(CONFIG_LD_ORPHAN_WARN_LEVEL) # Borrow vdsomunge.c from the arm vDSO diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 3a261abb6d158..66b8a8cb5a0f8 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -68,7 +68,7 @@ KBUILD_LDFLAGS += $(call ld-option,--no-ld-generated-unwind-info) # address by the bootloader. LDFLAGS_vmlinux := -pie $(call ld-option, --no-dynamic-linker) ifdef CONFIG_LD_ORPHAN_WARN -LDFLAGS_vmlinux += --orphan-handling=warn +LDFLAGS_vmlinux += --orphan-handling=$(CONFIG_LD_ORPHAN_WARN_LEVEL) endif LDFLAGS_vmlinux += -z noexecstack ifeq ($(CONFIG_LD_IS_BFD),y) diff --git a/init/Kconfig b/init/Kconfig index 694f7c160c9c1..bb1225ef04e76 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -159,10 +159,12 @@ config WERROR help A kernel build should not cause any compiler warnings, and this enables the '-Werror' (for C) and '-Dwarnings' (for Rust) flags - to enforce that rule by default. + to enforce that rule by default. Certain warnings from other tools + such as the linker may be upgraded to errors with this option as + well. - However, if you have a new (or very old) compiler with odd and - unusual warnings, or you have some architecture with problems, + However, if you have a new (or very old) compiler or linker with odd + and unusual warnings, or you have some architecture with problems, you may need to disable this config option in order to successfully build the kernel. @@ -1454,6 +1456,13 @@ config LD_ORPHAN_WARN def_bool y depends on ARCH_WANT_LD_ORPHAN_WARN depends on $(ld-option,--orphan-handling=warn) + depends on $(ld-option,--orphan-handling=error) + +config LD_ORPHAN_WARN_LEVEL + string + depends on LD_ORPHAN_WARN + default "error" if WERROR + default "warn" config SYSCTL bool -- GitLab From cd536db050993f7c220a6cfb01de5356032b6f8e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 18 Oct 2022 02:10:11 -0700 Subject: [PATCH 0308/1423] dma-buf: Proactively round up to kmalloc bucket size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of discovering the kmalloc bucket size _after_ allocation, round up proactively so the allocation is explicitly made for the full size, allowing the compiler to correctly reason about the resulting size of the buffer through the existing __alloc_size() hint. Cc: Sumit Semwal Cc: linux-media@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linaro-mm-sig@lists.linaro.org Reviewed-by: Christian König Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221018090858.never.941-kees@kernel.org --- drivers/dma-buf/dma-resv.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/dma-buf/dma-resv.c b/drivers/dma-buf/dma-resv.c index e3885c90a3acb..1c76aed8e2626 100644 --- a/drivers/dma-buf/dma-resv.c +++ b/drivers/dma-buf/dma-resv.c @@ -98,12 +98,17 @@ static void dma_resv_list_set(struct dma_resv_list *list, static struct dma_resv_list *dma_resv_list_alloc(unsigned int max_fences) { struct dma_resv_list *list; + size_t size; - list = kmalloc(struct_size(list, table, max_fences), GFP_KERNEL); + /* Round up to the next kmalloc bucket size. */ + size = kmalloc_size_roundup(struct_size(list, table, max_fences)); + + list = kmalloc(size, GFP_KERNEL); if (!list) return NULL; - list->max_fences = (ksize(list) - offsetof(typeof(*list), table)) / + /* Given the resulting bucket size, recalculated max_fences. */ + list->max_fences = (size - offsetof(typeof(*list), table)) / sizeof(*list->table); return list; -- GitLab From 905889bc6c842d18f369bf2834cf7219f32709ae Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 23 Sep 2022 13:28:13 -0700 Subject: [PATCH 0309/1423] btrfs: send: Proactively round up to kmalloc bucket size Instead of discovering the kmalloc bucket size _after_ allocation, round up proactively so the allocation is explicitly made for the full size, allowing the compiler to correctly reason about the resulting size of the buffer through the existing __alloc_size() hint. Cc: Chris Mason Cc: Josef Bacik Cc: linux-btrfs@vger.kernel.org Acked-by: David Sterba Link: https://lore.kernel.org/lkml/20220922133014.GI32411@suse.cz Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220923202822.2667581-8-keescook@chromium.org --- fs/btrfs/send.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 4ef4167072b89..f53e8049473dd 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -438,6 +438,11 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) path_len = p->end - p->start; old_buf_len = p->buf_len; + /* + * Allocate to the next largest kmalloc bucket size, to let + * the fast path happen most of the time. + */ + len = kmalloc_size_roundup(len); /* * First time the inline_buf does not suffice */ @@ -451,11 +456,7 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) if (!tmp_buf) return -ENOMEM; p->buf = tmp_buf; - /* - * The real size of the buffer is bigger, this will let the fast path - * happen most of the time - */ - p->buf_len = ksize(p->buf); + p->buf_len = len; if (p->reversed) { tmp_buf = p->buf + old_buf_len - path_len - 1; -- GitLab From 6dd142d9013ca82155d0c069434c60a0d5755ec0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 20 Sep 2022 14:13:05 -0700 Subject: [PATCH 0310/1423] coredump: Proactively round up to kmalloc bucket size Instead of discovering the kmalloc bucket size _after_ allocation, round up proactively so the allocation is explicitly made for the full size, allowing the compiler to correctly reason about the resulting size of the buffer through the existing __alloc_size() hint. Cc: Alexander Viro Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Kees Cook --- fs/coredump.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 7bad7785e8e67..97eaee3252515 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -68,7 +68,10 @@ struct core_name { static int expand_corename(struct core_name *cn, int size) { - char *corename = krealloc(cn->corename, size, GFP_KERNEL); + char *corename; + + size = kmalloc_size_roundup(size); + corename = krealloc(cn->corename, size, GFP_KERNEL); if (!corename) return -ENOMEM; @@ -76,7 +79,7 @@ static int expand_corename(struct core_name *cn, int size) if (size > core_name_size) /* racy but harmless */ core_name_size = size; - cn->size = ksize(corename); + cn->size = size; cn->corename = corename; return 0; } -- GitLab From 79218fd0b38bb05e8dcb80a49342836274046432 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 17 Oct 2022 16:00:45 -0700 Subject: [PATCH 0311/1423] iommu/amd: Drop unnecessary checks in amd_iommu_attach_device() The same checks are done in amd_iommu_probe_device(). If any of them fails there, then the device won't get a group, so there's no way for it to even reach amd_iommu_attach_device anymore. Link: https://lore.kernel.org/r/c054654a81f2b675c73108fe4bf10e45335a721a.1666042872.git.nicolinc@nvidia.com Suggested-by: Robin Murphy Cc: Joerg Roedel Cc: Suravee Suthikulpanit Reviewed-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/amd/iommu.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index d3b39d0416fa3..45299eb7e8e30 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2155,21 +2155,13 @@ static void amd_iommu_detach_device(struct iommu_domain *dom, static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev) { + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); struct protection_domain *domain = to_pdomain(dom); - struct iommu_dev_data *dev_data; - struct amd_iommu *iommu; + struct amd_iommu *iommu = rlookup_amd_iommu(dev); int ret; - if (!check_device(dev)) - return -EINVAL; - - dev_data = dev_iommu_priv_get(dev); dev_data->defer_attach = false; - iommu = rlookup_amd_iommu(dev); - if (!iommu) - return -EINVAL; - if (dev_data->domain) detach_device(dev); -- GitLab From 00208852d351ca6e4a8b9ff0c5376fa3a8ed8eaa Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 17 Oct 2022 16:01:22 -0700 Subject: [PATCH 0312/1423] iommu: Add return value rules to attach_dev op and APIs Cases like VFIO wish to attach a device to an existing domain that was not allocated specifically from the device. This raises a condition where the IOMMU driver can fail the domain attach because the domain and device are incompatible with each other. This is a soft failure that can be resolved by using a different domain. Provide a dedicated errno EINVAL from the IOMMU driver during attach that the reason why the attach failed is because of domain incompatibility. VFIO can use this to know that the attach is a soft failure and it should continue searching. Otherwise, the attach will be a hard failure and VFIO will return the code to userspace. Update kdocs to add rules of return value to the attach_dev op and APIs. Link: https://lore.kernel.org/r/bd56d93c18621104a0fa1b0de31e9b760b81b769.1666042872.git.nicolinc@nvidia.com Suggested-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommu.c | 24 ++++++++++++++++++++++++ include/linux/iommu.h | 12 ++++++++++++ 2 files changed, 36 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 65a3b3d886dc0..972731f0b328a 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1949,6 +1949,18 @@ static int __iommu_attach_device(struct iommu_domain *domain, return ret; } +/** + * iommu_attach_device - Attach an IOMMU domain to a device + * @domain: IOMMU domain to attach + * @dev: Device that will be attached + * + * Returns 0 on success and error code on failure + * + * Note that EINVAL can be treated as a soft failure, indicating + * that certain configuration of the domain is incompatible with + * the device. In this case attaching a different domain to the + * device may succeed. + */ int iommu_attach_device(struct iommu_domain *domain, struct device *dev) { struct iommu_group *group; @@ -2075,6 +2087,18 @@ static int __iommu_attach_group(struct iommu_domain *domain, return ret; } +/** + * iommu_attach_group - Attach an IOMMU domain to an IOMMU group + * @domain: IOMMU domain to attach + * @group: IOMMU group that will be attached + * + * Returns 0 on success and error code on failure + * + * Note that EINVAL can be treated as a soft failure, indicating + * that certain configuration of the domain is incompatible with + * the group. In this case attaching a different domain to the + * group may succeed. + */ int iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group) { int ret; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 3c9da1f8979e3..857898d102b36 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -266,6 +266,18 @@ struct iommu_ops { /** * struct iommu_domain_ops - domain specific operations * @attach_dev: attach an iommu domain to a device + * Return: + * * 0 - success + * * EINVAL - can indicate that device and domain are incompatible due to + * some previous configuration of the domain, in which case the + * driver shouldn't log an error, since it is legitimate for a + * caller to test reuse of existing domains. Otherwise, it may + * still represent some other fundamental problem + * * ENOMEM - out of memory + * * ENOSPC - non-ENOMEM type of resource allocation failures + * * EBUSY - device is attached to a domain and cannot be changed + * * ENODEV - device specific errors, not able to be attached + * * - treated as ENODEV by the caller. Use is discouraged * @detach_dev: detach an iommu domain from a device * @map: map a physically contiguous memory region to an iommu domain * @map_pages: map a physically contiguous set of pages of the same size to -- GitLab From bd7ebb7719356d750b1b4d671535922bae43fb3b Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 17 Oct 2022 16:02:13 -0700 Subject: [PATCH 0313/1423] iommu: Regulate EINVAL in ->attach_dev callback functions Following the new rules in include/linux/iommu.h kdocs, EINVAL now can be used to indicate that domain and device are incompatible by a caller that treats it as a soft failure and tries attaching to another domain. On the other hand, there are ->attach_dev callback functions returning it for obvious device-specific errors. They will result in some inefficiency in the caller handling routine. Update these places to corresponding errnos following the new rules. Link: https://lore.kernel.org/r/5924c03bea637f05feb2a20d624bae086b555ec5.1666042872.git.nicolinc@nvidia.com Reviewed-by: Jean-Philippe Brucker Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/fsl_pamu.c | 2 +- drivers/iommu/fsl_pamu_domain.c | 4 ++-- drivers/iommu/intel/pasid.c | 6 ++++-- drivers/iommu/mtk_iommu.c | 2 +- drivers/iommu/omap-iommu.c | 4 ++-- drivers/iommu/virtio-iommu.c | 2 +- 6 files changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/fsl_pamu.c b/drivers/iommu/fsl_pamu.c index 0d03f837a5d4e..2eb3211c81677 100644 --- a/drivers/iommu/fsl_pamu.c +++ b/drivers/iommu/fsl_pamu.c @@ -211,7 +211,7 @@ int pamu_config_ppaace(int liodn, u32 omi, u32 stashid, int prot) ppaace->op_encode.index_ot.omi = omi; } else if (~omi != 0) { pr_debug("bad operation mapping index: %d\n", omi); - return -EINVAL; + return -ENODEV; } /* configure stash id */ diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c index fa20f4b03e12d..4408ac3c49b61 100644 --- a/drivers/iommu/fsl_pamu_domain.c +++ b/drivers/iommu/fsl_pamu_domain.c @@ -258,7 +258,7 @@ static int fsl_pamu_attach_device(struct iommu_domain *domain, liodn = of_get_property(dev->of_node, "fsl,liodn", &len); if (!liodn) { pr_debug("missing fsl,liodn property at %pOF\n", dev->of_node); - return -EINVAL; + return -ENODEV; } spin_lock_irqsave(&dma_domain->domain_lock, flags); @@ -267,7 +267,7 @@ static int fsl_pamu_attach_device(struct iommu_domain *domain, if (liodn[i] >= PAACE_NUMBER_ENTRIES) { pr_debug("Invalid liodn %d, attach device failed for %pOF\n", liodn[i], dev->of_node); - ret = -EINVAL; + ret = -ENODEV; break; } diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index c30ddac40ee5f..95d73f19ab619 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -101,8 +101,10 @@ int intel_pasid_alloc_table(struct device *dev) might_sleep(); info = dev_iommu_priv_get(dev); - if (WARN_ON(!info || !dev_is_pci(dev) || info->pasid_table)) - return -EINVAL; + if (WARN_ON(!info || !dev_is_pci(dev))) + return -ENODEV; + if (WARN_ON(info->pasid_table)) + return -EEXIST; pasid_table = kzalloc(sizeof(*pasid_table), GFP_KERNEL); if (!pasid_table) diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index 2ab2ecfe01f80..eda441d0c6b68 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -609,7 +609,7 @@ static int mtk_iommu_domain_finalise(struct mtk_iommu_domain *dom, dom->iop = alloc_io_pgtable_ops(ARM_V7S, &dom->cfg, data); if (!dom->iop) { dev_err(data->dev, "Failed to alloc io pgtable\n"); - return -EINVAL; + return -ENOMEM; } /* Update our support page sizes bitmap */ diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index 07ee2600113c2..3f153f9e0ac59 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1414,7 +1414,7 @@ static int omap_iommu_attach_init(struct device *dev, odomain->num_iommus = omap_iommu_count(dev); if (!odomain->num_iommus) - return -EINVAL; + return -ENODEV; odomain->iommus = kcalloc(odomain->num_iommus, sizeof(*iommu), GFP_ATOMIC); @@ -1464,7 +1464,7 @@ omap_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) if (!arch_data || !arch_data->iommu_dev) { dev_err(dev, "device doesn't have an associated iommu\n"); - return -EINVAL; + return -ENODEV; } spin_lock(&omap_domain->lock); diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index 8b1b5c270e502..0b64e7f64e68c 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -670,7 +670,7 @@ static int viommu_domain_finalise(struct viommu_endpoint *vdev, dev_err(vdev->dev, "granule 0x%lx larger than system page size 0x%lx\n", viommu_page_size, PAGE_SIZE); - return -EINVAL; + return -ENODEV; } ret = ida_alloc_range(&viommu->domain_ids, viommu->first_domain, -- GitLab From f4a14773579302e5f0c4bf80b03f0db7ce67f2ce Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 17 Oct 2022 16:02:21 -0700 Subject: [PATCH 0314/1423] iommu: Use EINVAL for incompatible device/domain in ->attach_dev Following the new rules in include/linux/iommu.h kdocs, update all drivers ->attach_dev callback functions to return EINVAL in the failure paths that are related to domain incompatibility. Also, drop adjacent error prints to prevent a kernel log spam. Link: https://lore.kernel.org/r/f52a07f7320da94afe575c9631340d0019a203a7.1666042873.git.nicolinc@nvidia.com Reviewed-by: Jean-Philippe Brucker Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 11 +---------- drivers/iommu/arm/arm-smmu/arm-smmu.c | 3 --- drivers/iommu/arm/arm-smmu/qcom_iommu.c | 7 +------ drivers/iommu/intel/iommu.c | 10 +++------- drivers/iommu/ipmmu-vmsa.c | 2 -- drivers/iommu/omap-iommu.c | 2 +- drivers/iommu/sprd-iommu.c | 4 +--- drivers/iommu/tegra-gart.c | 2 +- drivers/iommu/virtio-iommu.c | 3 +-- 9 files changed, 9 insertions(+), 35 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 6d5df91c5c465..8b0a1e476d44d 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2430,23 +2430,14 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) goto out_unlock; } } else if (smmu_domain->smmu != smmu) { - dev_err(dev, - "cannot attach to SMMU %s (upstream of %s)\n", - dev_name(smmu_domain->smmu->dev), - dev_name(smmu->dev)); - ret = -ENXIO; + ret = -EINVAL; goto out_unlock; } else if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1 && master->ssid_bits != smmu_domain->s1_cfg.s1cdmax) { - dev_err(dev, - "cannot attach to incompatible domain (%u SSID bits != %u)\n", - smmu_domain->s1_cfg.s1cdmax, master->ssid_bits); ret = -EINVAL; goto out_unlock; } else if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1 && smmu_domain->stall_enabled != master->stall_enabled) { - dev_err(dev, "cannot attach to stall-%s domain\n", - smmu_domain->stall_enabled ? "enabled" : "disabled"); ret = -EINVAL; goto out_unlock; } diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 30dab1418e3ff..719fbca1fe52a 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1150,9 +1150,6 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) * different SMMUs. */ if (smmu_domain->smmu != smmu) { - dev_err(dev, - "cannot attach to SMMU %s whilst already attached to domain on SMMU %s\n", - dev_name(smmu_domain->smmu->dev), dev_name(smmu->dev)); ret = -EINVAL; goto rpm_put; } diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c index 3869c3ecda8cd..bfd7b51eb5dbf 100644 --- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c +++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c @@ -381,13 +381,8 @@ static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev * Sanity check the domain. We don't support domains across * different IOMMUs. */ - if (qcom_domain->iommu != qcom_iommu) { - dev_err(dev, "cannot attach to IOMMU %s while already " - "attached to domain on IOMMU %s\n", - dev_name(qcom_domain->iommu->dev), - dev_name(qcom_iommu->dev)); + if (qcom_domain->iommu != qcom_iommu) return -EINVAL; - } return 0; } diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 48cdcd0a5cf34..6f1a59206d2e0 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4194,19 +4194,15 @@ static int prepare_domain_attach_device(struct iommu_domain *domain, return -ENODEV; if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) - return -EOPNOTSUPP; + return -EINVAL; /* check if this iommu agaw is sufficient for max mapped address */ addr_width = agaw_to_width(iommu->agaw); if (addr_width > cap_mgaw(iommu->cap)) addr_width = cap_mgaw(iommu->cap); - if (dmar_domain->max_addr > (1LL << addr_width)) { - dev_err(dev, "%s: iommu width (%d) is not " - "sufficient for the mapped address (%llx)\n", - __func__, addr_width, dmar_domain->max_addr); - return -EFAULT; - } + if (dmar_domain->max_addr > (1LL << addr_width)) + return -EINVAL; dmar_domain->gaw = addr_width; /* diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index 3b30c0752274f..22230cc15dcd1 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -628,8 +628,6 @@ static int ipmmu_attach_device(struct iommu_domain *io_domain, * Something is wrong, we can't attach two devices using * different IOMMUs to the same domain. */ - dev_err(dev, "Can't attach IPMMU %s to domain on IPMMU %s\n", - dev_name(mmu->dev), dev_name(domain->mmu->dev)); ret = -EINVAL; } else dev_info(dev, "Reusing IPMMU context %u\n", domain->context_id); diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index 3f153f9e0ac59..2fd7702c67096 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1472,7 +1472,7 @@ omap_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) /* only a single client device can be attached to a domain */ if (omap_domain->dev) { dev_err(dev, "iommu domain is already attached\n"); - ret = -EBUSY; + ret = -EINVAL; goto out; } diff --git a/drivers/iommu/sprd-iommu.c b/drivers/iommu/sprd-iommu.c index fadd2c907222b..e027933755989 100644 --- a/drivers/iommu/sprd-iommu.c +++ b/drivers/iommu/sprd-iommu.c @@ -237,10 +237,8 @@ static int sprd_iommu_attach_device(struct iommu_domain *domain, struct sprd_iommu_domain *dom = to_sprd_domain(domain); size_t pgt_size = sprd_iommu_pgt_size(domain); - if (dom->sdev) { - pr_err("There's already a device attached to this domain.\n"); + if (dom->sdev) return -EINVAL; - } dom->pgt_va = dma_alloc_coherent(sdev->dev, pgt_size, &dom->pgt_pa, GFP_KERNEL); if (!dom->pgt_va) diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c index e5ca3cf1a9496..ed53279d1106c 100644 --- a/drivers/iommu/tegra-gart.c +++ b/drivers/iommu/tegra-gart.c @@ -112,7 +112,7 @@ static int gart_iommu_attach_dev(struct iommu_domain *domain, spin_lock(&gart->dom_lock); if (gart->active_domain && gart->active_domain != domain) { - ret = -EBUSY; + ret = -EINVAL; } else if (dev_iommu_priv_get(dev) != domain) { dev_iommu_priv_set(dev, domain); gart->active_domain = domain; diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index 0b64e7f64e68c..8226b4da43507 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -734,8 +734,7 @@ static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev) */ ret = viommu_domain_finalise(vdev, domain); } else if (vdomain->viommu != vdev->viommu) { - dev_err(dev, "cannot attach to foreign vIOMMU\n"); - ret = -EXDEV; + ret = -EINVAL; } mutex_unlock(&vdomain->mutex); -- GitLab From 04cee82e04d2aff3d177ef0021ecdff228daf7b8 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 17 Oct 2022 16:02:36 -0700 Subject: [PATCH 0315/1423] iommu: Propagate return value in ->attach_dev callback functions The mtk_iommu and virtio drivers have places in the ->attach_dev callback functions that return hardcode errnos instead of the returned values, but callers of these ->attach_dv callback functions may care. Propagate them directly without the extra conversions. Link: https://lore.kernel.org/r/ca8c5a447b87002334f83325f28823008b4ce420.1666042873.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Jean-Philippe Brucker Reviewed-by: Jason Gunthorpe Reviewed-by: Yong Wu Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/mtk_iommu.c | 2 +- drivers/iommu/virtio-iommu.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index eda441d0c6b68..b383c8327f9cb 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -668,7 +668,7 @@ static int mtk_iommu_attach_device(struct iommu_domain *domain, ret = mtk_iommu_domain_finalise(dom, frstdata, region_id); if (ret) { mutex_unlock(&dom->mutex); - return -ENODEV; + return ret; } dom->bank = &data->bank[bankid]; } diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index 8226b4da43507..5b8fe9bfa9a5b 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -697,7 +697,7 @@ static int viommu_domain_finalise(struct viommu_endpoint *vdev, if (ret) { ida_free(&viommu->domain_ids, vdomain->id); vdomain->viommu = NULL; - return -EOPNOTSUPP; + return ret; } } -- GitLab From 91586ce0d39a05f88795aa8814fb99b1387236b3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 21 Oct 2022 10:34:22 +0800 Subject: [PATCH 0316/1423] f2fs: fix to invalidate dcc->f2fs_issue_discard in error path Syzbot reports a NULL pointer dereference issue as below: __refcount_add include/linux/refcount.h:193 [inline] __refcount_inc include/linux/refcount.h:250 [inline] refcount_inc include/linux/refcount.h:267 [inline] get_task_struct include/linux/sched/task.h:110 [inline] kthread_stop+0x34/0x1c0 kernel/kthread.c:703 f2fs_stop_discard_thread+0x3c/0x5c fs/f2fs/segment.c:1638 kill_f2fs_super+0x5c/0x194 fs/f2fs/super.c:4522 deactivate_locked_super+0x70/0xe8 fs/super.c:332 deactivate_super+0xd0/0xd4 fs/super.c:363 cleanup_mnt+0x1f8/0x234 fs/namespace.c:1186 __cleanup_mnt+0x20/0x30 fs/namespace.c:1193 task_work_run+0xc4/0x14c kernel/task_work.c:177 exit_task_work include/linux/task_work.h:38 [inline] do_exit+0x26c/0xbe0 kernel/exit.c:795 do_group_exit+0x60/0xe8 kernel/exit.c:925 __do_sys_exit_group kernel/exit.c:936 [inline] __se_sys_exit_group kernel/exit.c:934 [inline] __wake_up_parent+0x0/0x40 kernel/exit.c:934 __invoke_syscall arch/arm64/kernel/syscall.c:38 [inline] invoke_syscall arch/arm64/kernel/syscall.c:52 [inline] el0_svc_common+0x138/0x220 arch/arm64/kernel/syscall.c:142 do_el0_svc+0x48/0x164 arch/arm64/kernel/syscall.c:206 el0_svc+0x58/0x150 arch/arm64/kernel/entry-common.c:636 el0t_64_sync_handler+0x84/0xf0 arch/arm64/kernel/entry-common.c:654 el0t_64_sync+0x18c/0x190 arch/arm64/kernel/entry.S:581 The root cause of this issue is in error path of f2fs_start_discard_thread(), it missed to invalidate dcc->f2fs_issue_discard, later kthread_stop() may access invalid pointer. Fixes: 4d67490498ac ("f2fs: Don't create discard thread when device doesn't support realtime discard") Reported-by: syzbot+035a381ea1afb63f098d@syzkaller.appspotmail.com Reported-by: syzbot+729c925c2d9fc495ddee@syzkaller.appspotmail.com Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index acf3d3fa43635..7a4f7c88b8b9f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2025,8 +2025,10 @@ int f2fs_start_discard_thread(struct f2fs_sb_info *sbi) dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi, "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); - if (IS_ERR(dcc->f2fs_issue_discard)) + if (IS_ERR(dcc->f2fs_issue_discard)) { err = PTR_ERR(dcc->f2fs_issue_discard); + dcc->f2fs_issue_discard = NULL; + } return err; } -- GitLab From 18792e64c86dd7e34ba28e4f61faba472b7bf5fc Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 6 Oct 2022 23:09:28 +0800 Subject: [PATCH 0317/1423] f2fs: support fault injection for f2fs_is_valid_blkaddr() This patch supports to inject fault into f2fs_is_valid_blkaddr() to simulate accessing inconsistent data/meta block addressses from caller. Usage: a) echo 262144 > /sys/fs/f2fs//inject_type or b) mount -o fault_type=262144 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 1 + fs/f2fs/checkpoint.c | 5 +++++ fs/f2fs/f2fs.h | 1 + fs/f2fs/super.c | 1 + 4 files changed, 8 insertions(+) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 17df9a02ccff1..b797e8ec96ede 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -199,6 +199,7 @@ fault_type=%d Support configuring fault injection type, should be FAULT_SLAB_ALLOC 0x000008000 FAULT_DQUOT_INIT 0x000010000 FAULT_LOCK_OP 0x000020000 + FAULT_BLKADDR 0x000040000 =================== =========== mode=%s Control block allocation mode which supports "adaptive" and "lfs". In "lfs" mode, there should be no random diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 0c82dae082aa9..c00694a502227 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -171,6 +171,11 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { + if (time_to_inject(sbi, FAULT_BLKADDR)) { + f2fs_show_injection_info(sbi, FAULT_BLKADDR); + return false; + } + switch (type) { case META_NAT: break; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e6355a5683b75..f57cb49dc3830 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -60,6 +60,7 @@ enum { FAULT_SLAB_ALLOC, FAULT_DQUOT_INIT, FAULT_LOCK_OP, + FAULT_BLKADDR, FAULT_MAX, }; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3834ead046200..df26fbe2bf586 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -61,6 +61,7 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_SLAB_ALLOC] = "slab alloc", [FAULT_DQUOT_INIT] = "dquot initialize", [FAULT_LOCK_OP] = "lock_op", + [FAULT_BLKADDR] = "invalid blkaddr", }; void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, -- GitLab From 3688cbe39b7a9ef3feb73234fb351de33fd1da52 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 25 Oct 2022 11:08:31 +0800 Subject: [PATCH 0318/1423] f2fs: remove batched_trim_sections node commit 377224c47118("f2fs: don't split checkpoint in fstrim") obsolete batch mode and related sysfs entry. Since this testing sysfs node has been deprecated for a long time, let's remove it. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 --- fs/f2fs/sysfs.c | 5 ----- 2 files changed, 8 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f57cb49dc3830..e990870bdab92 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1063,9 +1063,6 @@ struct f2fs_sm_info { /* a threshold to reclaim prefree segments */ unsigned int rec_prefree_segments; - /* for batched trimming */ - unsigned int trim_sections; /* # of sections to trim */ - struct list_head sit_entry_set; /* sit entry set list */ unsigned int ipu_policy; /* in-place-update policy */ diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index df27afd71ef48..926b7a844362e 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -488,9 +488,6 @@ out: return -EINVAL; } - if (!strcmp(a->attr.name, "trim_sections")) - return -EINVAL; - if (!strcmp(a->attr.name, "gc_urgent")) { if (t == 0) { sbi->gc_mode = GC_NORMAL; @@ -790,7 +787,6 @@ F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_ F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); @@ -919,7 +915,6 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(max_discard_issue_time), ATTR_LIST(discard_granularity), ATTR_LIST(pending_discard), - ATTR_LIST(batched_trim_sections), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), ATTR_LIST(min_fsync_blocks), -- GitLab From 6359a1aaca527311b7145ec6eb16890a5ddf5214 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 25 Oct 2022 14:50:24 +0800 Subject: [PATCH 0319/1423] f2fs: fix gc mode when gc_urgent_high_remaining is 1 Under the current logic, when gc_urgent_high_remaining is set to 1, the mode will be switched to normal at the beginning, instead of running in gc_urgent mode. Let's switch the gc mode back to normal when the gc ends. Fixes: 265576181b4a ("f2fs: remove gc_urgent_high_limited for cleanup") Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 7b4be412cec06..d2e9c280773f0 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -96,16 +96,6 @@ static int gc_thread_func(void *data) * invalidated soon after by user update or deletion. * So, I'd like to wait some time to collect dirty segments. */ - if (sbi->gc_mode == GC_URGENT_HIGH) { - spin_lock(&sbi->gc_urgent_high_lock); - if (sbi->gc_urgent_high_remaining) { - sbi->gc_urgent_high_remaining--; - if (!sbi->gc_urgent_high_remaining) - sbi->gc_mode = GC_NORMAL; - } - spin_unlock(&sbi->gc_urgent_high_lock); - } - if (sbi->gc_mode == GC_URGENT_HIGH || sbi->gc_mode == GC_URGENT_MID) { wait_ms = gc_th->urgent_sleep_time; @@ -162,6 +152,15 @@ do_gc: /* balancing f2fs's metadata periodically */ f2fs_balance_fs_bg(sbi, true); next: + if (sbi->gc_mode == GC_URGENT_HIGH) { + spin_lock(&sbi->gc_urgent_high_lock); + if (sbi->gc_urgent_high_remaining) { + sbi->gc_urgent_high_remaining--; + if (!sbi->gc_urgent_high_remaining) + sbi->gc_mode = GC_NORMAL; + } + spin_unlock(&sbi->gc_urgent_high_lock); + } sb_end_write(sbi->sb); } while (!kthread_should_stop()); -- GitLab From 44b9d01f2ee32884a7de270394b0fb7f75f87dba Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 25 Oct 2022 16:05:26 +0800 Subject: [PATCH 0320/1423] f2fs: cleanup in f2fs_create_flush_cmd_control() Just cleanup for readable, no functional changes. Suggested-by: Chao Yu Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7a4f7c88b8b9f..0df47ad80efba 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -620,12 +620,12 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; struct flush_cmd_control *fcc; - int err = 0; + int err; if (SM_I(sbi)->fcc_info) { fcc = SM_I(sbi)->fcc_info; if (fcc->f2fs_issue_flush) - return err; + return 0; goto init_thread; } @@ -638,7 +638,7 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi) init_llist_head(&fcc->issue_list); SM_I(sbi)->fcc_info = fcc; if (!test_opt(sbi, FLUSH_MERGE)) - return err; + return 0; init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, @@ -650,7 +650,7 @@ init_thread: return err; } - return err; + return 0; } void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) -- GitLab From b5f1a218ae5e4339130d6e733f0e63d623e09a2c Mon Sep 17 00:00:00 2001 From: Dongdong Zhang Date: Tue, 25 Oct 2022 17:40:36 +0800 Subject: [PATCH 0321/1423] f2fs: fix normal discard process In the DPOLICY_BG mode, there is a conflict between the two conditions "i + 1 < dpolicy->granularity" and "i < DEFAULT_DISCARD_GRANULARITY". If i = 15, the first condition is false, it will enter the second condition and dispatch all small granularity discards in function __issue_discard_cmd_orderly. The restrictive effect of the first condition to small discards will be invalidated. These two conditions should align. Fixes: 20ee4382322c ("f2fs: issue small discard by LBA order") Signed-off-by: Dongdong Zhang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0df47ad80efba..38f6a2bcb158b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1448,7 +1448,7 @@ retry: if (i + 1 < dpolicy->granularity) break; - if (i < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered) + if (i + 1 < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered) return __issue_discard_cmd_orderly(sbi, dpolicy); pend_list = &dcc->pend_list[i]; -- GitLab From 6047de5482c33d5f912cdc907336fde9ebc5714e Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 25 Oct 2022 01:54:01 +0800 Subject: [PATCH 0322/1423] f2fs: add barrier mount option This patch adds a mount option, barrier, in f2fs. The barrier option is the opposite of nobarrier. If this option is set, cache_flush commands are allowed to be issued. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 2 ++ fs/f2fs/super.c | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index b797e8ec96ede..6e67c5e6c7c37 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -154,6 +154,8 @@ nobarrier This option can be used if underlying storage guarantees If this option is set, no cache_flush commands are issued but f2fs still guarantees the write ordering of all the data writes. +barrier If this option is set, cache_flush commands are allowed to be + issued. fastboot This option is used when a system wants to reduce mount time as much as possible, even though normal performance can be sacrificed. diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index df26fbe2bf586..a247027711d89 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -111,6 +111,7 @@ enum { Opt_noinline_dentry, Opt_flush_merge, Opt_noflush_merge, + Opt_barrier, Opt_nobarrier, Opt_fastboot, Opt_extent_cache, @@ -187,6 +188,7 @@ static match_table_t f2fs_tokens = { {Opt_noinline_dentry, "noinline_dentry"}, {Opt_flush_merge, "flush_merge"}, {Opt_noflush_merge, "noflush_merge"}, + {Opt_barrier, "barrier"}, {Opt_nobarrier, "nobarrier"}, {Opt_fastboot, "fastboot"}, {Opt_extent_cache, "extent_cache"}, @@ -807,6 +809,9 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_nobarrier: set_opt(sbi, NOBARRIER); break; + case Opt_barrier: + clear_opt(sbi, NOBARRIER); + break; case Opt_fastboot: set_opt(sbi, FASTBOOT); break; @@ -1940,6 +1945,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",flush_merge"); if (test_opt(sbi, NOBARRIER)) seq_puts(seq, ",nobarrier"); + else + seq_puts(seq, ",barrier"); if (test_opt(sbi, FASTBOOT)) seq_puts(seq, ",fastboot"); if (test_opt(sbi, EXTENT_CACHE)) -- GitLab From a995627e6dd81d4485d40ce64880017a080d71e6 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 24 Oct 2022 16:00:35 -0700 Subject: [PATCH 0323/1423] f2fs: allow to set compression for inlined file The below commit disallows to set compression on empty created file which has a inline_data. Let's fix it. Fixes: 7165841d578e ("f2fs: fix to check inline_data during compressed inode conversion") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 82cda12582272..f96bbfa8b3991 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1915,6 +1915,10 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) if (!f2fs_disable_compressed_file(inode)) return -EINVAL; } else { + /* try to convert inline_data to support compression */ + int err = f2fs_convert_inline_inode(inode); + if (err) + return err; if (!f2fs_may_compress(inode)) return -EINVAL; if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode)) -- GitLab From c46867e9b9b8e0cdd6a5212c2b5ae616583a3bfd Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 25 Oct 2022 16:32:26 +0800 Subject: [PATCH 0324/1423] f2fs: introduce max_ordered_discard sysfs node The current max_ordered_discard is a fixed value, change it to be configurable through the sys node. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 6 ++++++ fs/f2fs/f2fs.h | 3 +++ fs/f2fs/segment.c | 3 ++- fs/f2fs/sysfs.c | 11 +++++++++++ 4 files changed, 22 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 483639fb727b2..53f70eadec96a 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -99,6 +99,12 @@ Description: Controls the issue rate of discard commands that consist of small checkpoint is triggered, and issued during the checkpoint. By default, it is disabled with 0. +What: /sys/fs/f2fs//max_ordered_discard +Date: October 2022 +Contact: "Yangtao Li" +Description: Controls the maximum ordered discard, the unit size is one block(4KB). + Set it to 16 by default. + What: /sys/fs/f2fs//max_discard_request Date: December 2021 Contact: "Konstantin Vyshetsky" diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e990870bdab92..fa8dc00dfb2b8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -331,6 +331,8 @@ struct discard_entry { /* default discard granularity of inner discard thread, unit: block count */ #define DEFAULT_DISCARD_GRANULARITY 16 +/* default maximum discard granularity of ordered discard, unit: block count */ +#define DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY 16 /* max discard pend list number */ #define MAX_PLIST_NUM 512 @@ -410,6 +412,7 @@ struct discard_cmd_control { unsigned int mid_discard_issue_time; /* mid. interval between discard issue */ unsigned int max_discard_issue_time; /* max. interval between discard issue */ unsigned int discard_granularity; /* discard granularity */ + unsigned int max_ordered_discard; /* maximum discard granularity issued by lba order */ unsigned int undiscard_blks; /* # of undiscard blocks */ unsigned int next_pos; /* next discard position */ atomic_t issued_discard; /* # of issued discard */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 38f6a2bcb158b..c470b443615f1 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1448,7 +1448,7 @@ retry: if (i + 1 < dpolicy->granularity) break; - if (i + 1 < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered) + if (i + 1 < dcc->max_ordered_discard && dpolicy->ordered) return __issue_discard_cmd_orderly(sbi, dpolicy); pend_list = &dcc->pend_list[i]; @@ -2048,6 +2048,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) return -ENOMEM; dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; + dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY; if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) dcc->discard_granularity = sbi->blocks_per_seg; else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 926b7a844362e..8095345ebdad5 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -483,6 +483,15 @@ out: return count; } + if (!strcmp(a->attr.name, "max_ordered_discard")) { + if (t == 0 || t > MAX_PLIST_NUM) + return -EINVAL; + if (!f2fs_block_unit_discard(sbi)) + return -EINVAL; + *ui = t; + return count; + } + if (!strcmp(a->attr.name, "migration_granularity")) { if (t == 0 || t > sbi->segs_per_sec) return -EINVAL; @@ -786,6 +795,7 @@ F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_ F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_ordered_discard, max_ordered_discard); F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); @@ -914,6 +924,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(mid_discard_issue_time), ATTR_LIST(max_discard_issue_time), ATTR_LIST(discard_granularity), + ATTR_LIST(max_ordered_discard), ATTR_LIST(pending_discard), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), -- GitLab From a5029a57a2f3f2e2711f4ac2d876c3c83d1758fe Mon Sep 17 00:00:00 2001 From: Keoseong Park Date: Thu, 27 Oct 2022 20:01:05 +0900 Subject: [PATCH 0325/1423] f2fs: Fix typo in comments Change "truncateion" to "truncation". Signed-off-by: Keoseong Park Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f96bbfa8b3991..c605a4f2bce21 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -571,7 +571,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) raw_node = F2FS_NODE(dn->node_page); addr = blkaddr_in_node(raw_node) + base + ofs; - /* Assumption: truncateion starts with cluster */ + /* Assumption: truncation starts with cluster */ for (; count > 0; count--, addr++, dn->ofs_in_node++, cluster_index++) { block_t blkaddr = le32_to_cpu(*addr); -- GitLab From 0db18eec0d9a7ee525209e31e3ac2f673545b12f Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Thu, 27 Oct 2022 14:42:40 +0530 Subject: [PATCH 0326/1423] f2fs: fix the assign logic of iocb commit 18ae8d12991b ("f2fs: show more DIO information in tracepoint") introduces iocb field in 'f2fs_direct_IO_enter' trace event And it only assigns the pointer and later it accesses its field in trace print log. Unable to handle kernel paging request at virtual address ffffffc04cef3d30 Mem abort info: ESR = 0x96000007 EC = 0x25: DABT (current EL), IL = 32 bits pc : trace_raw_output_f2fs_direct_IO_enter+0x54/0xa4 lr : trace_raw_output_f2fs_direct_IO_enter+0x2c/0xa4 sp : ffffffc0443cbbd0 x29: ffffffc0443cbbf0 x28: ffffff8935b120d0 x27: ffffff8935b12108 x26: ffffff8935b120f0 x25: ffffff8935b12100 x24: ffffff8935b110c0 x23: ffffff8935b10000 x22: ffffff88859a936c x21: ffffff88859a936c x20: ffffff8935b110c0 x19: ffffff8935b10000 x18: ffffffc03b195060 x17: ffffff8935b11e76 x16: 00000000000000cc x15: ffffffef855c4f2c x14: 0000000000000001 x13: 000000000000004e x12: ffff0000ffffff00 x11: ffffffef86c350d0 x10: 00000000000010c0 x9 : 000000000fe0002c x8 : ffffffc04cef3d28 x7 : 7f7f7f7f7f7f7f7f x6 : 0000000002000000 x5 : ffffff8935b11e9a x4 : 0000000000006250 x3 : ffff0a00ffffff04 x2 : 0000000000000002 x1 : ffffffef86a0a31f x0 : ffffff8935b10000 Call trace: trace_raw_output_f2fs_direct_IO_enter+0x54/0xa4 print_trace_fmt+0x9c/0x138 print_trace_line+0x154/0x254 tracing_read_pipe+0x21c/0x380 vfs_read+0x108/0x3ac ksys_read+0x7c/0xec __arm64_sys_read+0x20/0x30 invoke_syscall+0x60/0x150 el0_svc_common.llvm.1237943816091755067+0xb8/0xf8 do_el0_svc+0x28/0xa0 Fix it by copying the required variables for printing and while at it fix the similar issue at some other places in the same file. Fixes: bd984c03097b ("f2fs: show more DIO information in tracepoint") Signed-off-by: Mukesh Ojha Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/trace/events/f2fs.h | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index c6b372401c278..ff57e7f9914cc 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -322,7 +322,7 @@ TRACE_EVENT(f2fs_unlink_enter, __field(ino_t, ino) __field(loff_t, size) __field(blkcnt_t, blocks) - __field(const char *, name) + __string(name, dentry->d_name.name) ), TP_fast_assign( @@ -330,7 +330,7 @@ TRACE_EVENT(f2fs_unlink_enter, __entry->ino = dir->i_ino; __entry->size = dir->i_size; __entry->blocks = dir->i_blocks; - __entry->name = dentry->d_name.name; + __assign_str(name, dentry->d_name.name); ), TP_printk("dev = (%d,%d), dir ino = %lu, i_size = %lld, " @@ -338,7 +338,7 @@ TRACE_EVENT(f2fs_unlink_enter, show_dev_ino(__entry), __entry->size, (unsigned long long)__entry->blocks, - __entry->name) + __get_str(name)) ); DEFINE_EVENT(f2fs__inode_exit, f2fs_unlink_exit, @@ -940,25 +940,29 @@ TRACE_EVENT(f2fs_direct_IO_enter, TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) - __field(struct kiocb *, iocb) + __field(loff_t, ki_pos) + __field(int, ki_flags) + __field(u16, ki_ioprio) __field(unsigned long, len) __field(int, rw) ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->iocb = iocb; - __entry->len = len; - __entry->rw = rw; + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->ki_pos = iocb->ki_pos; + __entry->ki_flags = iocb->ki_flags; + __entry->ki_ioprio = iocb->ki_ioprio; + __entry->len = len; + __entry->rw = rw; ), TP_printk("dev = (%d,%d), ino = %lu pos = %lld len = %lu ki_flags = %x ki_ioprio = %x rw = %d", show_dev_ino(__entry), - __entry->iocb->ki_pos, + __entry->ki_pos, __entry->len, - __entry->iocb->ki_flags, - __entry->iocb->ki_ioprio, + __entry->ki_flags, + __entry->ki_ioprio, __entry->rw) ); @@ -1407,19 +1411,19 @@ TRACE_EVENT(f2fs_write_checkpoint, TP_STRUCT__entry( __field(dev_t, dev) __field(int, reason) - __field(char *, msg) + __string(dest_msg, msg) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->reason = reason; - __entry->msg = msg; + __assign_str(dest_msg, msg); ), TP_printk("dev = (%d,%d), checkpoint for %s, state = %s", show_dev(__entry->dev), show_cpreason(__entry->reason), - __entry->msg) + __get_str(dest_msg)) ); DECLARE_EVENT_CLASS(f2fs_discard, -- GitLab From 195623f2d8e9361eaddec071ad298998ec0590ba Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Thu, 27 Oct 2022 14:42:41 +0530 Subject: [PATCH 0327/1423] f2fs: fix the msg data type Data type of msg in f2fs_write_checkpoint trace should be const char * instead of char *. Signed-off-by: Mukesh Ojha Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/trace/events/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index ff57e7f9914cc..7fbfce498472f 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1404,7 +1404,7 @@ TRACE_EVENT(f2fs_readpages, TRACE_EVENT(f2fs_write_checkpoint, - TP_PROTO(struct super_block *sb, int reason, char *msg), + TP_PROTO(struct super_block *sb, int reason, const char *msg), TP_ARGS(sb, reason, msg), -- GitLab From 146dbcbf17a6d07169e75224d949cc2670de2e20 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Thu, 27 Oct 2022 18:24:46 +0800 Subject: [PATCH 0328/1423] f2fs: fix return val in f2fs_start_ckpt_thread() Return PTR_ERR(cprc->f2fs_issue_ckpt) instead of -ENOMEM; Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 +++- fs/f2fs/gc.c | 15 +++++++-------- fs/f2fs/segment.c | 4 ++-- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index c00694a502227..56f7d0d6a8b2f 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1902,8 +1902,10 @@ int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi) cprc->f2fs_issue_ckpt = kthread_run(issue_checkpoint_thread, sbi, "f2fs_ckpt-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(cprc->f2fs_issue_ckpt)) { + int err = PTR_ERR(cprc->f2fs_issue_ckpt); + cprc->f2fs_issue_ckpt = NULL; - return -ENOMEM; + return err; } set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d2e9c280773f0..15f56859966cc 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -171,13 +171,10 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) { struct f2fs_gc_kthread *gc_th; dev_t dev = sbi->sb->s_bdev->bd_dev; - int err = 0; gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL); - if (!gc_th) { - err = -ENOMEM; - goto out; - } + if (!gc_th) + return -ENOMEM; gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME; gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME; @@ -192,12 +189,14 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(gc_th->f2fs_gc_task)) { - err = PTR_ERR(gc_th->f2fs_gc_task); + int err = PTR_ERR(gc_th->f2fs_gc_task); + kfree(gc_th); sbi->gc_thread = NULL; + return err; } -out: - return err; + + return 0; } void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c470b443615f1..c4270cd6eaab7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -620,7 +620,6 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; struct flush_cmd_control *fcc; - int err; if (SM_I(sbi)->fcc_info) { fcc = SM_I(sbi)->fcc_info; @@ -644,7 +643,8 @@ init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { - err = PTR_ERR(fcc->f2fs_issue_flush); + int err = PTR_ERR(fcc->f2fs_issue_flush); + kfree(fcc); SM_I(sbi)->fcc_info = NULL; return err; -- GitLab From 7b02b2201893a71b881026cf574902019ab00db5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 28 Oct 2022 17:30:26 +0800 Subject: [PATCH 0329/1423] f2fs: fix to destroy sbi->post_read_wq in error path of f2fs_fill_super() In error path of f2fs_fill_super(), this patch fixes to call f2fs_destroy_post_read_wq() once if we fail in f2fs_start_ckpt_thread(). Fixes: 261eeb9c1585 ("f2fs: introduce checkpoint_merge mount option") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a247027711d89..e6365f040171e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4531,9 +4531,9 @@ free_nm: f2fs_destroy_node_manager(sbi); free_sm: f2fs_destroy_segment_manager(sbi); - f2fs_destroy_post_read_wq(sbi); stop_ckpt_thread: f2fs_stop_ckpt_thread(sbi); + f2fs_destroy_post_read_wq(sbi); free_devices: destroy_device_list(sbi); kvfree(sbi->ckpt); -- GitLab From a3951cd199a5d26138532d4e55af41262237632e Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 25 Oct 2022 11:32:16 +0800 Subject: [PATCH 0330/1423] f2fs: introduce gc_mode sysfs node Revert "f2fs: make gc_urgent and gc_segment_mode sysfs node readable". Add a gc_mode sysfs node to show the current gc_mode as a string. Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 6 ++++++ fs/f2fs/f2fs.h | 1 + fs/f2fs/sysfs.c | 15 +++++++++------ 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 53f70eadec96a..ef2b3572ba181 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -640,3 +640,9 @@ Date: July 2022 Contact: "Daeho Jeong" Description: Show the accumulated total revoked atomic write block count after boot. If you write "0" here, you can initialize to "0". + +What: /sys/fs/f2fs//gc_mode +Date: October 2022 +Contact: "Yangtao Li" +Description: Show the current gc_mode as a string. + This is a read-only entry. diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fa8dc00dfb2b8..662b27c19de10 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1319,6 +1319,7 @@ enum { MAX_TIME, }; +/* Note that you need to keep synchronization with this gc_mode_names array */ enum { GC_NORMAL, GC_IDLE_CB, diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 8095345ebdad5..1fbd41c483282 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -143,6 +143,12 @@ static ssize_t pending_discard_show(struct f2fs_attr *a, &SM_I(sbi)->dcc_info->discard_cmd_cnt)); } +static ssize_t gc_mode_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%s\n", gc_mode_names[sbi->gc_mode]); +} + static ssize_t features_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -332,13 +338,8 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, return sysfs_emit(buf, "%u\n", sbi->compr_new_inode); #endif - if (!strcmp(a->attr.name, "gc_urgent")) - return sysfs_emit(buf, "%s\n", - gc_mode_names[sbi->gc_mode]); - if (!strcmp(a->attr.name, "gc_segment_mode")) - return sysfs_emit(buf, "%s\n", - gc_mode_names[sbi->gc_segment_mode]); + return sysfs_emit(buf, "%u\n", sbi->gc_segment_mode); if (!strcmp(a->attr.name, "gc_reclaimed_segments")) { return sysfs_emit(buf, "%u\n", @@ -844,6 +845,7 @@ F2FS_GENERAL_RO_ATTR(encoding); F2FS_GENERAL_RO_ATTR(mounted_time_sec); F2FS_GENERAL_RO_ATTR(main_blkaddr); F2FS_GENERAL_RO_ATTR(pending_discard); +F2FS_GENERAL_RO_ATTR(gc_mode); #ifdef CONFIG_F2FS_STAT_FS F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count); F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count); @@ -926,6 +928,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(discard_granularity), ATTR_LIST(max_ordered_discard), ATTR_LIST(pending_discard), + ATTR_LIST(gc_mode), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), ATTR_LIST(min_fsync_blocks), -- GitLab From 23ddc81b087c8bd9a73272afa076e204dc2e5410 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 28 Oct 2022 09:49:53 -0700 Subject: [PATCH 0331/1423] f2fs: use sysfs_emit instead of sprintf Let's use sysfs_emit. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 1fbd41c483282..af23ed6121b00 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -95,28 +95,28 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) static ssize_t dirty_segments_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)(dirty_segments(sbi))); } static ssize_t free_segments_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)(free_segments(sbi))); } static ssize_t ovp_segments_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)(overprovision_segments(sbi))); } static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)(sbi->kbytes_written + ((f2fs_get_sectors_written(sbi) - sbi->sectors_written_start) >> 1))); @@ -125,13 +125,13 @@ static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, static ssize_t sb_status_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%lx\n", sbi->s_flag); + return sysfs_emit(buf, "%lx\n", sbi->s_flag); } static ssize_t cp_status_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%x\n", le32_to_cpu(F2FS_CKPT(sbi)->ckpt_flags)); + return sysfs_emit(buf, "%x\n", le32_to_cpu(F2FS_CKPT(sbi)->ckpt_flags)); } static ssize_t pending_discard_show(struct f2fs_attr *a, @@ -139,7 +139,7 @@ static ssize_t pending_discard_show(struct f2fs_attr *a, { if (!SM_I(sbi)->dcc_info) return -EINVAL; - return sprintf(buf, "%llu\n", (unsigned long long)atomic_read( + return sysfs_emit(buf, "%llu\n", (unsigned long long)atomic_read( &SM_I(sbi)->dcc_info->discard_cmd_cnt)); } @@ -205,7 +205,7 @@ static ssize_t features_show(struct f2fs_attr *a, static ssize_t current_reserved_blocks_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%u\n", sbi->current_reserved_blocks); + return sysfs_emit(buf, "%u\n", sbi->current_reserved_blocks); } static ssize_t unusable_show(struct f2fs_attr *a, @@ -217,7 +217,7 @@ static ssize_t unusable_show(struct f2fs_attr *a, unusable = sbi->unusable_block_count; else unusable = f2fs_get_unusable_blocks(sbi); - return sprintf(buf, "%llu\n", (unsigned long long)unusable); + return sysfs_emit(buf, "%llu\n", (unsigned long long)unusable); } static ssize_t encoding_show(struct f2fs_attr *a, @@ -232,13 +232,13 @@ static ssize_t encoding_show(struct f2fs_attr *a, (sb->s_encoding->version >> 8) & 0xff, sb->s_encoding->version & 0xff); #endif - return sprintf(buf, "(none)"); + return sysfs_emit(buf, "(none)"); } static ssize_t mounted_time_sec_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%llu", SIT_I(sbi)->mounted_time); + return sysfs_emit(buf, "%llu", SIT_I(sbi)->mounted_time); } #ifdef CONFIG_F2FS_STAT_FS @@ -247,7 +247,7 @@ static ssize_t moved_blocks_foreground_show(struct f2fs_attr *a, { struct f2fs_stat_info *si = F2FS_STAT(sbi); - return sprintf(buf, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)(si->tot_blks - (si->bg_data_blks + si->bg_node_blks))); } @@ -257,7 +257,7 @@ static ssize_t moved_blocks_background_show(struct f2fs_attr *a, { struct f2fs_stat_info *si = F2FS_STAT(sbi); - return sprintf(buf, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)(si->bg_data_blks + si->bg_node_blks)); } @@ -268,7 +268,7 @@ static ssize_t avg_vblocks_show(struct f2fs_attr *a, si->dirty_count = dirty_segments(sbi); f2fs_update_sit_info(sbi); - return sprintf(buf, "%llu\n", (unsigned long long)(si->avg_vblocks)); + return sysfs_emit(buf, "%llu\n", (unsigned long long)(si->avg_vblocks)); } #endif @@ -363,7 +363,7 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, ui = (unsigned int *)(ptr + a->offset); - return sprintf(buf, "%u\n", *ui); + return sysfs_emit(buf, "%u\n", *ui); } static ssize_t __sbi_store(struct f2fs_attr *a, @@ -728,7 +728,7 @@ static void f2fs_sb_release(struct kobject *kobj) static ssize_t f2fs_feature_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "supported\n"); + return sysfs_emit(buf, "supported\n"); } #define F2FS_FEATURE_RO_ATTR(_name) \ @@ -741,8 +741,8 @@ static ssize_t f2fs_sb_feature_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { if (F2FS_HAS_FEATURE(sbi, a->id)) - return sprintf(buf, "supported\n"); - return sprintf(buf, "unsupported\n"); + return sysfs_emit(buf, "supported\n"); + return sysfs_emit(buf, "unsupported\n"); } #define F2FS_SB_FEATURE_RO_ATTR(_name, _feat) \ -- GitLab From eebd36a408bb6fc5d7adbb4b8c6db993d0a850f8 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 28 Oct 2022 10:07:13 -0700 Subject: [PATCH 0332/1423] f2fs: add missing bracket in doc Let's add missing <>. Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index ef2b3572ba181..a6a60268dcc5a 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -241,7 +241,7 @@ Description: Shows total written kbytes issued to disk. What: /sys/fs/f2fs//features Date: July 2017 Contact: "Jaegeuk Kim" -Description: /feature_list/ +Description: /feature_list/> Shows all enabled features in current device. Supported features: encryption, blkzoned, extra_attr, projquota, inode_checksum, -- GitLab From e5a0db6a9e2eafe50e3ebc73a8285ae561e7d850 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 25 Oct 2022 14:50:25 +0800 Subject: [PATCH 0333/1423] f2fs: replace gc_urgent_high_remaining with gc_remaining_trials The user can set the trial count limit for GC urgent and idle mode with replaced gc_remaining_trials.. If GC thread gets to the limit, the mode will turn back to GC normal mode finally. It was applied only to GC_URGENT, while this patch expands it for GC_IDLE. Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 8 ++++---- fs/f2fs/f2fs.h | 5 +++-- fs/f2fs/gc.c | 12 ++++++------ fs/f2fs/super.c | 2 +- fs/f2fs/sysfs.c | 12 ++++++------ 5 files changed, 20 insertions(+), 19 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index a6a60268dcc5a..24e7cb77f265b 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -598,10 +598,10 @@ Description: With "mode=fragment:block" mount options, we can scatter block allo in the length of 1.. by turns. This value can be set between 1..512 and the default value is 4. -What: /sys/fs/f2fs//gc_urgent_high_remaining -Date: December 2021 -Contact: "Daeho Jeong" -Description: You can set the trial count limit for GC urgent high mode with this value. +What: /sys/fs/f2fs//gc_remaining_trials +Date: October 2022 +Contact: "Yangtao Li" +Description: You can set the trial count limit for GC urgent and idle mode with this value. If GC thread gets to the limit, the mode will turn back to GC normal mode. By default, the value is zero, which means there is no limit like before. diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 662b27c19de10..04ef4cce3d7f4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1736,8 +1736,9 @@ struct f2fs_sb_info { unsigned int cur_victim_sec; /* current victim section num */ unsigned int gc_mode; /* current GC state */ unsigned int next_victim_seg[2]; /* next segment in victim section */ - spinlock_t gc_urgent_high_lock; - unsigned int gc_urgent_high_remaining; /* remaining trial count for GC_URGENT_HIGH */ + spinlock_t gc_remaining_trials_lock; + /* remaining trial count for GC_URGENT_* and GC_IDLE_* */ + unsigned int gc_remaining_trials; /* for skip statistic */ unsigned long long skipped_gc_rwsem; /* FG_GC only */ diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 15f56859966cc..6466db75af5d2 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -152,14 +152,14 @@ do_gc: /* balancing f2fs's metadata periodically */ f2fs_balance_fs_bg(sbi, true); next: - if (sbi->gc_mode == GC_URGENT_HIGH) { - spin_lock(&sbi->gc_urgent_high_lock); - if (sbi->gc_urgent_high_remaining) { - sbi->gc_urgent_high_remaining--; - if (!sbi->gc_urgent_high_remaining) + if (sbi->gc_mode != GC_NORMAL) { + spin_lock(&sbi->gc_remaining_trials_lock); + if (sbi->gc_remaining_trials) { + sbi->gc_remaining_trials--; + if (!sbi->gc_remaining_trials) sbi->gc_mode = GC_NORMAL; } - spin_unlock(&sbi->gc_urgent_high_lock); + spin_unlock(&sbi->gc_remaining_trials_lock); } sb_end_write(sbi->sb); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e6365f040171e..a43d8a46a6e56 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3624,7 +3624,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->seq_file_ra_mul = MIN_RA_MUL; sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE; sbi->max_fragment_hole = DEF_FRAGMENT_SIZE; - spin_lock_init(&sbi->gc_urgent_high_lock); + spin_lock_init(&sbi->gc_remaining_trials_lock); atomic64_set(&sbi->current_atomic_write, 0); sbi->dir_level = DEF_DIR_LEVEL; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index af23ed6121b00..032c03e09580d 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -538,10 +538,10 @@ out: return count; } - if (!strcmp(a->attr.name, "gc_urgent_high_remaining")) { - spin_lock(&sbi->gc_urgent_high_lock); - sbi->gc_urgent_high_remaining = t; - spin_unlock(&sbi->gc_urgent_high_lock); + if (!strcmp(a->attr.name, "gc_remaining_trials")) { + spin_lock(&sbi->gc_remaining_trials_lock); + sbi->gc_remaining_trials = t; + spin_unlock(&sbi->gc_remaining_trials_lock); return count; } @@ -832,7 +832,7 @@ F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); #endif F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent_high_remaining, gc_urgent_high_remaining); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_remaining_trials, gc_remaining_trials); F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio); F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(free_segments); @@ -961,7 +961,7 @@ static struct attribute *f2fs_attrs[] = { #endif ATTR_LIST(data_io_flag), ATTR_LIST(node_io_flag), - ATTR_LIST(gc_urgent_high_remaining), + ATTR_LIST(gc_remaining_trials), ATTR_LIST(ckpt_thread_ioprio), ATTR_LIST(dirty_segments), ATTR_LIST(free_segments), -- GitLab From 3b21b794b5797d35f4fad930b53b1cd881c12dd3 Mon Sep 17 00:00:00 2001 From: "wangkailong@jari.cn" Date: Sat, 29 Oct 2022 22:49:30 +0800 Subject: [PATCH 0334/1423] f2fs: replace ternary operator with max() Fix the following coccicheck warning: ./fs/f2fs/segment.c:877:24-25: WARNING opportunity for max() Signed-off-by: KaiLong Wang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c4270cd6eaab7..aa4be7f25963d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -856,7 +856,7 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi) } mutex_unlock(&dirty_i->seglist_lock); - unusable = holes[DATA] > holes[NODE] ? holes[DATA] : holes[NODE]; + unusable = max(holes[DATA], holes[NODE]); if (unusable > ovp_holes) return unusable - ovp_holes; return 0; -- GitLab From f6c64dc32ab91b4c37fa2a255d2270f4ff0b95ba Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Sat, 29 Oct 2022 09:25:05 +0800 Subject: [PATCH 0335/1423] apparmor: Add __init annotation to aa_{setup/teardown}_dfa_engine() The aa_setup_dfa_engine() and aa_teardown_dfa_engine() is only called in apparmor_init(), so let us add __init annotation to them. Fixes: 11c236b89d7c ("apparmor: add a default null dfa") Signed-off-by: Xiu Jianfeng Signed-off-by: John Johansen --- security/apparmor/match.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/security/apparmor/match.c b/security/apparmor/match.c index 5095c26ca6836..b97ef5e1db732 100644 --- a/security/apparmor/match.c +++ b/security/apparmor/match.c @@ -31,7 +31,7 @@ static char stacksplitdfa_src[] = { }; struct aa_dfa *stacksplitdfa; -int aa_setup_dfa_engine(void) +int __init aa_setup_dfa_engine(void) { int error; @@ -59,7 +59,7 @@ int aa_setup_dfa_engine(void) return 0; } -void aa_teardown_dfa_engine(void) +void __init aa_teardown_dfa_engine(void) { aa_put_dfa(stacksplitdfa); aa_put_dfa(nulldfa); -- GitLab From 4295c60bbe9e63e35d330546eeaa1d2b62dae303 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Tue, 1 Nov 2022 05:40:40 -0700 Subject: [PATCH 0336/1423] apparmor: Fix uninitialized symbol 'array_size' in policy_unpack_test.c Make sure array_size is initialized in the kunit test to get rid of compiler warnings. This will also make sure the following tests fail consistently if the first test fails. Reported-by: kernel test robot Signed-off-by: John Johansen --- security/apparmor/policy_unpack_test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/security/apparmor/policy_unpack_test.c b/security/apparmor/policy_unpack_test.c index b214f6ea8a725..7465da42492d1 100644 --- a/security/apparmor/policy_unpack_test.c +++ b/security/apparmor/policy_unpack_test.c @@ -140,7 +140,7 @@ static void policy_unpack_test_inbounds_when_out_of_bounds(struct kunit *test) static void policy_unpack_test_unpack_array_with_null_name(struct kunit *test) { struct policy_unpack_fixture *puf = test->priv; - u16 array_size; + u16 array_size = 0; puf->e->pos += TEST_ARRAY_BUF_OFFSET; @@ -155,7 +155,7 @@ static void policy_unpack_test_unpack_array_with_name(struct kunit *test) { struct policy_unpack_fixture *puf = test->priv; const char name[] = TEST_ARRAY_NAME; - u16 array_size; + u16 array_size = 0; puf->e->pos += TEST_NAMED_ARRAY_BUF_OFFSET; -- GitLab From 4b21d25bf519c9487935a664886956bb18f04f6d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 24 Oct 2022 23:11:25 +0300 Subject: [PATCH 0337/1423] overflow: Introduce overflows_type() and castable_to_type() Implement a robust overflows_type() macro to test if a variable or constant value would overflow another variable or type. This can be used as a constant expression for static_assert() (which requires a constant expression[1][2]) when used on constant values. This must be constructed manually, since __builtin_add_overflow() does not produce a constant expression[3]. Additionally adds castable_to_type(), similar to __same_type(), but for checking if a constant value would overflow if cast to a given type. Add unit tests for overflows_type(), __same_type(), and castable_to_type() to the existing KUnit "overflow" test: [16:03:33] ================== overflow (21 subtests) ================== ... [16:03:33] [PASSED] overflows_type_test [16:03:33] [PASSED] same_type_test [16:03:33] [PASSED] castable_to_type_test [16:03:33] ==================== [PASSED] overflow ===================== [16:03:33] ============================================================ [16:03:33] Testing complete. Ran 21 tests: passed: 21 [16:03:33] Elapsed time: 24.022s total, 0.002s configuring, 22.598s building, 0.767s running [1] https://en.cppreference.com/w/c/language/_Static_assert [2] C11 standard (ISO/IEC 9899:2011): 6.7.10 Static assertions [3] https://gcc.gnu.org/onlinedocs/gcc/Integer-Overflow-Builtins.html 6.56 Built-in Functions to Perform Arithmetic with Overflow Checking Built-in Function: bool __builtin_add_overflow (type1 a, type2 b, Cc: Luc Van Oostenryck Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Tom Rix Cc: Daniel Latypov Cc: Vitor Massaru Iha Cc: "Gustavo A. R. Silva" Cc: Jani Nikula Cc: Mauro Carvalho Chehab Cc: linux-hardening@vger.kernel.org Cc: llvm@lists.linux.dev Co-developed-by: Gwan-gyeong Mun Signed-off-by: Gwan-gyeong Mun Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221024201125.1416422-1-gwan-gyeong.mun@intel.com --- drivers/gpu/drm/i915/i915_user_extensions.c | 2 +- drivers/gpu/drm/i915/i915_utils.h | 4 - include/linux/compiler.h | 1 + include/linux/overflow.h | 47 +++ lib/Makefile | 1 + lib/overflow_kunit.c | 381 ++++++++++++++++++++ 6 files changed, 431 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_user_extensions.c b/drivers/gpu/drm/i915/i915_user_extensions.c index c822d0aafd2dc..e3f808372c47b 100644 --- a/drivers/gpu/drm/i915/i915_user_extensions.c +++ b/drivers/gpu/drm/i915/i915_user_extensions.c @@ -51,7 +51,7 @@ int i915_user_extensions(struct i915_user_extension __user *ext, return err; if (get_user(next, &ext->next_extension) || - overflows_type(next, ext)) + overflows_type(next, uintptr_t)) return -EFAULT; ext = u64_to_user_ptr(next); diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h index 6c14d13364bf7..67a66d4d5c70c 100644 --- a/drivers/gpu/drm/i915/i915_utils.h +++ b/drivers/gpu/drm/i915/i915_utils.h @@ -111,10 +111,6 @@ bool i915_error_injected(void); #define range_overflows_end_t(type, start, size, max) \ range_overflows_end((type)(start), (type)(size), (type)(max)) -/* Note we don't consider signbits :| */ -#define overflows_type(x, T) \ - (sizeof(x) > sizeof(T) && (x) >> BITS_PER_TYPE(T)) - #define ptr_mask_bits(ptr, n) ({ \ unsigned long __v = (unsigned long)(ptr); \ (typeof(ptr))(__v & -BIT(n)); \ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 973a1bfd7ef53..947a60b801dbd 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -236,6 +236,7 @@ static inline void *offset_to_ptr(const int *off) * bool and also pointer types. */ #define is_signed_type(type) (((type)(-1)) < (__force type)1) +#define is_unsigned_type(type) (!is_signed_type(type)) /* * This is needed in functions which generate the stack canary, see diff --git a/include/linux/overflow.h b/include/linux/overflow.h index 1d3be1a2204c3..0e33b5cbdb9f6 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -128,6 +128,53 @@ static inline bool __must_check __must_check_overflow(bool overflow) (*_d >> _to_shift) != _a); \ })) +#define __overflows_type_constexpr(x, T) ( \ + is_unsigned_type(typeof(x)) ? \ + (x) > type_max(typeof(T)) : \ + is_unsigned_type(typeof(T)) ? \ + (x) < 0 || (x) > type_max(typeof(T)) : \ + (x) < type_min(typeof(T)) || (x) > type_max(typeof(T))) + +#define __overflows_type(x, T) ({ \ + typeof(T) v = 0; \ + check_add_overflow((x), v, &v); \ +}) + +/** + * overflows_type - helper for checking the overflows between value, variables, + * or data type + * + * @n: source constant value or variable to be checked + * @T: destination variable or data type proposed to store @x + * + * Compares the @x expression for whether or not it can safely fit in + * the storage of the type in @T. @x and @T can have different types. + * If @x is a constant expression, this will also resolve to a constant + * expression. + * + * Returns: true if overflow can occur, false otherwise. + */ +#define overflows_type(n, T) \ + __builtin_choose_expr(__is_constexpr(n), \ + __overflows_type_constexpr(n, T), \ + __overflows_type(n, T)) + +/** + * castable_to_type - like __same_type(), but also allows for casted literals + * + * @n: variable or constant value + * @T: variable or data type + * + * Unlike the __same_type() macro, this allows a constant value as the + * first argument. If this value would not overflow into an assignment + * of the second argument's type, it returns true. Otherwise, this falls + * back to __same_type(). + */ +#define castable_to_type(n, T) \ + __builtin_choose_expr(__is_constexpr(n), \ + !__overflows_type_constexpr(n, T), \ + __same_type(n, T)) + /** * size_mul() - Calculate size_t multiplication with saturation at SIZE_MAX * @factor1: first factor diff --git a/lib/Makefile b/lib/Makefile index 77c7951c8cf09..322178b9f7fbc 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -374,6 +374,7 @@ obj-$(CONFIG_CMDLINE_KUNIT_TEST) += cmdline_kunit.o obj-$(CONFIG_SLUB_KUNIT_TEST) += slub_kunit.o obj-$(CONFIG_MEMCPY_KUNIT_TEST) += memcpy_kunit.o obj-$(CONFIG_IS_SIGNED_TYPE_KUNIT_TEST) += is_signed_type_kunit.o +CFLAGS_overflow_kunit.o = $(call cc-disable-warning, tautological-constant-out-of-range-compare) obj-$(CONFIG_OVERFLOW_KUNIT_TEST) += overflow_kunit.o CFLAGS_stackinit_kunit.o += $(call cc-disable-warning, switch-unreachable) obj-$(CONFIG_STACKINIT_KUNIT_TEST) += stackinit_kunit.o diff --git a/lib/overflow_kunit.c b/lib/overflow_kunit.c index b8556a2e7bb1d..dcd3ba102db69 100644 --- a/lib/overflow_kunit.c +++ b/lib/overflow_kunit.c @@ -736,6 +736,384 @@ static void overflow_size_helpers_test(struct kunit *test) #undef check_one_size_helper } +static void overflows_type_test(struct kunit *test) +{ + int count = 0; + unsigned int var; + +#define __TEST_OVERFLOWS_TYPE(func, arg1, arg2, of) do { \ + bool __of = func(arg1, arg2); \ + KUNIT_EXPECT_EQ_MSG(test, __of, of, \ + "expected " #func "(" #arg1 ", " #arg2 " to%s overflow\n",\ + of ? "" : " not"); \ + count++; \ +} while (0) + +/* Args are: first type, second type, value, overflow expected */ +#define TEST_OVERFLOWS_TYPE(__t1, __t2, v, of) do { \ + __t1 t1 = (v); \ + __t2 t2; \ + __TEST_OVERFLOWS_TYPE(__overflows_type, t1, t2, of); \ + __TEST_OVERFLOWS_TYPE(__overflows_type, t1, __t2, of); \ + __TEST_OVERFLOWS_TYPE(__overflows_type_constexpr, t1, t2, of); \ + __TEST_OVERFLOWS_TYPE(__overflows_type_constexpr, t1, __t2, of);\ +} while (0) + + TEST_OVERFLOWS_TYPE(u8, u8, U8_MAX, false); + TEST_OVERFLOWS_TYPE(u8, u16, U8_MAX, false); + TEST_OVERFLOWS_TYPE(u8, s8, U8_MAX, true); + TEST_OVERFLOWS_TYPE(u8, s8, S8_MAX, false); + TEST_OVERFLOWS_TYPE(u8, s8, (u8)S8_MAX + 1, true); + TEST_OVERFLOWS_TYPE(u8, s16, U8_MAX, false); + TEST_OVERFLOWS_TYPE(s8, u8, S8_MAX, false); + TEST_OVERFLOWS_TYPE(s8, u8, -1, true); + TEST_OVERFLOWS_TYPE(s8, u8, S8_MIN, true); + TEST_OVERFLOWS_TYPE(s8, u16, S8_MAX, false); + TEST_OVERFLOWS_TYPE(s8, u16, -1, true); + TEST_OVERFLOWS_TYPE(s8, u16, S8_MIN, true); + TEST_OVERFLOWS_TYPE(s8, u32, S8_MAX, false); + TEST_OVERFLOWS_TYPE(s8, u32, -1, true); + TEST_OVERFLOWS_TYPE(s8, u32, S8_MIN, true); +#if BITS_PER_LONG == 64 + TEST_OVERFLOWS_TYPE(s8, u64, S8_MAX, false); + TEST_OVERFLOWS_TYPE(s8, u64, -1, true); + TEST_OVERFLOWS_TYPE(s8, u64, S8_MIN, true); +#endif + TEST_OVERFLOWS_TYPE(s8, s8, S8_MAX, false); + TEST_OVERFLOWS_TYPE(s8, s8, S8_MIN, false); + TEST_OVERFLOWS_TYPE(s8, s16, S8_MAX, false); + TEST_OVERFLOWS_TYPE(s8, s16, S8_MIN, false); + TEST_OVERFLOWS_TYPE(u16, u8, U8_MAX, false); + TEST_OVERFLOWS_TYPE(u16, u8, (u16)U8_MAX + 1, true); + TEST_OVERFLOWS_TYPE(u16, u8, U16_MAX, true); + TEST_OVERFLOWS_TYPE(u16, s8, S8_MAX, false); + TEST_OVERFLOWS_TYPE(u16, s8, (u16)S8_MAX + 1, true); + TEST_OVERFLOWS_TYPE(u16, s8, U16_MAX, true); + TEST_OVERFLOWS_TYPE(u16, s16, S16_MAX, false); + TEST_OVERFLOWS_TYPE(u16, s16, (u16)S16_MAX + 1, true); + TEST_OVERFLOWS_TYPE(u16, s16, U16_MAX, true); + TEST_OVERFLOWS_TYPE(u16, u32, U16_MAX, false); + TEST_OVERFLOWS_TYPE(u16, s32, U16_MAX, false); + TEST_OVERFLOWS_TYPE(s16, u8, U8_MAX, false); + TEST_OVERFLOWS_TYPE(s16, u8, (s16)U8_MAX + 1, true); + TEST_OVERFLOWS_TYPE(s16, u8, -1, true); + TEST_OVERFLOWS_TYPE(s16, u8, S16_MIN, true); + TEST_OVERFLOWS_TYPE(s16, u16, S16_MAX, false); + TEST_OVERFLOWS_TYPE(s16, u16, -1, true); + TEST_OVERFLOWS_TYPE(s16, u16, S16_MIN, true); + TEST_OVERFLOWS_TYPE(s16, u32, S16_MAX, false); + TEST_OVERFLOWS_TYPE(s16, u32, -1, true); + TEST_OVERFLOWS_TYPE(s16, u32, S16_MIN, true); +#if BITS_PER_LONG == 64 + TEST_OVERFLOWS_TYPE(s16, u64, S16_MAX, false); + TEST_OVERFLOWS_TYPE(s16, u64, -1, true); + TEST_OVERFLOWS_TYPE(s16, u64, S16_MIN, true); +#endif + TEST_OVERFLOWS_TYPE(s16, s8, S8_MAX, false); + TEST_OVERFLOWS_TYPE(s16, s8, S8_MIN, false); + TEST_OVERFLOWS_TYPE(s16, s8, (s16)S8_MAX + 1, true); + TEST_OVERFLOWS_TYPE(s16, s8, (s16)S8_MIN - 1, true); + TEST_OVERFLOWS_TYPE(s16, s8, S16_MAX, true); + TEST_OVERFLOWS_TYPE(s16, s8, S16_MIN, true); + TEST_OVERFLOWS_TYPE(s16, s16, S16_MAX, false); + TEST_OVERFLOWS_TYPE(s16, s16, S16_MIN, false); + TEST_OVERFLOWS_TYPE(s16, s32, S16_MAX, false); + TEST_OVERFLOWS_TYPE(s16, s32, S16_MIN, false); + TEST_OVERFLOWS_TYPE(u32, u8, U8_MAX, false); + TEST_OVERFLOWS_TYPE(u32, u8, (u32)U8_MAX + 1, true); + TEST_OVERFLOWS_TYPE(u32, u8, U32_MAX, true); + TEST_OVERFLOWS_TYPE(u32, s8, S8_MAX, false); + TEST_OVERFLOWS_TYPE(u32, s8, (u32)S8_MAX + 1, true); + TEST_OVERFLOWS_TYPE(u32, s8, U32_MAX, true); + TEST_OVERFLOWS_TYPE(u32, u16, U16_MAX, false); + TEST_OVERFLOWS_TYPE(u32, u16, U16_MAX + 1, true); + TEST_OVERFLOWS_TYPE(u32, u16, U32_MAX, true); + TEST_OVERFLOWS_TYPE(u32, s16, S16_MAX, false); + TEST_OVERFLOWS_TYPE(u32, s16, (u32)S16_MAX + 1, true); + TEST_OVERFLOWS_TYPE(u32, s16, U32_MAX, true); + TEST_OVERFLOWS_TYPE(u32, u32, U32_MAX, false); + TEST_OVERFLOWS_TYPE(u32, s32, S32_MAX, false); + TEST_OVERFLOWS_TYPE(u32, s32, U32_MAX, true); + TEST_OVERFLOWS_TYPE(u32, s32, (u32)S32_MAX + 1, true); +#if BITS_PER_LONG == 64 + TEST_OVERFLOWS_TYPE(u32, u64, U32_MAX, false); + TEST_OVERFLOWS_TYPE(u32, s64, U32_MAX, false); +#endif + TEST_OVERFLOWS_TYPE(s32, u8, U8_MAX, false); + TEST_OVERFLOWS_TYPE(s32, u8, (s32)U8_MAX + 1, true); + TEST_OVERFLOWS_TYPE(s32, u16, S32_MAX, true); + TEST_OVERFLOWS_TYPE(s32, u8, -1, true); + TEST_OVERFLOWS_TYPE(s32, u8, S32_MIN, true); + TEST_OVERFLOWS_TYPE(s32, u16, U16_MAX, false); + TEST_OVERFLOWS_TYPE(s32, u16, (s32)U16_MAX + 1, true); + TEST_OVERFLOWS_TYPE(s32, u16, S32_MAX, true); + TEST_OVERFLOWS_TYPE(s32, u16, -1, true); + TEST_OVERFLOWS_TYPE(s32, u16, S32_MIN, true); + TEST_OVERFLOWS_TYPE(s32, u32, S32_MAX, false); + TEST_OVERFLOWS_TYPE(s32, u32, -1, true); + TEST_OVERFLOWS_TYPE(s32, u32, S32_MIN, true); +#if BITS_PER_LONG == 64 + TEST_OVERFLOWS_TYPE(s32, u64, S32_MAX, false); + TEST_OVERFLOWS_TYPE(s32, u64, -1, true); + TEST_OVERFLOWS_TYPE(s32, u64, S32_MIN, true); +#endif + TEST_OVERFLOWS_TYPE(s32, s8, S8_MAX, false); + TEST_OVERFLOWS_TYPE(s32, s8, S8_MIN, false); + TEST_OVERFLOWS_TYPE(s32, s8, (s32)S8_MAX + 1, true); + TEST_OVERFLOWS_TYPE(s32, s8, (s32)S8_MIN - 1, true); + TEST_OVERFLOWS_TYPE(s32, s8, S32_MAX, true); + TEST_OVERFLOWS_TYPE(s32, s8, S32_MIN, true); + TEST_OVERFLOWS_TYPE(s32, s16, S16_MAX, false); + TEST_OVERFLOWS_TYPE(s32, s16, S16_MIN, false); + TEST_OVERFLOWS_TYPE(s32, s16, (s32)S16_MAX + 1, true); + TEST_OVERFLOWS_TYPE(s32, s16, (s32)S16_MIN - 1, true); + TEST_OVERFLOWS_TYPE(s32, s16, S32_MAX, true); + TEST_OVERFLOWS_TYPE(s32, s16, S32_MIN, true); + TEST_OVERFLOWS_TYPE(s32, s32, S32_MAX, false); + TEST_OVERFLOWS_TYPE(s32, s32, S32_MIN, false); +#if BITS_PER_LONG == 64 + TEST_OVERFLOWS_TYPE(s32, s64, S32_MAX, false); + TEST_OVERFLOWS_TYPE(s32, s64, S32_MIN, false); + TEST_OVERFLOWS_TYPE(u64, u8, U64_MAX, true); + TEST_OVERFLOWS_TYPE(u64, u8, U8_MAX, false); + TEST_OVERFLOWS_TYPE(u64, u8, (u64)U8_MAX + 1, true); + TEST_OVERFLOWS_TYPE(u64, u16, U64_MAX, true); + TEST_OVERFLOWS_TYPE(u64, u16, U16_MAX, false); + TEST_OVERFLOWS_TYPE(u64, u16, (u64)U16_MAX + 1, true); + TEST_OVERFLOWS_TYPE(u64, u32, U64_MAX, true); + TEST_OVERFLOWS_TYPE(u64, u32, U32_MAX, false); + TEST_OVERFLOWS_TYPE(u64, u32, (u64)U32_MAX + 1, true); + TEST_OVERFLOWS_TYPE(u64, u64, U64_MAX, false); + TEST_OVERFLOWS_TYPE(u64, s8, S8_MAX, false); + TEST_OVERFLOWS_TYPE(u64, s8, (u64)S8_MAX + 1, true); + TEST_OVERFLOWS_TYPE(u64, s8, U64_MAX, true); + TEST_OVERFLOWS_TYPE(u64, s16, S16_MAX, false); + TEST_OVERFLOWS_TYPE(u64, s16, (u64)S16_MAX + 1, true); + TEST_OVERFLOWS_TYPE(u64, s16, U64_MAX, true); + TEST_OVERFLOWS_TYPE(u64, s32, S32_MAX, false); + TEST_OVERFLOWS_TYPE(u64, s32, (u64)S32_MAX + 1, true); + TEST_OVERFLOWS_TYPE(u64, s32, U64_MAX, true); + TEST_OVERFLOWS_TYPE(u64, s64, S64_MAX, false); + TEST_OVERFLOWS_TYPE(u64, s64, U64_MAX, true); + TEST_OVERFLOWS_TYPE(u64, s64, (u64)S64_MAX + 1, true); + TEST_OVERFLOWS_TYPE(s64, u8, S64_MAX, true); + TEST_OVERFLOWS_TYPE(s64, u8, S64_MIN, true); + TEST_OVERFLOWS_TYPE(s64, u8, -1, true); + TEST_OVERFLOWS_TYPE(s64, u8, U8_MAX, false); + TEST_OVERFLOWS_TYPE(s64, u8, (s64)U8_MAX + 1, true); + TEST_OVERFLOWS_TYPE(s64, u16, S64_MAX, true); + TEST_OVERFLOWS_TYPE(s64, u16, S64_MIN, true); + TEST_OVERFLOWS_TYPE(s64, u16, -1, true); + TEST_OVERFLOWS_TYPE(s64, u16, U16_MAX, false); + TEST_OVERFLOWS_TYPE(s64, u16, (s64)U16_MAX + 1, true); + TEST_OVERFLOWS_TYPE(s64, u32, S64_MAX, true); + TEST_OVERFLOWS_TYPE(s64, u32, S64_MIN, true); + TEST_OVERFLOWS_TYPE(s64, u32, -1, true); + TEST_OVERFLOWS_TYPE(s64, u32, U32_MAX, false); + TEST_OVERFLOWS_TYPE(s64, u32, (s64)U32_MAX + 1, true); + TEST_OVERFLOWS_TYPE(s64, u64, S64_MAX, false); + TEST_OVERFLOWS_TYPE(s64, u64, S64_MIN, true); + TEST_OVERFLOWS_TYPE(s64, u64, -1, true); + TEST_OVERFLOWS_TYPE(s64, s8, S8_MAX, false); + TEST_OVERFLOWS_TYPE(s64, s8, S8_MIN, false); + TEST_OVERFLOWS_TYPE(s64, s8, (s64)S8_MAX + 1, true); + TEST_OVERFLOWS_TYPE(s64, s8, (s64)S8_MIN - 1, true); + TEST_OVERFLOWS_TYPE(s64, s8, S64_MAX, true); + TEST_OVERFLOWS_TYPE(s64, s16, S16_MAX, false); + TEST_OVERFLOWS_TYPE(s64, s16, S16_MIN, false); + TEST_OVERFLOWS_TYPE(s64, s16, (s64)S16_MAX + 1, true); + TEST_OVERFLOWS_TYPE(s64, s16, (s64)S16_MIN - 1, true); + TEST_OVERFLOWS_TYPE(s64, s16, S64_MAX, true); + TEST_OVERFLOWS_TYPE(s64, s32, S32_MAX, false); + TEST_OVERFLOWS_TYPE(s64, s32, S32_MIN, false); + TEST_OVERFLOWS_TYPE(s64, s32, (s64)S32_MAX + 1, true); + TEST_OVERFLOWS_TYPE(s64, s32, (s64)S32_MIN - 1, true); + TEST_OVERFLOWS_TYPE(s64, s32, S64_MAX, true); + TEST_OVERFLOWS_TYPE(s64, s64, S64_MAX, false); + TEST_OVERFLOWS_TYPE(s64, s64, S64_MIN, false); +#endif + + /* Check for macro side-effects. */ + var = INT_MAX - 1; + __TEST_OVERFLOWS_TYPE(__overflows_type, var++, int, false); + __TEST_OVERFLOWS_TYPE(__overflows_type, var++, int, false); + __TEST_OVERFLOWS_TYPE(__overflows_type, var++, int, true); + var = INT_MAX - 1; + __TEST_OVERFLOWS_TYPE(overflows_type, var++, int, false); + __TEST_OVERFLOWS_TYPE(overflows_type, var++, int, false); + __TEST_OVERFLOWS_TYPE(overflows_type, var++, int, true); + + kunit_info(test, "%d overflows_type() tests finished\n", count); +#undef TEST_OVERFLOWS_TYPE +#undef __TEST_OVERFLOWS_TYPE +} + +static void same_type_test(struct kunit *test) +{ + int count = 0; + int var; + +#define TEST_SAME_TYPE(t1, t2, same) do { \ + typeof(t1) __t1h = type_max(t1); \ + typeof(t1) __t1l = type_min(t1); \ + typeof(t2) __t2h = type_max(t2); \ + typeof(t2) __t2l = type_min(t2); \ + KUNIT_EXPECT_EQ(test, true, __same_type(t1, __t1h)); \ + KUNIT_EXPECT_EQ(test, true, __same_type(t1, __t1l)); \ + KUNIT_EXPECT_EQ(test, true, __same_type(__t1h, t1)); \ + KUNIT_EXPECT_EQ(test, true, __same_type(__t1l, t1)); \ + KUNIT_EXPECT_EQ(test, true, __same_type(t2, __t2h)); \ + KUNIT_EXPECT_EQ(test, true, __same_type(t2, __t2l)); \ + KUNIT_EXPECT_EQ(test, true, __same_type(__t2h, t2)); \ + KUNIT_EXPECT_EQ(test, true, __same_type(__t2l, t2)); \ + KUNIT_EXPECT_EQ(test, same, __same_type(t1, t2)); \ + KUNIT_EXPECT_EQ(test, same, __same_type(t2, __t1h)); \ + KUNIT_EXPECT_EQ(test, same, __same_type(t2, __t1l)); \ + KUNIT_EXPECT_EQ(test, same, __same_type(__t1h, t2)); \ + KUNIT_EXPECT_EQ(test, same, __same_type(__t1l, t2)); \ + KUNIT_EXPECT_EQ(test, same, __same_type(t1, __t2h)); \ + KUNIT_EXPECT_EQ(test, same, __same_type(t1, __t2l)); \ + KUNIT_EXPECT_EQ(test, same, __same_type(__t2h, t1)); \ + KUNIT_EXPECT_EQ(test, same, __same_type(__t2l, t1)); \ +} while (0) + +#if BITS_PER_LONG == 64 +# define TEST_SAME_TYPE64(base, t, m) TEST_SAME_TYPE(base, t, m) +#else +# define TEST_SAME_TYPE64(base, t, m) do { } while (0) +#endif + +#define TEST_TYPE_SETS(base, mu8, mu16, mu32, ms8, ms16, ms32, mu64, ms64) \ +do { \ + TEST_SAME_TYPE(base, u8, mu8); \ + TEST_SAME_TYPE(base, u16, mu16); \ + TEST_SAME_TYPE(base, u32, mu32); \ + TEST_SAME_TYPE(base, s8, ms8); \ + TEST_SAME_TYPE(base, s16, ms16); \ + TEST_SAME_TYPE(base, s32, ms32); \ + TEST_SAME_TYPE64(base, u64, mu64); \ + TEST_SAME_TYPE64(base, s64, ms64); \ +} while (0) + + TEST_TYPE_SETS(u8, true, false, false, false, false, false, false, false); + TEST_TYPE_SETS(u16, false, true, false, false, false, false, false, false); + TEST_TYPE_SETS(u32, false, false, true, false, false, false, false, false); + TEST_TYPE_SETS(s8, false, false, false, true, false, false, false, false); + TEST_TYPE_SETS(s16, false, false, false, false, true, false, false, false); + TEST_TYPE_SETS(s32, false, false, false, false, false, true, false, false); +#if BITS_PER_LONG == 64 + TEST_TYPE_SETS(u64, false, false, false, false, false, false, true, false); + TEST_TYPE_SETS(s64, false, false, false, false, false, false, false, true); +#endif + + /* Check for macro side-effects. */ + var = 4; + KUNIT_EXPECT_EQ(test, var, 4); + KUNIT_EXPECT_TRUE(test, __same_type(var++, int)); + KUNIT_EXPECT_EQ(test, var, 4); + KUNIT_EXPECT_TRUE(test, __same_type(int, var++)); + KUNIT_EXPECT_EQ(test, var, 4); + KUNIT_EXPECT_TRUE(test, __same_type(var++, var++)); + KUNIT_EXPECT_EQ(test, var, 4); + + kunit_info(test, "%d __same_type() tests finished\n", count); + +#undef TEST_TYPE_SETS +#undef TEST_SAME_TYPE64 +#undef TEST_SAME_TYPE +} + +static void castable_to_type_test(struct kunit *test) +{ + int count = 0; + +#define TEST_CASTABLE_TO_TYPE(arg1, arg2, pass) do { \ + bool __pass = castable_to_type(arg1, arg2); \ + KUNIT_EXPECT_EQ_MSG(test, __pass, pass, \ + "expected castable_to_type(" #arg1 ", " #arg2 ") to%s pass\n",\ + pass ? "" : " not"); \ + count++; \ +} while (0) + + TEST_CASTABLE_TO_TYPE(16, u8, true); + TEST_CASTABLE_TO_TYPE(16, u16, true); + TEST_CASTABLE_TO_TYPE(16, u32, true); + TEST_CASTABLE_TO_TYPE(16, s8, true); + TEST_CASTABLE_TO_TYPE(16, s16, true); + TEST_CASTABLE_TO_TYPE(16, s32, true); + TEST_CASTABLE_TO_TYPE(-16, s8, true); + TEST_CASTABLE_TO_TYPE(-16, s16, true); + TEST_CASTABLE_TO_TYPE(-16, s32, true); +#if BITS_PER_LONG == 64 + TEST_CASTABLE_TO_TYPE(16, u64, true); + TEST_CASTABLE_TO_TYPE(-16, s64, true); +#endif + +#define TEST_CASTABLE_TO_TYPE_VAR(width) do { \ + u ## width u ## width ## var = 0; \ + s ## width s ## width ## var = 0; \ + \ + /* Constant expressions that fit types. */ \ + TEST_CASTABLE_TO_TYPE(type_max(u ## width), u ## width, true); \ + TEST_CASTABLE_TO_TYPE(type_min(u ## width), u ## width, true); \ + TEST_CASTABLE_TO_TYPE(type_max(u ## width), u ## width ## var, true); \ + TEST_CASTABLE_TO_TYPE(type_min(u ## width), u ## width ## var, true); \ + TEST_CASTABLE_TO_TYPE(type_max(s ## width), s ## width, true); \ + TEST_CASTABLE_TO_TYPE(type_min(s ## width), s ## width, true); \ + TEST_CASTABLE_TO_TYPE(type_max(s ## width), s ## width ## var, true); \ + TEST_CASTABLE_TO_TYPE(type_min(u ## width), s ## width ## var, true); \ + /* Constant expressions that do not fit types. */ \ + TEST_CASTABLE_TO_TYPE(type_max(u ## width), s ## width, false); \ + TEST_CASTABLE_TO_TYPE(type_max(u ## width), s ## width ## var, false); \ + TEST_CASTABLE_TO_TYPE(type_min(s ## width), u ## width, false); \ + TEST_CASTABLE_TO_TYPE(type_min(s ## width), u ## width ## var, false); \ + /* Non-constant expression with mismatched type. */ \ + TEST_CASTABLE_TO_TYPE(s ## width ## var, u ## width, false); \ + TEST_CASTABLE_TO_TYPE(u ## width ## var, s ## width, false); \ +} while (0) + +#define TEST_CASTABLE_TO_TYPE_RANGE(width) do { \ + unsigned long big = U ## width ## _MAX; \ + signed long small = S ## width ## _MIN; \ + u ## width u ## width ## var = 0; \ + s ## width s ## width ## var = 0; \ + \ + /* Constant expression in range. */ \ + TEST_CASTABLE_TO_TYPE(U ## width ## _MAX, u ## width, true); \ + TEST_CASTABLE_TO_TYPE(U ## width ## _MAX, u ## width ## var, true); \ + TEST_CASTABLE_TO_TYPE(S ## width ## _MIN, s ## width, true); \ + TEST_CASTABLE_TO_TYPE(S ## width ## _MIN, s ## width ## var, true); \ + /* Constant expression out of range. */ \ + TEST_CASTABLE_TO_TYPE((unsigned long)U ## width ## _MAX + 1, u ## width, false); \ + TEST_CASTABLE_TO_TYPE((unsigned long)U ## width ## _MAX + 1, u ## width ## var, false); \ + TEST_CASTABLE_TO_TYPE((signed long)S ## width ## _MIN - 1, s ## width, false); \ + TEST_CASTABLE_TO_TYPE((signed long)S ## width ## _MIN - 1, s ## width ## var, false); \ + /* Non-constant expression with mismatched type. */ \ + TEST_CASTABLE_TO_TYPE(big, u ## width, false); \ + TEST_CASTABLE_TO_TYPE(big, u ## width ## var, false); \ + TEST_CASTABLE_TO_TYPE(small, s ## width, false); \ + TEST_CASTABLE_TO_TYPE(small, s ## width ## var, false); \ +} while (0) + + TEST_CASTABLE_TO_TYPE_VAR(8); + TEST_CASTABLE_TO_TYPE_VAR(16); + TEST_CASTABLE_TO_TYPE_VAR(32); +#if BITS_PER_LONG == 64 + TEST_CASTABLE_TO_TYPE_VAR(64); +#endif + + TEST_CASTABLE_TO_TYPE_RANGE(8); + TEST_CASTABLE_TO_TYPE_RANGE(16); +#if BITS_PER_LONG == 64 + TEST_CASTABLE_TO_TYPE_RANGE(32); +#endif + kunit_info(test, "%d castable_to_type() tests finished\n", count); + +#undef TEST_CASTABLE_TO_TYPE_RANGE +#undef TEST_CASTABLE_TO_TYPE_VAR +#undef TEST_CASTABLE_TO_TYPE +} + static struct kunit_case overflow_test_cases[] = { KUNIT_CASE(u8_u8__u8_overflow_test), KUNIT_CASE(s8_s8__s8_overflow_test), @@ -755,6 +1133,9 @@ static struct kunit_case overflow_test_cases[] = { KUNIT_CASE(shift_nonsense_test), KUNIT_CASE(overflow_allocation_test), KUNIT_CASE(overflow_size_helpers_test), + KUNIT_CASE(overflows_type_test), + KUNIT_CASE(same_type_test), + KUNIT_CASE(castable_to_type_test), {} }; -- GitLab From a072f249b1b3a457196c83df622e3aa376b1f8df Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Tue, 1 Nov 2022 14:16:52 +0000 Subject: [PATCH 0338/1423] dt-bindings: i2c: mv64xxx: Add F1C100s compatible string The I2C controller IP used in the Allwinner F1C100s series of SoCs is compatible with the ones used in the other Allwinner SoCs. Add an F1C100s specific compatible string to the list of existing names. Signed-off-by: Andre Przywara Acked-by: Rob Herring Signed-off-by: Wolfram Sang --- Documentation/devicetree/bindings/i2c/marvell,mv64xxx-i2c.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/i2c/marvell,mv64xxx-i2c.yaml b/Documentation/devicetree/bindings/i2c/marvell,mv64xxx-i2c.yaml index 93c164aa00daf..984fc1ed3ec6a 100644 --- a/Documentation/devicetree/bindings/i2c/marvell,mv64xxx-i2c.yaml +++ b/Documentation/devicetree/bindings/i2c/marvell,mv64xxx-i2c.yaml @@ -19,6 +19,7 @@ properties: - const: allwinner,sun6i-a31-i2c - items: - enum: + - allwinner,suniv-f1c100s-i2c - allwinner,sun8i-a23-i2c - allwinner,sun8i-a83t-i2c - allwinner,sun8i-v536-i2c -- GitLab From 52951ea193ad3b77c433497425a1049520fd6f22 Mon Sep 17 00:00:00 2001 From: Weilong Chen Date: Tue, 1 Nov 2022 16:07:27 +0800 Subject: [PATCH 0339/1423] i2c: hisi: Add initial device tree support The HiSilicon I2C controller can be used on embedded platform, which boot from devicetree. Signed-off-by: Weilong Chen Acked-by: Yicong Yang Reviewed-by: Andy Shevchenko Signed-off-by: Wolfram Sang --- drivers/i2c/busses/Kconfig | 2 +- drivers/i2c/busses/i2c-hisi.c | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index e50f9603d189e..a7bfddf08fa7b 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -673,7 +673,7 @@ config I2C_HIGHLANDER config I2C_HISI tristate "HiSilicon I2C controller" - depends on (ARM64 && ACPI) || COMPILE_TEST + depends on ARM64 || COMPILE_TEST help Say Y here if you want to have Hisilicon I2C controller support available on the Kunpeng Server. diff --git a/drivers/i2c/busses/i2c-hisi.c b/drivers/i2c/busses/i2c-hisi.c index 76c3d8f6fc3c6..bcc97e4fcb654 100644 --- a/drivers/i2c/busses/i2c-hisi.c +++ b/drivers/i2c/busses/i2c-hisi.c @@ -489,11 +489,18 @@ static const struct acpi_device_id hisi_i2c_acpi_ids[] = { }; MODULE_DEVICE_TABLE(acpi, hisi_i2c_acpi_ids); +static const struct of_device_id hisi_i2c_dts_ids[] = { + { .compatible = "hisilicon,ascend910-i2c", }, + { } +}; +MODULE_DEVICE_TABLE(of, hisi_i2c_dts_ids); + static struct platform_driver hisi_i2c_driver = { .probe = hisi_i2c_probe, .driver = { .name = "hisi-i2c", .acpi_match_table = hisi_i2c_acpi_ids, + .of_match_table = hisi_i2c_dts_ids, }, }; module_platform_driver(hisi_i2c_driver); -- GitLab From e77f7ba726cc0c9b1c62b295d2aac42c3a18ebd1 Mon Sep 17 00:00:00 2001 From: Weilong Chen Date: Tue, 1 Nov 2022 16:07:28 +0800 Subject: [PATCH 0340/1423] dt-bindings: i2c: add entry for hisilicon,ascend910-i2c Add the new compatible for HiSilicon i2c. Signed-off-by: Weilong Chen Reviewed-by: Rob Herring Reviewed-by: Yicong Yang Signed-off-by: Wolfram Sang --- .../bindings/i2c/hisilicon,ascend910-i2c.yaml | 73 +++++++++++++++++++ MAINTAINERS | 1 + 2 files changed, 74 insertions(+) create mode 100644 Documentation/devicetree/bindings/i2c/hisilicon,ascend910-i2c.yaml diff --git a/Documentation/devicetree/bindings/i2c/hisilicon,ascend910-i2c.yaml b/Documentation/devicetree/bindings/i2c/hisilicon,ascend910-i2c.yaml new file mode 100644 index 0000000000000..7d7a8de7bcd89 --- /dev/null +++ b/Documentation/devicetree/bindings/i2c/hisilicon,ascend910-i2c.yaml @@ -0,0 +1,73 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/i2c/hisilicon,ascend910-i2c.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: HiSilicon common I2C controller + +maintainers: + - Yicong Yang + +description: + The HiSilicon common I2C controller can be used for many different + types of SoC such as Huawei Ascend AI series chips. + +allOf: + - $ref: /schemas/i2c/i2c-controller.yaml# + +properties: + compatible: + const: hisilicon,ascend910-i2c + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + clocks: + maxItems: 1 + + clock-frequency: + default: 400000 + + i2c-sda-falling-time-ns: + default: 343 + + i2c-scl-falling-time-ns: + default: 203 + + i2c-sda-hold-time-ns: + default: 830 + + i2c-scl-rising-time-ns: + default: 365 + + i2c-digital-filter-width-ns: + default: 0 + +required: + - compatible + - reg + - interrupts + +unevaluatedProperties: false + +examples: + - | + #include + + i2c@38b0000 { + compatible = "hisilicon,ascend910-i2c"; + reg = <0x38b0000 0x10000>; + interrupts = ; + i2c-sda-falling-time-ns = <56>; + i2c-scl-falling-time-ns = <56>; + i2c-sda-hold-time-ns = <56>; + i2c-scl-rising-time-ns = <56>; + i2c-digital-filter; + i2c-digital-filter-width-ns = <0x0>; + clocks = <&alg_clk>; + clock-frequency = <400000>; + }; diff --git a/MAINTAINERS b/MAINTAINERS index 379945f82a643..1d09b36b9b9c8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9214,6 +9214,7 @@ M: Yicong Yang L: linux-i2c@vger.kernel.org S: Maintained W: https://www.hisilicon.com +F: Documentation/devicetree/bindings/i2c/hisilicon,ascend910-i2c.yaml F: drivers/i2c/busses/i2c-hisi.c HISILICON LPC BUS DRIVER -- GitLab From 1adf3cc20d693569ebee90fd91fa34b0570fcd6f Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 31 Oct 2022 08:59:05 +0800 Subject: [PATCH 0341/1423] iommu: Add max_pasids field in struct iommu_device Use this field to keep the number of supported PASIDs that an IOMMU hardware is able to support. This is a generic attribute of an IOMMU and lifting it into the per-IOMMU device structure makes it possible to allocate a PASID for device without calls into the IOMMU drivers. Any iommu driver that supports PASID related features should set this field before enabling them on the devices. In the Intel IOMMU driver, intel_iommu_sm is moved to CONFIG_INTEL_IOMMU enclave so that the pasid_supported() helper could be used in dmar.c without compilation errors. Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Yi Liu Tested-by: Zhangfei Gao Tested-by: Tony Zhu Link: https://lore.kernel.org/r/20221031005917.45690-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 1 + drivers/iommu/intel/dmar.c | 7 +++++++ drivers/iommu/intel/iommu.h | 4 ++-- include/linux/iommu.h | 2 ++ 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 6d5df91c5c465..21cb13da122c8 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3543,6 +3543,7 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) /* SID/SSID sizes */ smmu->ssid_bits = FIELD_GET(IDR1_SSIDSIZE, reg); smmu->sid_bits = FIELD_GET(IDR1_SIDSIZE, reg); + smmu->iommu.max_pasids = 1UL << smmu->ssid_bits; /* * If the SMMU supports fewer bits than would fill a single L2 stream diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index 5a8f780e7ffd8..3528058d253e4 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -1104,6 +1104,13 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) raw_spin_lock_init(&iommu->register_lock); + /* + * A value of N in PSS field of eCap register indicates hardware + * supports PASID field of N+1 bits. + */ + if (pasid_supported(iommu)) + iommu->iommu.max_pasids = 2UL << ecap_pss(iommu->ecap); + /* * This is only for hotplug; at boot time intel_iommu_enabled won't * be set yet. When intel_iommu_init() runs, it registers the units diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 92023dff9513a..cce0598f41092 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -480,8 +480,6 @@ enum { #define VTD_FLAG_IRQ_REMAP_PRE_ENABLED (1 << 1) #define VTD_FLAG_SVM_CAPABLE (1 << 2) -extern int intel_iommu_sm; - #define sm_supported(iommu) (intel_iommu_sm && ecap_smts((iommu)->ecap)) #define pasid_supported(iommu) (sm_supported(iommu) && \ ecap_pasid((iommu)->ecap)) @@ -795,6 +793,7 @@ struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, extern const struct iommu_ops intel_iommu_ops; #ifdef CONFIG_INTEL_IOMMU +extern int intel_iommu_sm; extern int iommu_calculate_agaw(struct intel_iommu *iommu); extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu); extern int dmar_disabled; @@ -810,6 +809,7 @@ static inline int iommu_calculate_max_sagaw(struct intel_iommu *iommu) } #define dmar_disabled (1) #define intel_iommu_enabled (0) +#define intel_iommu_sm (0) #endif static inline const char *decode_prq_descriptor(char *str, size_t size, diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 3c9da1f8979e3..e3af4f46e6e07 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -322,12 +322,14 @@ struct iommu_domain_ops { * @list: Used by the iommu-core to keep a list of registered iommus * @ops: iommu-ops for talking to this iommu * @dev: struct device for sysfs handling + * @max_pasids: number of supported PASIDs */ struct iommu_device { struct list_head list; const struct iommu_ops *ops; struct fwnode_handle *fwnode; struct device *dev; + u32 max_pasids; }; /** -- GitLab From 22d2c7afb3697a68c7fc05c935ef662dee06dc60 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 31 Oct 2022 08:59:06 +0800 Subject: [PATCH 0342/1423] iommu: Add max_pasids field in struct dev_iommu Use this field to save the number of PASIDs that a device is able to consume. It is a generic attribute of a device and lifting it into the per-device dev_iommu struct could help to avoid the boilerplate code in various IOMMU drivers. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Yi Liu Tested-by: Zhangfei Gao Tested-by: Tony Zhu Link: https://lore.kernel.org/r/20221031005917.45690-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 20 ++++++++++++++++++++ include/linux/iommu.h | 2 ++ 2 files changed, 22 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 65a3b3d886dc0..297ac79bc21cf 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -278,6 +279,24 @@ static void dev_iommu_free(struct device *dev) kfree(param); } +static u32 dev_iommu_get_max_pasids(struct device *dev) +{ + u32 max_pasids = 0, bits = 0; + int ret; + + if (dev_is_pci(dev)) { + ret = pci_max_pasids(to_pci_dev(dev)); + if (ret > 0) + max_pasids = ret; + } else { + ret = device_property_read_u32(dev, "pasid-num-bits", &bits); + if (!ret) + max_pasids = 1UL << bits; + } + + return min_t(u32, max_pasids, dev->iommu->iommu_dev->max_pasids); +} + static int __iommu_probe_device(struct device *dev, struct list_head *group_list) { const struct iommu_ops *ops = dev->bus->iommu_ops; @@ -303,6 +322,7 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list } dev->iommu->iommu_dev = iommu_dev; + dev->iommu->max_pasids = dev_iommu_get_max_pasids(dev); group = iommu_group_get_for_dev(dev); if (IS_ERR(group)) { diff --git a/include/linux/iommu.h b/include/linux/iommu.h index e3af4f46e6e07..ac3f6c6dcc6d6 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -368,6 +368,7 @@ struct iommu_fault_param { * @fwspec: IOMMU fwspec data * @iommu_dev: IOMMU device this device is linked to * @priv: IOMMU Driver private data + * @max_pasids: number of PASIDs this device can consume * * TODO: migrate other per device data pointers under iommu_dev_data, e.g. * struct iommu_group *iommu_group; @@ -379,6 +380,7 @@ struct dev_iommu { struct iommu_fwspec *fwspec; struct iommu_device *iommu_dev; void *priv; + u32 max_pasids; }; int iommu_device_register(struct iommu_device *iommu, -- GitLab From 942fd5435dccb273f90176b046ae6bbba60cfbd8 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 31 Oct 2022 08:59:07 +0800 Subject: [PATCH 0343/1423] iommu: Remove SVM_FLAG_SUPERVISOR_MODE support The current kernel DMA with PASID support is based on the SVA with a flag SVM_FLAG_SUPERVISOR_MODE. The IOMMU driver binds the kernel memory address space to a PASID of the device. The device driver programs the device with kernel virtual address (KVA) for DMA access. There have been security and functional issues with this approach: - The lack of IOTLB synchronization upon kernel page table updates. (vmalloc, module/BPF loading, CONFIG_DEBUG_PAGEALLOC etc.) - Other than slight more protection, using kernel virtual address (KVA) has little advantage over physical address. There are also no use cases yet where DMA engines need kernel virtual addresses for in-kernel DMA. This removes SVM_FLAG_SUPERVISOR_MODE support from the IOMMU interface. The device drivers are suggested to handle kernel DMA with PASID through the kernel DMA APIs. The drvdata parameter in iommu_sva_bind_device() and all callbacks is not needed anymore. Cleanup them as well. Link: https://lore.kernel.org/linux-iommu/20210511194726.GP1002214@nvidia.com/ Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Jean-Philippe Brucker Reviewed-by: Kevin Tian Reviewed-by: Fenghua Yu Tested-by: Zhangfei Gao Tested-by: Tony Zhu Link: https://lore.kernel.org/r/20221031005917.45690-4-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/dma/idxd/cdev.c | 3 +- drivers/dma/idxd/init.c | 25 +-------- .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 3 +- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 5 +- drivers/iommu/intel/iommu.h | 3 +- drivers/iommu/intel/svm.c | 55 +++++-------------- drivers/iommu/iommu.c | 5 +- drivers/misc/uacce/uacce.c | 2 +- include/linux/intel-svm.h | 13 ----- include/linux/iommu.h | 8 +-- 10 files changed, 25 insertions(+), 97 deletions(-) diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c index c2808fd081d65..66720001ba1ce 100644 --- a/drivers/dma/idxd/cdev.c +++ b/drivers/dma/idxd/cdev.c @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include @@ -100,7 +99,7 @@ static int idxd_cdev_open(struct inode *inode, struct file *filp) filp->private_data = ctx; if (device_user_pasid_enabled(idxd)) { - sva = iommu_sva_bind_device(dev, current->mm, NULL); + sva = iommu_sva_bind_device(dev, current->mm); if (IS_ERR(sva)) { rc = PTR_ERR(sva); dev_err(dev, "pasid allocation failed: %d\n", rc); diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index 2b18d512cbfc9..2c0fcfdc75c72 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -502,29 +501,7 @@ static struct idxd_device *idxd_alloc(struct pci_dev *pdev, struct idxd_driver_d static int idxd_enable_system_pasid(struct idxd_device *idxd) { - int flags; - unsigned int pasid; - struct iommu_sva *sva; - - flags = SVM_FLAG_SUPERVISOR_MODE; - - sva = iommu_sva_bind_device(&idxd->pdev->dev, NULL, &flags); - if (IS_ERR(sva)) { - dev_warn(&idxd->pdev->dev, - "iommu sva bind failed: %ld\n", PTR_ERR(sva)); - return PTR_ERR(sva); - } - - pasid = iommu_sva_get_pasid(sva); - if (pasid == IOMMU_PASID_INVALID) { - iommu_sva_unbind_device(sva); - return -ENODEV; - } - - idxd->sva = sva; - idxd->pasid = pasid; - dev_dbg(&idxd->pdev->dev, "system pasid: %u\n", pasid); - return 0; + return -EOPNOTSUPP; } static void idxd_disable_system_pasid(struct idxd_device *idxd) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index 5968a568aae2a..8fcf0df4bd0ea 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -367,8 +367,7 @@ err_free_bond: return ERR_PTR(ret); } -struct iommu_sva * -arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, void *drvdata) +struct iommu_sva *arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm) { struct iommu_sva *handle; struct iommu_domain *domain = iommu_get_domain_for_dev(dev); diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index cd48590ada303..d2ba86470c423 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -754,8 +754,7 @@ bool arm_smmu_master_sva_enabled(struct arm_smmu_master *master); int arm_smmu_master_enable_sva(struct arm_smmu_master *master); int arm_smmu_master_disable_sva(struct arm_smmu_master *master); bool arm_smmu_master_iopf_supported(struct arm_smmu_master *master); -struct iommu_sva *arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, - void *drvdata); +struct iommu_sva *arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm); void arm_smmu_sva_unbind(struct iommu_sva *handle); u32 arm_smmu_sva_get_pasid(struct iommu_sva *handle); void arm_smmu_sva_notifier_synchronize(void); @@ -791,7 +790,7 @@ static inline bool arm_smmu_master_iopf_supported(struct arm_smmu_master *master } static inline struct iommu_sva * -arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, void *drvdata) +arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm) { return ERR_PTR(-ENODEV); } diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index cce0598f41092..33e5bcaf2a6cf 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -748,8 +748,7 @@ struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn); extern void intel_svm_check(struct intel_iommu *iommu); extern int intel_svm_enable_prq(struct intel_iommu *iommu); extern int intel_svm_finish_prq(struct intel_iommu *iommu); -struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm, - void *drvdata); +struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm); void intel_svm_unbind(struct iommu_sva *handle); u32 intel_svm_get_pasid(struct iommu_sva *handle); int intel_svm_page_response(struct device *dev, struct iommu_fault_event *evt, diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 7d08eb034f2d2..94bc47b68c935 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -296,8 +296,7 @@ out: return 0; } -static int intel_svm_alloc_pasid(struct device *dev, struct mm_struct *mm, - unsigned int flags) +static int intel_svm_alloc_pasid(struct device *dev, struct mm_struct *mm) { ioasid_t max_pasid = dev_is_pci(dev) ? pci_max_pasids(to_pci_dev(dev)) : intel_pasid_max_id; @@ -307,8 +306,7 @@ static int intel_svm_alloc_pasid(struct device *dev, struct mm_struct *mm, static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu, struct device *dev, - struct mm_struct *mm, - unsigned int flags) + struct mm_struct *mm) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_svm_dev *sdev; @@ -324,22 +322,18 @@ static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu, svm->pasid = mm->pasid; svm->mm = mm; - svm->flags = flags; INIT_LIST_HEAD_RCU(&svm->devs); - if (!(flags & SVM_FLAG_SUPERVISOR_MODE)) { - svm->notifier.ops = &intel_mmuops; - ret = mmu_notifier_register(&svm->notifier, mm); - if (ret) { - kfree(svm); - return ERR_PTR(ret); - } + svm->notifier.ops = &intel_mmuops; + ret = mmu_notifier_register(&svm->notifier, mm); + if (ret) { + kfree(svm); + return ERR_PTR(ret); } ret = pasid_private_add(svm->pasid, svm); if (ret) { - if (svm->notifier.ops) - mmu_notifier_unregister(&svm->notifier, mm); + mmu_notifier_unregister(&svm->notifier, mm); kfree(svm); return ERR_PTR(ret); } @@ -374,9 +368,7 @@ static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu, } /* Setup the pasid table: */ - sflags = (flags & SVM_FLAG_SUPERVISOR_MODE) ? - PASID_FLAG_SUPERVISOR_MODE : 0; - sflags |= cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0; + sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0; ret = intel_pasid_setup_first_level(iommu, dev, mm->pgd, mm->pasid, FLPT_DEFAULT_DID, sflags); if (ret) @@ -390,8 +382,7 @@ free_sdev: kfree(sdev); free_svm: if (list_empty(&svm->devs)) { - if (svm->notifier.ops) - mmu_notifier_unregister(&svm->notifier, mm); + mmu_notifier_unregister(&svm->notifier, mm); pasid_private_remove(mm->pasid); kfree(svm); } @@ -780,40 +771,20 @@ prq_advance: return IRQ_RETVAL(handled); } -struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata) +struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm) { struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); - unsigned int flags = 0; struct iommu_sva *sva; int ret; - if (drvdata) - flags = *(unsigned int *)drvdata; - - if (flags & SVM_FLAG_SUPERVISOR_MODE) { - if (!ecap_srs(iommu->ecap)) { - dev_err(dev, "%s: Supervisor PASID not supported\n", - iommu->name); - return ERR_PTR(-EOPNOTSUPP); - } - - if (mm) { - dev_err(dev, "%s: Supervisor PASID with user provided mm\n", - iommu->name); - return ERR_PTR(-EINVAL); - } - - mm = &init_mm; - } - mutex_lock(&pasid_mutex); - ret = intel_svm_alloc_pasid(dev, mm, flags); + ret = intel_svm_alloc_pasid(dev, mm); if (ret) { mutex_unlock(&pasid_mutex); return ERR_PTR(ret); } - sva = intel_svm_bind_mm(iommu, dev, mm, flags); + sva = intel_svm_bind_mm(iommu, dev, mm); mutex_unlock(&pasid_mutex); return sva; diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 297ac79bc21cf..a94ec648c88bb 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2750,7 +2750,6 @@ EXPORT_SYMBOL_GPL(iommu_dev_disable_feature); * iommu_sva_bind_device() - Bind a process address space to a device * @dev: the device * @mm: the mm to bind, caller must hold a reference to it - * @drvdata: opaque data pointer to pass to bind callback * * Create a bond between device and address space, allowing the device to access * the mm using the returned PASID. If a bond already exists between @device and @@ -2763,7 +2762,7 @@ EXPORT_SYMBOL_GPL(iommu_dev_disable_feature); * On error, returns an ERR_PTR value. */ struct iommu_sva * -iommu_sva_bind_device(struct device *dev, struct mm_struct *mm, void *drvdata) +iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) { struct iommu_group *group; struct iommu_sva *handle = ERR_PTR(-EINVAL); @@ -2788,7 +2787,7 @@ iommu_sva_bind_device(struct device *dev, struct mm_struct *mm, void *drvdata) if (iommu_group_device_count(group) != 1) goto out_unlock; - handle = ops->sva_bind(dev, mm, drvdata); + handle = ops->sva_bind(dev, mm); out_unlock: mutex_unlock(&group->mutex); diff --git a/drivers/misc/uacce/uacce.c b/drivers/misc/uacce/uacce.c index b70a013139c74..905eff1f840ed 100644 --- a/drivers/misc/uacce/uacce.c +++ b/drivers/misc/uacce/uacce.c @@ -108,7 +108,7 @@ static int uacce_bind_queue(struct uacce_device *uacce, struct uacce_queue *q) if (!(uacce->flags & UACCE_DEV_SVA)) return 0; - handle = iommu_sva_bind_device(uacce->parent, current->mm, NULL); + handle = iommu_sva_bind_device(uacce->parent, current->mm); if (IS_ERR(handle)) return PTR_ERR(handle); diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h index 207ef06ba3e1d..f9a0d44f6fdb1 100644 --- a/include/linux/intel-svm.h +++ b/include/linux/intel-svm.h @@ -13,17 +13,4 @@ #define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20) #define PRQ_DEPTH ((0x1000 << PRQ_ORDER) >> 5) -/* - * The SVM_FLAG_SUPERVISOR_MODE flag requests a PASID which can be used only - * for access to kernel addresses. No IOTLB flushes are automatically done - * for kernel mappings; it is valid only for access to the kernel's static - * 1:1 mapping of physical memory — not to vmalloc or even module mappings. - * A future API addition may permit the use of such ranges, by means of an - * explicit IOTLB flush call (akin to the DMA API's unmap method). - * - * It is unlikely that we will ever hook into flush_tlb_kernel_range() to - * do such IOTLB flushes automatically. - */ -#define SVM_FLAG_SUPERVISOR_MODE BIT(0) - #endif /* __INTEL_SVM_H__ */ diff --git a/include/linux/iommu.h b/include/linux/iommu.h index ac3f6c6dcc6d6..72bb0531aa769 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -247,8 +247,7 @@ struct iommu_ops { int (*dev_enable_feat)(struct device *dev, enum iommu_dev_features f); int (*dev_disable_feat)(struct device *dev, enum iommu_dev_features f); - struct iommu_sva *(*sva_bind)(struct device *dev, struct mm_struct *mm, - void *drvdata); + struct iommu_sva *(*sva_bind)(struct device *dev, struct mm_struct *mm); void (*sva_unbind)(struct iommu_sva *handle); u32 (*sva_get_pasid)(struct iommu_sva *handle); @@ -668,8 +667,7 @@ int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features f); int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features f); struct iommu_sva *iommu_sva_bind_device(struct device *dev, - struct mm_struct *mm, - void *drvdata); + struct mm_struct *mm); void iommu_sva_unbind_device(struct iommu_sva *handle); u32 iommu_sva_get_pasid(struct iommu_sva *handle); @@ -1000,7 +998,7 @@ iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat) } static inline struct iommu_sva * -iommu_sva_bind_device(struct device *dev, struct mm_struct *mm, void *drvdata) +iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) { return NULL; } -- GitLab From 201007ef707a8bb5592cd07dd46fc9222c48e0b9 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 31 Oct 2022 08:59:08 +0800 Subject: [PATCH 0344/1423] PCI: Enable PASID only when ACS RR & UF enabled on upstream path The Requester ID/Process Address Space ID (PASID) combination identifies an address space distinct from the PCI bus address space, e.g., an address space defined by an IOMMU. But the PCIe fabric routes Memory Requests based on the TLP address, ignoring any PASID (PCIe r6.0, sec 2.2.10.4), so a TLP with PASID that SHOULD go upstream to the IOMMU may instead be routed as a P2P Request if its address falls in a bridge window. To ensure that all Memory Requests with PASID are routed upstream, only enable PASID if ACS P2P Request Redirect and Upstream Forwarding are enabled for the path leading to the device. Suggested-by: Jason Gunthorpe Suggested-by: Kevin Tian Signed-off-by: Lu Baolu Acked-by: Bjorn Helgaas Reviewed-by: Jason Gunthorpe Tested-by: Tony Zhu Link: https://lore.kernel.org/r/20221031005917.45690-5-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/pci/ats.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c index c967ad6e26267..f9cc2e10b676a 100644 --- a/drivers/pci/ats.c +++ b/drivers/pci/ats.c @@ -382,6 +382,9 @@ int pci_enable_pasid(struct pci_dev *pdev, int features) if (!pasid) return -EINVAL; + if (!pci_acs_path_enabled(pdev, NULL, PCI_ACS_RR | PCI_ACS_UF)) + return -EINVAL; + pci_read_config_word(pdev, pasid + PCI_PASID_CAP, &supported); supported &= PCI_PASID_CAP_EXEC | PCI_PASID_CAP_PRIV; -- GitLab From 16603704559c7a68718059c4f75287886c01b20f Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 31 Oct 2022 08:59:09 +0800 Subject: [PATCH 0345/1423] iommu: Add attach/detach_dev_pasid iommu interfaces Attaching an IOMMU domain to a PASID of a device is a generic operation for modern IOMMU drivers which support PASID-granular DMA address translation. Currently visible usage scenarios include (but not limited): - SVA (Shared Virtual Address) - kernel DMA with PASID - hardware-assist mediated device This adds the set_dev_pasid domain ops for setting the domain onto a PASID of a device and remove_dev_pasid iommu ops for removing any setup on a PASID of device. This also adds interfaces for device drivers to attach/detach/retrieve a domain for a PASID of a device. If multiple devices share a single group, it's fine as long the fabric always routes every TLP marked with a PASID to the host bridge and only the host bridge. For example, ACS achieves this universally and has been checked when pci_enable_pasid() is called. As we can't reliably tell the source apart in a group, all the devices in a group have to be considered as the same source, and mapped to the same PASID table. The DMA ownership is about the whole device (more precisely, iommu group), including the RID and PASIDs. When the ownership is converted, the pasid array must be empty. This also adds necessary checks in the DMA ownership interfaces. Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker Reviewed-by: Kevin Tian Reviewed-by: Yi Liu Reviewed-by: Jason Gunthorpe Tested-by: Zhangfei Gao Tested-by: Tony Zhu Link: https://lore.kernel.org/r/20221031005917.45690-6-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 141 ++++++++++++++++++++++++++++++++++++++++-- include/linux/iommu.h | 32 ++++++++++ 2 files changed, 169 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index a94ec648c88bb..bf22992beb98a 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -43,6 +43,7 @@ struct iommu_group { struct kobject kobj; struct kobject *devices_kobj; struct list_head devices; + struct xarray pasid_array; struct mutex mutex; void *iommu_data; void (*iommu_data_release)(void *iommu_data); @@ -723,6 +724,7 @@ struct iommu_group *iommu_group_alloc(void) mutex_init(&group->mutex); INIT_LIST_HEAD(&group->devices); INIT_LIST_HEAD(&group->entry); + xa_init(&group->pasid_array); ret = ida_alloc(&iommu_group_ida, GFP_KERNEL); if (ret < 0) { @@ -3106,7 +3108,8 @@ int iommu_device_use_default_domain(struct device *dev) mutex_lock(&group->mutex); if (group->owner_cnt) { - if (group->owner || !iommu_is_default_domain(group)) { + if (group->owner || !iommu_is_default_domain(group) || + !xa_empty(&group->pasid_array)) { ret = -EBUSY; goto unlock_out; } @@ -3137,7 +3140,7 @@ void iommu_device_unuse_default_domain(struct device *dev) return; mutex_lock(&group->mutex); - if (!WARN_ON(!group->owner_cnt)) + if (!WARN_ON(!group->owner_cnt || !xa_empty(&group->pasid_array))) group->owner_cnt--; mutex_unlock(&group->mutex); @@ -3185,7 +3188,8 @@ int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner) ret = -EPERM; goto unlock_out; } else { - if (group->domain && group->domain != group->default_domain) { + if ((group->domain && group->domain != group->default_domain) || + !xa_empty(&group->pasid_array)) { ret = -EBUSY; goto unlock_out; } @@ -3219,7 +3223,8 @@ void iommu_group_release_dma_owner(struct iommu_group *group) int ret; mutex_lock(&group->mutex); - if (WARN_ON(!group->owner_cnt || !group->owner)) + if (WARN_ON(!group->owner_cnt || !group->owner || + !xa_empty(&group->pasid_array))) goto unlock_out; group->owner_cnt = 0; @@ -3250,3 +3255,131 @@ bool iommu_group_dma_owner_claimed(struct iommu_group *group) return user; } EXPORT_SYMBOL_GPL(iommu_group_dma_owner_claimed); + +static int __iommu_set_group_pasid(struct iommu_domain *domain, + struct iommu_group *group, ioasid_t pasid) +{ + struct group_device *device; + int ret = 0; + + list_for_each_entry(device, &group->devices, list) { + ret = domain->ops->set_dev_pasid(domain, device->dev, pasid); + if (ret) + break; + } + + return ret; +} + +static void __iommu_remove_group_pasid(struct iommu_group *group, + ioasid_t pasid) +{ + struct group_device *device; + const struct iommu_ops *ops; + + list_for_each_entry(device, &group->devices, list) { + ops = dev_iommu_ops(device->dev); + ops->remove_dev_pasid(device->dev, pasid); + } +} + +/* + * iommu_attach_device_pasid() - Attach a domain to pasid of device + * @domain: the iommu domain. + * @dev: the attached device. + * @pasid: the pasid of the device. + * + * Return: 0 on success, or an error. + */ +int iommu_attach_device_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) +{ + struct iommu_group *group; + void *curr; + int ret; + + if (!domain->ops->set_dev_pasid) + return -EOPNOTSUPP; + + group = iommu_group_get(dev); + if (!group) + return -ENODEV; + + mutex_lock(&group->mutex); + curr = xa_cmpxchg(&group->pasid_array, pasid, NULL, domain, GFP_KERNEL); + if (curr) { + ret = xa_err(curr) ? : -EBUSY; + goto out_unlock; + } + + ret = __iommu_set_group_pasid(domain, group, pasid); + if (ret) { + __iommu_remove_group_pasid(group, pasid); + xa_erase(&group->pasid_array, pasid); + } +out_unlock: + mutex_unlock(&group->mutex); + iommu_group_put(group); + + return ret; +} +EXPORT_SYMBOL_GPL(iommu_attach_device_pasid); + +/* + * iommu_detach_device_pasid() - Detach the domain from pasid of device + * @domain: the iommu domain. + * @dev: the attached device. + * @pasid: the pasid of the device. + * + * The @domain must have been attached to @pasid of the @dev with + * iommu_attach_device_pasid(). + */ +void iommu_detach_device_pasid(struct iommu_domain *domain, struct device *dev, + ioasid_t pasid) +{ + struct iommu_group *group = iommu_group_get(dev); + + mutex_lock(&group->mutex); + __iommu_remove_group_pasid(group, pasid); + WARN_ON(xa_erase(&group->pasid_array, pasid) != domain); + mutex_unlock(&group->mutex); + + iommu_group_put(group); +} +EXPORT_SYMBOL_GPL(iommu_detach_device_pasid); + +/* + * iommu_get_domain_for_dev_pasid() - Retrieve domain for @pasid of @dev + * @dev: the queried device + * @pasid: the pasid of the device + * @type: matched domain type, 0 for any match + * + * This is a variant of iommu_get_domain_for_dev(). It returns the existing + * domain attached to pasid of a device. Callers must hold a lock around this + * function, and both iommu_attach/detach_dev_pasid() whenever a domain of + * type is being manipulated. This API does not internally resolve races with + * attach/detach. + * + * Return: attached domain on success, NULL otherwise. + */ +struct iommu_domain *iommu_get_domain_for_dev_pasid(struct device *dev, + ioasid_t pasid, + unsigned int type) +{ + struct iommu_domain *domain; + struct iommu_group *group; + + group = iommu_group_get(dev); + if (!group) + return NULL; + + xa_lock(&group->pasid_array); + domain = xa_load(&group->pasid_array, pasid); + if (type && domain && domain->type != type) + domain = ERR_PTR(-EBUSY); + xa_unlock(&group->pasid_array); + iommu_group_put(group); + + return domain; +} +EXPORT_SYMBOL_GPL(iommu_get_domain_for_dev_pasid); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 72bb0531aa769..5d2b78ac5416f 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -223,6 +223,9 @@ struct iommu_iotlb_gather { * - IOMMU_DOMAIN_DMA: must use a dma domain * - 0: use the default setting * @default_domain_ops: the default ops for domains + * @remove_dev_pasid: Remove any translation configurations of a specific + * pasid, so that any DMA transactions with this pasid + * will be blocked by the hardware. * @pgsize_bitmap: bitmap of all possible supported page sizes * @owner: Driver module providing these ops */ @@ -256,6 +259,7 @@ struct iommu_ops { struct iommu_page_response *msg); int (*def_domain_type)(struct device *dev); + void (*remove_dev_pasid)(struct device *dev, ioasid_t pasid); const struct iommu_domain_ops *default_domain_ops; unsigned long pgsize_bitmap; @@ -266,6 +270,7 @@ struct iommu_ops { * struct iommu_domain_ops - domain specific operations * @attach_dev: attach an iommu domain to a device * @detach_dev: detach an iommu domain from a device + * @set_dev_pasid: set an iommu domain to a pasid of device * @map: map a physically contiguous memory region to an iommu domain * @map_pages: map a physically contiguous set of pages of the same size to * an iommu domain. @@ -286,6 +291,8 @@ struct iommu_ops { struct iommu_domain_ops { int (*attach_dev)(struct iommu_domain *domain, struct device *dev); void (*detach_dev)(struct iommu_domain *domain, struct device *dev); + int (*set_dev_pasid)(struct iommu_domain *domain, struct device *dev, + ioasid_t pasid); int (*map)(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp); @@ -678,6 +685,13 @@ int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner); void iommu_group_release_dma_owner(struct iommu_group *group); bool iommu_group_dma_owner_claimed(struct iommu_group *group); +int iommu_attach_device_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid); +void iommu_detach_device_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid); +struct iommu_domain * +iommu_get_domain_for_dev_pasid(struct device *dev, ioasid_t pasid, + unsigned int type); #else /* CONFIG_IOMMU_API */ struct iommu_ops {}; @@ -1040,6 +1054,24 @@ static inline bool iommu_group_dma_owner_claimed(struct iommu_group *group) { return false; } + +static inline int iommu_attach_device_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) +{ + return -ENODEV; +} + +static inline void iommu_detach_device_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) +{ +} + +static inline struct iommu_domain * +iommu_get_domain_for_dev_pasid(struct device *dev, ioasid_t pasid, + unsigned int type) +{ + return NULL; +} #endif /* CONFIG_IOMMU_API */ /** -- GitLab From 136467962e49931dbc6240aea8197fab7e407ba4 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 31 Oct 2022 08:59:10 +0800 Subject: [PATCH 0346/1423] iommu: Add IOMMU SVA domain support The SVA iommu_domain represents a hardware pagetable that the IOMMU hardware could use for SVA translation. This adds some infrastructures to support SVA domain in the iommu core. It includes: - Extend the iommu_domain to support a new IOMMU_DOMAIN_SVA domain type. The IOMMU drivers that support allocation of the SVA domain should provide its own SVA domain specific iommu_domain_ops. - Add a helper to allocate an SVA domain. The iommu_domain_free() is still used to free an SVA domain. The report_iommu_fault() should be replaced by the new iommu_report_device_fault(). Leave the existing fault handler with the existing users and the newly added SVA members excludes it. Suggested-by: Jean-Philippe Brucker Suggested-by: Jason Gunthorpe Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Yi Liu Tested-by: Zhangfei Gao Tested-by: Tony Zhu Link: https://lore.kernel.org/r/20221031005917.45690-7-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 20 ++++++++++++++++++++ include/linux/iommu.h | 25 +++++++++++++++++++++++-- 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index bf22992beb98a..6a1cd2018e301 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "dma-iommu.h" @@ -1934,6 +1935,8 @@ EXPORT_SYMBOL_GPL(iommu_domain_alloc); void iommu_domain_free(struct iommu_domain *domain) { + if (domain->type == IOMMU_DOMAIN_SVA) + mmdrop(domain->mm); iommu_put_dma_cookie(domain); domain->ops->free(domain); } @@ -3383,3 +3386,20 @@ struct iommu_domain *iommu_get_domain_for_dev_pasid(struct device *dev, return domain; } EXPORT_SYMBOL_GPL(iommu_get_domain_for_dev_pasid); + +struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, + struct mm_struct *mm) +{ + const struct iommu_ops *ops = dev_iommu_ops(dev); + struct iommu_domain *domain; + + domain = ops->domain_alloc(IOMMU_DOMAIN_SVA); + if (!domain) + return NULL; + + domain->type = IOMMU_DOMAIN_SVA; + mmgrab(mm); + domain->mm = mm; + + return domain; +} diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 5d2b78ac5416f..776baa3759674 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -64,6 +64,8 @@ struct iommu_domain_geometry { #define __IOMMU_DOMAIN_PT (1U << 2) /* Domain is identity mapped */ #define __IOMMU_DOMAIN_DMA_FQ (1U << 3) /* DMA-API uses flush queue */ +#define __IOMMU_DOMAIN_SVA (1U << 4) /* Shared process address space */ + /* * This are the possible domain-types * @@ -77,6 +79,8 @@ struct iommu_domain_geometry { * certain optimizations for these domains * IOMMU_DOMAIN_DMA_FQ - As above, but definitely using batched TLB * invalidation. + * IOMMU_DOMAIN_SVA - DMA addresses are shared process addresses + * represented by mm_struct's. */ #define IOMMU_DOMAIN_BLOCKED (0U) #define IOMMU_DOMAIN_IDENTITY (__IOMMU_DOMAIN_PT) @@ -86,15 +90,24 @@ struct iommu_domain_geometry { #define IOMMU_DOMAIN_DMA_FQ (__IOMMU_DOMAIN_PAGING | \ __IOMMU_DOMAIN_DMA_API | \ __IOMMU_DOMAIN_DMA_FQ) +#define IOMMU_DOMAIN_SVA (__IOMMU_DOMAIN_SVA) struct iommu_domain { unsigned type; const struct iommu_domain_ops *ops; unsigned long pgsize_bitmap; /* Bitmap of page sizes in use */ - iommu_fault_handler_t handler; - void *handler_token; struct iommu_domain_geometry geometry; struct iommu_dma_cookie *iova_cookie; + union { + struct { + iommu_fault_handler_t handler; + void *handler_token; + }; + struct { /* IOMMU_DOMAIN_SVA */ + struct mm_struct *mm; + int users; + }; + }; }; static inline bool iommu_is_dma_domain(struct iommu_domain *domain) @@ -685,6 +698,8 @@ int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner); void iommu_group_release_dma_owner(struct iommu_group *group); bool iommu_group_dma_owner_claimed(struct iommu_group *group); +struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, + struct mm_struct *mm); int iommu_attach_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid); void iommu_detach_device_pasid(struct iommu_domain *domain, @@ -1055,6 +1070,12 @@ static inline bool iommu_group_dma_owner_claimed(struct iommu_group *group) return false; } +static inline struct iommu_domain * +iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm) +{ + return NULL; +} + static inline int iommu_attach_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid) { -- GitLab From eaca8889a1ef50783bcaad143668b735d136fe46 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 31 Oct 2022 08:59:11 +0800 Subject: [PATCH 0347/1423] iommu/vt-d: Add SVA domain support Add support for SVA domain allocation and provide an SVA-specific iommu_domain_ops. This implementation is based on the existing SVA code. Possible cleanup and refactoring are left for incremental changes later. The VT-d driver will also need to support setting a DMA domain to a PASID of device. Current SVA implementation uses different data structures to track the domain and device PASID relationship. That's the reason why we need to check the domain type in remove_dev_pasid callback. Eventually we'll consolidate the data structures and remove the need of domain type check. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Yi Liu Tested-by: Tony Zhu Link: https://lore.kernel.org/r/20221031005917.45690-8-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 25 ++++++++++++++++++++ drivers/iommu/intel/iommu.h | 10 ++++++++ drivers/iommu/intel/svm.c | 47 +++++++++++++++++++++++++++++++++++++ 3 files changed, 82 insertions(+) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 48cdcd0a5cf34..7b67e431dd362 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4169,6 +4169,8 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) return domain; case IOMMU_DOMAIN_IDENTITY: return &si_domain->domain; + case IOMMU_DOMAIN_SVA: + return intel_svm_domain_alloc(); default: return NULL; } @@ -4712,6 +4714,28 @@ static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain, __mapping_notify_one(info->iommu, dmar_domain, pfn, pages); } +static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid) +{ + struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); + struct iommu_domain *domain; + + /* Domain type specific cleanup: */ + domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0); + if (domain) { + switch (domain->type) { + case IOMMU_DOMAIN_SVA: + intel_svm_remove_dev_pasid(dev, pasid); + break; + default: + /* should never reach here */ + WARN_ON(1); + break; + } + } + + intel_pasid_tear_down_entry(iommu, dev, pasid, false); +} + const struct iommu_ops intel_iommu_ops = { .capable = intel_iommu_capable, .domain_alloc = intel_iommu_domain_alloc, @@ -4724,6 +4748,7 @@ const struct iommu_ops intel_iommu_ops = { .dev_disable_feat = intel_iommu_dev_disable_feat, .is_attach_deferred = intel_iommu_is_attach_deferred, .def_domain_type = device_def_domain_type, + .remove_dev_pasid = intel_iommu_remove_dev_pasid, .pgsize_bitmap = SZ_4K, #ifdef CONFIG_INTEL_IOMMU_SVM .sva_bind = intel_svm_bind, diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 33e5bcaf2a6cf..252fa344f88a2 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -753,6 +753,8 @@ void intel_svm_unbind(struct iommu_sva *handle); u32 intel_svm_get_pasid(struct iommu_sva *handle); int intel_svm_page_response(struct device *dev, struct iommu_fault_event *evt, struct iommu_page_response *msg); +struct iommu_domain *intel_svm_domain_alloc(void); +void intel_svm_remove_dev_pasid(struct device *dev, ioasid_t pasid); struct intel_svm_dev { struct list_head list; @@ -777,6 +779,14 @@ struct intel_svm { }; #else static inline void intel_svm_check(struct intel_iommu *iommu) {} +static inline struct iommu_domain *intel_svm_domain_alloc(void) +{ + return NULL; +} + +static inline void intel_svm_remove_dev_pasid(struct device *dev, ioasid_t pasid) +{ +} #endif #ifdef CONFIG_INTEL_IOMMU_DEBUGFS diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 94bc47b68c935..86c8ea0d9635e 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -882,3 +882,50 @@ int intel_svm_page_response(struct device *dev, out: return ret; } + +void intel_svm_remove_dev_pasid(struct device *dev, ioasid_t pasid) +{ + mutex_lock(&pasid_mutex); + intel_svm_unbind_mm(dev, pasid); + mutex_unlock(&pasid_mutex); +} + +static int intel_svm_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + struct mm_struct *mm = domain->mm; + struct iommu_sva *sva; + int ret = 0; + + mutex_lock(&pasid_mutex); + sva = intel_svm_bind_mm(iommu, dev, mm); + if (IS_ERR(sva)) + ret = PTR_ERR(sva); + mutex_unlock(&pasid_mutex); + + return ret; +} + +static void intel_svm_domain_free(struct iommu_domain *domain) +{ + kfree(to_dmar_domain(domain)); +} + +static const struct iommu_domain_ops intel_svm_domain_ops = { + .set_dev_pasid = intel_svm_set_dev_pasid, + .free = intel_svm_domain_free +}; + +struct iommu_domain *intel_svm_domain_alloc(void) +{ + struct dmar_domain *domain; + + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) + return NULL; + domain->domain.ops = &intel_svm_domain_ops; + + return &domain->domain; +} -- GitLab From 386fa64fd52baadb849ed60c78b024fd1618278e Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 31 Oct 2022 08:59:12 +0800 Subject: [PATCH 0348/1423] arm-smmu-v3/sva: Add SVA domain support Add support for SVA domain allocation and provide an SVA-specific iommu_domain_ops. This implementation is based on the existing SVA code. Possible cleanup and refactoring are left for incremental changes later. Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker Reviewed-by: Jason Gunthorpe Tested-by: Zhangfei Gao Link: https://lore.kernel.org/r/20221031005917.45690-9-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 61 +++++++++++++++++++ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 15 +++++ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 14 +++++ 3 files changed, 90 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index 8fcf0df4bd0ea..2d188d12419ec 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -549,3 +549,64 @@ void arm_smmu_sva_notifier_synchronize(void) */ mmu_notifier_synchronize(); } + +void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t id) +{ + struct mm_struct *mm = domain->mm; + struct arm_smmu_bond *bond = NULL, *t; + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + + mutex_lock(&sva_lock); + list_for_each_entry(t, &master->bonds, list) { + if (t->mm == mm) { + bond = t; + break; + } + } + + if (!WARN_ON(!bond) && refcount_dec_and_test(&bond->refs)) { + list_del(&bond->list); + arm_smmu_mmu_notifier_put(bond->smmu_mn); + kfree(bond); + } + mutex_unlock(&sva_lock); +} + +static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t id) +{ + int ret = 0; + struct iommu_sva *handle; + struct mm_struct *mm = domain->mm; + + mutex_lock(&sva_lock); + handle = __arm_smmu_sva_bind(dev, mm); + if (IS_ERR(handle)) + ret = PTR_ERR(handle); + mutex_unlock(&sva_lock); + + return ret; +} + +static void arm_smmu_sva_domain_free(struct iommu_domain *domain) +{ + kfree(domain); +} + +static const struct iommu_domain_ops arm_smmu_sva_domain_ops = { + .set_dev_pasid = arm_smmu_sva_set_dev_pasid, + .free = arm_smmu_sva_domain_free +}; + +struct iommu_domain *arm_smmu_sva_domain_alloc(void) +{ + struct iommu_domain *domain; + + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) + return NULL; + domain->ops = &arm_smmu_sva_domain_ops; + + return domain; +} diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 21cb13da122c8..eed2eb8effa3f 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2009,6 +2009,9 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type) { struct arm_smmu_domain *smmu_domain; + if (type == IOMMU_DOMAIN_SVA) + return arm_smmu_sva_domain_alloc(); + if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA && type != IOMMU_DOMAIN_DMA_FQ && @@ -2838,6 +2841,17 @@ static int arm_smmu_def_domain_type(struct device *dev) return 0; } +static void arm_smmu_remove_dev_pasid(struct device *dev, ioasid_t pasid) +{ + struct iommu_domain *domain; + + domain = iommu_get_domain_for_dev_pasid(dev, pasid, IOMMU_DOMAIN_SVA); + if (WARN_ON(IS_ERR(domain)) || !domain) + return; + + arm_smmu_sva_remove_dev_pasid(domain, dev, pasid); +} + static struct iommu_ops arm_smmu_ops = { .capable = arm_smmu_capable, .domain_alloc = arm_smmu_domain_alloc, @@ -2846,6 +2860,7 @@ static struct iommu_ops arm_smmu_ops = { .device_group = arm_smmu_device_group, .of_xlate = arm_smmu_of_xlate, .get_resv_regions = arm_smmu_get_resv_regions, + .remove_dev_pasid = arm_smmu_remove_dev_pasid, .dev_enable_feat = arm_smmu_dev_enable_feature, .dev_disable_feat = arm_smmu_dev_disable_feature, .sva_bind = arm_smmu_sva_bind, diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index d2ba86470c423..5aa853e98d388 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -758,6 +758,9 @@ struct iommu_sva *arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm); void arm_smmu_sva_unbind(struct iommu_sva *handle); u32 arm_smmu_sva_get_pasid(struct iommu_sva *handle); void arm_smmu_sva_notifier_synchronize(void); +struct iommu_domain *arm_smmu_sva_domain_alloc(void); +void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t id); #else /* CONFIG_ARM_SMMU_V3_SVA */ static inline bool arm_smmu_sva_supported(struct arm_smmu_device *smmu) { @@ -803,5 +806,16 @@ static inline u32 arm_smmu_sva_get_pasid(struct iommu_sva *handle) } static inline void arm_smmu_sva_notifier_synchronize(void) {} + +static inline struct iommu_domain *arm_smmu_sva_domain_alloc(void) +{ + return NULL; +} + +static inline void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain, + struct device *dev, + ioasid_t id) +{ +} #endif /* CONFIG_ARM_SMMU_V3_SVA */ #endif /* _ARM_SMMU_V3_H */ -- GitLab From be51b1d6bbff48c7d1943a8ff1e5a55777807f6e Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 31 Oct 2022 08:59:13 +0800 Subject: [PATCH 0349/1423] iommu/sva: Refactoring iommu_sva_bind/unbind_device() The existing iommu SVA interfaces are implemented by calling the SVA specific iommu ops provided by the IOMMU drivers. There's no need for any SVA specific ops in iommu_ops vector anymore as we can achieve this through the generic attach/detach_dev_pasid domain ops. This refactors the IOMMU SVA interfaces implementation by using the iommu_attach/detach_device_pasid interfaces and align them with the concept of the SVA iommu domain. Put the new SVA code in the SVA related file in order to make it self-contained. Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Tested-by: Zhangfei Gao Tested-by: Tony Zhu Link: https://lore.kernel.org/r/20221031005917.45690-10-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu-sva-lib.c | 111 ++++++++++++++++++++++++++++++++++ drivers/iommu/iommu.c | 91 ---------------------------- include/linux/iommu.h | 43 +++++++------ 3 files changed, 134 insertions(+), 111 deletions(-) diff --git a/drivers/iommu/iommu-sva-lib.c b/drivers/iommu/iommu-sva-lib.c index 1065061438960..e425573a17875 100644 --- a/drivers/iommu/iommu-sva-lib.c +++ b/drivers/iommu/iommu-sva-lib.c @@ -4,6 +4,7 @@ */ #include #include +#include #include "iommu-sva-lib.h" @@ -69,3 +70,113 @@ struct mm_struct *iommu_sva_find(ioasid_t pasid) return ioasid_find(&iommu_sva_pasid, pasid, __mmget_not_zero); } EXPORT_SYMBOL_GPL(iommu_sva_find); + +/** + * iommu_sva_bind_device() - Bind a process address space to a device + * @dev: the device + * @mm: the mm to bind, caller must hold a reference to mm_users + * + * Create a bond between device and address space, allowing the device to + * access the mm using the PASID returned by iommu_sva_get_pasid(). If a + * bond already exists between @device and @mm, an additional internal + * reference is taken. Caller must call iommu_sva_unbind_device() + * to release each reference. + * + * iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_SVA) must be called first, to + * initialize the required SVA features. + * + * On error, returns an ERR_PTR value. + */ +struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) +{ + struct iommu_domain *domain; + struct iommu_sva *handle; + ioasid_t max_pasids; + int ret; + + max_pasids = dev->iommu->max_pasids; + if (!max_pasids) + return ERR_PTR(-EOPNOTSUPP); + + /* Allocate mm->pasid if necessary. */ + ret = iommu_sva_alloc_pasid(mm, 1, max_pasids - 1); + if (ret) + return ERR_PTR(ret); + + handle = kzalloc(sizeof(*handle), GFP_KERNEL); + if (!handle) + return ERR_PTR(-ENOMEM); + + mutex_lock(&iommu_sva_lock); + /* Search for an existing domain. */ + domain = iommu_get_domain_for_dev_pasid(dev, mm->pasid, + IOMMU_DOMAIN_SVA); + if (IS_ERR(domain)) { + ret = PTR_ERR(domain); + goto out_unlock; + } + + if (domain) { + domain->users++; + goto out; + } + + /* Allocate a new domain and set it on device pasid. */ + domain = iommu_sva_domain_alloc(dev, mm); + if (!domain) { + ret = -ENOMEM; + goto out_unlock; + } + + ret = iommu_attach_device_pasid(domain, dev, mm->pasid); + if (ret) + goto out_free_domain; + domain->users = 1; +out: + mutex_unlock(&iommu_sva_lock); + handle->dev = dev; + handle->domain = domain; + + return handle; + +out_free_domain: + iommu_domain_free(domain); +out_unlock: + mutex_unlock(&iommu_sva_lock); + kfree(handle); + + return ERR_PTR(ret); +} +EXPORT_SYMBOL_GPL(iommu_sva_bind_device); + +/** + * iommu_sva_unbind_device() - Remove a bond created with iommu_sva_bind_device + * @handle: the handle returned by iommu_sva_bind_device() + * + * Put reference to a bond between device and address space. The device should + * not be issuing any more transaction for this PASID. All outstanding page + * requests for this PASID must have been flushed to the IOMMU. + */ +void iommu_sva_unbind_device(struct iommu_sva *handle) +{ + struct iommu_domain *domain = handle->domain; + ioasid_t pasid = domain->mm->pasid; + struct device *dev = handle->dev; + + mutex_lock(&iommu_sva_lock); + if (--domain->users == 0) { + iommu_detach_device_pasid(domain, dev, pasid); + iommu_domain_free(domain); + } + mutex_unlock(&iommu_sva_lock); + kfree(handle); +} +EXPORT_SYMBOL_GPL(iommu_sva_unbind_device); + +u32 iommu_sva_get_pasid(struct iommu_sva *handle) +{ + struct iommu_domain *domain = handle->domain; + + return domain->mm->pasid; +} +EXPORT_SYMBOL_GPL(iommu_sva_get_pasid); diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 6a1cd2018e301..c9da0a1bb3b8f 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2751,97 +2751,6 @@ int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat) } EXPORT_SYMBOL_GPL(iommu_dev_disable_feature); -/** - * iommu_sva_bind_device() - Bind a process address space to a device - * @dev: the device - * @mm: the mm to bind, caller must hold a reference to it - * - * Create a bond between device and address space, allowing the device to access - * the mm using the returned PASID. If a bond already exists between @device and - * @mm, it is returned and an additional reference is taken. Caller must call - * iommu_sva_unbind_device() to release each reference. - * - * iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_SVA) must be called first, to - * initialize the required SVA features. - * - * On error, returns an ERR_PTR value. - */ -struct iommu_sva * -iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) -{ - struct iommu_group *group; - struct iommu_sva *handle = ERR_PTR(-EINVAL); - const struct iommu_ops *ops = dev_iommu_ops(dev); - - if (!ops->sva_bind) - return ERR_PTR(-ENODEV); - - group = iommu_group_get(dev); - if (!group) - return ERR_PTR(-ENODEV); - - /* Ensure device count and domain don't change while we're binding */ - mutex_lock(&group->mutex); - - /* - * To keep things simple, SVA currently doesn't support IOMMU groups - * with more than one device. Existing SVA-capable systems are not - * affected by the problems that required IOMMU groups (lack of ACS - * isolation, device ID aliasing and other hardware issues). - */ - if (iommu_group_device_count(group) != 1) - goto out_unlock; - - handle = ops->sva_bind(dev, mm); - -out_unlock: - mutex_unlock(&group->mutex); - iommu_group_put(group); - - return handle; -} -EXPORT_SYMBOL_GPL(iommu_sva_bind_device); - -/** - * iommu_sva_unbind_device() - Remove a bond created with iommu_sva_bind_device - * @handle: the handle returned by iommu_sva_bind_device() - * - * Put reference to a bond between device and address space. The device should - * not be issuing any more transaction for this PASID. All outstanding page - * requests for this PASID must have been flushed to the IOMMU. - */ -void iommu_sva_unbind_device(struct iommu_sva *handle) -{ - struct iommu_group *group; - struct device *dev = handle->dev; - const struct iommu_ops *ops = dev_iommu_ops(dev); - - if (!ops->sva_unbind) - return; - - group = iommu_group_get(dev); - if (!group) - return; - - mutex_lock(&group->mutex); - ops->sva_unbind(handle); - mutex_unlock(&group->mutex); - - iommu_group_put(group); -} -EXPORT_SYMBOL_GPL(iommu_sva_unbind_device); - -u32 iommu_sva_get_pasid(struct iommu_sva *handle) -{ - const struct iommu_ops *ops = dev_iommu_ops(handle->dev); - - if (!ops->sva_get_pasid) - return IOMMU_PASID_INVALID; - - return ops->sva_get_pasid(handle); -} -EXPORT_SYMBOL_GPL(iommu_sva_get_pasid); - /* * Changes the default domain of an iommu group that has *only* one device * diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 776baa3759674..bee5659d07ebe 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -645,6 +645,7 @@ struct iommu_fwspec { */ struct iommu_sva { struct device *dev; + struct iommu_domain *domain; }; int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode, @@ -686,11 +687,6 @@ void iommu_release_device(struct device *dev); int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features f); int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features f); -struct iommu_sva *iommu_sva_bind_device(struct device *dev, - struct mm_struct *mm); -void iommu_sva_unbind_device(struct iommu_sva *handle); -u32 iommu_sva_get_pasid(struct iommu_sva *handle); - int iommu_device_use_default_domain(struct device *dev); void iommu_device_unuse_default_domain(struct device *dev); @@ -1026,21 +1022,6 @@ iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat) return -ENODEV; } -static inline struct iommu_sva * -iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) -{ - return NULL; -} - -static inline void iommu_sva_unbind_device(struct iommu_sva *handle) -{ -} - -static inline u32 iommu_sva_get_pasid(struct iommu_sva *handle) -{ - return IOMMU_PASID_INVALID; -} - static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev) { return NULL; @@ -1154,4 +1135,26 @@ static inline void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_m #endif /* CONFIG_IOMMU_DMA */ +#ifdef CONFIG_IOMMU_SVA +struct iommu_sva *iommu_sva_bind_device(struct device *dev, + struct mm_struct *mm); +void iommu_sva_unbind_device(struct iommu_sva *handle); +u32 iommu_sva_get_pasid(struct iommu_sva *handle); +#else +static inline struct iommu_sva * +iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) +{ + return NULL; +} + +static inline void iommu_sva_unbind_device(struct iommu_sva *handle) +{ +} + +static inline u32 iommu_sva_get_pasid(struct iommu_sva *handle) +{ + return IOMMU_PASID_INVALID; +} +#endif /* CONFIG_IOMMU_SVA */ + #endif /* __LINUX_IOMMU_H */ -- GitLab From 1c263576f4735e063e234fa5f43fd3046d36b5b3 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 31 Oct 2022 08:59:14 +0800 Subject: [PATCH 0350/1423] iommu: Remove SVA related callbacks from iommu ops These ops'es have been deprecated. There's no need for them anymore. Remove them to avoid dead code. Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Yi Liu Tested-by: Zhangfei Gao Tested-by: Tony Zhu Link: https://lore.kernel.org/r/20221031005917.45690-11-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 40 --------------- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 3 -- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 16 ------ drivers/iommu/intel/iommu.c | 3 -- drivers/iommu/intel/iommu.h | 3 -- drivers/iommu/intel/svm.c | 49 ------------------- include/linux/iommu.h | 7 --- 7 files changed, 121 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index 2d188d12419ec..9541afbba73c5 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -344,11 +344,6 @@ __arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm) if (!bond) return ERR_PTR(-ENOMEM); - /* Allocate a PASID for this mm if necessary */ - ret = iommu_sva_alloc_pasid(mm, 1, (1U << master->ssid_bits) - 1); - if (ret) - goto err_free_bond; - bond->mm = mm; bond->sva.dev = dev; refcount_set(&bond->refs, 1); @@ -367,41 +362,6 @@ err_free_bond: return ERR_PTR(ret); } -struct iommu_sva *arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm) -{ - struct iommu_sva *handle; - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - - if (smmu_domain->stage != ARM_SMMU_DOMAIN_S1) - return ERR_PTR(-EINVAL); - - mutex_lock(&sva_lock); - handle = __arm_smmu_sva_bind(dev, mm); - mutex_unlock(&sva_lock); - return handle; -} - -void arm_smmu_sva_unbind(struct iommu_sva *handle) -{ - struct arm_smmu_bond *bond = sva_to_bond(handle); - - mutex_lock(&sva_lock); - if (refcount_dec_and_test(&bond->refs)) { - list_del(&bond->list); - arm_smmu_mmu_notifier_put(bond->smmu_mn); - kfree(bond); - } - mutex_unlock(&sva_lock); -} - -u32 arm_smmu_sva_get_pasid(struct iommu_sva *handle) -{ - struct arm_smmu_bond *bond = sva_to_bond(handle); - - return bond->mm->pasid; -} - bool arm_smmu_sva_supported(struct arm_smmu_device *smmu) { unsigned long reg, fld; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index eed2eb8effa3f..891e87ea54dbe 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2863,9 +2863,6 @@ static struct iommu_ops arm_smmu_ops = { .remove_dev_pasid = arm_smmu_remove_dev_pasid, .dev_enable_feat = arm_smmu_dev_enable_feature, .dev_disable_feat = arm_smmu_dev_disable_feature, - .sva_bind = arm_smmu_sva_bind, - .sva_unbind = arm_smmu_sva_unbind, - .sva_get_pasid = arm_smmu_sva_get_pasid, .page_response = arm_smmu_page_response, .def_domain_type = arm_smmu_def_domain_type, .pgsize_bitmap = -1UL, /* Restricted during device attach */ diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 5aa853e98d388..8d772ea8a5836 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -754,9 +754,6 @@ bool arm_smmu_master_sva_enabled(struct arm_smmu_master *master); int arm_smmu_master_enable_sva(struct arm_smmu_master *master); int arm_smmu_master_disable_sva(struct arm_smmu_master *master); bool arm_smmu_master_iopf_supported(struct arm_smmu_master *master); -struct iommu_sva *arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm); -void arm_smmu_sva_unbind(struct iommu_sva *handle); -u32 arm_smmu_sva_get_pasid(struct iommu_sva *handle); void arm_smmu_sva_notifier_synchronize(void); struct iommu_domain *arm_smmu_sva_domain_alloc(void); void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain, @@ -792,19 +789,6 @@ static inline bool arm_smmu_master_iopf_supported(struct arm_smmu_master *master return false; } -static inline struct iommu_sva * -arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm) -{ - return ERR_PTR(-ENODEV); -} - -static inline void arm_smmu_sva_unbind(struct iommu_sva *handle) {} - -static inline u32 arm_smmu_sva_get_pasid(struct iommu_sva *handle) -{ - return IOMMU_PASID_INVALID; -} - static inline void arm_smmu_sva_notifier_synchronize(void) {} static inline struct iommu_domain *arm_smmu_sva_domain_alloc(void) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 7b67e431dd362..5a41b10593b7a 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4751,9 +4751,6 @@ const struct iommu_ops intel_iommu_ops = { .remove_dev_pasid = intel_iommu_remove_dev_pasid, .pgsize_bitmap = SZ_4K, #ifdef CONFIG_INTEL_IOMMU_SVM - .sva_bind = intel_svm_bind, - .sva_unbind = intel_svm_unbind, - .sva_get_pasid = intel_svm_get_pasid, .page_response = intel_svm_page_response, #endif .default_domain_ops = &(const struct iommu_domain_ops) { diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 252fa344f88a2..251a609fdce32 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -748,9 +748,6 @@ struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn); extern void intel_svm_check(struct intel_iommu *iommu); extern int intel_svm_enable_prq(struct intel_iommu *iommu); extern int intel_svm_finish_prq(struct intel_iommu *iommu); -struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm); -void intel_svm_unbind(struct iommu_sva *handle); -u32 intel_svm_get_pasid(struct iommu_sva *handle); int intel_svm_page_response(struct device *dev, struct iommu_fault_event *evt, struct iommu_page_response *msg); struct iommu_domain *intel_svm_domain_alloc(void); diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 86c8ea0d9635e..fceae93870189 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -296,14 +296,6 @@ out: return 0; } -static int intel_svm_alloc_pasid(struct device *dev, struct mm_struct *mm) -{ - ioasid_t max_pasid = dev_is_pci(dev) ? - pci_max_pasids(to_pci_dev(dev)) : intel_pasid_max_id; - - return iommu_sva_alloc_pasid(mm, PASID_MIN, max_pasid - 1); -} - static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu, struct device *dev, struct mm_struct *mm) @@ -771,47 +763,6 @@ prq_advance: return IRQ_RETVAL(handled); } -struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm) -{ - struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); - struct iommu_sva *sva; - int ret; - - mutex_lock(&pasid_mutex); - ret = intel_svm_alloc_pasid(dev, mm); - if (ret) { - mutex_unlock(&pasid_mutex); - return ERR_PTR(ret); - } - - sva = intel_svm_bind_mm(iommu, dev, mm); - mutex_unlock(&pasid_mutex); - - return sva; -} - -void intel_svm_unbind(struct iommu_sva *sva) -{ - struct intel_svm_dev *sdev = to_intel_svm_dev(sva); - - mutex_lock(&pasid_mutex); - intel_svm_unbind_mm(sdev->dev, sdev->pasid); - mutex_unlock(&pasid_mutex); -} - -u32 intel_svm_get_pasid(struct iommu_sva *sva) -{ - struct intel_svm_dev *sdev; - u32 pasid; - - mutex_lock(&pasid_mutex); - sdev = to_intel_svm_dev(sva); - pasid = sdev->pasid; - mutex_unlock(&pasid_mutex); - - return pasid; -} - int intel_svm_page_response(struct device *dev, struct iommu_fault_event *evt, struct iommu_page_response *msg) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index bee5659d07ebe..c337ef1c97bc8 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -227,9 +227,6 @@ struct iommu_iotlb_gather { * driver init to device driver init (default no) * @dev_enable/disable_feat: per device entries to enable/disable * iommu specific features. - * @sva_bind: Bind process address space to device - * @sva_unbind: Unbind process address space from device - * @sva_get_pasid: Get PASID associated to a SVA handle * @page_response: handle page request response * @def_domain_type: device default domain type, return value: * - IOMMU_DOMAIN_IDENTITY: must use an identity domain @@ -263,10 +260,6 @@ struct iommu_ops { int (*dev_enable_feat)(struct device *dev, enum iommu_dev_features f); int (*dev_disable_feat)(struct device *dev, enum iommu_dev_features f); - struct iommu_sva *(*sva_bind)(struct device *dev, struct mm_struct *mm); - void (*sva_unbind)(struct iommu_sva *handle); - u32 (*sva_get_pasid)(struct iommu_sva *handle); - int (*page_response)(struct device *dev, struct iommu_fault_event *evt, struct iommu_page_response *msg); -- GitLab From 8cc93159f91960b4812ea48887e9e7501babc95a Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 31 Oct 2022 08:59:15 +0800 Subject: [PATCH 0351/1423] iommu: Prepare IOMMU domain for IOPF This adds some mechanisms around the iommu_domain so that the I/O page fault handling framework could route a page fault to the domain and call the fault handler from it. Add pointers to the page fault handler and its private data in struct iommu_domain. The fault handler will be called with the private data as a parameter once a page fault is routed to the domain. Any kernel component which owns an iommu domain could install handler and its private parameter so that the page fault could be further routed and handled. This also prepares the SVA implementation to be the first consumer of the per-domain page fault handling model. The I/O page fault handler for SVA is copied to the SVA file with mmget_not_zero() added before mmap_read_lock(). Suggested-by: Jean-Philippe Brucker Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Tested-by: Zhangfei Gao Tested-by: Tony Zhu Link: https://lore.kernel.org/r/20221031005917.45690-12-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/io-pgfault.c | 7 +++++ drivers/iommu/iommu-sva-lib.c | 58 +++++++++++++++++++++++++++++++++++ drivers/iommu/iommu-sva-lib.h | 8 +++++ drivers/iommu/iommu.c | 4 +++ include/linux/iommu.h | 3 ++ 5 files changed, 80 insertions(+) diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index 1df8c1dcae776..aee9e033012f0 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -181,6 +181,13 @@ static void iopf_handle_group(struct work_struct *work) * request completes, outstanding faults will have been dealt with by the time * the PASID is freed. * + * Any valid page fault will be eventually routed to an iommu domain and the + * page fault handler installed there will get called. The users of this + * handling framework should guarantee that the iommu domain could only be + * freed after the device has stopped generating page faults (or the iommu + * hardware has been set to block the page faults) and the pending page faults + * have been flushed. + * * Return: 0 on success and <0 on error. */ int iommu_queue_iopf(struct iommu_fault *fault, void *cookie) diff --git a/drivers/iommu/iommu-sva-lib.c b/drivers/iommu/iommu-sva-lib.c index e425573a17875..089fd61ff4538 100644 --- a/drivers/iommu/iommu-sva-lib.c +++ b/drivers/iommu/iommu-sva-lib.c @@ -180,3 +180,61 @@ u32 iommu_sva_get_pasid(struct iommu_sva *handle) return domain->mm->pasid; } EXPORT_SYMBOL_GPL(iommu_sva_get_pasid); + +/* + * I/O page fault handler for SVA + */ +enum iommu_page_response_code +iommu_sva_handle_iopf(struct iommu_fault *fault, void *data) +{ + vm_fault_t ret; + struct vm_area_struct *vma; + struct mm_struct *mm = data; + unsigned int access_flags = 0; + unsigned int fault_flags = FAULT_FLAG_REMOTE; + struct iommu_fault_page_request *prm = &fault->prm; + enum iommu_page_response_code status = IOMMU_PAGE_RESP_INVALID; + + if (!(prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID)) + return status; + + if (!mmget_not_zero(mm)) + return status; + + mmap_read_lock(mm); + + vma = find_extend_vma(mm, prm->addr); + if (!vma) + /* Unmapped area */ + goto out_put_mm; + + if (prm->perm & IOMMU_FAULT_PERM_READ) + access_flags |= VM_READ; + + if (prm->perm & IOMMU_FAULT_PERM_WRITE) { + access_flags |= VM_WRITE; + fault_flags |= FAULT_FLAG_WRITE; + } + + if (prm->perm & IOMMU_FAULT_PERM_EXEC) { + access_flags |= VM_EXEC; + fault_flags |= FAULT_FLAG_INSTRUCTION; + } + + if (!(prm->perm & IOMMU_FAULT_PERM_PRIV)) + fault_flags |= FAULT_FLAG_USER; + + if (access_flags & ~vma->vm_flags) + /* Access fault */ + goto out_put_mm; + + ret = handle_mm_fault(vma, prm->addr, fault_flags, NULL); + status = ret & VM_FAULT_ERROR ? IOMMU_PAGE_RESP_INVALID : + IOMMU_PAGE_RESP_SUCCESS; + +out_put_mm: + mmap_read_unlock(mm); + mmput(mm); + + return status; +} diff --git a/drivers/iommu/iommu-sva-lib.h b/drivers/iommu/iommu-sva-lib.h index 8909ea1094e3a..1b3ace4b5863a 100644 --- a/drivers/iommu/iommu-sva-lib.h +++ b/drivers/iommu/iommu-sva-lib.h @@ -26,6 +26,8 @@ int iopf_queue_flush_dev(struct device *dev); struct iopf_queue *iopf_queue_alloc(const char *name); void iopf_queue_free(struct iopf_queue *queue); int iopf_queue_discard_partial(struct iopf_queue *queue); +enum iommu_page_response_code +iommu_sva_handle_iopf(struct iommu_fault *fault, void *data); #else /* CONFIG_IOMMU_SVA */ static inline int iommu_queue_iopf(struct iommu_fault *fault, void *cookie) @@ -63,5 +65,11 @@ static inline int iopf_queue_discard_partial(struct iopf_queue *queue) { return -ENODEV; } + +static inline enum iommu_page_response_code +iommu_sva_handle_iopf(struct iommu_fault *fault, void *data) +{ + return IOMMU_PAGE_RESP_INVALID; +} #endif /* CONFIG_IOMMU_SVA */ #endif /* _IOMMU_SVA_LIB_H */ diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index c9da0a1bb3b8f..9e0fb18e1b346 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -33,6 +33,8 @@ #include "dma-iommu.h" +#include "iommu-sva-lib.h" + static struct kset *iommu_group_kset; static DEFINE_IDA(iommu_group_ida); @@ -3309,6 +3311,8 @@ struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, domain->type = IOMMU_DOMAIN_SVA; mmgrab(mm); domain->mm = mm; + domain->iopf_handler = iommu_sva_handle_iopf; + domain->fault_data = mm; return domain; } diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c337ef1c97bc8..7d2648058e439 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -98,6 +98,9 @@ struct iommu_domain { unsigned long pgsize_bitmap; /* Bitmap of page sizes in use */ struct iommu_domain_geometry geometry; struct iommu_dma_cookie *iova_cookie; + enum iommu_page_response_code (*iopf_handler)(struct iommu_fault *fault, + void *data); + void *fault_data; union { struct { iommu_fault_handler_t handler; -- GitLab From 4bb4211e48fbfb392bb07168b75b1a92832b62f5 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 31 Oct 2022 08:59:16 +0800 Subject: [PATCH 0352/1423] iommu: Per-domain I/O page fault handling Tweak the I/O page fault handling framework to route the page faults to the domain and call the page fault handler retrieved from the domain. This makes the I/O page fault handling framework possible to serve more usage scenarios as long as they have an IOMMU domain and install a page fault handler in it. Some unused functions are also removed to avoid dead code. The iommu_get_domain_for_dev_pasid() which retrieves attached domain for a {device, PASID} pair is used. It will be used by the page fault handling framework which knows {device, PASID} reported from the iommu driver. We have a guarantee that the SVA domain doesn't go away during IOPF handling, because unbind() won't free the domain until all the pending page requests have been flushed from the pipeline. The drivers either call iopf_queue_flush_dev() explicitly, or in stall case, the device driver is required to flush all DMAs including stalled transactions before calling unbind(). This also renames iopf_handle_group() to iopf_handler() to avoid confusing. Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Tested-by: Zhangfei Gao Tested-by: Tony Zhu Link: https://lore.kernel.org/r/20221031005917.45690-13-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/io-pgfault.c | 68 +++++--------------------------------- 1 file changed, 9 insertions(+), 59 deletions(-) diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index aee9e033012f0..d046d89cec553 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -69,69 +69,18 @@ static int iopf_complete_group(struct device *dev, struct iopf_fault *iopf, return iommu_page_response(dev, &resp); } -static enum iommu_page_response_code -iopf_handle_single(struct iopf_fault *iopf) -{ - vm_fault_t ret; - struct mm_struct *mm; - struct vm_area_struct *vma; - unsigned int access_flags = 0; - unsigned int fault_flags = FAULT_FLAG_REMOTE; - struct iommu_fault_page_request *prm = &iopf->fault.prm; - enum iommu_page_response_code status = IOMMU_PAGE_RESP_INVALID; - - if (!(prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID)) - return status; - - mm = iommu_sva_find(prm->pasid); - if (IS_ERR_OR_NULL(mm)) - return status; - - mmap_read_lock(mm); - - vma = find_extend_vma(mm, prm->addr); - if (!vma) - /* Unmapped area */ - goto out_put_mm; - - if (prm->perm & IOMMU_FAULT_PERM_READ) - access_flags |= VM_READ; - - if (prm->perm & IOMMU_FAULT_PERM_WRITE) { - access_flags |= VM_WRITE; - fault_flags |= FAULT_FLAG_WRITE; - } - - if (prm->perm & IOMMU_FAULT_PERM_EXEC) { - access_flags |= VM_EXEC; - fault_flags |= FAULT_FLAG_INSTRUCTION; - } - - if (!(prm->perm & IOMMU_FAULT_PERM_PRIV)) - fault_flags |= FAULT_FLAG_USER; - - if (access_flags & ~vma->vm_flags) - /* Access fault */ - goto out_put_mm; - - ret = handle_mm_fault(vma, prm->addr, fault_flags, NULL); - status = ret & VM_FAULT_ERROR ? IOMMU_PAGE_RESP_INVALID : - IOMMU_PAGE_RESP_SUCCESS; - -out_put_mm: - mmap_read_unlock(mm); - mmput(mm); - - return status; -} - -static void iopf_handle_group(struct work_struct *work) +static void iopf_handler(struct work_struct *work) { struct iopf_group *group; + struct iommu_domain *domain; struct iopf_fault *iopf, *next; enum iommu_page_response_code status = IOMMU_PAGE_RESP_SUCCESS; group = container_of(work, struct iopf_group, work); + domain = iommu_get_domain_for_dev_pasid(group->dev, + group->last_fault.fault.prm.pasid, 0); + if (!domain || !domain->iopf_handler) + status = IOMMU_PAGE_RESP_INVALID; list_for_each_entry_safe(iopf, next, &group->faults, list) { /* @@ -139,7 +88,8 @@ static void iopf_handle_group(struct work_struct *work) * faults in the group if there is an error. */ if (status == IOMMU_PAGE_RESP_SUCCESS) - status = iopf_handle_single(iopf); + status = domain->iopf_handler(&iopf->fault, + domain->fault_data); if (!(iopf->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) @@ -242,7 +192,7 @@ int iommu_queue_iopf(struct iommu_fault *fault, void *cookie) group->last_fault.fault = *fault; INIT_LIST_HEAD(&group->faults); list_add(&group->last_fault.list, &group->faults); - INIT_WORK(&group->work, iopf_handle_group); + INIT_WORK(&group->work, iopf_handler); /* See if we have partial faults for this group */ list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) { -- GitLab From 757636ed2607a3269cd2764e3e4a0480384c6c26 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 31 Oct 2022 08:59:17 +0800 Subject: [PATCH 0353/1423] iommu: Rename iommu-sva-lib.{c,h} Rename iommu-sva-lib.c[h] to iommu-sva.c[h] as it contains all code for SVA implementation in iommu core. Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Tested-by: Zhangfei Gao Tested-by: Tony Zhu Link: https://lore.kernel.org/r/20221031005917.45690-14-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/Makefile | 2 +- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 2 +- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 2 +- drivers/iommu/intel/iommu.c | 2 +- drivers/iommu/intel/svm.c | 2 +- drivers/iommu/io-pgfault.c | 2 +- drivers/iommu/{iommu-sva-lib.c => iommu-sva.c} | 2 +- drivers/iommu/{iommu-sva-lib.h => iommu-sva.h} | 6 +++--- drivers/iommu/iommu.c | 2 +- 9 files changed, 11 insertions(+), 11 deletions(-) rename drivers/iommu/{iommu-sva-lib.c => iommu-sva.c} (99%) rename drivers/iommu/{iommu-sva-lib.h => iommu-sva.h} (95%) diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index cc9f381013c35..7fbf6a3376620 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -28,6 +28,6 @@ obj-$(CONFIG_FSL_PAMU) += fsl_pamu.o fsl_pamu_domain.o obj-$(CONFIG_S390_IOMMU) += s390-iommu.o obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o -obj-$(CONFIG_IOMMU_SVA) += iommu-sva-lib.o io-pgfault.o +obj-$(CONFIG_IOMMU_SVA) += iommu-sva.o io-pgfault.o obj-$(CONFIG_SPRD_IOMMU) += sprd-iommu.o obj-$(CONFIG_APPLE_DART) += apple-dart.o diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index 9541afbba73c5..a5a63b1c947eb 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -10,7 +10,7 @@ #include #include "arm-smmu-v3.h" -#include "../../iommu-sva-lib.h" +#include "../../iommu-sva.h" #include "../../io-pgtable-arm.h" struct arm_smmu_mmu_notifier { diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 891e87ea54dbe..94a2e53368af9 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -29,7 +29,7 @@ #include "arm-smmu-v3.h" #include "../../dma-iommu.h" -#include "../../iommu-sva-lib.h" +#include "../../iommu-sva.h" static bool disable_bypass = true; module_param(disable_bypass, bool, 0444); diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 5a41b10593b7a..a934a46bb9e65 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -27,7 +27,7 @@ #include "iommu.h" #include "../dma-iommu.h" #include "../irq_remapping.h" -#include "../iommu-sva-lib.h" +#include "../iommu-sva.h" #include "pasid.h" #include "cap_audit.h" diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index fceae93870189..f32de15da61a2 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -24,7 +24,7 @@ #include "iommu.h" #include "pasid.h" #include "perf.h" -#include "../iommu-sva-lib.h" +#include "../iommu-sva.h" #include "trace.h" static irqreturn_t prq_event_thread(int irq, void *d); diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index d046d89cec553..e5b8b9110c132 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -11,7 +11,7 @@ #include #include -#include "iommu-sva-lib.h" +#include "iommu-sva.h" /** * struct iopf_queue - IO Page Fault queue diff --git a/drivers/iommu/iommu-sva-lib.c b/drivers/iommu/iommu-sva.c similarity index 99% rename from drivers/iommu/iommu-sva-lib.c rename to drivers/iommu/iommu-sva.c index 089fd61ff4538..24bf9b2b58aa6 100644 --- a/drivers/iommu/iommu-sva-lib.c +++ b/drivers/iommu/iommu-sva.c @@ -6,7 +6,7 @@ #include #include -#include "iommu-sva-lib.h" +#include "iommu-sva.h" static DEFINE_MUTEX(iommu_sva_lock); static DECLARE_IOASID_SET(iommu_sva_pasid); diff --git a/drivers/iommu/iommu-sva-lib.h b/drivers/iommu/iommu-sva.h similarity index 95% rename from drivers/iommu/iommu-sva-lib.h rename to drivers/iommu/iommu-sva.h index 1b3ace4b5863a..7215a761b9628 100644 --- a/drivers/iommu/iommu-sva-lib.h +++ b/drivers/iommu/iommu-sva.h @@ -2,8 +2,8 @@ /* * SVA library for IOMMU drivers */ -#ifndef _IOMMU_SVA_LIB_H -#define _IOMMU_SVA_LIB_H +#ifndef _IOMMU_SVA_H +#define _IOMMU_SVA_H #include #include @@ -72,4 +72,4 @@ iommu_sva_handle_iopf(struct iommu_fault *fault, void *data) return IOMMU_PAGE_RESP_INVALID; } #endif /* CONFIG_IOMMU_SVA */ -#endif /* _IOMMU_SVA_LIB_H */ +#endif /* _IOMMU_SVA_H */ diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 9e0fb18e1b346..c50f68b2b656e 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -33,7 +33,7 @@ #include "dma-iommu.h" -#include "iommu-sva-lib.h" +#include "iommu-sva.h" static struct kset *iommu_group_kset; static DEFINE_IDA(iommu_group_ida); -- GitLab From fdaeb224e2bf747942653e4f70226fcfe60fbf73 Mon Sep 17 00:00:00 2001 From: Anirudh Venkataramanan Date: Wed, 26 Oct 2022 12:16:13 -0700 Subject: [PATCH 0354/1423] crypto: tcrypt - Use pr_cont to print test results For some test cases, a line break gets inserted between the test banner and the results. For example, with mode=211 this is the output: [...] testing speed of rfc4106(gcm(aes)) (rfc4106-gcm-aesni) encryption [...] test 0 (160 bit key, 16 byte blocks): [...] 1 operation in 2373 cycles (16 bytes) --snip-- [...] testing speed of gcm(aes) (generic-gcm-aesni) encryption [...] test 0 (128 bit key, 16 byte blocks): [...] 1 operation in 2338 cycles (16 bytes) Similar behavior is seen in the following cases as well: modprobe tcrypt mode=212 modprobe tcrypt mode=213 modprobe tcrypt mode=221 modprobe tcrypt mode=300 sec=1 modprobe tcrypt mode=400 sec=1 This doesn't happen with mode=215: [...] tcrypt: testing speed of multibuffer rfc4106(gcm(aes)) (rfc4106-gcm-aesni) encryption [...] tcrypt: test 0 (160 bit key, 16 byte blocks): 1 operation in 2215 cycles (16 bytes) --snip-- [...] tcrypt: testing speed of multibuffer gcm(aes) (generic-gcm-aesni) encryption [...] tcrypt: test 0 (128 bit key, 16 byte blocks): 1 operation in 2191 cycles (16 bytes) This print inconsistency is because printk() is used instead of pr_cont() in a few places. Change these to be pr_cont(). checkpatch warns that pr_cont() shouldn't be used. This can be ignored in this context as tcrypt already uses pr_cont(). Signed-off-by: Anirudh Venkataramanan Signed-off-by: Herbert Xu --- crypto/tcrypt.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 3f7dc94a63e0b..2822405a0d45d 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -506,8 +506,8 @@ static int test_aead_cycles(struct aead_request *req, int enc, int blen) out: if (ret == 0) - printk("1 operation in %lu cycles (%d bytes)\n", - (cycles + 4) / 8, blen); + pr_cont("1 operation in %lu cycles (%d bytes)\n", + (cycles + 4) / 8, blen); return ret; } @@ -727,8 +727,8 @@ static int test_ahash_jiffies_digest(struct ahash_request *req, int blen, return ret; } - printk("%6u opers/sec, %9lu bytes/sec\n", - bcount / secs, ((long)bcount * blen) / secs); + pr_cont("%6u opers/sec, %9lu bytes/sec\n", + bcount / secs, ((long)bcount * blen) / secs); return 0; } -- GitLab From 837a99f59043c3505d69761575172da1d09220b5 Mon Sep 17 00:00:00 2001 From: Anirudh Venkataramanan Date: Wed, 26 Oct 2022 12:16:14 -0700 Subject: [PATCH 0355/1423] crypto: tcrypt - Use pr_info/pr_err Currently, there's mixed use of printk() and pr_info()/pr_err(). The latter prints the module name (because pr_fmt() is defined so) but the former does not. As a result there's inconsistency in the printed output. For example: modprobe mode=211: [...] test 0 (160 bit key, 16 byte blocks): 1 operation in 2320 cycles (16 bytes) [...] test 1 (160 bit key, 64 byte blocks): 1 operation in 2336 cycles (64 bytes) modprobe mode=215: [...] tcrypt: test 0 (160 bit key, 16 byte blocks): 1 operation in 2173 cycles (16 bytes) [...] tcrypt: test 1 (160 bit key, 64 byte blocks): 1 operation in 2241 cycles (64 bytes) Replace all instances of printk() with pr_info()/pr_err() so that the module name is printed consistently. Signed-off-by: Anirudh Venkataramanan Signed-off-by: Herbert Xu --- crypto/tcrypt.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 2822405a0d45d..78b236bc9cd81 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -575,8 +575,8 @@ static void test_aead_speed(const char *algo, int enc, unsigned int secs, } crypto_init_wait(&wait); - printk(KERN_INFO "\ntesting speed of %s (%s) %s\n", algo, - get_driver_name(crypto_aead, tfm), e); + pr_info("\ntesting speed of %s (%s) %s\n", algo, + get_driver_name(crypto_aead, tfm), e); req = aead_request_alloc(tfm, GFP_KERNEL); if (!req) { @@ -624,8 +624,8 @@ static void test_aead_speed(const char *algo, int enc, unsigned int secs, memset(iv, 0xff, iv_len); crypto_aead_clear_flags(tfm, ~0); - printk(KERN_INFO "test %u (%d bit key, %d byte blocks): ", - i, *keysize * 8, bs); + pr_info("test %u (%d bit key, %d byte blocks): ", + i, *keysize * 8, bs); memset(tvmem[0], 0xff, PAGE_SIZE); @@ -877,8 +877,8 @@ static void test_ahash_speed_common(const char *algo, unsigned int secs, return; } - printk(KERN_INFO "\ntesting speed of async %s (%s)\n", algo, - get_driver_name(crypto_ahash, tfm)); + pr_info("\ntesting speed of async %s (%s)\n", algo, + get_driver_name(crypto_ahash, tfm)); if (crypto_ahash_digestsize(tfm) > MAX_DIGEST_SIZE) { pr_err("digestsize(%u) > %d\n", crypto_ahash_digestsize(tfm), @@ -2885,7 +2885,7 @@ static int __init tcrypt_mod_init(void) err = do_test(alg, type, mask, mode, num_mb); if (err) { - printk(KERN_ERR "tcrypt: one or more tests failed!\n"); + pr_err("one or more tests failed!\n"); goto err_free_tv; } else { pr_debug("all tests passed\n"); -- GitLab From a2ef563000aff04352a4aded7b2d2bf19a1674bf Mon Sep 17 00:00:00 2001 From: Anirudh Venkataramanan Date: Wed, 26 Oct 2022 12:16:15 -0700 Subject: [PATCH 0356/1423] crypto: tcrypt - Drop module name from print string The pr_fmt() define includes KBUILD_MODNAME, and so there's no need for pr_err() to also print it. Drop module name from the print string. Signed-off-by: Anirudh Venkataramanan Signed-off-by: Herbert Xu --- crypto/tcrypt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 78b236bc9cd81..40e72ec0f5378 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -1329,8 +1329,7 @@ static void test_skcipher_speed(const char *algo, int enc, unsigned int secs, req = skcipher_request_alloc(tfm, GFP_KERNEL); if (!req) { - pr_err("tcrypt: skcipher: Failed to allocate request for %s\n", - algo); + pr_err("skcipher: Failed to allocate request for %s\n", algo); goto out; } -- GitLab From 3513828cb8f6db714ee48b1559e6253a57ecf1f6 Mon Sep 17 00:00:00 2001 From: Anirudh Venkataramanan Date: Wed, 26 Oct 2022 12:16:16 -0700 Subject: [PATCH 0357/1423] crypto: tcrypt - Drop leading newlines from prints The top level print banners have a leading newline. It's not entirely clear why this exists, but it makes it harder to parse tcrypt test output using a script. Drop said newlines. tcrypt output before this patch: [...] testing speed of rfc4106(gcm(aes)) (rfc4106-gcm-aesni) encryption [...] test 0 (160 bit key, 16 byte blocks): 1 operation in 2320 cycles (16 bytes) tcrypt output with this patch: [...] testing speed of rfc4106(gcm(aes)) (rfc4106-gcm-aesni) encryption [...] test 0 (160 bit key, 16 byte blocks): 1 operation in 2320 cycles (16 bytes) Signed-off-by: Anirudh Venkataramanan Signed-off-by: Herbert Xu --- crypto/tcrypt.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 40e72ec0f5378..b096ae901aa80 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -324,7 +324,7 @@ static void test_mb_aead_speed(const char *algo, int enc, int secs, crypto_req_done, &data[i].wait); } - pr_info("\ntesting speed of multibuffer %s (%s) %s\n", algo, + pr_info("testing speed of multibuffer %s (%s) %s\n", algo, get_driver_name(crypto_aead, tfm), e); i = 0; @@ -575,7 +575,7 @@ static void test_aead_speed(const char *algo, int enc, unsigned int secs, } crypto_init_wait(&wait); - pr_info("\ntesting speed of %s (%s) %s\n", algo, + pr_info("testing speed of %s (%s) %s\n", algo, get_driver_name(crypto_aead, tfm), e); req = aead_request_alloc(tfm, GFP_KERNEL); @@ -877,7 +877,7 @@ static void test_ahash_speed_common(const char *algo, unsigned int secs, return; } - pr_info("\ntesting speed of async %s (%s)\n", algo, + pr_info("testing speed of async %s (%s)\n", algo, get_driver_name(crypto_ahash, tfm)); if (crypto_ahash_digestsize(tfm) > MAX_DIGEST_SIZE) { @@ -1117,7 +1117,7 @@ static void test_mb_skcipher_speed(const char *algo, int enc, int secs, crypto_init_wait(&data[i].wait); } - pr_info("\ntesting speed of multibuffer %s (%s) %s\n", algo, + pr_info("testing speed of multibuffer %s (%s) %s\n", algo, get_driver_name(crypto_skcipher, tfm), e); i = 0; @@ -1324,7 +1324,7 @@ static void test_skcipher_speed(const char *algo, int enc, unsigned int secs, return; } - pr_info("\ntesting speed of %s %s (%s) %s\n", async ? "async" : "sync", + pr_info("testing speed of %s %s (%s) %s\n", async ? "async" : "sync", algo, get_driver_name(crypto_skcipher, tfm), e); req = skcipher_request_alloc(tfm, GFP_KERNEL); -- GitLab From e1fa51aa2b04830495c1fc9d3d5dee4a4419b70d Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Thu, 27 Oct 2022 14:54:53 +0800 Subject: [PATCH 0358/1423] crypto: arm64/sm3 - raise the priority of the CE implementation Raise the priority of the sm3-ce algorithm from 200 to 400, this is to make room for the implementation of sm3-neon. Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu --- arch/arm64/crypto/sm3-ce-glue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/crypto/sm3-ce-glue.c b/arch/arm64/crypto/sm3-ce-glue.c index ee98954ae8ca6..54bf6ebcfffb1 100644 --- a/arch/arm64/crypto/sm3-ce-glue.c +++ b/arch/arm64/crypto/sm3-ce-glue.c @@ -84,7 +84,7 @@ static struct shash_alg sm3_alg = { .base.cra_driver_name = "sm3-ce", .base.cra_blocksize = SM3_BLOCK_SIZE, .base.cra_module = THIS_MODULE, - .base.cra_priority = 200, + .base.cra_priority = 400, }; static int __init sm3_ce_mod_init(void) -- GitLab From a41b2129461f6c88e087ca9a6e2fde34cb6deb48 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Thu, 27 Oct 2022 14:54:54 +0800 Subject: [PATCH 0359/1423] crypto: arm64/sm3 - add NEON assembly implementation This patch adds the NEON acceleration implementation of the SM3 hash algorithm. The main algorithm is based on SM3 NEON accelerated work of the libgcrypt project. Benchmark on T-Head Yitian-710 2.75 GHz, the data comes from the 326 mode of tcrypt, and compares the performance data of sm3-generic and sm3-ce. The abscissas are blocks of different lengths. The data is tabulated and the unit is Mb/s: update-size | 16 64 256 1024 2048 4096 8192 ---------------+-------------------------------------------------------- sm3-generic | 185.24 221.28 301.26 307.43 300.83 308.82 308.91 sm3-neon | 171.81 220.20 322.94 339.28 334.09 343.61 343.87 sm3-ce | 227.48 333.48 502.62 527.87 520.45 534.91 535.40 Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu --- arch/arm64/crypto/Kconfig | 11 + arch/arm64/crypto/Makefile | 3 + arch/arm64/crypto/sm3-neon-core.S | 600 ++++++++++++++++++++++++++++++ arch/arm64/crypto/sm3-neon-glue.c | 103 +++++ 4 files changed, 717 insertions(+) create mode 100644 arch/arm64/crypto/sm3-neon-core.S create mode 100644 arch/arm64/crypto/sm3-neon-glue.c diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 8bd80508a710d..4b121dc0cfba2 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -96,6 +96,17 @@ config CRYPTO_SHA3_ARM64 Architecture: arm64 using: - ARMv8.2 Crypto Extensions +config CRYPTO_SM3_NEON + tristate "Hash functions: SM3 (NEON)" + depends on KERNEL_MODE_NEON + select CRYPTO_HASH + select CRYPTO_SM3 + help + SM3 (ShangMi 3) secure hash function (OSCCA GM/T 0004-2012) + + Architecture: arm64 using: + - NEON (Advanced SIMD) extensions + config CRYPTO_SM3_ARM64_CE tristate "Hash functions: SM3 (ARMv8.2 Crypto Extensions)" depends on KERNEL_MODE_NEON diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index 24bb0c4610de2..087f1625e7751 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -17,6 +17,9 @@ sha512-ce-y := sha512-ce-glue.o sha512-ce-core.o obj-$(CONFIG_CRYPTO_SHA3_ARM64) += sha3-ce.o sha3-ce-y := sha3-ce-glue.o sha3-ce-core.o +obj-$(CONFIG_CRYPTO_SM3_NEON) += sm3-neon.o +sm3-neon-y := sm3-neon-glue.o sm3-neon-core.o + obj-$(CONFIG_CRYPTO_SM3_ARM64_CE) += sm3-ce.o sm3-ce-y := sm3-ce-glue.o sm3-ce-core.o diff --git a/arch/arm64/crypto/sm3-neon-core.S b/arch/arm64/crypto/sm3-neon-core.S new file mode 100644 index 0000000000000..3e3b4e5c736fc --- /dev/null +++ b/arch/arm64/crypto/sm3-neon-core.S @@ -0,0 +1,600 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * sm3-neon-core.S - SM3 secure hash using NEON instructions + * + * Linux/arm64 port of the libgcrypt SM3 implementation for AArch64 + * + * Copyright (C) 2021 Jussi Kivilinna + * Copyright (c) 2022 Tianjia Zhang + */ + +#include +#include + +/* Context structure */ + +#define state_h0 0 +#define state_h1 4 +#define state_h2 8 +#define state_h3 12 +#define state_h4 16 +#define state_h5 20 +#define state_h6 24 +#define state_h7 28 + +/* Stack structure */ + +#define STACK_W_SIZE (32 * 2 * 3) + +#define STACK_W (0) +#define STACK_SIZE (STACK_W + STACK_W_SIZE) + +/* Register macros */ + +#define RSTATE x0 +#define RDATA x1 +#define RNBLKS x2 +#define RKPTR x28 +#define RFRAME x29 + +#define ra w3 +#define rb w4 +#define rc w5 +#define rd w6 +#define re w7 +#define rf w8 +#define rg w9 +#define rh w10 + +#define t0 w11 +#define t1 w12 +#define t2 w13 +#define t3 w14 +#define t4 w15 +#define t5 w16 +#define t6 w17 + +#define k_even w19 +#define k_odd w20 + +#define addr0 x21 +#define addr1 x22 + +#define s0 w23 +#define s1 w24 +#define s2 w25 +#define s3 w26 + +#define W0 v0 +#define W1 v1 +#define W2 v2 +#define W3 v3 +#define W4 v4 +#define W5 v5 + +#define XTMP0 v6 +#define XTMP1 v7 +#define XTMP2 v16 +#define XTMP3 v17 +#define XTMP4 v18 +#define XTMP5 v19 +#define XTMP6 v20 + +/* Helper macros. */ + +#define _(...) /*_*/ + +#define clear_vec(x) \ + movi x.8h, #0; + +#define rolw(o, a, n) \ + ror o, a, #(32 - n); + +/* Round function macros. */ + +#define GG1_1(x, y, z, o, t) \ + eor o, x, y; +#define GG1_2(x, y, z, o, t) \ + eor o, o, z; +#define GG1_3(x, y, z, o, t) + +#define FF1_1(x, y, z, o, t) GG1_1(x, y, z, o, t) +#define FF1_2(x, y, z, o, t) +#define FF1_3(x, y, z, o, t) GG1_2(x, y, z, o, t) + +#define GG2_1(x, y, z, o, t) \ + bic o, z, x; +#define GG2_2(x, y, z, o, t) \ + and t, y, x; +#define GG2_3(x, y, z, o, t) \ + eor o, o, t; + +#define FF2_1(x, y, z, o, t) \ + eor o, x, y; +#define FF2_2(x, y, z, o, t) \ + and t, x, y; \ + and o, o, z; +#define FF2_3(x, y, z, o, t) \ + eor o, o, t; + +#define R(i, a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \ + K_LOAD(round); \ + ldr t5, [sp, #(wtype##_W1_ADDR(round, widx))]; \ + rolw(t0, a, 12); /* rol(a, 12) => t0 */ \ + IOP(1, iop_param); \ + FF##i##_1(a, b, c, t1, t2); \ + ldr t6, [sp, #(wtype##_W1W2_ADDR(round, widx))]; \ + add k, k, e; \ + IOP(2, iop_param); \ + GG##i##_1(e, f, g, t3, t4); \ + FF##i##_2(a, b, c, t1, t2); \ + IOP(3, iop_param); \ + add k, k, t0; \ + add h, h, t5; \ + add d, d, t6; /* w1w2 + d => d */ \ + IOP(4, iop_param); \ + rolw(k, k, 7); /* rol (t0 + e + t), 7) => k */ \ + GG##i##_2(e, f, g, t3, t4); \ + add h, h, k; /* h + w1 + k => h */ \ + IOP(5, iop_param); \ + FF##i##_3(a, b, c, t1, t2); \ + eor t0, t0, k; /* k ^ t0 => t0 */ \ + GG##i##_3(e, f, g, t3, t4); \ + add d, d, t1; /* FF(a,b,c) + d => d */ \ + IOP(6, iop_param); \ + add t3, t3, h; /* GG(e,f,g) + h => t3 */ \ + rolw(b, b, 9); /* rol(b, 9) => b */ \ + eor h, t3, t3, ror #(32-9); \ + IOP(7, iop_param); \ + add d, d, t0; /* t0 + d => d */ \ + rolw(f, f, 19); /* rol(f, 19) => f */ \ + IOP(8, iop_param); \ + eor h, h, t3, ror #(32-17); /* P0(t3) => h */ + +#define R1(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \ + R(1, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param) + +#define R2(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \ + R(2, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param) + +#define KL(round) \ + ldp k_even, k_odd, [RKPTR, #(4*(round))]; + +/* Input expansion macros. */ + +/* Byte-swapped input address. */ +#define IW_W_ADDR(round, widx, offs) \ + (STACK_W + ((round) / 4) * 64 + (offs) + ((widx) * 4)) + +/* Expanded input address. */ +#define XW_W_ADDR(round, widx, offs) \ + (STACK_W + ((((round) / 3) - 4) % 2) * 64 + (offs) + ((widx) * 4)) + +/* Rounds 1-12, byte-swapped input block addresses. */ +#define IW_W1_ADDR(round, widx) IW_W_ADDR(round, widx, 32) +#define IW_W1W2_ADDR(round, widx) IW_W_ADDR(round, widx, 48) + +/* Rounds 1-12, expanded input block addresses. */ +#define XW_W1_ADDR(round, widx) XW_W_ADDR(round, widx, 0) +#define XW_W1W2_ADDR(round, widx) XW_W_ADDR(round, widx, 16) + +/* Input block loading. + * Interleaving within round function needed for in-order CPUs. */ +#define LOAD_W_VEC_1_1() \ + add addr0, sp, #IW_W1_ADDR(0, 0); +#define LOAD_W_VEC_1_2() \ + add addr1, sp, #IW_W1_ADDR(4, 0); +#define LOAD_W_VEC_1_3() \ + ld1 {W0.16b}, [RDATA], #16; +#define LOAD_W_VEC_1_4() \ + ld1 {W1.16b}, [RDATA], #16; +#define LOAD_W_VEC_1_5() \ + ld1 {W2.16b}, [RDATA], #16; +#define LOAD_W_VEC_1_6() \ + ld1 {W3.16b}, [RDATA], #16; +#define LOAD_W_VEC_1_7() \ + rev32 XTMP0.16b, W0.16b; +#define LOAD_W_VEC_1_8() \ + rev32 XTMP1.16b, W1.16b; +#define LOAD_W_VEC_2_1() \ + rev32 XTMP2.16b, W2.16b; +#define LOAD_W_VEC_2_2() \ + rev32 XTMP3.16b, W3.16b; +#define LOAD_W_VEC_2_3() \ + eor XTMP4.16b, XTMP1.16b, XTMP0.16b; +#define LOAD_W_VEC_2_4() \ + eor XTMP5.16b, XTMP2.16b, XTMP1.16b; +#define LOAD_W_VEC_2_5() \ + st1 {XTMP0.16b}, [addr0], #16; +#define LOAD_W_VEC_2_6() \ + st1 {XTMP4.16b}, [addr0]; \ + add addr0, sp, #IW_W1_ADDR(8, 0); +#define LOAD_W_VEC_2_7() \ + eor XTMP6.16b, XTMP3.16b, XTMP2.16b; +#define LOAD_W_VEC_2_8() \ + ext W0.16b, XTMP0.16b, XTMP0.16b, #8; /* W0: xx, w0, xx, xx */ +#define LOAD_W_VEC_3_1() \ + mov W2.16b, XTMP1.16b; /* W2: xx, w6, w5, w4 */ +#define LOAD_W_VEC_3_2() \ + st1 {XTMP1.16b}, [addr1], #16; +#define LOAD_W_VEC_3_3() \ + st1 {XTMP5.16b}, [addr1]; \ + ext W1.16b, XTMP0.16b, XTMP0.16b, #4; /* W1: xx, w3, w2, w1 */ +#define LOAD_W_VEC_3_4() \ + ext W3.16b, XTMP1.16b, XTMP2.16b, #12; /* W3: xx, w9, w8, w7 */ +#define LOAD_W_VEC_3_5() \ + ext W4.16b, XTMP2.16b, XTMP3.16b, #8; /* W4: xx, w12, w11, w10 */ +#define LOAD_W_VEC_3_6() \ + st1 {XTMP2.16b}, [addr0], #16; +#define LOAD_W_VEC_3_7() \ + st1 {XTMP6.16b}, [addr0]; +#define LOAD_W_VEC_3_8() \ + ext W5.16b, XTMP3.16b, XTMP3.16b, #4; /* W5: xx, w15, w14, w13 */ + +#define LOAD_W_VEC_1(iop_num, ...) \ + LOAD_W_VEC_1_##iop_num() +#define LOAD_W_VEC_2(iop_num, ...) \ + LOAD_W_VEC_2_##iop_num() +#define LOAD_W_VEC_3(iop_num, ...) \ + LOAD_W_VEC_3_##iop_num() + +/* Message scheduling. Note: 3 words per vector register. + * Interleaving within round function needed for in-order CPUs. */ +#define SCHED_W_1_1(round, w0, w1, w2, w3, w4, w5) \ + /* Load (w[i - 16]) => XTMP0 */ \ + /* Load (w[i - 13]) => XTMP5 */ \ + ext XTMP0.16b, w0.16b, w0.16b, #12; /* XTMP0: w0, xx, xx, xx */ +#define SCHED_W_1_2(round, w0, w1, w2, w3, w4, w5) \ + ext XTMP5.16b, w1.16b, w1.16b, #12; +#define SCHED_W_1_3(round, w0, w1, w2, w3, w4, w5) \ + ext XTMP0.16b, XTMP0.16b, w1.16b, #12; /* XTMP0: xx, w2, w1, w0 */ +#define SCHED_W_1_4(round, w0, w1, w2, w3, w4, w5) \ + ext XTMP5.16b, XTMP5.16b, w2.16b, #12; +#define SCHED_W_1_5(round, w0, w1, w2, w3, w4, w5) \ + /* w[i - 9] == w3 */ \ + /* W3 ^ XTMP0 => XTMP0 */ \ + eor XTMP0.16b, XTMP0.16b, w3.16b; +#define SCHED_W_1_6(round, w0, w1, w2, w3, w4, w5) \ + /* w[i - 3] == w5 */ \ + /* rol(XMM5, 15) ^ XTMP0 => XTMP0 */ \ + /* rol(XTMP5, 7) => XTMP1 */ \ + add addr0, sp, #XW_W1_ADDR((round), 0); \ + shl XTMP2.4s, w5.4s, #15; +#define SCHED_W_1_7(round, w0, w1, w2, w3, w4, w5) \ + shl XTMP1.4s, XTMP5.4s, #7; +#define SCHED_W_1_8(round, w0, w1, w2, w3, w4, w5) \ + sri XTMP2.4s, w5.4s, #(32-15); +#define SCHED_W_2_1(round, w0, w1, w2, w3, w4, w5) \ + sri XTMP1.4s, XTMP5.4s, #(32-7); +#define SCHED_W_2_2(round, w0, w1, w2, w3, w4, w5) \ + eor XTMP0.16b, XTMP0.16b, XTMP2.16b; +#define SCHED_W_2_3(round, w0, w1, w2, w3, w4, w5) \ + /* w[i - 6] == W4 */ \ + /* W4 ^ XTMP1 => XTMP1 */ \ + eor XTMP1.16b, XTMP1.16b, w4.16b; +#define SCHED_W_2_4(round, w0, w1, w2, w3, w4, w5) \ + /* P1(XTMP0) ^ XTMP1 => W0 */ \ + shl XTMP3.4s, XTMP0.4s, #15; +#define SCHED_W_2_5(round, w0, w1, w2, w3, w4, w5) \ + shl XTMP4.4s, XTMP0.4s, #23; +#define SCHED_W_2_6(round, w0, w1, w2, w3, w4, w5) \ + eor w0.16b, XTMP1.16b, XTMP0.16b; +#define SCHED_W_2_7(round, w0, w1, w2, w3, w4, w5) \ + sri XTMP3.4s, XTMP0.4s, #(32-15); +#define SCHED_W_2_8(round, w0, w1, w2, w3, w4, w5) \ + sri XTMP4.4s, XTMP0.4s, #(32-23); +#define SCHED_W_3_1(round, w0, w1, w2, w3, w4, w5) \ + eor w0.16b, w0.16b, XTMP3.16b; +#define SCHED_W_3_2(round, w0, w1, w2, w3, w4, w5) \ + /* Load (w[i - 3]) => XTMP2 */ \ + ext XTMP2.16b, w4.16b, w4.16b, #12; +#define SCHED_W_3_3(round, w0, w1, w2, w3, w4, w5) \ + eor w0.16b, w0.16b, XTMP4.16b; +#define SCHED_W_3_4(round, w0, w1, w2, w3, w4, w5) \ + ext XTMP2.16b, XTMP2.16b, w5.16b, #12; +#define SCHED_W_3_5(round, w0, w1, w2, w3, w4, w5) \ + /* W1 ^ W2 => XTMP3 */ \ + eor XTMP3.16b, XTMP2.16b, w0.16b; +#define SCHED_W_3_6(round, w0, w1, w2, w3, w4, w5) +#define SCHED_W_3_7(round, w0, w1, w2, w3, w4, w5) \ + st1 {XTMP2.16b-XTMP3.16b}, [addr0]; +#define SCHED_W_3_8(round, w0, w1, w2, w3, w4, w5) + +#define SCHED_W_W0W1W2W3W4W5_1(iop_num, round) \ + SCHED_W_1_##iop_num(round, W0, W1, W2, W3, W4, W5) +#define SCHED_W_W0W1W2W3W4W5_2(iop_num, round) \ + SCHED_W_2_##iop_num(round, W0, W1, W2, W3, W4, W5) +#define SCHED_W_W0W1W2W3W4W5_3(iop_num, round) \ + SCHED_W_3_##iop_num(round, W0, W1, W2, W3, W4, W5) + +#define SCHED_W_W1W2W3W4W5W0_1(iop_num, round) \ + SCHED_W_1_##iop_num(round, W1, W2, W3, W4, W5, W0) +#define SCHED_W_W1W2W3W4W5W0_2(iop_num, round) \ + SCHED_W_2_##iop_num(round, W1, W2, W3, W4, W5, W0) +#define SCHED_W_W1W2W3W4W5W0_3(iop_num, round) \ + SCHED_W_3_##iop_num(round, W1, W2, W3, W4, W5, W0) + +#define SCHED_W_W2W3W4W5W0W1_1(iop_num, round) \ + SCHED_W_1_##iop_num(round, W2, W3, W4, W5, W0, W1) +#define SCHED_W_W2W3W4W5W0W1_2(iop_num, round) \ + SCHED_W_2_##iop_num(round, W2, W3, W4, W5, W0, W1) +#define SCHED_W_W2W3W4W5W0W1_3(iop_num, round) \ + SCHED_W_3_##iop_num(round, W2, W3, W4, W5, W0, W1) + +#define SCHED_W_W3W4W5W0W1W2_1(iop_num, round) \ + SCHED_W_1_##iop_num(round, W3, W4, W5, W0, W1, W2) +#define SCHED_W_W3W4W5W0W1W2_2(iop_num, round) \ + SCHED_W_2_##iop_num(round, W3, W4, W5, W0, W1, W2) +#define SCHED_W_W3W4W5W0W1W2_3(iop_num, round) \ + SCHED_W_3_##iop_num(round, W3, W4, W5, W0, W1, W2) + +#define SCHED_W_W4W5W0W1W2W3_1(iop_num, round) \ + SCHED_W_1_##iop_num(round, W4, W5, W0, W1, W2, W3) +#define SCHED_W_W4W5W0W1W2W3_2(iop_num, round) \ + SCHED_W_2_##iop_num(round, W4, W5, W0, W1, W2, W3) +#define SCHED_W_W4W5W0W1W2W3_3(iop_num, round) \ + SCHED_W_3_##iop_num(round, W4, W5, W0, W1, W2, W3) + +#define SCHED_W_W5W0W1W2W3W4_1(iop_num, round) \ + SCHED_W_1_##iop_num(round, W5, W0, W1, W2, W3, W4) +#define SCHED_W_W5W0W1W2W3W4_2(iop_num, round) \ + SCHED_W_2_##iop_num(round, W5, W0, W1, W2, W3, W4) +#define SCHED_W_W5W0W1W2W3W4_3(iop_num, round) \ + SCHED_W_3_##iop_num(round, W5, W0, W1, W2, W3, W4) + + + /* + * Transform blocks*64 bytes (blocks*16 32-bit words) at 'src'. + * + * void sm3_neon_transform(struct sm3_state *sst, u8 const *src, + * int blocks) + */ + .text +.align 3 +SYM_FUNC_START(sm3_neon_transform) + ldp ra, rb, [RSTATE, #0] + ldp rc, rd, [RSTATE, #8] + ldp re, rf, [RSTATE, #16] + ldp rg, rh, [RSTATE, #24] + + stp x28, x29, [sp, #-16]! + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + mov RFRAME, sp + + sub addr0, sp, #STACK_SIZE + adr_l RKPTR, .LKtable + and sp, addr0, #(~63) + + /* Preload first block. */ + LOAD_W_VEC_1(1, 0) + LOAD_W_VEC_1(2, 0) + LOAD_W_VEC_1(3, 0) + LOAD_W_VEC_1(4, 0) + LOAD_W_VEC_1(5, 0) + LOAD_W_VEC_1(6, 0) + LOAD_W_VEC_1(7, 0) + LOAD_W_VEC_1(8, 0) + LOAD_W_VEC_2(1, 0) + LOAD_W_VEC_2(2, 0) + LOAD_W_VEC_2(3, 0) + LOAD_W_VEC_2(4, 0) + LOAD_W_VEC_2(5, 0) + LOAD_W_VEC_2(6, 0) + LOAD_W_VEC_2(7, 0) + LOAD_W_VEC_2(8, 0) + LOAD_W_VEC_3(1, 0) + LOAD_W_VEC_3(2, 0) + LOAD_W_VEC_3(3, 0) + LOAD_W_VEC_3(4, 0) + LOAD_W_VEC_3(5, 0) + LOAD_W_VEC_3(6, 0) + LOAD_W_VEC_3(7, 0) + LOAD_W_VEC_3(8, 0) + +.balign 16 +.Loop: + /* Transform 0-3 */ + R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 0, 0, IW, _, 0) + R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 1, 1, IW, _, 0) + R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 2, 2, IW, _, 0) + R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 3, 3, IW, _, 0) + + /* Transform 4-7 + Precalc 12-14 */ + R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 4, 0, IW, _, 0) + R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 5, 1, IW, _, 0) + R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 6, 2, IW, SCHED_W_W0W1W2W3W4W5_1, 12) + R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 7, 3, IW, SCHED_W_W0W1W2W3W4W5_2, 12) + + /* Transform 8-11 + Precalc 12-17 */ + R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 8, 0, IW, SCHED_W_W0W1W2W3W4W5_3, 12) + R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 9, 1, IW, SCHED_W_W1W2W3W4W5W0_1, 15) + R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 10, 2, IW, SCHED_W_W1W2W3W4W5W0_2, 15) + R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 11, 3, IW, SCHED_W_W1W2W3W4W5W0_3, 15) + + /* Transform 12-14 + Precalc 18-20 */ + R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 12, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 18) + R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 13, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 18) + R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 14, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 18) + + /* Transform 15-17 + Precalc 21-23 */ + R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 15, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 21) + R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 16, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 21) + R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 17, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 21) + + /* Transform 18-20 + Precalc 24-26 */ + R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 18, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 24) + R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 19, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 24) + R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 20, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 24) + + /* Transform 21-23 + Precalc 27-29 */ + R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 21, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 27) + R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 22, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 27) + R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 23, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 27) + + /* Transform 24-26 + Precalc 30-32 */ + R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 24, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 30) + R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 25, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 30) + R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 26, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 30) + + /* Transform 27-29 + Precalc 33-35 */ + R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 27, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 33) + R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 28, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 33) + R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 29, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 33) + + /* Transform 30-32 + Precalc 36-38 */ + R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 30, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 36) + R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 31, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 36) + R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 32, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 36) + + /* Transform 33-35 + Precalc 39-41 */ + R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 33, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 39) + R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 34, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 39) + R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 35, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 39) + + /* Transform 36-38 + Precalc 42-44 */ + R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 36, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 42) + R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 37, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 42) + R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 38, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 42) + + /* Transform 39-41 + Precalc 45-47 */ + R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 39, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 45) + R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 40, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 45) + R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 41, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 45) + + /* Transform 42-44 + Precalc 48-50 */ + R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 42, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 48) + R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 43, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 48) + R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 44, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 48) + + /* Transform 45-47 + Precalc 51-53 */ + R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 45, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 51) + R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 46, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 51) + R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 47, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 51) + + /* Transform 48-50 + Precalc 54-56 */ + R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 48, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 54) + R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 49, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 54) + R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 50, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 54) + + /* Transform 51-53 + Precalc 57-59 */ + R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 51, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 57) + R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 52, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 57) + R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 53, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 57) + + /* Transform 54-56 + Precalc 60-62 */ + R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 54, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 60) + R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 55, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 60) + R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 56, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 60) + + /* Transform 57-59 + Precalc 63 */ + R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 57, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 63) + R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 58, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 63) + R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 59, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 63) + + /* Transform 60 */ + R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 60, 0, XW, _, _) + subs RNBLKS, RNBLKS, #1 + b.eq .Lend + + /* Transform 61-63 + Preload next block */ + R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 61, 1, XW, LOAD_W_VEC_1, _) + ldp s0, s1, [RSTATE, #0] + R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, LOAD_W_VEC_2, _) + ldp s2, s3, [RSTATE, #8] + R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 63, 0, XW, LOAD_W_VEC_3, _) + + /* Update the chaining variables. */ + eor ra, ra, s0 + eor rb, rb, s1 + ldp s0, s1, [RSTATE, #16] + eor rc, rc, s2 + ldp k_even, k_odd, [RSTATE, #24] + eor rd, rd, s3 + eor re, re, s0 + stp ra, rb, [RSTATE, #0] + eor rf, rf, s1 + stp rc, rd, [RSTATE, #8] + eor rg, rg, k_even + stp re, rf, [RSTATE, #16] + eor rh, rh, k_odd + stp rg, rh, [RSTATE, #24] + b .Loop + +.Lend: + /* Transform 61-63 */ + R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 61, 1, XW, _, _) + ldp s0, s1, [RSTATE, #0] + R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, _, _) + ldp s2, s3, [RSTATE, #8] + R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 63, 0, XW, _, _) + + /* Update the chaining variables. */ + eor ra, ra, s0 + clear_vec(W0) + eor rb, rb, s1 + clear_vec(W1) + ldp s0, s1, [RSTATE, #16] + clear_vec(W2) + eor rc, rc, s2 + clear_vec(W3) + ldp k_even, k_odd, [RSTATE, #24] + clear_vec(W4) + eor rd, rd, s3 + clear_vec(W5) + eor re, re, s0 + clear_vec(XTMP0) + stp ra, rb, [RSTATE, #0] + clear_vec(XTMP1) + eor rf, rf, s1 + clear_vec(XTMP2) + stp rc, rd, [RSTATE, #8] + clear_vec(XTMP3) + eor rg, rg, k_even + clear_vec(XTMP4) + stp re, rf, [RSTATE, #16] + clear_vec(XTMP5) + eor rh, rh, k_odd + clear_vec(XTMP6) + stp rg, rh, [RSTATE, #24] + + /* Clear message expansion area */ + add addr0, sp, #STACK_W + st1 {W0.16b-W3.16b}, [addr0], #64 + st1 {W0.16b-W3.16b}, [addr0], #64 + st1 {W0.16b-W3.16b}, [addr0] + + mov sp, RFRAME + + ldp x25, x26, [sp], #16 + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + ldp x28, x29, [sp], #16 + + ret +SYM_FUNC_END(sm3_neon_transform) + + + .section ".rodata", "a" + + .align 4 +.LKtable: + .long 0x79cc4519, 0xf3988a32, 0xe7311465, 0xce6228cb + .long 0x9cc45197, 0x3988a32f, 0x7311465e, 0xe6228cbc + .long 0xcc451979, 0x988a32f3, 0x311465e7, 0x6228cbce + .long 0xc451979c, 0x88a32f39, 0x11465e73, 0x228cbce6 + .long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c + .long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce + .long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec + .long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5 + .long 0x7a879d8a, 0xf50f3b14, 0xea1e7629, 0xd43cec53 + .long 0xa879d8a7, 0x50f3b14f, 0xa1e7629e, 0x43cec53d + .long 0x879d8a7a, 0x0f3b14f5, 0x1e7629ea, 0x3cec53d4 + .long 0x79d8a7a8, 0xf3b14f50, 0xe7629ea1, 0xcec53d43 + .long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c + .long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce + .long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec + .long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5 diff --git a/arch/arm64/crypto/sm3-neon-glue.c b/arch/arm64/crypto/sm3-neon-glue.c new file mode 100644 index 0000000000000..7182ee683f14a --- /dev/null +++ b/arch/arm64/crypto/sm3-neon-glue.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * sm3-neon-glue.c - SM3 secure hash using NEON instructions + * + * Copyright (C) 2022 Tianjia Zhang + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +asmlinkage void sm3_neon_transform(struct sm3_state *sst, u8 const *src, + int blocks); + +static int sm3_neon_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + if (!crypto_simd_usable()) { + sm3_update(shash_desc_ctx(desc), data, len); + return 0; + } + + kernel_neon_begin(); + sm3_base_do_update(desc, data, len, sm3_neon_transform); + kernel_neon_end(); + + return 0; +} + +static int sm3_neon_final(struct shash_desc *desc, u8 *out) +{ + if (!crypto_simd_usable()) { + sm3_final(shash_desc_ctx(desc), out); + return 0; + } + + kernel_neon_begin(); + sm3_base_do_finalize(desc, sm3_neon_transform); + kernel_neon_end(); + + return sm3_base_finish(desc, out); +} + +static int sm3_neon_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + if (!crypto_simd_usable()) { + struct sm3_state *sctx = shash_desc_ctx(desc); + + if (len) + sm3_update(sctx, data, len); + sm3_final(sctx, out); + return 0; + } + + kernel_neon_begin(); + if (len) + sm3_base_do_update(desc, data, len, sm3_neon_transform); + sm3_base_do_finalize(desc, sm3_neon_transform); + kernel_neon_end(); + + return sm3_base_finish(desc, out); +} + +static struct shash_alg sm3_alg = { + .digestsize = SM3_DIGEST_SIZE, + .init = sm3_base_init, + .update = sm3_neon_update, + .final = sm3_neon_final, + .finup = sm3_neon_finup, + .descsize = sizeof(struct sm3_state), + .base.cra_name = "sm3", + .base.cra_driver_name = "sm3-neon", + .base.cra_blocksize = SM3_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + .base.cra_priority = 200, +}; + +static int __init sm3_neon_init(void) +{ + return crypto_register_shash(&sm3_alg); +} + +static void __exit sm3_neon_fini(void) +{ + crypto_unregister_shash(&sm3_alg); +} + +module_init(sm3_neon_init); +module_exit(sm3_neon_fini); + +MODULE_DESCRIPTION("SM3 secure hash using NEON instructions"); +MODULE_AUTHOR("Jussi Kivilinna "); +MODULE_AUTHOR("Tianjia Zhang "); +MODULE_LICENSE("GPL v2"); -- GitLab From 62508017a264133a62987fe40f70c68af9a36572 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Thu, 27 Oct 2022 14:54:55 +0800 Subject: [PATCH 0360/1423] crypto: arm64/sm4 - refactor and simplify NEON implementation This patch does not add new features. The main work is to refactor and simplify the implementation of SM4 NEON, which is reflected in the following aspects: The accelerated implementation supports the arbitrary number of blocks, not just multiples of 8, which simplifies the implementation and brings some optimization acceleration for data that is not aligned by 8 blocks. When loading the input data, use the ld4 instruction to replace the original ld1 instruction as much as possible, which will save the cost of matrix transposition of the input data. Use 8-block parallelism whenever possible to speed up matrix transpose and rotation operations, instead of up to 4-block parallelism. Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu --- arch/arm64/crypto/sm4-neon-core.S | 630 +++++++++++++++++++----------- arch/arm64/crypto/sm4-neon-glue.c | 172 +++----- 2 files changed, 456 insertions(+), 346 deletions(-) diff --git a/arch/arm64/crypto/sm4-neon-core.S b/arch/arm64/crypto/sm4-neon-core.S index 3d5256b354d27..f295b4b7d70a3 100644 --- a/arch/arm64/crypto/sm4-neon-core.S +++ b/arch/arm64/crypto/sm4-neon-core.S @@ -18,6 +18,11 @@ #define RTMP2 v10 #define RTMP3 v11 +#define RTMP4 v12 +#define RTMP5 v13 +#define RTMP6 v14 +#define RTMP7 v15 + #define RX0 v12 #define RX1 v13 #define RKEY v14 @@ -25,7 +30,7 @@ /* Helper macros. */ -#define PREPARE \ +#define SM4_PREPARE() \ adr_l x5, crypto_sm4_sbox; \ ld1 {v16.16b-v19.16b}, [x5], #64; \ ld1 {v20.16b-v23.16b}, [x5], #64; \ @@ -42,7 +47,25 @@ zip1 s2.2d, RTMP2.2d, RTMP3.2d; \ zip2 s3.2d, RTMP2.2d, RTMP3.2d; -#define rotate_clockwise_90(s0, s1, s2, s3) \ +#define transpose_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \ + zip1 RTMP0.4s, s0.4s, s1.4s; \ + zip1 RTMP1.4s, s2.4s, s3.4s; \ + zip2 RTMP2.4s, s0.4s, s1.4s; \ + zip2 RTMP3.4s, s2.4s, s3.4s; \ + zip1 RTMP4.4s, s4.4s, s5.4s; \ + zip1 RTMP5.4s, s6.4s, s7.4s; \ + zip2 RTMP6.4s, s4.4s, s5.4s; \ + zip2 RTMP7.4s, s6.4s, s7.4s; \ + zip1 s0.2d, RTMP0.2d, RTMP1.2d; \ + zip2 s1.2d, RTMP0.2d, RTMP1.2d; \ + zip1 s2.2d, RTMP2.2d, RTMP3.2d; \ + zip2 s3.2d, RTMP2.2d, RTMP3.2d; \ + zip1 s4.2d, RTMP4.2d, RTMP5.2d; \ + zip2 s5.2d, RTMP4.2d, RTMP5.2d; \ + zip1 s6.2d, RTMP6.2d, RTMP7.2d; \ + zip2 s7.2d, RTMP6.2d, RTMP7.2d; + +#define rotate_clockwise_4x4(s0, s1, s2, s3) \ zip1 RTMP0.4s, s1.4s, s0.4s; \ zip2 RTMP1.4s, s1.4s, s0.4s; \ zip1 RTMP2.4s, s3.4s, s2.4s; \ @@ -52,6 +75,24 @@ zip1 s2.2d, RTMP3.2d, RTMP1.2d; \ zip2 s3.2d, RTMP3.2d, RTMP1.2d; +#define rotate_clockwise_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \ + zip1 RTMP0.4s, s1.4s, s0.4s; \ + zip1 RTMP2.4s, s3.4s, s2.4s; \ + zip2 RTMP1.4s, s1.4s, s0.4s; \ + zip2 RTMP3.4s, s3.4s, s2.4s; \ + zip1 RTMP4.4s, s5.4s, s4.4s; \ + zip1 RTMP6.4s, s7.4s, s6.4s; \ + zip2 RTMP5.4s, s5.4s, s4.4s; \ + zip2 RTMP7.4s, s7.4s, s6.4s; \ + zip1 s0.2d, RTMP2.2d, RTMP0.2d; \ + zip2 s1.2d, RTMP2.2d, RTMP0.2d; \ + zip1 s2.2d, RTMP3.2d, RTMP1.2d; \ + zip2 s3.2d, RTMP3.2d, RTMP1.2d; \ + zip1 s4.2d, RTMP6.2d, RTMP4.2d; \ + zip2 s5.2d, RTMP6.2d, RTMP4.2d; \ + zip1 s6.2d, RTMP7.2d, RTMP5.2d; \ + zip2 s7.2d, RTMP7.2d, RTMP5.2d; + #define ROUND4(round, s0, s1, s2, s3) \ dup RX0.4s, RKEY.s[round]; \ /* rk ^ s1 ^ s2 ^ s3 */ \ @@ -87,14 +128,7 @@ /* s0 ^= RTMP3 */ \ eor s0.16b, s0.16b, RTMP3.16b; -#define SM4_CRYPT_BLK4(b0, b1, b2, b3) \ - rev32 b0.16b, b0.16b; \ - rev32 b1.16b, b1.16b; \ - rev32 b2.16b, b2.16b; \ - rev32 b3.16b, b3.16b; \ - \ - transpose_4x4(b0, b1, b2, b3); \ - \ +#define SM4_CRYPT_BLK4_BE(b0, b1, b2, b3) \ mov x6, 8; \ 4: \ ld1 {RKEY.4s}, [x0], #16; \ @@ -107,15 +141,23 @@ \ bne 4b; \ \ - rotate_clockwise_90(b0, b1, b2, b3); \ rev32 b0.16b, b0.16b; \ rev32 b1.16b, b1.16b; \ rev32 b2.16b, b2.16b; \ rev32 b3.16b, b3.16b; \ \ + rotate_clockwise_4x4(b0, b1, b2, b3); \ + \ /* repoint to rkey */ \ sub x0, x0, #128; +#define SM4_CRYPT_BLK4(b0, b1, b2, b3) \ + rev32 b0.16b, b0.16b; \ + rev32 b1.16b, b1.16b; \ + rev32 b2.16b, b2.16b; \ + rev32 b3.16b, b3.16b; \ + SM4_CRYPT_BLK4_BE(b0, b1, b2, b3); + #define ROUND8(round, s0, s1, s2, s3, t0, t1, t2, t3) \ /* rk ^ s1 ^ s2 ^ s3 */ \ dup RX0.4s, RKEY.s[round]; \ @@ -175,7 +217,7 @@ eor s0.16b, s0.16b, RTMP0.16b; \ eor t0.16b, t0.16b, RTMP1.16b; -#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \ +#define SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7) \ rev32 b0.16b, b0.16b; \ rev32 b1.16b, b1.16b; \ rev32 b2.16b, b2.16b; \ @@ -185,9 +227,6 @@ rev32 b6.16b, b6.16b; \ rev32 b7.16b, b7.16b; \ \ - transpose_4x4(b0, b1, b2, b3); \ - transpose_4x4(b4, b5, b6, b7); \ - \ mov x6, 8; \ 8: \ ld1 {RKEY.4s}, [x0], #16; \ @@ -200,8 +239,6 @@ \ bne 8b; \ \ - rotate_clockwise_90(b0, b1, b2, b3); \ - rotate_clockwise_90(b4, b5, b6, b7); \ rev32 b0.16b, b0.16b; \ rev32 b1.16b, b1.16b; \ rev32 b2.16b, b2.16b; \ @@ -214,274 +251,429 @@ /* repoint to rkey */ \ sub x0, x0, #128; +#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \ + SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7); \ + rotate_clockwise_4x4_2x(b0, b1, b2, b3, b4, b5, b6, b7); \ -.align 3 -SYM_FUNC_START_LOCAL(__sm4_neon_crypt_blk1_4) - /* input: - * x0: round key array, CTX - * x1: dst - * x2: src - * w3: num blocks (1..4) - */ - PREPARE; - - ld1 {v0.16b}, [x2], #16; - mov v1.16b, v0.16b; - mov v2.16b, v0.16b; - mov v3.16b, v0.16b; - cmp w3, #2; - blt .Lblk4_load_input_done; - ld1 {v1.16b}, [x2], #16; - beq .Lblk4_load_input_done; - ld1 {v2.16b}, [x2], #16; - cmp w3, #3; - beq .Lblk4_load_input_done; - ld1 {v3.16b}, [x2]; - -.Lblk4_load_input_done: - SM4_CRYPT_BLK4(v0, v1, v2, v3); - - st1 {v0.16b}, [x1], #16; - cmp w3, #2; - blt .Lblk4_store_output_done; - st1 {v1.16b}, [x1], #16; - beq .Lblk4_store_output_done; - st1 {v2.16b}, [x1], #16; - cmp w3, #3; - beq .Lblk4_store_output_done; - st1 {v3.16b}, [x1]; - -.Lblk4_store_output_done: - ret; -SYM_FUNC_END(__sm4_neon_crypt_blk1_4) .align 3 -SYM_FUNC_START(sm4_neon_crypt_blk1_8) +SYM_FUNC_START(sm4_neon_crypt) /* input: * x0: round key array, CTX * x1: dst * x2: src - * w3: num blocks (1..8) + * w3: nblocks */ - cmp w3, #5; - blt __sm4_neon_crypt_blk1_4; - - PREPARE; - - ld1 {v0.16b-v3.16b}, [x2], #64; - ld1 {v4.16b}, [x2], #16; - mov v5.16b, v4.16b; - mov v6.16b, v4.16b; - mov v7.16b, v4.16b; - beq .Lblk8_load_input_done; - ld1 {v5.16b}, [x2], #16; - cmp w3, #7; - blt .Lblk8_load_input_done; - ld1 {v6.16b}, [x2], #16; - beq .Lblk8_load_input_done; - ld1 {v7.16b}, [x2]; - -.Lblk8_load_input_done: - SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); - - cmp w3, #6; - st1 {v0.16b-v3.16b}, [x1], #64; - st1 {v4.16b}, [x1], #16; - blt .Lblk8_store_output_done; - st1 {v5.16b}, [x1], #16; - beq .Lblk8_store_output_done; - st1 {v6.16b}, [x1], #16; - cmp w3, #7; - beq .Lblk8_store_output_done; - st1 {v7.16b}, [x1]; - -.Lblk8_store_output_done: - ret; -SYM_FUNC_END(sm4_neon_crypt_blk1_8) + SM4_PREPARE() -.align 3 -SYM_FUNC_START(sm4_neon_crypt_blk8) - /* input: - * x0: round key array, CTX - * x1: dst - * x2: src - * w3: nblocks (multiples of 8) - */ - PREPARE; +.Lcrypt_loop_8x: + sub w3, w3, #8 + tbnz w3, #31, .Lcrypt_4x + + ld4 {v0.4s-v3.4s}, [x2], #64 + ld4 {v4.4s-v7.4s}, [x2], #64 -.Lcrypt_loop_blk: - subs w3, w3, #8; - bmi .Lcrypt_end; + SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) - ld1 {v0.16b-v3.16b}, [x2], #64; - ld1 {v4.16b-v7.16b}, [x2], #64; + st1 {v0.16b-v3.16b}, [x1], #64 + st1 {v4.16b-v7.16b}, [x1], #64 - SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); + cbz w3, .Lcrypt_end + b .Lcrypt_loop_8x - st1 {v0.16b-v3.16b}, [x1], #64; - st1 {v4.16b-v7.16b}, [x1], #64; +.Lcrypt_4x: + add w3, w3, #8 + cmp w3, #4 + blt .Lcrypt_tail - b .Lcrypt_loop_blk; + sub w3, w3, #4 + + ld4 {v0.4s-v3.4s}, [x2], #64 + + SM4_CRYPT_BLK4(v0, v1, v2, v3) + + st1 {v0.16b-v3.16b}, [x1], #64 + + cbz w3, .Lcrypt_end + +.Lcrypt_tail: + cmp w3, #2 + ld1 {v0.16b}, [x2], #16 + blt .Lcrypt_tail_load_done + ld1 {v1.16b}, [x2], #16 + beq .Lcrypt_tail_load_done + ld1 {v2.16b}, [x2], #16 + +.Lcrypt_tail_load_done: + transpose_4x4(v0, v1, v2, v3) + + SM4_CRYPT_BLK4(v0, v1, v2, v3) + + cmp w3, #2 + st1 {v0.16b}, [x1], #16 + blt .Lcrypt_end + st1 {v1.16b}, [x1], #16 + beq .Lcrypt_end + st1 {v2.16b}, [x1], #16 .Lcrypt_end: - ret; -SYM_FUNC_END(sm4_neon_crypt_blk8) + ret +SYM_FUNC_END(sm4_neon_crypt) .align 3 -SYM_FUNC_START(sm4_neon_cbc_dec_blk8) +SYM_FUNC_START(sm4_neon_cbc_dec) /* input: * x0: round key array, CTX * x1: dst * x2: src * x3: iv (big endian, 128 bit) - * w4: nblocks (multiples of 8) + * w4: nblocks */ - PREPARE; + SM4_PREPARE() + + ld1 {RIV.16b}, [x3] + +.Lcbc_dec_loop_8x: + sub w4, w4, #8 + tbnz w4, #31, .Lcbc_dec_4x + + ld4 {v0.4s-v3.4s}, [x2], #64 + ld4 {v4.4s-v7.4s}, [x2] + + SM4_CRYPT_BLK8_norotate(v0, v1, v2, v3, v4, v5, v6, v7) + + /* Avoid overwriting the RIV register */ + rotate_clockwise_4x4(v0, v1, v2, v3) + rotate_clockwise_4x4(v4, v5, v6, v7) + + sub x2, x2, #64 + + eor v0.16b, v0.16b, RIV.16b - ld1 {RIV.16b}, [x3]; + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64 + ld1 {RTMP4.16b-RTMP7.16b}, [x2], #64 -.Lcbc_loop_blk: - subs w4, w4, #8; - bmi .Lcbc_end; + eor v1.16b, v1.16b, RTMP0.16b + eor v2.16b, v2.16b, RTMP1.16b + eor v3.16b, v3.16b, RTMP2.16b + eor v4.16b, v4.16b, RTMP3.16b + eor v5.16b, v5.16b, RTMP4.16b + eor v6.16b, v6.16b, RTMP5.16b + eor v7.16b, v7.16b, RTMP6.16b - ld1 {v0.16b-v3.16b}, [x2], #64; - ld1 {v4.16b-v7.16b}, [x2]; + mov RIV.16b, RTMP7.16b - SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); + st1 {v0.16b-v3.16b}, [x1], #64 + st1 {v4.16b-v7.16b}, [x1], #64 - sub x2, x2, #64; - eor v0.16b, v0.16b, RIV.16b; - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; - eor v1.16b, v1.16b, RTMP0.16b; - eor v2.16b, v2.16b, RTMP1.16b; - eor v3.16b, v3.16b, RTMP2.16b; - st1 {v0.16b-v3.16b}, [x1], #64; + cbz w4, .Lcbc_dec_end + b .Lcbc_dec_loop_8x - eor v4.16b, v4.16b, RTMP3.16b; - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; - eor v5.16b, v5.16b, RTMP0.16b; - eor v6.16b, v6.16b, RTMP1.16b; - eor v7.16b, v7.16b, RTMP2.16b; +.Lcbc_dec_4x: + add w4, w4, #8 + cmp w4, #4 + blt .Lcbc_dec_tail - mov RIV.16b, RTMP3.16b; - st1 {v4.16b-v7.16b}, [x1], #64; + sub w4, w4, #4 - b .Lcbc_loop_blk; + ld1 {v0.16b-v3.16b}, [x2], #64 -.Lcbc_end: + rev32 v4.16b, v0.16b + rev32 v5.16b, v1.16b + rev32 v6.16b, v2.16b + rev32 v7.16b, v3.16b + + transpose_4x4(v4, v5, v6, v7) + + SM4_CRYPT_BLK4_BE(v4, v5, v6, v7) + + eor v4.16b, v4.16b, RIV.16b + eor v5.16b, v5.16b, v0.16b + eor v6.16b, v6.16b, v1.16b + eor v7.16b, v7.16b, v2.16b + + mov RIV.16b, v3.16b + + st1 {v4.16b-v7.16b}, [x1], #64 + + cbz w4, .Lcbc_dec_end + +.Lcbc_dec_tail: + cmp w4, #2 + ld1 {v0.16b}, [x2], #16 + blt .Lcbc_dec_tail_load_done + ld1 {v1.16b}, [x2], #16 + beq .Lcbc_dec_tail_load_done + ld1 {v2.16b}, [x2], #16 + +.Lcbc_dec_tail_load_done: + rev32 v4.16b, v0.16b + rev32 v5.16b, v1.16b + rev32 v6.16b, v2.16b + + transpose_4x4(v4, v5, v6, v7) + + SM4_CRYPT_BLK4_BE(v4, v5, v6, v7) + + cmp w4, #2 + eor v4.16b, v4.16b, RIV.16b + mov RIV.16b, v0.16b + st1 {v4.16b}, [x1], #16 + blt .Lcbc_dec_end + + eor v5.16b, v5.16b, v0.16b + mov RIV.16b, v1.16b + st1 {v5.16b}, [x1], #16 + beq .Lcbc_dec_end + + eor v6.16b, v6.16b, v1.16b + mov RIV.16b, v2.16b + st1 {v6.16b}, [x1], #16 + +.Lcbc_dec_end: /* store new IV */ - st1 {RIV.16b}, [x3]; + st1 {RIV.16b}, [x3] - ret; -SYM_FUNC_END(sm4_neon_cbc_dec_blk8) + ret +SYM_FUNC_END(sm4_neon_cbc_dec) .align 3 -SYM_FUNC_START(sm4_neon_cfb_dec_blk8) +SYM_FUNC_START(sm4_neon_cfb_dec) /* input: * x0: round key array, CTX * x1: dst * x2: src * x3: iv (big endian, 128 bit) - * w4: nblocks (multiples of 8) + * w4: nblocks */ - PREPARE; + SM4_PREPARE() + + ld1 {v0.16b}, [x3] + +.Lcfb_dec_loop_8x: + sub w4, w4, #8 + tbnz w4, #31, .Lcfb_dec_4x + + ld1 {v1.16b-v3.16b}, [x2], #48 + ld4 {v4.4s-v7.4s}, [x2] + + transpose_4x4(v0, v1, v2, v3) + + SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) + + sub x2, x2, #48 + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64 + ld1 {RTMP4.16b-RTMP7.16b}, [x2], #64 + + eor v0.16b, v0.16b, RTMP0.16b + eor v1.16b, v1.16b, RTMP1.16b + eor v2.16b, v2.16b, RTMP2.16b + eor v3.16b, v3.16b, RTMP3.16b + eor v4.16b, v4.16b, RTMP4.16b + eor v5.16b, v5.16b, RTMP5.16b + eor v6.16b, v6.16b, RTMP6.16b + eor v7.16b, v7.16b, RTMP7.16b + + st1 {v0.16b-v3.16b}, [x1], #64 + st1 {v4.16b-v7.16b}, [x1], #64 + + mov v0.16b, RTMP7.16b + + cbz w4, .Lcfb_dec_end + b .Lcfb_dec_loop_8x + +.Lcfb_dec_4x: + add w4, w4, #8 + cmp w4, #4 + blt .Lcfb_dec_tail + + sub w4, w4, #4 + + ld1 {v4.16b-v7.16b}, [x2], #64 + + rev32 v0.16b, v0.16b /* v0 is IV register */ + rev32 v1.16b, v4.16b + rev32 v2.16b, v5.16b + rev32 v3.16b, v6.16b + + transpose_4x4(v0, v1, v2, v3) + + SM4_CRYPT_BLK4_BE(v0, v1, v2, v3) - ld1 {v0.16b}, [x3]; + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b -.Lcfb_loop_blk: - subs w4, w4, #8; - bmi .Lcfb_end; + st1 {v0.16b-v3.16b}, [x1], #64 - ld1 {v1.16b, v2.16b, v3.16b}, [x2], #48; - ld1 {v4.16b-v7.16b}, [x2]; + mov v0.16b, v7.16b - SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); + cbz w4, .Lcfb_dec_end - sub x2, x2, #48; - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; - eor v0.16b, v0.16b, RTMP0.16b; - eor v1.16b, v1.16b, RTMP1.16b; - eor v2.16b, v2.16b, RTMP2.16b; - eor v3.16b, v3.16b, RTMP3.16b; - st1 {v0.16b-v3.16b}, [x1], #64; +.Lcfb_dec_tail: + cmp w4, #2 + ld1 {v4.16b}, [x2], #16 + blt .Lcfb_dec_tail_load_done + ld1 {v5.16b}, [x2], #16 + beq .Lcfb_dec_tail_load_done + ld1 {v6.16b}, [x2], #16 - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; - eor v4.16b, v4.16b, RTMP0.16b; - eor v5.16b, v5.16b, RTMP1.16b; - eor v6.16b, v6.16b, RTMP2.16b; - eor v7.16b, v7.16b, RTMP3.16b; - st1 {v4.16b-v7.16b}, [x1], #64; +.Lcfb_dec_tail_load_done: + rev32 v0.16b, v0.16b /* v0 is IV register */ + rev32 v1.16b, v4.16b + rev32 v2.16b, v5.16b - mov v0.16b, RTMP3.16b; + transpose_4x4(v0, v1, v2, v3) - b .Lcfb_loop_blk; + SM4_CRYPT_BLK4_BE(v0, v1, v2, v3) -.Lcfb_end: + cmp w4, #2 + eor v0.16b, v0.16b, v4.16b + st1 {v0.16b}, [x1], #16 + mov v0.16b, v4.16b + blt .Lcfb_dec_end + + eor v1.16b, v1.16b, v5.16b + st1 {v1.16b}, [x1], #16 + mov v0.16b, v5.16b + beq .Lcfb_dec_end + + eor v2.16b, v2.16b, v6.16b + st1 {v2.16b}, [x1], #16 + mov v0.16b, v6.16b + +.Lcfb_dec_end: /* store new IV */ - st1 {v0.16b}, [x3]; + st1 {v0.16b}, [x3] - ret; -SYM_FUNC_END(sm4_neon_cfb_dec_blk8) + ret +SYM_FUNC_END(sm4_neon_cfb_dec) .align 3 -SYM_FUNC_START(sm4_neon_ctr_enc_blk8) +SYM_FUNC_START(sm4_neon_ctr_crypt) /* input: * x0: round key array, CTX * x1: dst * x2: src * x3: ctr (big endian, 128 bit) - * w4: nblocks (multiples of 8) + * w4: nblocks */ - PREPARE; + SM4_PREPARE() - ldp x7, x8, [x3]; - rev x7, x7; - rev x8, x8; + ldp x7, x8, [x3] + rev x7, x7 + rev x8, x8 -.Lctr_loop_blk: - subs w4, w4, #8; - bmi .Lctr_end; +.Lctr_crypt_loop_8x: + sub w4, w4, #8 + tbnz w4, #31, .Lctr_crypt_4x -#define inc_le128(vctr) \ - mov vctr.d[1], x8; \ - mov vctr.d[0], x7; \ - adds x8, x8, #1; \ - adc x7, x7, xzr; \ - rev64 vctr.16b, vctr.16b; +#define inc_le128(vctr) \ + mov vctr.d[1], x8; \ + mov vctr.d[0], x7; \ + adds x8, x8, #1; \ + rev64 vctr.16b, vctr.16b; \ + adc x7, x7, xzr; /* construct CTRs */ - inc_le128(v0); /* +0 */ - inc_le128(v1); /* +1 */ - inc_le128(v2); /* +2 */ - inc_le128(v3); /* +3 */ - inc_le128(v4); /* +4 */ - inc_le128(v5); /* +5 */ - inc_le128(v6); /* +6 */ - inc_le128(v7); /* +7 */ - - SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); - - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; - eor v0.16b, v0.16b, RTMP0.16b; - eor v1.16b, v1.16b, RTMP1.16b; - eor v2.16b, v2.16b, RTMP2.16b; - eor v3.16b, v3.16b, RTMP3.16b; - st1 {v0.16b-v3.16b}, [x1], #64; - - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; - eor v4.16b, v4.16b, RTMP0.16b; - eor v5.16b, v5.16b, RTMP1.16b; - eor v6.16b, v6.16b, RTMP2.16b; - eor v7.16b, v7.16b, RTMP3.16b; - st1 {v4.16b-v7.16b}, [x1], #64; - - b .Lctr_loop_blk; - -.Lctr_end: + inc_le128(v0) /* +0 */ + inc_le128(v1) /* +1 */ + inc_le128(v2) /* +2 */ + inc_le128(v3) /* +3 */ + inc_le128(v4) /* +4 */ + inc_le128(v5) /* +5 */ + inc_le128(v6) /* +6 */ + inc_le128(v7) /* +7 */ + + transpose_4x4_2x(v0, v1, v2, v3, v4, v5, v6, v7) + + SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) + + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64 + ld1 {RTMP4.16b-RTMP7.16b}, [x2], #64 + + eor v0.16b, v0.16b, RTMP0.16b + eor v1.16b, v1.16b, RTMP1.16b + eor v2.16b, v2.16b, RTMP2.16b + eor v3.16b, v3.16b, RTMP3.16b + eor v4.16b, v4.16b, RTMP4.16b + eor v5.16b, v5.16b, RTMP5.16b + eor v6.16b, v6.16b, RTMP6.16b + eor v7.16b, v7.16b, RTMP7.16b + + st1 {v0.16b-v3.16b}, [x1], #64 + st1 {v4.16b-v7.16b}, [x1], #64 + + cbz w4, .Lctr_crypt_end + b .Lctr_crypt_loop_8x + +.Lctr_crypt_4x: + add w4, w4, #8 + cmp w4, #4 + blt .Lctr_crypt_tail + + sub w4, w4, #4 + + /* construct CTRs */ + inc_le128(v0) /* +0 */ + inc_le128(v1) /* +1 */ + inc_le128(v2) /* +2 */ + inc_le128(v3) /* +3 */ + + ld1 {v4.16b-v7.16b}, [x2], #64 + + transpose_4x4(v0, v1, v2, v3) + + SM4_CRYPT_BLK4(v0, v1, v2, v3) + + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + + st1 {v0.16b-v3.16b}, [x1], #64 + + cbz w4, .Lctr_crypt_end + +.Lctr_crypt_tail: + /* inc_le128 will change the sign bit */ + ld1 {v4.16b}, [x2], #16 + inc_le128(v0) + cmp w4, #2 + blt .Lctr_crypt_tail_load_done + + ld1 {v5.16b}, [x2], #16 + inc_le128(v1) + cmp w4, #2 + beq .Lctr_crypt_tail_load_done + + ld1 {v6.16b}, [x2], #16 + inc_le128(v2) + +.Lctr_crypt_tail_load_done: + transpose_4x4(v0, v1, v2, v3) + + SM4_CRYPT_BLK4(v0, v1, v2, v3) + + cmp w4, #2 + + eor v0.16b, v0.16b, v4.16b + st1 {v0.16b}, [x1], #16 + blt .Lctr_crypt_end + + eor v1.16b, v1.16b, v5.16b + st1 {v1.16b}, [x1], #16 + beq .Lctr_crypt_end + + eor v2.16b, v2.16b, v6.16b + st1 {v2.16b}, [x1], #16 + +.Lctr_crypt_end: /* store new CTR */ - rev x7, x7; - rev x8, x8; - stp x7, x8, [x3]; + rev x7, x7 + rev x8, x8 + stp x7, x8, [x3] - ret; -SYM_FUNC_END(sm4_neon_ctr_enc_blk8) + ret +SYM_FUNC_END(sm4_neon_ctr_crypt) diff --git a/arch/arm64/crypto/sm4-neon-glue.c b/arch/arm64/crypto/sm4-neon-glue.c index 03a6a6866a311..7b19accf5c037 100644 --- a/arch/arm64/crypto/sm4-neon-glue.c +++ b/arch/arm64/crypto/sm4-neon-glue.c @@ -18,19 +18,14 @@ #include #include -#define BYTES2BLKS(nbytes) ((nbytes) >> 4) -#define BYTES2BLK8(nbytes) (((nbytes) >> 4) & ~(8 - 1)) - -asmlinkage void sm4_neon_crypt_blk1_8(const u32 *rkey, u8 *dst, const u8 *src, - unsigned int nblks); -asmlinkage void sm4_neon_crypt_blk8(const u32 *rkey, u8 *dst, const u8 *src, - unsigned int nblks); -asmlinkage void sm4_neon_cbc_dec_blk8(const u32 *rkey, u8 *dst, const u8 *src, - u8 *iv, unsigned int nblks); -asmlinkage void sm4_neon_cfb_dec_blk8(const u32 *rkey, u8 *dst, const u8 *src, - u8 *iv, unsigned int nblks); -asmlinkage void sm4_neon_ctr_enc_blk8(const u32 *rkey, u8 *dst, const u8 *src, - u8 *iv, unsigned int nblks); +asmlinkage void sm4_neon_crypt(const u32 *rkey, u8 *dst, const u8 *src, + unsigned int nblocks); +asmlinkage void sm4_neon_cbc_dec(const u32 *rkey_dec, u8 *dst, const u8 *src, + u8 *iv, unsigned int nblocks); +asmlinkage void sm4_neon_cfb_dec(const u32 *rkey_enc, u8 *dst, const u8 *src, + u8 *iv, unsigned int nblocks); +asmlinkage void sm4_neon_ctr_crypt(const u32 *rkey_enc, u8 *dst, const u8 *src, + u8 *iv, unsigned int nblocks); static int sm4_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int key_len) @@ -51,27 +46,18 @@ static int sm4_ecb_do_crypt(struct skcipher_request *req, const u32 *rkey) while ((nbytes = walk.nbytes) > 0) { const u8 *src = walk.src.virt.addr; u8 *dst = walk.dst.virt.addr; - unsigned int nblks; + unsigned int nblocks; - kernel_neon_begin(); + nblocks = nbytes / SM4_BLOCK_SIZE; + if (nblocks) { + kernel_neon_begin(); - nblks = BYTES2BLK8(nbytes); - if (nblks) { - sm4_neon_crypt_blk8(rkey, dst, src, nblks); - dst += nblks * SM4_BLOCK_SIZE; - src += nblks * SM4_BLOCK_SIZE; - nbytes -= nblks * SM4_BLOCK_SIZE; - } + sm4_neon_crypt(rkey, dst, src, nblocks); - nblks = BYTES2BLKS(nbytes); - if (nblks) { - sm4_neon_crypt_blk1_8(rkey, dst, src, nblks); - nbytes -= nblks * SM4_BLOCK_SIZE; + kernel_neon_end(); } - kernel_neon_end(); - - err = skcipher_walk_done(&walk, nbytes); + err = skcipher_walk_done(&walk, nbytes % SM4_BLOCK_SIZE); } return err; @@ -138,48 +124,19 @@ static int sm4_cbc_decrypt(struct skcipher_request *req) while ((nbytes = walk.nbytes) > 0) { const u8 *src = walk.src.virt.addr; u8 *dst = walk.dst.virt.addr; - unsigned int nblks; + unsigned int nblocks; - kernel_neon_begin(); + nblocks = nbytes / SM4_BLOCK_SIZE; + if (nblocks) { + kernel_neon_begin(); - nblks = BYTES2BLK8(nbytes); - if (nblks) { - sm4_neon_cbc_dec_blk8(ctx->rkey_dec, dst, src, - walk.iv, nblks); - dst += nblks * SM4_BLOCK_SIZE; - src += nblks * SM4_BLOCK_SIZE; - nbytes -= nblks * SM4_BLOCK_SIZE; - } + sm4_neon_cbc_dec(ctx->rkey_dec, dst, src, + walk.iv, nblocks); - nblks = BYTES2BLKS(nbytes); - if (nblks) { - u8 keystream[SM4_BLOCK_SIZE * 8]; - u8 iv[SM4_BLOCK_SIZE]; - int i; - - sm4_neon_crypt_blk1_8(ctx->rkey_dec, keystream, - src, nblks); - - src += ((int)nblks - 2) * SM4_BLOCK_SIZE; - dst += (nblks - 1) * SM4_BLOCK_SIZE; - memcpy(iv, src + SM4_BLOCK_SIZE, SM4_BLOCK_SIZE); - - for (i = nblks - 1; i > 0; i--) { - crypto_xor_cpy(dst, src, - &keystream[i * SM4_BLOCK_SIZE], - SM4_BLOCK_SIZE); - src -= SM4_BLOCK_SIZE; - dst -= SM4_BLOCK_SIZE; - } - crypto_xor_cpy(dst, walk.iv, - keystream, SM4_BLOCK_SIZE); - memcpy(walk.iv, iv, SM4_BLOCK_SIZE); - nbytes -= nblks * SM4_BLOCK_SIZE; + kernel_neon_end(); } - kernel_neon_end(); - - err = skcipher_walk_done(&walk, nbytes); + err = skcipher_walk_done(&walk, nbytes % SM4_BLOCK_SIZE); } return err; @@ -238,41 +195,21 @@ static int sm4_cfb_decrypt(struct skcipher_request *req) while ((nbytes = walk.nbytes) > 0) { const u8 *src = walk.src.virt.addr; u8 *dst = walk.dst.virt.addr; - unsigned int nblks; + unsigned int nblocks; - kernel_neon_begin(); + nblocks = nbytes / SM4_BLOCK_SIZE; + if (nblocks) { + kernel_neon_begin(); - nblks = BYTES2BLK8(nbytes); - if (nblks) { - sm4_neon_cfb_dec_blk8(ctx->rkey_enc, dst, src, - walk.iv, nblks); - dst += nblks * SM4_BLOCK_SIZE; - src += nblks * SM4_BLOCK_SIZE; - nbytes -= nblks * SM4_BLOCK_SIZE; - } + sm4_neon_cfb_dec(ctx->rkey_enc, dst, src, + walk.iv, nblocks); - nblks = BYTES2BLKS(nbytes); - if (nblks) { - u8 keystream[SM4_BLOCK_SIZE * 8]; - - memcpy(keystream, walk.iv, SM4_BLOCK_SIZE); - if (nblks > 1) - memcpy(&keystream[SM4_BLOCK_SIZE], src, - (nblks - 1) * SM4_BLOCK_SIZE); - memcpy(walk.iv, src + (nblks - 1) * SM4_BLOCK_SIZE, - SM4_BLOCK_SIZE); - - sm4_neon_crypt_blk1_8(ctx->rkey_enc, keystream, - keystream, nblks); - - crypto_xor_cpy(dst, src, keystream, - nblks * SM4_BLOCK_SIZE); - dst += nblks * SM4_BLOCK_SIZE; - src += nblks * SM4_BLOCK_SIZE; - nbytes -= nblks * SM4_BLOCK_SIZE; - } + kernel_neon_end(); - kernel_neon_end(); + dst += nblocks * SM4_BLOCK_SIZE; + src += nblocks * SM4_BLOCK_SIZE; + nbytes -= nblocks * SM4_BLOCK_SIZE; + } /* tail */ if (walk.nbytes == walk.total && nbytes > 0) { @@ -302,40 +239,21 @@ static int sm4_ctr_crypt(struct skcipher_request *req) while ((nbytes = walk.nbytes) > 0) { const u8 *src = walk.src.virt.addr; u8 *dst = walk.dst.virt.addr; - unsigned int nblks; + unsigned int nblocks; - kernel_neon_begin(); + nblocks = nbytes / SM4_BLOCK_SIZE; + if (nblocks) { + kernel_neon_begin(); - nblks = BYTES2BLK8(nbytes); - if (nblks) { - sm4_neon_ctr_enc_blk8(ctx->rkey_enc, dst, src, - walk.iv, nblks); - dst += nblks * SM4_BLOCK_SIZE; - src += nblks * SM4_BLOCK_SIZE; - nbytes -= nblks * SM4_BLOCK_SIZE; - } + sm4_neon_ctr_crypt(ctx->rkey_enc, dst, src, + walk.iv, nblocks); - nblks = BYTES2BLKS(nbytes); - if (nblks) { - u8 keystream[SM4_BLOCK_SIZE * 8]; - int i; - - for (i = 0; i < nblks; i++) { - memcpy(&keystream[i * SM4_BLOCK_SIZE], - walk.iv, SM4_BLOCK_SIZE); - crypto_inc(walk.iv, SM4_BLOCK_SIZE); - } - sm4_neon_crypt_blk1_8(ctx->rkey_enc, keystream, - keystream, nblks); - - crypto_xor_cpy(dst, src, keystream, - nblks * SM4_BLOCK_SIZE); - dst += nblks * SM4_BLOCK_SIZE; - src += nblks * SM4_BLOCK_SIZE; - nbytes -= nblks * SM4_BLOCK_SIZE; - } + kernel_neon_end(); - kernel_neon_end(); + dst += nblocks * SM4_BLOCK_SIZE; + src += nblocks * SM4_BLOCK_SIZE; + nbytes -= nblocks * SM4_BLOCK_SIZE; + } /* tail */ if (walk.nbytes == walk.total && nbytes > 0) { -- GitLab From c24ee936c79d7c381750f6c23bbef1257850279f Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Thu, 27 Oct 2022 14:54:56 +0800 Subject: [PATCH 0361/1423] crypto: testmgr - add SM4 cts-cbc/xts/xcbc test vectors This patch newly adds the test vectors of CTS-CBC/XTS/XCBC modes of the SM4 algorithm, and also added some test vectors for SM4 GCM/CCM. Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu --- crypto/testmgr.c | 19 + crypto/testmgr.h | 977 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 996 insertions(+) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index bcd059caa1c81..e2806ef044fd4 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -4712,6 +4712,12 @@ static const struct alg_test_desc alg_test_descs[] = { .alg = "cts(cbc(paes))", .test = alg_test_null, .fips_allowed = 1, + }, { + .alg = "cts(cbc(sm4))", + .test = alg_test_skcipher, + .suite = { + .cipher = __VECS(sm4_cts_tv_template) + } }, { .alg = "curve25519", .test = alg_test_kpp, @@ -5586,6 +5592,12 @@ static const struct alg_test_desc alg_test_descs[] = { .suite = { .hash = __VECS(aes_xcbc128_tv_template) } + }, { + .alg = "xcbc(sm4)", + .test = alg_test_hash, + .suite = { + .hash = __VECS(sm4_xcbc128_tv_template) + } }, { .alg = "xchacha12", .test = alg_test_skcipher, @@ -5640,6 +5652,13 @@ static const struct alg_test_desc alg_test_descs[] = { .suite = { .cipher = __VECS(serpent_xts_tv_template) } + }, { + .alg = "xts(sm4)", + .generic_driver = "xts(ecb(sm4-generic))", + .test = alg_test_skcipher, + .suite = { + .cipher = __VECS(sm4_xts_tv_template) + } }, { .alg = "xts(twofish)", .generic_driver = "xts(ecb(twofish-generic))", diff --git a/crypto/testmgr.h b/crypto/testmgr.h index d6088e26f3261..f10bfb9d99735 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -14882,6 +14882,353 @@ static const struct cipher_testvec sm4_cfb_tv_template[] = { } }; +static const struct cipher_testvec sm4_cts_tv_template[] = { + /* Generated from AES-CTS test vectors */ + { + .klen = 16, + .key = "\x63\x68\x69\x63\x6b\x65\x6e\x20" + "\x74\x65\x72\x69\x79\x61\x6b\x69", + .ptext = "\x49\x20\x77\x6f\x75\x6c\x64\x20" + "\x6c\x69\x6b\x65\x20\x74\x68\x65" + "\x20", + .len = 17, + .ctext = "\x05\xfe\x23\xee\x17\xa2\x89\x98" + "\xbc\x97\x0a\x0b\x54\x67\xca\xd7" + "\xd6", + }, { + .klen = 16, + .key = "\x63\x68\x69\x63\x6b\x65\x6e\x20" + "\x74\x65\x72\x69\x79\x61\x6b\x69", + .ptext = "\x49\x20\x77\x6f\x75\x6c\x64\x20" + "\x6c\x69\x6b\x65\x20\x74\x68\x65" + "\x20\x47\x65\x6e\x65\x72\x61\x6c" + "\x20\x47\x61\x75\x27\x73\x20", + .len = 31, + .ctext = "\x15\x46\xe4\x95\xa4\xec\xf0\xb8" + "\x49\xd6\x6a\x9d\x89\xc7\xfd\x70" + "\xd6\x71\xc8\xc0\x4d\x52\x7c\x66" + "\x93\xf7\x70\xbb\xa8\x3f\xa3", + }, { + .klen = 16, + .key = "\x63\x68\x69\x63\x6b\x65\x6e\x20" + "\x74\x65\x72\x69\x79\x61\x6b\x69", + .ptext = "\x49\x20\x77\x6f\x75\x6c\x64\x20" + "\x6c\x69\x6b\x65\x20\x74\x68\x65" + "\x20\x47\x65\x6e\x65\x72\x61\x6c" + "\x20\x47\x61\x75\x27\x73\x20\x43", + .len = 32, + .ctext = "\x89\xc7\x99\x3f\x87\x69\x5c\xd3" + "\x01\x6a\xbf\xd4\x3f\x79\x02\xa3" + "\xd6\x71\xc8\xc0\x4d\x52\x7c\x66" + "\x93\xf7\x70\xbb\xa8\x3f\xa3\xcf", + }, { + .klen = 16, + .key = "\x63\x68\x69\x63\x6b\x65\x6e\x20" + "\x74\x65\x72\x69\x79\x61\x6b\x69", + .ptext = "\x49\x20\x77\x6f\x75\x6c\x64\x20" + "\x6c\x69\x6b\x65\x20\x74\x68\x65" + "\x20\x47\x65\x6e\x65\x72\x61\x6c" + "\x20\x47\x61\x75\x27\x73\x20\x43" + "\x68\x69\x63\x6b\x65\x6e\x2c\x20" + "\x70\x6c\x65\x61\x73\x65\x2c", + .len = 47, + .ctext = "\xd6\x71\xc8\xc0\x4d\x52\x7c\x66" + "\x93\xf7\x70\xbb\xa8\x3f\xa3\xcf" + "\xd3\xe1\xdc\xeb\xfa\x04\x11\x99" + "\xde\xcf\x6f\x4d\x7b\x09\x92\x7f" + "\x89\xc7\x99\x3f\x87\x69\x5c\xd3" + "\x01\x6a\xbf\xd4\x3f\x79\x02", + }, { + .klen = 16, + .key = "\x63\x68\x69\x63\x6b\x65\x6e\x20" + "\x74\x65\x72\x69\x79\x61\x6b\x69", + .ptext = "\x49\x20\x77\x6f\x75\x6c\x64\x20" + "\x6c\x69\x6b\x65\x20\x74\x68\x65" + "\x20\x47\x65\x6e\x65\x72\x61\x6c" + "\x20\x47\x61\x75\x27\x73\x20\x43" + "\x68\x69\x63\x6b\x65\x6e\x2c\x20" + "\x70\x6c\x65\x61\x73\x65\x2c\x20", + .len = 48, + .ctext = "\xd6\x71\xc8\xc0\x4d\x52\x7c\x66" + "\x93\xf7\x70\xbb\xa8\x3f\xa3\xcf" + "\x9a\xbd\x7b\xfe\x82\xab\xcc\x7f" + "\xbd\x99\x21\x0c\x5e\x4d\xed\x20" + "\x89\xc7\x99\x3f\x87\x69\x5c\xd3" + "\x01\x6a\xbf\xd4\x3f\x79\x02\xa3", + }, { + .klen = 16, + .key = "\x63\x68\x69\x63\x6b\x65\x6e\x20" + "\x74\x65\x72\x69\x79\x61\x6b\x69", + .ptext = "\x49\x20\x77\x6f\x75\x6c\x64\x20" + "\x6c\x69\x6b\x65\x20\x74\x68\x65" + "\x20\x47\x65\x6e\x65\x72\x61\x6c" + "\x20\x47\x61\x75\x27\x73\x20\x43" + "\x68\x69\x63\x6b\x65\x6e\x2c\x20" + "\x70\x6c\x65\x61\x73\x65\x2c\x20" + "\x61\x6e\x64\x20\x77\x6f\x6e\x74" + "\x6f\x6e\x20\x73\x6f\x75\x70\x2e", + .len = 64, + .ctext = "\xd6\x71\xc8\xc0\x4d\x52\x7c\x66" + "\x93\xf7\x70\xbb\xa8\x3f\xa3\xcf" + "\x89\xc7\x99\x3f\x87\x69\x5c\xd3" + "\x01\x6a\xbf\xd4\x3f\x79\x02\xa3" + "\x58\x19\xa4\x8f\xa9\x68\x5e\x6b" + "\x2c\x0f\x81\x60\x15\x98\x27\x4f" + "\x9a\xbd\x7b\xfe\x82\xab\xcc\x7f" + "\xbd\x99\x21\x0c\x5e\x4d\xed\x20", + } +}; + +static const struct cipher_testvec sm4_xts_tv_template[] = { + /* Generated from AES-XTS test vectors */ + { + .key = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .klen = 32, + .iv = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .ptext = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .ctext = "\xd9\xb4\x21\xf7\x31\xc8\x94\xfd" + "\xc3\x5b\x77\x29\x1f\xe4\xe3\xb0" + "\x2a\x1f\xb7\x66\x98\xd5\x9f\x0e" + "\x51\x37\x6c\x4a\xda\x5b\xc7\x5d", + .len = 32, + }, { + .key = "\x11\x11\x11\x11\x11\x11\x11\x11" + "\x11\x11\x11\x11\x11\x11\x11\x11" + "\x22\x22\x22\x22\x22\x22\x22\x22" + "\x22\x22\x22\x22\x22\x22\x22\x22", + .klen = 32, + .iv = "\x33\x33\x33\x33\x33\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .ptext = "\x44\x44\x44\x44\x44\x44\x44\x44" + "\x44\x44\x44\x44\x44\x44\x44\x44" + "\x44\x44\x44\x44\x44\x44\x44\x44" + "\x44\x44\x44\x44\x44\x44\x44\x44", + .ctext = "\xa7\x4d\x72\x6c\x11\x19\x6a\x32" + "\xbe\x04\xe0\x01\xff\x29\xd0\xc7" + "\x93\x2f\x9f\x3e\xc2\x9b\xfc\xb6" + "\x4d\xd1\x7f\x63\xcb\xd3\xea\x31", + .len = 32, + }, { + .key = "\xff\xfe\xfd\xfc\xfb\xfa\xf9\xf8" + "\xf7\xf6\xf5\xf4\xf3\xf2\xf1\xf0" + "\x22\x22\x22\x22\x22\x22\x22\x22" + "\x22\x22\x22\x22\x22\x22\x22\x22", + .klen = 32, + .iv = "\x33\x33\x33\x33\x33\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .ptext = "\x44\x44\x44\x44\x44\x44\x44\x44" + "\x44\x44\x44\x44\x44\x44\x44\x44" + "\x44\x44\x44\x44\x44\x44\x44\x44" + "\x44\x44\x44\x44\x44\x44\x44\x44", + .ctext = "\x7f\x76\x08\x8e\xff\xad\xf7\x0c" + "\x02\xea\x9f\x95\xda\x06\x28\xd3" + "\x51\xbf\xcb\x9e\xac\x05\x63\xbc" + "\xf1\x7b\x71\x0d\xab\x0a\x98\x26", + .len = 32, + }, { + .key = "\x27\x18\x28\x18\x28\x45\x90\x45" + "\x23\x53\x60\x28\x74\x71\x35\x26" + "\x31\x41\x59\x26\x53\x58\x97\x93" + "\x23\x84\x62\x64\x33\x83\x27\x95", + .klen = 32, + .iv = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .ptext = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + "\x10\x11\x12\x13\x14\x15\x16\x17" + "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" + "\x20\x21\x22\x23\x24\x25\x26\x27" + "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f" + "\x30\x31\x32\x33\x34\x35\x36\x37" + "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f" + "\x40\x41\x42\x43\x44\x45\x46\x47" + "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f" + "\x50\x51\x52\x53\x54\x55\x56\x57" + "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f" + "\x60\x61\x62\x63\x64\x65\x66\x67" + "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" + "\x70\x71\x72\x73\x74\x75\x76\x77" + "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f" + "\x80\x81\x82\x83\x84\x85\x86\x87" + "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" + "\x90\x91\x92\x93\x94\x95\x96\x97" + "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" + "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7" + "\xa8\xa9\xaa\xab\xac\xad\xae\xaf" + "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7" + "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" + "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7" + "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf" + "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7" + "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf" + "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7" + "\xe8\xe9\xea\xeb\xec\xed\xee\xef" + "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7" + "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff" + "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + "\x10\x11\x12\x13\x14\x15\x16\x17" + "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" + "\x20\x21\x22\x23\x24\x25\x26\x27" + "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f" + "\x30\x31\x32\x33\x34\x35\x36\x37" + "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f" + "\x40\x41\x42\x43\x44\x45\x46\x47" + "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f" + "\x50\x51\x52\x53\x54\x55\x56\x57" + "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f" + "\x60\x61\x62\x63\x64\x65\x66\x67" + "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" + "\x70\x71\x72\x73\x74\x75\x76\x77" + "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f" + "\x80\x81\x82\x83\x84\x85\x86\x87" + "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" + "\x90\x91\x92\x93\x94\x95\x96\x97" + "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" + "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7" + "\xa8\xa9\xaa\xab\xac\xad\xae\xaf" + "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7" + "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" + "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7" + "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf" + "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7" + "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf" + "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7" + "\xe8\xe9\xea\xeb\xec\xed\xee\xef" + "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7" + "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff", + .ctext = "\x54\xdd\x65\xb6\x32\x6f\xae\xa8" + "\xfa\xd1\xa8\x3c\x63\x61\x4a\xf3" + "\x9f\x72\x1d\x8d\xfe\x17\x7a\x30" + "\xb6\x6a\xbf\x6a\x44\x99\x80\xe1" + "\xcd\xbe\x06\xaf\xb7\x33\x36\xf3" + "\x7a\x4d\x39\xde\x96\x4a\x30\xd7" + "\xd0\x4a\x37\x99\x16\x9c\x60\x25" + "\x8f\x6b\x74\x8a\x61\x86\x1a\xa5" + "\xec\x92\xa2\xc1\x5b\x2b\x7c\x61" + "\x5a\x42\xab\xa4\x99\xbb\xd6\xb7" + "\x1d\xb9\xc7\x89\xb2\x18\x20\x89" + "\xa2\x5d\xd3\xdf\x80\x0e\xd1\x86" + "\x4d\x19\xf7\xed\x45\xfd\x17\xa9" + "\x48\x0b\x0f\xb8\x2d\x9b\x7f\xc3" + "\xed\x57\xe9\xa1\x14\x0e\xaa\x77" + "\x8d\xd2\xdd\x67\x9e\x3e\xdc\x3d" + "\xc4\xd5\x5c\x95\x0e\xbc\x53\x1d" + "\x95\x92\xf7\xc4\x63\x82\x56\xd5" + "\x65\x18\x29\x2a\x20\xaf\x98\xfd" + "\xd3\xa6\x36\x00\x35\x0a\x70\xab" + "\x5a\x40\xf4\xc2\x85\x03\x7c\xa0" + "\x1f\x25\x1f\x19\xec\xae\x03\x29" + "\xff\x77\xad\x88\xcd\x5a\x4c\xde" + "\xa2\xae\xab\xc2\x21\x48\xff\xbd" + "\x23\x9b\xd1\x05\x15\xbd\xe1\x13" + "\x1d\xec\x84\x04\xe4\x43\xdc\x76" + "\x31\x40\xd5\xf2\x2b\xf3\x3e\x0c" + "\x68\x72\xd6\xb8\x1d\x63\x0f\x6f" + "\x00\xcd\xd0\x58\xfe\x80\xf9\xcb" + "\xfb\x77\x70\x7f\x93\xce\xe2\xca" + "\x92\xb9\x15\xb8\x30\x40\x27\xc1" + "\x90\xa8\x4e\x2d\x65\xe0\x18\xcc" + "\x6a\x38\x7d\x37\x66\xac\xdb\x28" + "\x25\x32\x84\xe8\xdb\x9a\xcf\x8f" + "\x52\x28\x0d\xdc\x6d\x00\x33\xd2" + "\xcc\xaa\xa4\xf9\xae\xff\x12\x36" + "\x69\xbc\x02\x4f\xd6\x76\x8e\xdf" + "\x8b\xc1\xf8\xd6\x22\xc1\x9c\x60" + "\x9e\xf9\x7f\x60\x91\x90\xcd\x11" + "\x02\x41\xe7\xfb\x08\x4e\xd8\x94" + "\x2d\xa1\xf9\xb9\xcf\x1b\x51\x4b" + "\x61\xa3\x88\xb3\x0e\xa6\x1a\x4a" + "\x74\x5b\x38\x1e\xe7\xad\x6c\x4d" + "\xb1\x27\x54\x53\xb8\x41\x3f\x98" + "\xdf\x6e\x4a\x40\x98\x6e\xe4\xb5" + "\x9a\xf5\xdf\xae\xcd\x30\x12\x65" + "\x17\x90\x67\xa0\x0d\x7c\xa3\x5a" + "\xb9\x5a\xbd\x61\x7a\xde\xa2\x8e" + "\xc1\xc2\x6a\x97\xde\x28\xb8\xbf" + "\xe3\x01\x20\xd6\xae\xfb\xd2\x58" + "\xc5\x9e\x42\xd1\x61\xe8\x06\x5a" + "\x78\x10\x6b\xdc\xa5\xcd\x90\xfb" + "\x3a\xac\x4e\x93\x86\x6c\x8a\x7f" + "\x96\x76\x86\x0a\x79\x14\x5b\xd9" + "\x2e\x02\xe8\x19\xa9\x0b\xe0\xb9" + "\x7c\xc5\x22\xb3\x21\x06\x85\x6f" + "\xdf\x0e\x54\xd8\x8e\x46\x24\x15" + "\x5a\x2f\x1c\x14\xea\xea\xa1\x63" + "\xf8\x58\xe9\x9a\x80\x6e\x79\x1a" + "\xcd\x82\xf1\xb0\xe2\x9f\x00\x28" + "\xa4\xc3\x8e\x97\x6f\x57\x1a\x93" + "\xf4\xfd\x57\xd7\x87\xc2\x4d\xb0" + "\xe0\x1c\xa3\x04\xe5\xa5\xc4\xdd" + "\x50\xcf\x8b\xdb\xf4\x91\xe5\x7c", + .len = 512, + }, { + .key = "\x62\x49\x77\x57\x24\x70\x93\x69" + "\x99\x59\x57\x49\x66\x96\x76\x27" + "\x02\x88\x41\x97\x16\x93\x99\x37" + "\x51\x05\x82\x09\x74\x94\x45\x92", + .klen = 32, + .iv = "\xff\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .ptext = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + "\x10\x11\x12\x13\x14\x15\x16\x17" + "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" + "\x20\x21\x22\x23\x24\x25\x26\x27" + "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f" + "\x30\x31\x32\x33\x34\x35\x36\x37" + "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f" + "\x40\x41\x42\x43\x44\x45\x46\x47" + "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f" + "\x50\x51\x52\x53\x54\x55\x56\x57" + "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f" + "\x60\x61\x62\x63\x64\x65\x66\x67" + "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" + "\x70\x71\x72\x73\x74\x75\x76\x77" + "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f" + "\x80\x81\x82\x83\x84\x85\x86\x87" + "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" + "\x90\x91\x92\x93\x94\x95\x96\x97" + "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" + "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7" + "\xa8\xa9\xaa\xab\xac\xad\xae\xaf" + "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7" + "\xf8\xf9\xfa\xfb\xfc", + .ctext = "\xa2\x9f\x9e\x4e\x71\xdb\x28\x3c" + "\x80\x0e\xf6\xb7\x8e\x57\x1c\xba" + "\x90\xda\x3b\x6c\x22\x00\x68\x30" + "\x1d\x63\x0d\x9e\x6a\xad\x37\x55" + "\xbc\x77\x1e\xc9\xad\x83\x30\xd5" + "\x27\xb2\x66\x77\x18\x3c\xa6\x39" + "\x9c\x0a\xaa\x1f\x02\xe1\xd5\x65" + "\x9b\x8d\xc5\x97\x3d\xc5\x04\x53" + "\x78\x00\xe3\xb0\x1a\x43\x4e\xb7" + "\xc4\x9f\x38\xc5\x7b\xa4\x70\x64" + "\x78\xe6\x32\xd9\x65\x44\xc5\x64" + "\xb8\x42\x35\x99\xff\x66\x75\xb0" + "\x22\xd3\x9b\x6e\x8d\xcf\x6a\x24" + "\xfd\x92\xb7\x1b\x04\x28\x2a\x61" + "\xdc\x96\x2a\x20\x7a\x2c\xf1\xf9" + "\x12\x15\xf0\x4d\xcf\x2b\xde\x33" + "\x41\xbc\xe7\x85\x87\x22\xb7\x16" + "\x02\x1c\xd8\xa2\x0f\x1f\xa3\xe9" + "\xd8\x45\x48\xe7\xbe\x08\x4e\x4e" + "\x23\x79\x84\xdb\x40\x76\xf5\x13" + "\x78\x92\x4a\x2f\xf9\x1b\xf2\x80" + "\x25\x74\x51\x45\x9a\x77\x78\x97" + "\xd3\xe0\xc7\xc4\x35\x67\x2a\xe6" + "\xb3\x0d\x62\x9f\x8b", + .len = 189, + }, +}; + static const struct aead_testvec sm4_gcm_tv_template[] = { { /* From https://datatracker.ietf.org/doc/html/rfc8998#appendix-A.1 */ .key = "\x01\x23\x45\x67\x89\xAB\xCD\xEF" @@ -14913,6 +15260,298 @@ static const struct aead_testvec sm4_gcm_tv_template[] = { "\x83\xDE\x35\x41\xE4\xC2\xB5\x81" "\x77\xE0\x65\xA9\xBF\x7B\x62\xEC", .clen = 80, + }, { /* Generated from AES-GCM test vectors */ + .key = zeroed_string, + .klen = 16, + .ctext = "\x23\x2f\x0c\xfe\x30\x8b\x49\xea" + "\x6f\xc8\x82\x29\xb5\xdc\x85\x8d", + .clen = 16, + }, { + .key = zeroed_string, + .klen = 16, + .ptext = zeroed_string, + .plen = 16, + .ctext = "\x7d\xe2\xaa\x7f\x11\x10\x18\x82" + "\x18\x06\x3b\xe1\xbf\xeb\x6d\x89" + "\xb8\x51\xb5\xf3\x94\x93\x75\x2b" + "\xe5\x08\xf1\xbb\x44\x82\xc5\x57", + .clen = 32, + }, { + .key = "\xfe\xff\xe9\x92\x86\x65\x73\x1c" + "\x6d\x6a\x8f\x94\x67\x30\x83\x08", + .klen = 16, + .iv = "\xca\xfe\xba\xbe\xfa\xce\xdb\xad" + "\xde\xca\xf8\x88", + .ptext = "\xd9\x31\x32\x25\xf8\x84\x06\xe5" + "\xa5\x59\x09\xc5\xaf\xf5\x26\x9a" + "\x86\xa7\xa9\x53\x15\x34\xf7\xda" + "\x2e\x4c\x30\x3d\x8a\x31\x8a\x72" + "\x1c\x3c\x0c\x95\x95\x68\x09\x53" + "\x2f\xcf\x0e\x24\x49\xa6\xb5\x25" + "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57" + "\xba\x63\x7b\x39\x1a\xaf\xd2\x55", + .plen = 64, + .ctext = "\xe4\x11\x0f\xf1\xc1\x41\x97\xe6" + "\x76\x21\x6a\x33\x83\x10\x41\xeb" + "\x09\x58\x00\x11\x7b\xdc\x3f\x75" + "\x1a\x49\x6e\xfc\xf2\xbb\xdf\xdb" + "\x3a\x2e\x13\xfd\xc5\xc1\x9d\x07" + "\x1a\xe5\x48\x3f\xed\xde\x98\x5d" + "\x3f\x2d\x5b\x4e\xee\x0b\xb6\xdf" + "\xe3\x63\x36\x83\x23\xf7\x5b\x80" + "\x7d\xfe\x77\xef\x71\xb1\x5e\xc9" + "\x52\x6b\x09\xab\x84\x28\x4b\x8a", + .clen = 80, + }, { + .key = "\xfe\xff\xe9\x92\x86\x65\x73\x1c" + "\x6d\x6a\x8f\x94\x67\x30\x83\x08", + .klen = 16, + .iv = "\xca\xfe\xba\xbe\xfa\xce\xdb\xad" + "\xde\xca\xf8\x88", + .ptext = "\xd9\x31\x32\x25\xf8\x84\x06\xe5" + "\xa5\x59\x09\xc5\xaf\xf5\x26\x9a" + "\x86\xa7\xa9\x53\x15\x34\xf7\xda" + "\x2e\x4c\x30\x3d\x8a\x31\x8a\x72" + "\x1c\x3c\x0c\x95\x95\x68\x09\x53" + "\x2f\xcf\x0e\x24\x49\xa6\xb5\x25" + "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57" + "\xba\x63\x7b\x39", + .plen = 60, + .assoc = "\xfe\xed\xfa\xce\xde\xad\xbe\xef" + "\xfe\xed\xfa\xce\xde\xad\xbe\xef" + "\xab\xad\xda\xd2", + .alen = 20, + .ctext = "\xe4\x11\x0f\xf1\xc1\x41\x97\xe6" + "\x76\x21\x6a\x33\x83\x10\x41\xeb" + "\x09\x58\x00\x11\x7b\xdc\x3f\x75" + "\x1a\x49\x6e\xfc\xf2\xbb\xdf\xdb" + "\x3a\x2e\x13\xfd\xc5\xc1\x9d\x07" + "\x1a\xe5\x48\x3f\xed\xde\x98\x5d" + "\x3f\x2d\x5b\x4e\xee\x0b\xb6\xdf" + "\xe3\x63\x36\x83" + "\x89\xf6\xba\x35\xb8\x18\xd3\xcc" + "\x38\x6c\x05\xb3\x8a\xcb\xc9\xde", + .clen = 76, + }, { + .key = "\xfe\xff\xe9\x92\x86\x65\x73\x1c" + "\xfe\xff\xe9\x92\x86\x65\x73\x1c", + .klen = 16, + .iv = "\xca\xfe\xba\xbe\xfa\xce\xdb\xad" + "\xde\xca\xf8\x88", + .ptext = "\xd9\x31\x32\x25\xf8\x84\x06\xe5" + "\xa5\x59\x09\xc5\xaf\xf5\x26\x9a" + "\x86\xa7\xa9\x53\x15\x34\xf7\xda" + "\x2e\x4c\x30\x3d\x8a\x31\x8a\x72" + "\x1c\x3c\x0c\x95\x95\x68\x09\x53" + "\x2f\xcf\x0e\x24\x49\xa6\xb5\x25" + "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57" + "\xba\x63\x7b\x39", + .plen = 60, + .assoc = "\xfe\xed\xfa\xce\xde\xad\xbe\xef" + "\xfe\xed\xfa\xce\xde\xad\xbe\xef" + "\xab\xad\xda\xd2", + .alen = 20, + .ctext = "\xc1\x11\x44\x51\xd9\x25\x87\x5b" + "\x0f\xd9\x06\xf3\x33\x44\xbb\x87" + "\x8b\xa3\x77\xd2\x0c\x60\xfa\xcc" + "\x85\x50\x6f\x96\x0c\x54\x54\xc1" + "\x58\x04\x88\x6e\xf4\x26\x35\x7e" + "\x94\x80\x48\x6c\xf2\xf4\x88\x1f" + "\x19\x63\xea\xae\xba\x81\x1a\x5d" + "\x0e\x6f\x59\x08" + "\x33\xac\x5b\xa8\x19\x60\xdb\x1d" + "\xdd\x2e\x22\x2e\xe0\x87\x51\x5d", + .clen = 76, + }, { + .key = "\x8b\x32\xcf\xe7\x44\xed\x13\x59" + "\x04\x38\x77\xb0\xb9\xad\xb4\x38", + .klen = 16, + .iv = "\x00\xff\xff\xff\xff\x00\x00\xff" + "\xff\xff\x00\xff", + .ptext = "\x42\xc1\xcc\x08\x48\x6f\x41\x3f" + "\x2f\x11\x66\x8b\x2a\x16\xf0\xe0" + "\x58\x83\xf0\xc3\x70\x14\xc0\x5b" + "\x3f\xec\x1d\x25\x3c\x51\xd2\x03" + "\xcf\x59\x74\x1f\xb2\x85\xb4\x07" + "\xc6\x6a\x63\x39\x8a\x5b\xde\xcb" + "\xaf\x08\x44\xbd\x6f\x91\x15\xe1" + "\xf5\x7a\x6e\x18\xbd\xdd\x61\x50" + "\x59\xa9\x97\xab\xbb\x0e\x74\x5c" + "\x00\xa4\x43\x54\x04\x54\x9b\x3b" + "\x77\xec\xfd\x5c\xa6\xe8\x7b\x08" + "\xae\xe6\x10\x3f\x32\x65\xd1\xfc" + "\xa4\x1d\x2c\x31\xfb\x33\x7a\xb3" + "\x35\x23\xf4\x20\x41\xd4\xad\x82" + "\x8b\xa4\xad\x96\x1c\x20\x53\xbe" + "\x0e\xa6\xf4\xdc\x78\x49\x3e\x72" + "\xb1\xa9\xb5\x83\xcb\x08\x54\xb7" + "\xad\x49\x3a\xae\x98\xce\xa6\x66" + "\x10\x30\x90\x8c\x55\x83\xd7\x7c" + "\x8b\xe6\x53\xde\xd2\x6e\x18\x21" + "\x01\x52\xd1\x9f\x9d\xbb\x9c\x73" + "\x57\xcc\x89\x09\x75\x9b\x78\x70" + "\xed\x26\x97\x4d\xb4\xe4\x0c\xa5" + "\xfa\x70\x04\x70\xc6\x96\x1c\x7d" + "\x54\x41\x77\xa8\xe3\xb0\x7e\x96" + "\x82\xd9\xec\xa2\x87\x68\x55\xf9" + "\x8f\x9e\x73\x43\x47\x6a\x08\x36" + "\x93\x67\xa8\x2d\xde\xac\x41\xa9" + "\x5c\x4d\x73\x97\x0f\x70\x68\xfa" + "\x56\x4d\x00\xc2\x3b\x1f\xc8\xb9" + "\x78\x1f\x51\x07\xe3\x9a\x13\x4e" + "\xed\x2b\x2e\xa3\xf7\x44\xb2\xe7" + "\xab\x19\x37\xd9\xba\x76\x5e\xd2" + "\xf2\x53\x15\x17\x4c\x6b\x16\x9f" + "\x02\x66\x49\xca\x7c\x91\x05\xf2" + "\x45\x36\x1e\xf5\x77\xad\x1f\x46" + "\xa8\x13\xfb\x63\xb6\x08\x99\x63" + "\x82\xa2\xed\xb3\xac\xdf\x43\x19" + "\x45\xea\x78\x73\xd9\xb7\x39\x11" + "\xa3\x13\x7c\xf8\x3f\xf7\xad\x81" + "\x48\x2f\xa9\x5c\x5f\xa0\xf0\x79" + "\xa4\x47\x7d\x80\x20\x26\xfd\x63" + "\x0a\xc7\x7e\x6d\x75\x47\xff\x76" + "\x66\x2e\x8a\x6c\x81\x35\xaf\x0b" + "\x2e\x6a\x49\x60\xc1\x10\xe1\xe1" + "\x54\x03\xa4\x09\x0c\x37\x7a\x15" + "\x23\x27\x5b\x8b\x4b\xa5\x64\x97" + "\xae\x4a\x50\x73\x1f\x66\x1c\x5c" + "\x03\x25\x3c\x8d\x48\x58\x71\x34" + "\x0e\xec\x4e\x55\x1a\x03\x6a\xe5" + "\xb6\x19\x2b\x84\x2a\x20\xd1\xea" + "\x80\x6f\x96\x0e\x05\x62\xc7\x78" + "\x87\x79\x60\x38\x46\xb4\x25\x57" + "\x6e\x16\x63\xf8\xad\x6e\xd7\x42" + "\x69\xe1\x88\xef\x6e\xd5\xb4\x9a" + "\x3c\x78\x6c\x3b\xe5\xa0\x1d\x22" + "\x86\x5c\x74\x3a\xeb\x24\x26\xc7" + "\x09\xfc\x91\x96\x47\x87\x4f\x1a" + "\xd6\x6b\x2c\x18\x47\xc0\xb8\x24" + "\xa8\x5a\x4a\x9e\xcb\x03\xe7\x2a" + "\x09\xe6\x4d\x9c\x6d\x86\x60\xf5" + "\x2f\x48\x69\x37\x9f\xf2\xd2\xcb" + "\x0e\x5a\xdd\x6e\x8a\xfb\x6a\xfe" + "\x0b\x63\xde\x87\x42\x79\x8a\x68" + "\x51\x28\x9b\x7a\xeb\xaf\xb8\x2f" + "\x9d\xd1\xc7\x45\x90\x08\xc9\x83" + "\xe9\x83\x84\xcb\x28\x69\x09\x69" + "\xce\x99\x46\x00\x54\xcb\xd8\x38" + "\xf9\x53\x4a\xbf\x31\xce\x57\x15" + "\x33\xfa\x96\x04\x33\x42\xe3\xc0" + "\xb7\x54\x4a\x65\x7a\x7c\x02\xe6" + "\x19\x95\xd0\x0e\x82\x07\x63\xf9" + "\xe1\x2b\x2a\xfc\x55\x92\x52\xc9" + "\xb5\x9f\x23\x28\x60\xe7\x20\x51" + "\x10\xd3\xed\x6d\x9b\xab\xb8\xe2" + "\x5d\x9a\x34\xb3\xbe\x9c\x64\xcb" + "\x78\xc6\x91\x22\x40\x91\x80\xbe" + "\xd7\x78\x5c\x0e\x0a\xdc\x08\xe9" + "\x67\x10\xa4\x83\x98\x79\x23\xe7" + "\x92\xda\xa9\x22\x16\xb1\xe7\x78" + "\xa3\x1c\x6c\x8f\x35\x7c\x4d\x37" + "\x2f\x6e\x0b\x50\x5c\x34\xb9\xf9" + "\xe6\x3d\x91\x0d\x32\x95\xaa\x3d" + "\x48\x11\x06\xbb\x2d\xf2\x63\x88" + "\x3f\x73\x09\xe2\x45\x56\x31\x51" + "\xfa\x5e\x4e\x62\xf7\x90\xf9\xa9" + "\x7d\x7b\x1b\xb1\xc8\x26\x6e\x66" + "\xf6\x90\x9a\x7f\xf2\x57\xcc\x23" + "\x59\xfa\xfa\xaa\x44\x04\x01\xa7" + "\xa4\x78\xdb\x74\x3d\x8b\xb5", + .plen = 719, + .ctext = "\xdc\xb1\x0f\x2a\xe8\x2d\x1c\x57" + "\xc4\x82\xfa\xd6\x87\xe6\x2f\x50" + "\xbd\x9e\x0a\x42\x31\xf2\xc7\xbb" + "\x21\x63\xa7\x05\x43\x33\xef\x33" + "\x5c\xd3\x47\x55\xce\x5c\xe4\xd4" + "\xe5\x07\x62\x22\xac\x01\xa8\x35" + "\x9c\x59\x34\x30\x8e\xff\x9f\xb4" + "\xd2\x4e\x74\x90\x64\xf2\x78\x5e" + "\x63\xb7\xc5\x08\x1b\x37\xa5\x9e" + "\xc0\xde\xff\xa9\x7f\x0b\xd3\x02" + "\x83\x6e\x33\xfa\x43\x11\xd3\xda" + "\x02\xcf\xcd\x4a\xc0\x78\x1f\x39" + "\x62\xcb\xa3\x95\x7e\x13\x92\x28" + "\xb2\xc4\x7a\xba\xd1\xc6\xf6\x1f" + "\xda\x0b\xf1\xd1\x99\x54\xd8\x3b" + "\x16\xf8\xe6\x97\x1e\xa7\xcf\x49" + "\x69\x84\x01\x4c\xdc\x7a\x34\xff" + "\x01\x08\xa3\x0b\x39\xac\x21\x37" + "\xd8\xb4\x04\x19\x8b\x7a\x7d\x17" + "\x44\xd1\x18\xaf\x1f\xa9\x29\xfe" + "\xfa\x77\xe0\x40\x42\x0c\x79\xb7" + "\xc3\x15\x1b\xd9\x0c\x82\xfc\x16" + "\x70\xd6\x2a\xe9\x94\x72\xc5\xa5" + "\x8a\x58\xbc\xfa\xe0\x88\x39\x4a" + "\x80\xe8\xec\xaf\x60\xac\xe7\xf8" + "\x9c\xf0\xfc\x61\x39\x07\x98\x6b" + "\x88\xe3\x98\x22\x28\x18\x4a\x2d" + "\x25\xef\x10\xe3\x83\x66\x3f\xfd" + "\xc7\x0b\xa3\xfd\x97\xa9\xf4\xbd" + "\xd8\x2a\xee\x4a\x50\xad\xcc\xb5" + "\xc7\xab\xb8\x79\x9c\xd1\xf1\x27" + "\x08\xf5\xf5\xe8\x1b\x66\xce\x41" + "\x56\x60\x94\x86\xf0\x78\xc2\xfa" + "\x5b\x63\x40\xb1\xd1\x1a\x38\x69" + "\x0b\x8c\xb2\xf5\xa2\xbe\x90\x9d" + "\x46\x23\x79\x8b\x3b\x4a\xf4\xbb" + "\x55\xf7\x58\x9d\xaf\x59\xff\x74" + "\xf3\xb9\xc4\x26\xb1\xf8\xe1\x28" + "\x8b\x5e\x8f\x6d\x64\xe7\xe8\x63" + "\xd2\x9e\xcb\xee\xae\x19\x04\x1d" + "\x05\xf0\x9d\x99\x7b\x33\x33\xae" + "\x6e\xe5\x09\xdd\x67\x51\xc4\xc8" + "\x6a\xc7\x36\x35\xc9\x93\x76\xa1" + "\xa8\x1c\xfa\x75\x92\x34\x0e\x7d" + "\x3d\x1d\xef\x00\xfd\xa5\x25\x12" + "\x7c\x91\x21\x41\xcc\x50\x47\xa9" + "\x22\x50\x24\x96\x34\x79\x3d\xe8" + "\x3f\xa0\x56\xaf\x98\x53\x55\xc3" + "\x46\x1b\x17\x54\xb8\xb0\xb7\xe0" + "\xe0\xab\x47\x6f\x06\xda\xcc\x75" + "\xa7\x96\xb7\x92\xf3\xa0\x5f\xe6" + "\xba\x97\xe3\x2f\x97\x05\xb2\x99" + "\xa0\x09\x10\x98\x9c\xd3\x2e\xd1" + "\x7e\x2a\x30\x54\x3c\xb9\x33\xe3" + "\xf2\xaf\xd3\xa5\xee\xd0\x0b\x8a" + "\x19\x54\x0f\x02\x51\x1f\x91\xdf" + "\x71\x9c\xad\x77\x35\x28\x55\x6d" + "\xcd\x7a\xd9\xa3\x41\x98\x6b\x37" + "\x19\x0f\xbe\xae\x69\xb2\x25\x01" + "\xee\x0e\x51\x4b\x53\xea\x0f\x5f" + "\x85\x74\x79\x36\x32\x0a\x2a\x40" + "\xad\x6b\x78\x41\x54\x99\xe9\xc1" + "\x2b\x6c\x9b\x42\x21\xef\xe2\x50" + "\x56\x8d\x78\xdf\x58\xbe\x0a\x0f" + "\xfc\xfc\x0d\x2e\xd0\xcb\xa6\x0a" + "\xa8\xd9\x1e\xa9\xd4\x7c\x99\x88" + "\xcf\x11\xad\x1c\xd3\x04\x63\x55" + "\xef\x85\x0b\x69\xa1\x40\xf1\x75" + "\x24\xf4\xe5\x2c\xd4\x7a\x24\x50" + "\x8f\xa2\x71\xc9\x92\x20\xcd\xcf" + "\xda\x40\xbe\xf6\xfe\x1a\xca\xc7" + "\x4a\x80\x45\x55\xcb\xdd\xb7\x01" + "\xb0\x8d\xcb\xd2\xae\xbd\xa4\xd0" + "\x5c\x10\x05\x66\x7b\xd4\xff\xd9" + "\xc4\x23\x9d\x8d\x6b\x24\xf8\x3f" + "\x73\x4d\x5c\x2b\x33\x4c\x5e\x63" + "\x74\x6d\x03\xa1\x7a\x35\x65\x17" + "\x38\x7f\x3b\xc1\x69\xcf\x61\x34" + "\x30\x21\xaf\x97\x47\x12\x3f\xa1" + "\xa7\x50\xc5\x87\xfb\x3f\x70\x32" + "\x86\x17\x5f\x25\xe4\x74\xc6\xd0" + "\x9b\x39\xe6\xe1\x5a\xec\x8f\x40" + "\xce\xcc\x37\x3b\xd8\x72\x1c\x31" + "\x75\xa4\xa6\x89\x8c\xdd\xd6\xd2" + "\x32\x3d\xe8\xc3\x54\xab\x1f\x35" + "\x52\xb4\x94\x81\xb0\x37\x3a\x03" + "\xbb\xb1\x99\x30\xa5\xf8\x21\xcd" + "\x93\x5d\xa7\x13\xed\xc7\x49\x09" + "\x70\xda\x08\x39\xaa\x15\x9e\x45" + "\x35\x2b\x0f\x5c\x8c\x8b\xc9" + "\xa8\xb8\x9f\xfd\x37\x36\x31\x7e" + "\x34\x4f\xc1\xc0\xca\x8a\x22\xfd", + .clen = 735, } }; @@ -14947,6 +15586,282 @@ static const struct aead_testvec sm4_ccm_tv_template[] = { "\x16\x84\x2D\x4F\xA1\x86\xF5\x6A" "\xB3\x32\x56\x97\x1F\xA1\x10\xF4", .clen = 80, + }, { /* Generated from AES-CCM test vectors */ + .key = "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7" + "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf", + .klen = 16, + .iv = "\x01\x00\x00\x00\x03\x02\x01\x00" + "\xa0\xa1\xa2\xa3\xa4\xa5\x00\x00", + .assoc = "\x00\x01\x02\x03\x04\x05\x06\x07", + .alen = 8, + .ptext = "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + "\x10\x11\x12\x13\x14\x15\x16\x17" + "\x18\x19\x1a\x1b\x1c\x1d\x1e", + .plen = 23, + .ctext = "\x7b\xff\x4a\x15\xf5\x73\xce\x82" + "\x6e\xc2\x31\x1d\xe2\x53\x02\xac" + "\xa4\x48\xf9\xe4\xf5\x1f\x81\x70" + "\x18\xbc\xb6\x84\x01\xb8\xae", + .clen = 31, + }, { + .key = "\xf4\x6b\xc2\x75\x62\xfe\xb4\xe1" + "\x53\x14\x73\x66\x8d\x88\xf6\x80", + .klen = 16, + .iv = "\x03\xa0\x20\x35\x26\xf2\x21\x8d" + "\x50\x20\xda\xe2\x00\x00\x00\x00", + .assoc = "\x5b\x9e\x13\x67\x02\x5e\xef\xc1" + "\x6c\xf9\xd7\x1e\x52\x8f\x7a\x47" + "\xe9\xd4\xcf\x20\x14\x6e\xf0\x2d" + "\xd8\x9e\x2b\x56\x10\x23\x56\xe7", + .alen = 32, + .ctext = "\x23\x58\xce\xdc\x40\xb1\xcd\x92" + "\x47\x96\x59\xfc\x8a\x26\x4f\xcf", + .clen = 16, + }, { + .key = "\xab\x2f\x8a\x74\xb7\x1c\xd2\xb1" + "\xff\x80\x2e\x48\x7d\x82\xf8\xb9", + .klen = 16, + .iv = "\x03\xaf\x94\x87\x78\x35\x82\x81" + "\x7f\x88\x94\x68\x00\x00\x00\x00", + .alen = 0, + .ptext = "\x00", + .plen = 0, + .ctext = "\x72\x7e\xf5\xd6\x39\x7a\x2b\x43", + .clen = 8, + }, { + .key = "\x39\xbb\xa7\xbe\x59\x97\x9e\x73" + "\xa4\x48\x93\x39\x26\x71\x4a\xc6", + .klen = 16, + .iv = "\x03\xee\x49\x83\xe9\xa9\xff\xe9" + "\x57\xba\xfd\x9e\x00\x00\x00\x00", + .assoc = "\x44\xa6\x2c\x05\xe9\xe1\x43\xb1" + "\x58\x7c\xf2\x5c\x6d\x39\x0a\x64" + "\xa4\xf0\x13\x05\xd1\x77\x99\x67" + "\x11\xc4\xc6\xdb\x00\x56\x36\x61", + .alen = 32, + .ptext = "\x00", + .plen = 0, + .ctext = "\xb0\x9d\xc6\xfb\x7d\xb5\xa1\x0e", + .clen = 8, + }, { + .key = "\x58\x5d\xa0\x96\x65\x1a\x04\xd7" + "\x0d\x1a\x53\x3b\xb5\xe3\xf8\x8b", + .klen = 16, + .iv = "\x03\xcf\x76\x3f\xd9\x95\x75\x8f" + "\x44\x89\x40\x7b\x00\x00\x00\x00", + .assoc = "\x8f\x86\x6c\x4d\x1d\xc5\x39\x88" + "\xc8\xf3\x5c\x52\x10\x63\x6f\x2b" + "\x8a\x2a\xc5\x6f\x30\x23\x58\x7b" + "\xfb\x36\x03\x11\xb4\xd9\xf2\xfe", + .alen = 32, + .ptext = "\xc2\x54\xc8\xde\x78\x87\x77\x40" + "\x49\x71\xe4\xb7\xe7\xcb\x76\x61" + "\x0a\x41\xb9\xe9\xc0\x76\x54\xab" + "\x04\x49\x3b\x19\x93\x57\x25\x5d", + .plen = 32, + .ctext = "\xc9\xae\xef\x1d\xf3\x2c\xd3\x38" + "\xc9\x7f\x7e\x28\xe8\xaa\xb3\x60" + "\x49\xdc\x66\xca\x7b\x3d\xe0\x3c" + "\xcb\x45\x9c\x1b\xb2\xbe\x07\x90" + "\x87\xa6\x6b\x89\x0d\x0f\x90\xaa" + "\x7d\xf6\x5a\x9a\x68\x2b\x81\x92", + .clen = 48, + }, { + .key = "\x8b\x32\xcf\xe7\x44\xed\x13\x59" + "\x04\x38\x77\xb0\xb9\xad\xb4\x38", + .klen = 16, + .iv = "\x02\xff\xff\xff\xff\x00\x00\xff" + "\xff\xff\x00\xff\xff\x00\x00\x00", + .assoc = "\x8f\x86\x6c\x4d\x1d\xc5\x39\x88" + "\xc8\xf3\x5c\x52\x10\x63\x6f\x2b" + "\x8a\x2a\xc5\x6f\x30\x23\x58\x7b" + "\xfb\x36\x03\x11\xb4\xd9\xf2\xfe" + "\xc8\xf3\x5c\x52\x10\x63", + .alen = 38, + .ptext = "\x42\xc1\xcc\x08\x48\x6f\x41\x3f" + "\x2f\x11\x66\x8b\x2a\x16\xf0\xe0" + "\x58\x83\xf0\xc3\x70\x14\xc0\x5b" + "\x3f\xec\x1d\x25\x3c\x51\xd2\x03" + "\xcf\x59\x74\x1f\xb2\x85\xb4\x07" + "\xc6\x6a\x63\x39\x8a\x5b\xde\xcb" + "\xaf\x08\x44\xbd\x6f\x91\x15\xe1" + "\xf5\x7a\x6e\x18\xbd\xdd\x61\x50" + "\x59\xa9\x97\xab\xbb\x0e\x74\x5c" + "\x00\xa4\x43\x54\x04\x54\x9b\x3b" + "\x77\xec\xfd\x5c\xa6\xe8\x7b\x08" + "\xae\xe6\x10\x3f\x32\x65\xd1\xfc" + "\xa4\x1d\x2c\x31\xfb\x33\x7a\xb3" + "\x35\x23\xf4\x20\x41\xd4\xad\x82" + "\x8b\xa4\xad\x96\x1c\x20\x53\xbe" + "\x0e\xa6\xf4\xdc\x78\x49\x3e\x72" + "\xb1\xa9\xb5\x83\xcb\x08\x54\xb7" + "\xad\x49\x3a\xae\x98\xce\xa6\x66" + "\x10\x30\x90\x8c\x55\x83\xd7\x7c" + "\x8b\xe6\x53\xde\xd2\x6e\x18\x21" + "\x01\x52\xd1\x9f\x9d\xbb\x9c\x73" + "\x57\xcc\x89\x09\x75\x9b\x78\x70" + "\xed\x26\x97\x4d\xb4\xe4\x0c\xa5" + "\xfa\x70\x04\x70\xc6\x96\x1c\x7d" + "\x54\x41\x77\xa8\xe3\xb0\x7e\x96" + "\x82\xd9\xec\xa2\x87\x68\x55\xf9" + "\x8f\x9e\x73\x43\x47\x6a\x08\x36" + "\x93\x67\xa8\x2d\xde\xac\x41\xa9" + "\x5c\x4d\x73\x97\x0f\x70\x68\xfa" + "\x56\x4d\x00\xc2\x3b\x1f\xc8\xb9" + "\x78\x1f\x51\x07\xe3\x9a\x13\x4e" + "\xed\x2b\x2e\xa3\xf7\x44\xb2\xe7" + "\xab\x19\x37\xd9\xba\x76\x5e\xd2" + "\xf2\x53\x15\x17\x4c\x6b\x16\x9f" + "\x02\x66\x49\xca\x7c\x91\x05\xf2" + "\x45\x36\x1e\xf5\x77\xad\x1f\x46" + "\xa8\x13\xfb\x63\xb6\x08\x99\x63" + "\x82\xa2\xed\xb3\xac\xdf\x43\x19" + "\x45\xea\x78\x73\xd9\xb7\x39\x11" + "\xa3\x13\x7c\xf8\x3f\xf7\xad\x81" + "\x48\x2f\xa9\x5c\x5f\xa0\xf0\x79" + "\xa4\x47\x7d\x80\x20\x26\xfd\x63" + "\x0a\xc7\x7e\x6d\x75\x47\xff\x76" + "\x66\x2e\x8a\x6c\x81\x35\xaf\x0b" + "\x2e\x6a\x49\x60\xc1\x10\xe1\xe1" + "\x54\x03\xa4\x09\x0c\x37\x7a\x15" + "\x23\x27\x5b\x8b\x4b\xa5\x64\x97" + "\xae\x4a\x50\x73\x1f\x66\x1c\x5c" + "\x03\x25\x3c\x8d\x48\x58\x71\x34" + "\x0e\xec\x4e\x55\x1a\x03\x6a\xe5" + "\xb6\x19\x2b\x84\x2a\x20\xd1\xea" + "\x80\x6f\x96\x0e\x05\x62\xc7\x78" + "\x87\x79\x60\x38\x46\xb4\x25\x57" + "\x6e\x16\x63\xf8\xad\x6e\xd7\x42" + "\x69\xe1\x88\xef\x6e\xd5\xb4\x9a" + "\x3c\x78\x6c\x3b\xe5\xa0\x1d\x22" + "\x86\x5c\x74\x3a\xeb\x24\x26\xc7" + "\x09\xfc\x91\x96\x47\x87\x4f\x1a" + "\xd6\x6b\x2c\x18\x47\xc0\xb8\x24" + "\xa8\x5a\x4a\x9e\xcb\x03\xe7\x2a" + "\x09\xe6\x4d\x9c\x6d\x86\x60\xf5" + "\x2f\x48\x69\x37\x9f\xf2\xd2\xcb" + "\x0e\x5a\xdd\x6e\x8a\xfb\x6a\xfe" + "\x0b\x63\xde\x87\x42\x79\x8a\x68" + "\x51\x28\x9b\x7a\xeb\xaf\xb8\x2f" + "\x9d\xd1\xc7\x45\x90\x08\xc9\x83" + "\xe9\x83\x84\xcb\x28\x69\x09\x69" + "\xce\x99\x46\x00\x54\xcb\xd8\x38" + "\xf9\x53\x4a\xbf\x31\xce\x57\x15" + "\x33\xfa\x96\x04\x33\x42\xe3\xc0" + "\xb7\x54\x4a\x65\x7a\x7c\x02\xe6" + "\x19\x95\xd0\x0e\x82\x07\x63\xf9" + "\xe1\x2b\x2a\xfc\x55\x92\x52\xc9" + "\xb5\x9f\x23\x28\x60\xe7\x20\x51" + "\x10\xd3\xed\x6d\x9b\xab\xb8\xe2" + "\x5d\x9a\x34\xb3\xbe\x9c\x64\xcb" + "\x78\xc6\x91\x22\x40\x91\x80\xbe" + "\xd7\x78\x5c\x0e\x0a\xdc\x08\xe9" + "\x67\x10\xa4\x83\x98\x79\x23\xe7" + "\x92\xda\xa9\x22\x16\xb1\xe7\x78" + "\xa3\x1c\x6c\x8f\x35\x7c\x4d\x37" + "\x2f\x6e\x0b\x50\x5c\x34\xb9\xf9" + "\xe6\x3d\x91\x0d\x32\x95\xaa\x3d" + "\x48\x11\x06\xbb\x2d\xf2\x63\x88" + "\x3f\x73\x09\xe2\x45\x56\x31\x51" + "\xfa\x5e\x4e\x62\xf7\x90\xf9\xa9" + "\x7d\x7b\x1b\xb1\xc8\x26\x6e\x66" + "\xf6\x90\x9a\x7f\xf2\x57\xcc\x23" + "\x59\xfa\xfa\xaa\x44\x04\x01\xa7" + "\xa4\x78\xdb\x74\x3d\x8b\xb5", + .plen = 719, + .ctext = "\xc5\x50\x85\x02\x72\xa8\xb3\x62" + "\xf9\xcd\x77\x7b\x43\xa5\x04\x70" + "\x68\x40\x57\x21\x1c\xfe\xef\x05" + "\x4d\xb8\x44\xba\x59\xea\x62\x32" + "\xcb\x6b\x6a\x39\x9b\xf3\xe5\xa4" + "\x36\x38\xde\x7d\xcf\xb6\xcd\xe3" + "\x89\xbf\x37\xc9\x96\x3c\x70\x10" + "\x92\x47\xcc\xac\x6f\xf8\x55\x9a" + "\x26\x43\x34\xb4\x92\x7d\x68\xfc" + "\x60\x37\x74\x2a\x55\xba\xc7\xd7" + "\x98\x69\xb7\xcf\x42\xfd\xb2\x10" + "\xa0\x59\xe1\x2c\x73\x66\x12\x97" + "\x85\x8b\x28\xcc\x29\x02\x15\x89" + "\x23\xd3\x32\x92\x87\x57\x09\x13" + "\x04\x7e\x8b\x6c\x3a\xc1\x4e\x6c" + "\xe1\x9f\xc8\xcc\x47\x9c\xd8\x10" + "\xf4\xb7\x5c\x30\x7a\x8b\x0f\x01" + "\x52\x38\x02\x92\x99\xac\x03\x90" + "\x18\x32\x2d\x21\x6a\x0a\x2a\xe7" + "\xc2\xcc\x15\x84\x4e\x2b\x0b\x3a" + "\x4c\xdc\xb0\x6b\x10\xd1\x27\x10" + "\xf0\x4a\x5c\x43\xa0\x34\x34\x59" + "\x47\x43\x48\xcb\x69\xa7\xff\x52" + "\xb8\xca\x23\x09\x07\xd7\xc5\xe4" + "\x2a\x4f\x99\xd5\x83\x36\x2a\x2d" + "\x59\xd0\xca\xb0\xfa\x40\x8c\xab" + "\xdf\x69\x08\xd9\x79\x1d\xde\xa8" + "\x0b\x34\x74\x4d\xf5\xa0\x4c\x81" + "\x7f\x93\x06\x40\x24\xfe\x7d\xcd" + "\xe4\xfe\xf8\xf8\x30\xce\xd0\x5d" + "\x70\xfd\x0d\x5a\x78\x85\x74\x2d" + "\xe4\xb5\x40\x18\x99\x11\xe4\x6a" + "\xdf\xfa\x4f\x25\x2c\xde\x15\xb7" + "\x12\xd8\xc6\x90\x0d\x0f\xc9\xfb" + "\x21\xf1\xed\xfe\x98\xe1\x03\xe2" + "\x5c\xef\xb6\xc7\x87\x77\x0e\xcd" + "\xff\x78\x94\xc9\xbe\xd3\x47\xf7" + "\x8d\x37\x48\x01\x42\xe2\x17\x96" + "\xfc\xc0\xcb\x7b\x7b\x57\xaf\x3b" + "\xc9\xd0\x94\xce\x5e\x1b\xa9\x47" + "\x02\x4d\x74\xcc\x45\x1d\xd3\x2d" + "\x5f\x4f\x7f\xf2\x4b\xf9\x59\xee" + "\x9e\x9e\xb9\x95\x29\x19\xd1\x5f" + "\x72\xab\x8d\xf1\x28\xd1\x1c\xae" + "\xc2\xba\xf7\x22\x84\x2c\x83\x51" + "\x03\xad\xa3\xef\x81\xa7\xdc\xf1" + "\x44\x51\x50\x96\x70\xd1\xe5\x47" + "\x57\xf9\x30\x90\xe4\xbf\xfc\x75" + "\x14\xaa\x4d\xb7\xb1\xe7\x79\x33" + "\x43\xc2\x5c\xc1\xbc\x09\x92\x0f" + "\xa7\xaf\x68\x51\x51\xec\x0b\xc3" + "\x3d\x2b\x94\x30\x45\x29\x1b\x9e" + "\x70\x56\xf8\xd6\x67\x2d\x39\x3b" + "\x3c\xd2\xd0\xd3\xdc\x7d\x84\xe9" + "\x06\x31\x98\xa6\x5c\xbf\x10\x58" + "\xce\xbb\xa7\xe1\x65\x7e\x51\x87" + "\x70\x46\xb4\x7f\xf9\xec\x92\x1c" + "\x9b\x24\x49\xc1\x04\xbe\x1c\x5f" + "\xcc\xb3\x33\x8c\xad\xe7\xdc\x32" + "\x54\xa2\x0d\x83\x0f\x3c\x12\x5d" + "\x71\xe3\x9c\xae\x71\xa3\x2a\x10" + "\xc5\x91\xb4\x73\x96\x60\xdb\x5d" + "\x1f\xd5\x9a\xd2\x69\xc3\xd7\x4b" + "\xa2\x66\x81\x96\x4a\xaa\x02\xd6" + "\xd5\x44\x9b\x42\x3a\x15\x5f\xe7" + "\x4d\x7c\xf6\x71\x4a\xea\xe8\x43" + "\xd7\x68\xe4\xbc\x05\x87\x49\x05" + "\x3b\x47\xb2\x6d\x5f\xd1\x11\xa6" + "\x58\xd4\xa2\x45\xec\xb5\x54\x55" + "\xd3\xd6\xd2\x6a\x8b\x21\x9e\x2c" + "\xf1\x27\x4b\x5b\xe3\xff\xe0\xfd" + "\x4b\xf1\xe7\xe2\x84\xf2\x17\x37" + "\x11\x68\xc4\x92\x4b\x6b\xef\x8e" + "\x75\xf5\xc2\x7d\x5c\xe9\x7c\xfc" + "\x2b\x00\x33\x0e\x7d\x69\xd8\xd4" + "\x9b\xa8\x38\x54\x7e\x6d\x23\x51" + "\x2c\xd6\xc4\x58\x23\x1c\x22\x2a" + "\x59\xc5\x9b\xec\x9d\xbf\x03\x0f" + "\xb3\xdd\xba\x02\x22\xa0\x34\x37" + "\x19\x56\xc2\x5b\x32\x1d\x1e\x66" + "\x68\xf4\x47\x05\x04\x18\xa7\x28" + "\x80\xf2\xc7\x99\xed\x1e\x72\x48" + "\x8f\x97\x5d\xb3\x74\x42\xfd\x0c" + "\x0f\x5f\x29\x0c\xf1\x35\x22\x90" + "\xd6\x7c\xb8\xa3\x2a\x89\x38\x71" + "\xe9\x7a\x55\x3c\x3b\xf2\x6e\x1a" + "\x22\x8f\x07\x81\xc1\xe1\xf1\x76" + "\x2a\x75\xab\x86\xc4\xcc\x52\x59" + "\x83\x19\x5e\xb3\x53\xe2\x81\xdf" + "\xe6\x15\xb3\xba\x0c\x0e\xba" + "\xa9\x2c\xed\x51\xd5\x06\xc8\xc6" + "\x4b\x9f\x5d\x1b\x61\x31\xad\xf4", + .clen = 735, } }; @@ -15030,6 +15945,68 @@ static const struct hash_testvec sm4_cmac128_tv_template[] = { } }; +static const struct hash_testvec sm4_xcbc128_tv_template[] = { + { /* Generated from AES-XCBC128 test vectors */ + .key = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", + .plaintext = zeroed_string, + .digest = "\xa9\x9a\x5c\x44\xe2\x34\xee\x2c" + "\x9b\xe4\x9d\xca\x64\xb0\xa5\xc4", + .psize = 0, + .ksize = 16, + }, { + .key = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", + .plaintext = "\x00\x01\x02", + .digest = "\x17\x27\x62\xf3\x8b\x88\x1d\xc0" + "\x97\x35\x9c\x3e\x9f\x27\xb7\x83", + .psize = 3, + .ksize = 16, + } , { + .key = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", + .plaintext = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", + .digest = "\xda\x45\xd1\xac\xec\x4d\xab\x46" + "\xdd\x59\xe0\x44\xff\x59\xd5\xfc", + .psize = 16, + .ksize = 16, + }, { + .key = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", + .plaintext = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + "\x10\x11\x12\x13", + .digest = "\xbe\x24\x5d\x81\x8c\x8a\x10\xa4" + "\x8e\xc2\x16\xfa\xa4\x83\xc9\x2a", + .psize = 20, + .ksize = 16, + }, { + .key = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", + .plaintext = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + "\x10\x11\x12\x13\x14\x15\x16\x17" + "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f", + .digest = "\x91\x82\x31\x56\xd5\x77\xa4\xc5" + "\x88\x2d\xce\x3a\x87\x5e\xbd\xba", + .psize = 32, + .ksize = 16, + }, { + .key = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", + .plaintext = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + "\x10\x11\x12\x13\x14\x15\x16\x17" + "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" + "\x20\x21", + .digest = "\x2a\xae\xa5\x24\x0c\x12\x9f\x5f" + "\x55\xfb\xae\x35\x13\x0d\x22\x2d", + .psize = 34, + .ksize = 16, + } +}; + /* Cast6 test vectors from RFC 2612 */ static const struct cipher_testvec cast6_tv_template[] = { { -- GitLab From 3c3836378dd578a0d510420c0f67818e8dc49f0e Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Thu, 27 Oct 2022 14:54:57 +0800 Subject: [PATCH 0362/1423] crypto: tcrypt - add SM4 cts-cbc/xts/xcbc test Added CTS-CBC/XTS/XCBC tests for SM4 algorithms, as well as corresponding speed tests, this is to test performance-optimized implementations of these modes. Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu --- crypto/tcrypt.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index b096ae901aa80..0f101897e90fb 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -1710,6 +1710,10 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) ret = min(ret, tcrypt_test("gcm(aria)")); break; + case 59: + ret = min(ret, tcrypt_test("cts(cbc(sm4))")); + break; + case 100: ret = min(ret, tcrypt_test("hmac(md5)")); break; @@ -1810,6 +1814,10 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) ret = min(ret, tcrypt_test("cmac(sm4)")); break; + case 160: + ret = min(ret, tcrypt_test("xcbc(sm4)")); + break; + case 181: ret = min(ret, tcrypt_test("authenc(hmac(sha1),cbc(des))")); break; @@ -1845,6 +1853,7 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) ret = min(ret, tcrypt_test("cbc(sm4)")); ret = min(ret, tcrypt_test("cfb(sm4)")); ret = min(ret, tcrypt_test("ctr(sm4)")); + ret = min(ret, tcrypt_test("xts(sm4)")); break; case 192: ret = min(ret, tcrypt_test("ecb(aria)")); @@ -2108,6 +2117,10 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) speed_template_16); test_cipher_speed("cbc(sm4)", DECRYPT, sec, NULL, 0, speed_template_16); + test_cipher_speed("cts(cbc(sm4))", ENCRYPT, sec, NULL, 0, + speed_template_16); + test_cipher_speed("cts(cbc(sm4))", DECRYPT, sec, NULL, 0, + speed_template_16); test_cipher_speed("cfb(sm4)", ENCRYPT, sec, NULL, 0, speed_template_16); test_cipher_speed("cfb(sm4)", DECRYPT, sec, NULL, 0, @@ -2116,6 +2129,10 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) speed_template_16); test_cipher_speed("ctr(sm4)", DECRYPT, sec, NULL, 0, speed_template_16); + test_cipher_speed("xts(sm4)", ENCRYPT, sec, NULL, 0, + speed_template_32); + test_cipher_speed("xts(sm4)", DECRYPT, sec, NULL, 0, + speed_template_32); break; case 219: @@ -2629,6 +2646,10 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) speed_template_16); test_acipher_speed("ctr(sm4)", DECRYPT, sec, NULL, 0, speed_template_16); + test_acipher_speed("xts(sm4)", ENCRYPT, sec, NULL, 0, + speed_template_32); + test_acipher_speed("xts(sm4)", DECRYPT, sec, NULL, 0, + speed_template_32); break; case 519: -- GitLab From ce41fefd2443c25166458f24621b53a28fff989f Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Thu, 27 Oct 2022 14:54:58 +0800 Subject: [PATCH 0363/1423] crypto: arm64/sm4 - refactor and simplify CE implementation This patch does not add new features, but only refactors and simplifies the implementation of the Crypto Extension acceleration of the SM4 algorithm: Extract the macro optimized by SM4 Crypto Extension for reuse in the subsequent optimization of CCM/GCM modes. Encryption in CBC and CFB modes processes four blocks at a time instead of one, allowing the ld1 instruction to load 64 bytes of data at a time, which will reduces unnecessary memory accesses. CBC/CFB/CTR makes full use of free registers to reduce redundant memory accesses, and rearranges some instructions to improve out-of-order execution capabilities. Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu --- arch/arm64/crypto/sm4-ce-asm.h | 209 +++++++++++ arch/arm64/crypto/sm4-ce-core.S | 646 ++++++++++++++------------------ arch/arm64/crypto/sm4-ce-glue.c | 64 ++-- 3 files changed, 519 insertions(+), 400 deletions(-) create mode 100644 arch/arm64/crypto/sm4-ce-asm.h diff --git a/arch/arm64/crypto/sm4-ce-asm.h b/arch/arm64/crypto/sm4-ce-asm.h new file mode 100644 index 0000000000000..7ea98e42e779c --- /dev/null +++ b/arch/arm64/crypto/sm4-ce-asm.h @@ -0,0 +1,209 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SM4 helper macros for Crypto Extensions + * Copyright (C) 2022 Tianjia Zhang + */ + +#define SM4_PREPARE(ptr) \ + ld1 {v24.16b-v27.16b}, [ptr], #64; \ + ld1 {v28.16b-v31.16b}, [ptr]; + +#define SM4_CRYPT_BLK_BE(b0) \ + sm4e b0.4s, v24.4s; \ + sm4e b0.4s, v25.4s; \ + sm4e b0.4s, v26.4s; \ + sm4e b0.4s, v27.4s; \ + sm4e b0.4s, v28.4s; \ + sm4e b0.4s, v29.4s; \ + sm4e b0.4s, v30.4s; \ + sm4e b0.4s, v31.4s; \ + rev64 b0.4s, b0.4s; \ + ext b0.16b, b0.16b, b0.16b, #8; \ + rev32 b0.16b, b0.16b; + +#define SM4_CRYPT_BLK(b0) \ + rev32 b0.16b, b0.16b; \ + SM4_CRYPT_BLK_BE(b0); + +#define SM4_CRYPT_BLK2_BE(b0, b1) \ + sm4e b0.4s, v24.4s; \ + sm4e b1.4s, v24.4s; \ + sm4e b0.4s, v25.4s; \ + sm4e b1.4s, v25.4s; \ + sm4e b0.4s, v26.4s; \ + sm4e b1.4s, v26.4s; \ + sm4e b0.4s, v27.4s; \ + sm4e b1.4s, v27.4s; \ + sm4e b0.4s, v28.4s; \ + sm4e b1.4s, v28.4s; \ + sm4e b0.4s, v29.4s; \ + sm4e b1.4s, v29.4s; \ + sm4e b0.4s, v30.4s; \ + sm4e b1.4s, v30.4s; \ + sm4e b0.4s, v31.4s; \ + sm4e b1.4s, v31.4s; \ + rev64 b0.4s, b0.4s; \ + rev64 b1.4s, b1.4s; \ + ext b0.16b, b0.16b, b0.16b, #8; \ + ext b1.16b, b1.16b, b1.16b, #8; \ + rev32 b0.16b, b0.16b; \ + rev32 b1.16b, b1.16b; \ + +#define SM4_CRYPT_BLK2(b0, b1) \ + rev32 b0.16b, b0.16b; \ + rev32 b1.16b, b1.16b; \ + SM4_CRYPT_BLK2_BE(b0, b1); + +#define SM4_CRYPT_BLK4_BE(b0, b1, b2, b3) \ + sm4e b0.4s, v24.4s; \ + sm4e b1.4s, v24.4s; \ + sm4e b2.4s, v24.4s; \ + sm4e b3.4s, v24.4s; \ + sm4e b0.4s, v25.4s; \ + sm4e b1.4s, v25.4s; \ + sm4e b2.4s, v25.4s; \ + sm4e b3.4s, v25.4s; \ + sm4e b0.4s, v26.4s; \ + sm4e b1.4s, v26.4s; \ + sm4e b2.4s, v26.4s; \ + sm4e b3.4s, v26.4s; \ + sm4e b0.4s, v27.4s; \ + sm4e b1.4s, v27.4s; \ + sm4e b2.4s, v27.4s; \ + sm4e b3.4s, v27.4s; \ + sm4e b0.4s, v28.4s; \ + sm4e b1.4s, v28.4s; \ + sm4e b2.4s, v28.4s; \ + sm4e b3.4s, v28.4s; \ + sm4e b0.4s, v29.4s; \ + sm4e b1.4s, v29.4s; \ + sm4e b2.4s, v29.4s; \ + sm4e b3.4s, v29.4s; \ + sm4e b0.4s, v30.4s; \ + sm4e b1.4s, v30.4s; \ + sm4e b2.4s, v30.4s; \ + sm4e b3.4s, v30.4s; \ + sm4e b0.4s, v31.4s; \ + sm4e b1.4s, v31.4s; \ + sm4e b2.4s, v31.4s; \ + sm4e b3.4s, v31.4s; \ + rev64 b0.4s, b0.4s; \ + rev64 b1.4s, b1.4s; \ + rev64 b2.4s, b2.4s; \ + rev64 b3.4s, b3.4s; \ + ext b0.16b, b0.16b, b0.16b, #8; \ + ext b1.16b, b1.16b, b1.16b, #8; \ + ext b2.16b, b2.16b, b2.16b, #8; \ + ext b3.16b, b3.16b, b3.16b, #8; \ + rev32 b0.16b, b0.16b; \ + rev32 b1.16b, b1.16b; \ + rev32 b2.16b, b2.16b; \ + rev32 b3.16b, b3.16b; + +#define SM4_CRYPT_BLK4(b0, b1, b2, b3) \ + rev32 b0.16b, b0.16b; \ + rev32 b1.16b, b1.16b; \ + rev32 b2.16b, b2.16b; \ + rev32 b3.16b, b3.16b; \ + SM4_CRYPT_BLK4_BE(b0, b1, b2, b3); + +#define SM4_CRYPT_BLK8_BE(b0, b1, b2, b3, b4, b5, b6, b7) \ + sm4e b0.4s, v24.4s; \ + sm4e b1.4s, v24.4s; \ + sm4e b2.4s, v24.4s; \ + sm4e b3.4s, v24.4s; \ + sm4e b4.4s, v24.4s; \ + sm4e b5.4s, v24.4s; \ + sm4e b6.4s, v24.4s; \ + sm4e b7.4s, v24.4s; \ + sm4e b0.4s, v25.4s; \ + sm4e b1.4s, v25.4s; \ + sm4e b2.4s, v25.4s; \ + sm4e b3.4s, v25.4s; \ + sm4e b4.4s, v25.4s; \ + sm4e b5.4s, v25.4s; \ + sm4e b6.4s, v25.4s; \ + sm4e b7.4s, v25.4s; \ + sm4e b0.4s, v26.4s; \ + sm4e b1.4s, v26.4s; \ + sm4e b2.4s, v26.4s; \ + sm4e b3.4s, v26.4s; \ + sm4e b4.4s, v26.4s; \ + sm4e b5.4s, v26.4s; \ + sm4e b6.4s, v26.4s; \ + sm4e b7.4s, v26.4s; \ + sm4e b0.4s, v27.4s; \ + sm4e b1.4s, v27.4s; \ + sm4e b2.4s, v27.4s; \ + sm4e b3.4s, v27.4s; \ + sm4e b4.4s, v27.4s; \ + sm4e b5.4s, v27.4s; \ + sm4e b6.4s, v27.4s; \ + sm4e b7.4s, v27.4s; \ + sm4e b0.4s, v28.4s; \ + sm4e b1.4s, v28.4s; \ + sm4e b2.4s, v28.4s; \ + sm4e b3.4s, v28.4s; \ + sm4e b4.4s, v28.4s; \ + sm4e b5.4s, v28.4s; \ + sm4e b6.4s, v28.4s; \ + sm4e b7.4s, v28.4s; \ + sm4e b0.4s, v29.4s; \ + sm4e b1.4s, v29.4s; \ + sm4e b2.4s, v29.4s; \ + sm4e b3.4s, v29.4s; \ + sm4e b4.4s, v29.4s; \ + sm4e b5.4s, v29.4s; \ + sm4e b6.4s, v29.4s; \ + sm4e b7.4s, v29.4s; \ + sm4e b0.4s, v30.4s; \ + sm4e b1.4s, v30.4s; \ + sm4e b2.4s, v30.4s; \ + sm4e b3.4s, v30.4s; \ + sm4e b4.4s, v30.4s; \ + sm4e b5.4s, v30.4s; \ + sm4e b6.4s, v30.4s; \ + sm4e b7.4s, v30.4s; \ + sm4e b0.4s, v31.4s; \ + sm4e b1.4s, v31.4s; \ + sm4e b2.4s, v31.4s; \ + sm4e b3.4s, v31.4s; \ + sm4e b4.4s, v31.4s; \ + sm4e b5.4s, v31.4s; \ + sm4e b6.4s, v31.4s; \ + sm4e b7.4s, v31.4s; \ + rev64 b0.4s, b0.4s; \ + rev64 b1.4s, b1.4s; \ + rev64 b2.4s, b2.4s; \ + rev64 b3.4s, b3.4s; \ + rev64 b4.4s, b4.4s; \ + rev64 b5.4s, b5.4s; \ + rev64 b6.4s, b6.4s; \ + rev64 b7.4s, b7.4s; \ + ext b0.16b, b0.16b, b0.16b, #8; \ + ext b1.16b, b1.16b, b1.16b, #8; \ + ext b2.16b, b2.16b, b2.16b, #8; \ + ext b3.16b, b3.16b, b3.16b, #8; \ + ext b4.16b, b4.16b, b4.16b, #8; \ + ext b5.16b, b5.16b, b5.16b, #8; \ + ext b6.16b, b6.16b, b6.16b, #8; \ + ext b7.16b, b7.16b, b7.16b, #8; \ + rev32 b0.16b, b0.16b; \ + rev32 b1.16b, b1.16b; \ + rev32 b2.16b, b2.16b; \ + rev32 b3.16b, b3.16b; \ + rev32 b4.16b, b4.16b; \ + rev32 b5.16b, b5.16b; \ + rev32 b6.16b, b6.16b; \ + rev32 b7.16b, b7.16b; + +#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \ + rev32 b0.16b, b0.16b; \ + rev32 b1.16b, b1.16b; \ + rev32 b2.16b, b2.16b; \ + rev32 b3.16b, b3.16b; \ + rev32 b4.16b, b4.16b; \ + rev32 b5.16b, b5.16b; \ + rev32 b6.16b, b6.16b; \ + rev32 b7.16b, b7.16b; \ + SM4_CRYPT_BLK8_BE(b0, b1, b2, b3, b4, b5, b6, b7); diff --git a/arch/arm64/crypto/sm4-ce-core.S b/arch/arm64/crypto/sm4-ce-core.S index 934e0f0932799..41fc745a85284 100644 --- a/arch/arm64/crypto/sm4-ce-core.S +++ b/arch/arm64/crypto/sm4-ce-core.S @@ -10,10 +10,12 @@ #include #include +#include "sm4-ce-asm.h" .arch armv8-a+crypto -.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 16, 20, 24, 25, 26, 27, 28, 29, 30, 31 +.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ + 20, 24, 25, 26, 27, 28, 29, 30, 31 .set .Lv\b\().4s, \b .endr @@ -34,174 +36,6 @@ #define RIV v20 -/* Helper macros. */ - -#define PREPARE \ - ld1 {v24.16b-v27.16b}, [x0], #64; \ - ld1 {v28.16b-v31.16b}, [x0]; - -#define SM4_CRYPT_BLK(b0) \ - rev32 b0.16b, b0.16b; \ - sm4e b0.4s, v24.4s; \ - sm4e b0.4s, v25.4s; \ - sm4e b0.4s, v26.4s; \ - sm4e b0.4s, v27.4s; \ - sm4e b0.4s, v28.4s; \ - sm4e b0.4s, v29.4s; \ - sm4e b0.4s, v30.4s; \ - sm4e b0.4s, v31.4s; \ - rev64 b0.4s, b0.4s; \ - ext b0.16b, b0.16b, b0.16b, #8; \ - rev32 b0.16b, b0.16b; - -#define SM4_CRYPT_BLK4(b0, b1, b2, b3) \ - rev32 b0.16b, b0.16b; \ - rev32 b1.16b, b1.16b; \ - rev32 b2.16b, b2.16b; \ - rev32 b3.16b, b3.16b; \ - sm4e b0.4s, v24.4s; \ - sm4e b1.4s, v24.4s; \ - sm4e b2.4s, v24.4s; \ - sm4e b3.4s, v24.4s; \ - sm4e b0.4s, v25.4s; \ - sm4e b1.4s, v25.4s; \ - sm4e b2.4s, v25.4s; \ - sm4e b3.4s, v25.4s; \ - sm4e b0.4s, v26.4s; \ - sm4e b1.4s, v26.4s; \ - sm4e b2.4s, v26.4s; \ - sm4e b3.4s, v26.4s; \ - sm4e b0.4s, v27.4s; \ - sm4e b1.4s, v27.4s; \ - sm4e b2.4s, v27.4s; \ - sm4e b3.4s, v27.4s; \ - sm4e b0.4s, v28.4s; \ - sm4e b1.4s, v28.4s; \ - sm4e b2.4s, v28.4s; \ - sm4e b3.4s, v28.4s; \ - sm4e b0.4s, v29.4s; \ - sm4e b1.4s, v29.4s; \ - sm4e b2.4s, v29.4s; \ - sm4e b3.4s, v29.4s; \ - sm4e b0.4s, v30.4s; \ - sm4e b1.4s, v30.4s; \ - sm4e b2.4s, v30.4s; \ - sm4e b3.4s, v30.4s; \ - sm4e b0.4s, v31.4s; \ - sm4e b1.4s, v31.4s; \ - sm4e b2.4s, v31.4s; \ - sm4e b3.4s, v31.4s; \ - rev64 b0.4s, b0.4s; \ - rev64 b1.4s, b1.4s; \ - rev64 b2.4s, b2.4s; \ - rev64 b3.4s, b3.4s; \ - ext b0.16b, b0.16b, b0.16b, #8; \ - ext b1.16b, b1.16b, b1.16b, #8; \ - ext b2.16b, b2.16b, b2.16b, #8; \ - ext b3.16b, b3.16b, b3.16b, #8; \ - rev32 b0.16b, b0.16b; \ - rev32 b1.16b, b1.16b; \ - rev32 b2.16b, b2.16b; \ - rev32 b3.16b, b3.16b; - -#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \ - rev32 b0.16b, b0.16b; \ - rev32 b1.16b, b1.16b; \ - rev32 b2.16b, b2.16b; \ - rev32 b3.16b, b3.16b; \ - rev32 b4.16b, b4.16b; \ - rev32 b5.16b, b5.16b; \ - rev32 b6.16b, b6.16b; \ - rev32 b7.16b, b7.16b; \ - sm4e b0.4s, v24.4s; \ - sm4e b1.4s, v24.4s; \ - sm4e b2.4s, v24.4s; \ - sm4e b3.4s, v24.4s; \ - sm4e b4.4s, v24.4s; \ - sm4e b5.4s, v24.4s; \ - sm4e b6.4s, v24.4s; \ - sm4e b7.4s, v24.4s; \ - sm4e b0.4s, v25.4s; \ - sm4e b1.4s, v25.4s; \ - sm4e b2.4s, v25.4s; \ - sm4e b3.4s, v25.4s; \ - sm4e b4.4s, v25.4s; \ - sm4e b5.4s, v25.4s; \ - sm4e b6.4s, v25.4s; \ - sm4e b7.4s, v25.4s; \ - sm4e b0.4s, v26.4s; \ - sm4e b1.4s, v26.4s; \ - sm4e b2.4s, v26.4s; \ - sm4e b3.4s, v26.4s; \ - sm4e b4.4s, v26.4s; \ - sm4e b5.4s, v26.4s; \ - sm4e b6.4s, v26.4s; \ - sm4e b7.4s, v26.4s; \ - sm4e b0.4s, v27.4s; \ - sm4e b1.4s, v27.4s; \ - sm4e b2.4s, v27.4s; \ - sm4e b3.4s, v27.4s; \ - sm4e b4.4s, v27.4s; \ - sm4e b5.4s, v27.4s; \ - sm4e b6.4s, v27.4s; \ - sm4e b7.4s, v27.4s; \ - sm4e b0.4s, v28.4s; \ - sm4e b1.4s, v28.4s; \ - sm4e b2.4s, v28.4s; \ - sm4e b3.4s, v28.4s; \ - sm4e b4.4s, v28.4s; \ - sm4e b5.4s, v28.4s; \ - sm4e b6.4s, v28.4s; \ - sm4e b7.4s, v28.4s; \ - sm4e b0.4s, v29.4s; \ - sm4e b1.4s, v29.4s; \ - sm4e b2.4s, v29.4s; \ - sm4e b3.4s, v29.4s; \ - sm4e b4.4s, v29.4s; \ - sm4e b5.4s, v29.4s; \ - sm4e b6.4s, v29.4s; \ - sm4e b7.4s, v29.4s; \ - sm4e b0.4s, v30.4s; \ - sm4e b1.4s, v30.4s; \ - sm4e b2.4s, v30.4s; \ - sm4e b3.4s, v30.4s; \ - sm4e b4.4s, v30.4s; \ - sm4e b5.4s, v30.4s; \ - sm4e b6.4s, v30.4s; \ - sm4e b7.4s, v30.4s; \ - sm4e b0.4s, v31.4s; \ - sm4e b1.4s, v31.4s; \ - sm4e b2.4s, v31.4s; \ - sm4e b3.4s, v31.4s; \ - sm4e b4.4s, v31.4s; \ - sm4e b5.4s, v31.4s; \ - sm4e b6.4s, v31.4s; \ - sm4e b7.4s, v31.4s; \ - rev64 b0.4s, b0.4s; \ - rev64 b1.4s, b1.4s; \ - rev64 b2.4s, b2.4s; \ - rev64 b3.4s, b3.4s; \ - rev64 b4.4s, b4.4s; \ - rev64 b5.4s, b5.4s; \ - rev64 b6.4s, b6.4s; \ - rev64 b7.4s, b7.4s; \ - ext b0.16b, b0.16b, b0.16b, #8; \ - ext b1.16b, b1.16b, b1.16b, #8; \ - ext b2.16b, b2.16b, b2.16b, #8; \ - ext b3.16b, b3.16b, b3.16b, #8; \ - ext b4.16b, b4.16b, b4.16b, #8; \ - ext b5.16b, b5.16b, b5.16b, #8; \ - ext b6.16b, b6.16b, b6.16b, #8; \ - ext b7.16b, b7.16b, b7.16b, #8; \ - rev32 b0.16b, b0.16b; \ - rev32 b1.16b, b1.16b; \ - rev32 b2.16b, b2.16b; \ - rev32 b3.16b, b3.16b; \ - rev32 b4.16b, b4.16b; \ - rev32 b5.16b, b5.16b; \ - rev32 b6.16b, b6.16b; \ - rev32 b7.16b, b7.16b; - .align 3 SYM_FUNC_START(sm4_ce_expand_key) @@ -268,7 +102,7 @@ SYM_FUNC_START(sm4_ce_crypt_block) * x1: dst * x2: src */ - PREPARE; + SM4_PREPARE(x0) ld1 {v0.16b}, [x2]; SM4_CRYPT_BLK(v0); @@ -285,7 +119,7 @@ SYM_FUNC_START(sm4_ce_crypt) * x2: src * w3: nblocks */ - PREPARE; + SM4_PREPARE(x0) .Lcrypt_loop_blk: sub w3, w3, #8; @@ -337,26 +171,50 @@ SYM_FUNC_START(sm4_ce_cbc_enc) * x3: iv (big endian, 128 bit) * w4: nblocks */ - PREPARE; + SM4_PREPARE(x0) + + ld1 {RIV.16b}, [x3] + +.Lcbc_enc_loop_4x: + cmp w4, #4 + blt .Lcbc_enc_loop_1x + + sub w4, w4, #4 - ld1 {RIV.16b}, [x3]; + ld1 {v0.16b-v3.16b}, [x2], #64 -.Lcbc_enc_loop: - sub w4, w4, #1; + eor v0.16b, v0.16b, RIV.16b + SM4_CRYPT_BLK(v0) + eor v1.16b, v1.16b, v0.16b + SM4_CRYPT_BLK(v1) + eor v2.16b, v2.16b, v1.16b + SM4_CRYPT_BLK(v2) + eor v3.16b, v3.16b, v2.16b + SM4_CRYPT_BLK(v3) - ld1 {RTMP0.16b}, [x2], #16; - eor RIV.16b, RIV.16b, RTMP0.16b; + st1 {v0.16b-v3.16b}, [x1], #64 + mov RIV.16b, v3.16b - SM4_CRYPT_BLK(RIV); + cbz w4, .Lcbc_enc_end + b .Lcbc_enc_loop_4x - st1 {RIV.16b}, [x1], #16; +.Lcbc_enc_loop_1x: + sub w4, w4, #1 - cbnz w4, .Lcbc_enc_loop; + ld1 {v0.16b}, [x2], #16 + eor RIV.16b, RIV.16b, v0.16b + SM4_CRYPT_BLK(RIV) + + st1 {RIV.16b}, [x1], #16 + + cbnz w4, .Lcbc_enc_loop_1x + +.Lcbc_enc_end: /* store new IV */ - st1 {RIV.16b}, [x3]; + st1 {RIV.16b}, [x3] - ret; + ret SYM_FUNC_END(sm4_ce_cbc_enc) .align 3 @@ -368,79 +226,93 @@ SYM_FUNC_START(sm4_ce_cbc_dec) * x3: iv (big endian, 128 bit) * w4: nblocks */ - PREPARE; + SM4_PREPARE(x0) - ld1 {RIV.16b}, [x3]; + ld1 {RIV.16b}, [x3] -.Lcbc_loop_blk: - sub w4, w4, #8; - tbnz w4, #31, .Lcbc_tail8; +.Lcbc_dec_loop_8x: + sub w4, w4, #8 + tbnz w4, #31, .Lcbc_dec_4x - ld1 {v0.16b-v3.16b}, [x2], #64; - ld1 {v4.16b-v7.16b}, [x2]; + ld1 {v0.16b-v3.16b}, [x2], #64 + ld1 {v4.16b-v7.16b}, [x2], #64 - SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); + rev32 v8.16b, v0.16b + rev32 v9.16b, v1.16b + rev32 v10.16b, v2.16b + rev32 v11.16b, v3.16b + rev32 v12.16b, v4.16b + rev32 v13.16b, v5.16b + rev32 v14.16b, v6.16b + rev32 v15.16b, v7.16b - sub x2, x2, #64; - eor v0.16b, v0.16b, RIV.16b; - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; - eor v1.16b, v1.16b, RTMP0.16b; - eor v2.16b, v2.16b, RTMP1.16b; - eor v3.16b, v3.16b, RTMP2.16b; - st1 {v0.16b-v3.16b}, [x1], #64; + SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15) - eor v4.16b, v4.16b, RTMP3.16b; - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; - eor v5.16b, v5.16b, RTMP0.16b; - eor v6.16b, v6.16b, RTMP1.16b; - eor v7.16b, v7.16b, RTMP2.16b; + eor v8.16b, v8.16b, RIV.16b + eor v9.16b, v9.16b, v0.16b + eor v10.16b, v10.16b, v1.16b + eor v11.16b, v11.16b, v2.16b + eor v12.16b, v12.16b, v3.16b + eor v13.16b, v13.16b, v4.16b + eor v14.16b, v14.16b, v5.16b + eor v15.16b, v15.16b, v6.16b - mov RIV.16b, RTMP3.16b; - st1 {v4.16b-v7.16b}, [x1], #64; + st1 {v8.16b-v11.16b}, [x1], #64 + st1 {v12.16b-v15.16b}, [x1], #64 - cbz w4, .Lcbc_end; - b .Lcbc_loop_blk; + mov RIV.16b, v7.16b -.Lcbc_tail8: - add w4, w4, #8; - cmp w4, #4; - blt .Lcbc_tail4; + cbz w4, .Lcbc_dec_end + b .Lcbc_dec_loop_8x - sub w4, w4, #4; +.Lcbc_dec_4x: + add w4, w4, #8 + cmp w4, #4 + blt .Lcbc_dec_loop_1x - ld1 {v0.16b-v3.16b}, [x2]; + sub w4, w4, #4 - SM4_CRYPT_BLK4(v0, v1, v2, v3); + ld1 {v0.16b-v3.16b}, [x2], #64 - eor v0.16b, v0.16b, RIV.16b; - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; - eor v1.16b, v1.16b, RTMP0.16b; - eor v2.16b, v2.16b, RTMP1.16b; - eor v3.16b, v3.16b, RTMP2.16b; + rev32 v8.16b, v0.16b + rev32 v9.16b, v1.16b + rev32 v10.16b, v2.16b + rev32 v11.16b, v3.16b - mov RIV.16b, RTMP3.16b; - st1 {v0.16b-v3.16b}, [x1], #64; + SM4_CRYPT_BLK4_BE(v8, v9, v10, v11) - cbz w4, .Lcbc_end; + eor v8.16b, v8.16b, RIV.16b + eor v9.16b, v9.16b, v0.16b + eor v10.16b, v10.16b, v1.16b + eor v11.16b, v11.16b, v2.16b -.Lcbc_tail4: - sub w4, w4, #1; + st1 {v8.16b-v11.16b}, [x1], #64 - ld1 {v0.16b}, [x2]; + mov RIV.16b, v3.16b - SM4_CRYPT_BLK(v0); + cbz w4, .Lcbc_dec_end - eor v0.16b, v0.16b, RIV.16b; - ld1 {RIV.16b}, [x2], #16; - st1 {v0.16b}, [x1], #16; +.Lcbc_dec_loop_1x: + sub w4, w4, #1 + + ld1 {v0.16b}, [x2], #16 + + rev32 v8.16b, v0.16b + + SM4_CRYPT_BLK_BE(v8) - cbnz w4, .Lcbc_tail4; + eor v8.16b, v8.16b, RIV.16b + st1 {v8.16b}, [x1], #16 -.Lcbc_end: + mov RIV.16b, v0.16b + + cbnz w4, .Lcbc_dec_loop_1x + +.Lcbc_dec_end: /* store new IV */ - st1 {RIV.16b}, [x3]; + st1 {RIV.16b}, [x3] - ret; + ret SYM_FUNC_END(sm4_ce_cbc_dec) .align 3 @@ -452,25 +324,57 @@ SYM_FUNC_START(sm4_ce_cfb_enc) * x3: iv (big endian, 128 bit) * w4: nblocks */ - PREPARE; + SM4_PREPARE(x0) + + ld1 {RIV.16b}, [x3] + +.Lcfb_enc_loop_4x: + cmp w4, #4 + blt .Lcfb_enc_loop_1x + + sub w4, w4, #4 + + ld1 {v0.16b-v3.16b}, [x2], #64 + + rev32 v8.16b, RIV.16b + SM4_CRYPT_BLK_BE(v8) + eor v0.16b, v0.16b, v8.16b + + rev32 v8.16b, v0.16b + SM4_CRYPT_BLK_BE(v8) + eor v1.16b, v1.16b, v8.16b + + rev32 v8.16b, v1.16b + SM4_CRYPT_BLK_BE(v8) + eor v2.16b, v2.16b, v8.16b + + rev32 v8.16b, v2.16b + SM4_CRYPT_BLK_BE(v8) + eor v3.16b, v3.16b, v8.16b - ld1 {RIV.16b}, [x3]; + st1 {v0.16b-v3.16b}, [x1], #64 + mov RIV.16b, v3.16b -.Lcfb_enc_loop: - sub w4, w4, #1; + cbz w4, .Lcfb_enc_end + b .Lcfb_enc_loop_4x - SM4_CRYPT_BLK(RIV); +.Lcfb_enc_loop_1x: + sub w4, w4, #1 - ld1 {RTMP0.16b}, [x2], #16; - eor RIV.16b, RIV.16b, RTMP0.16b; - st1 {RIV.16b}, [x1], #16; + ld1 {v0.16b}, [x2], #16 - cbnz w4, .Lcfb_enc_loop; + SM4_CRYPT_BLK(RIV) + eor RIV.16b, RIV.16b, v0.16b + st1 {RIV.16b}, [x1], #16 + + cbnz w4, .Lcfb_enc_loop_1x + +.Lcfb_enc_end: /* store new IV */ - st1 {RIV.16b}, [x3]; + st1 {RIV.16b}, [x3] - ret; + ret SYM_FUNC_END(sm4_ce_cfb_enc) .align 3 @@ -482,79 +386,91 @@ SYM_FUNC_START(sm4_ce_cfb_dec) * x3: iv (big endian, 128 bit) * w4: nblocks */ - PREPARE; + SM4_PREPARE(x0) - ld1 {v0.16b}, [x3]; + ld1 {RIV.16b}, [x3] -.Lcfb_loop_blk: - sub w4, w4, #8; - tbnz w4, #31, .Lcfb_tail8; +.Lcfb_dec_loop_8x: + sub w4, w4, #8 + tbnz w4, #31, .Lcfb_dec_4x - ld1 {v1.16b, v2.16b, v3.16b}, [x2], #48; - ld1 {v4.16b-v7.16b}, [x2]; + ld1 {v0.16b-v3.16b}, [x2], #64 + ld1 {v4.16b-v7.16b}, [x2], #64 - SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); + rev32 v8.16b, RIV.16b + rev32 v9.16b, v0.16b + rev32 v10.16b, v1.16b + rev32 v11.16b, v2.16b + rev32 v12.16b, v3.16b + rev32 v13.16b, v4.16b + rev32 v14.16b, v5.16b + rev32 v15.16b, v6.16b - sub x2, x2, #48; - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; - eor v0.16b, v0.16b, RTMP0.16b; - eor v1.16b, v1.16b, RTMP1.16b; - eor v2.16b, v2.16b, RTMP2.16b; - eor v3.16b, v3.16b, RTMP3.16b; - st1 {v0.16b-v3.16b}, [x1], #64; + SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15) - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; - eor v4.16b, v4.16b, RTMP0.16b; - eor v5.16b, v5.16b, RTMP1.16b; - eor v6.16b, v6.16b, RTMP2.16b; - eor v7.16b, v7.16b, RTMP3.16b; - st1 {v4.16b-v7.16b}, [x1], #64; + mov RIV.16b, v7.16b - mov v0.16b, RTMP3.16b; + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b + eor v4.16b, v4.16b, v12.16b + eor v5.16b, v5.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + eor v7.16b, v7.16b, v15.16b - cbz w4, .Lcfb_end; - b .Lcfb_loop_blk; + st1 {v0.16b-v3.16b}, [x1], #64 + st1 {v4.16b-v7.16b}, [x1], #64 -.Lcfb_tail8: - add w4, w4, #8; - cmp w4, #4; - blt .Lcfb_tail4; + cbz w4, .Lcfb_dec_end + b .Lcfb_dec_loop_8x - sub w4, w4, #4; +.Lcfb_dec_4x: + add w4, w4, #8 + cmp w4, #4 + blt .Lcfb_dec_loop_1x - ld1 {v1.16b, v2.16b, v3.16b}, [x2]; + sub w4, w4, #4 - SM4_CRYPT_BLK4(v0, v1, v2, v3); + ld1 {v0.16b-v3.16b}, [x2], #64 - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; - eor v0.16b, v0.16b, RTMP0.16b; - eor v1.16b, v1.16b, RTMP1.16b; - eor v2.16b, v2.16b, RTMP2.16b; - eor v3.16b, v3.16b, RTMP3.16b; - st1 {v0.16b-v3.16b}, [x1], #64; + rev32 v8.16b, RIV.16b + rev32 v9.16b, v0.16b + rev32 v10.16b, v1.16b + rev32 v11.16b, v2.16b - mov v0.16b, RTMP3.16b; + SM4_CRYPT_BLK4_BE(v8, v9, v10, v11) - cbz w4, .Lcfb_end; + mov RIV.16b, v3.16b -.Lcfb_tail4: - sub w4, w4, #1; + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b - SM4_CRYPT_BLK(v0); + st1 {v0.16b-v3.16b}, [x1], #64 - ld1 {RTMP0.16b}, [x2], #16; - eor v0.16b, v0.16b, RTMP0.16b; - st1 {v0.16b}, [x1], #16; + cbz w4, .Lcfb_dec_end + +.Lcfb_dec_loop_1x: + sub w4, w4, #1 + + ld1 {v0.16b}, [x2], #16 - mov v0.16b, RTMP0.16b; + SM4_CRYPT_BLK(RIV) - cbnz w4, .Lcfb_tail4; + eor RIV.16b, RIV.16b, v0.16b + st1 {RIV.16b}, [x1], #16 -.Lcfb_end: + mov RIV.16b, v0.16b + + cbnz w4, .Lcfb_dec_loop_1x + +.Lcfb_dec_end: /* store new IV */ - st1 {v0.16b}, [x3]; + st1 {RIV.16b}, [x3] - ret; + ret SYM_FUNC_END(sm4_ce_cfb_dec) .align 3 @@ -566,95 +482,99 @@ SYM_FUNC_START(sm4_ce_ctr_enc) * x3: ctr (big endian, 128 bit) * w4: nblocks */ - PREPARE; + SM4_PREPARE(x0) - ldp x7, x8, [x3]; - rev x7, x7; - rev x8, x8; + ldp x7, x8, [x3] + rev x7, x7 + rev x8, x8 -.Lctr_loop_blk: - sub w4, w4, #8; - tbnz w4, #31, .Lctr_tail8; +.Lctr_loop_8x: + sub w4, w4, #8 + tbnz w4, #31, .Lctr_4x -#define inc_le128(vctr) \ - mov vctr.d[1], x8; \ - mov vctr.d[0], x7; \ - adds x8, x8, #1; \ - adc x7, x7, xzr; \ - rev64 vctr.16b, vctr.16b; +#define inc_le128(vctr) \ + mov vctr.d[1], x8; \ + mov vctr.d[0], x7; \ + adds x8, x8, #1; \ + rev64 vctr.16b, vctr.16b; \ + adc x7, x7, xzr; /* construct CTRs */ - inc_le128(v0); /* +0 */ - inc_le128(v1); /* +1 */ - inc_le128(v2); /* +2 */ - inc_le128(v3); /* +3 */ - inc_le128(v4); /* +4 */ - inc_le128(v5); /* +5 */ - inc_le128(v6); /* +6 */ - inc_le128(v7); /* +7 */ + inc_le128(v0) /* +0 */ + inc_le128(v1) /* +1 */ + inc_le128(v2) /* +2 */ + inc_le128(v3) /* +3 */ + inc_le128(v4) /* +4 */ + inc_le128(v5) /* +5 */ + inc_le128(v6) /* +6 */ + inc_le128(v7) /* +7 */ + + ld1 {v8.16b-v11.16b}, [x2], #64 + ld1 {v12.16b-v15.16b}, [x2], #64 + + SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) + + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b + eor v4.16b, v4.16b, v12.16b + eor v5.16b, v5.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + eor v7.16b, v7.16b, v15.16b + + st1 {v0.16b-v3.16b}, [x1], #64 + st1 {v4.16b-v7.16b}, [x1], #64 + + cbz w4, .Lctr_end + b .Lctr_loop_8x + +.Lctr_4x: + add w4, w4, #8 + cmp w4, #4 + blt .Lctr_loop_1x + + sub w4, w4, #4 - SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); - - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; - eor v0.16b, v0.16b, RTMP0.16b; - eor v1.16b, v1.16b, RTMP1.16b; - eor v2.16b, v2.16b, RTMP2.16b; - eor v3.16b, v3.16b, RTMP3.16b; - st1 {v0.16b-v3.16b}, [x1], #64; + /* construct CTRs */ + inc_le128(v0) /* +0 */ + inc_le128(v1) /* +1 */ + inc_le128(v2) /* +2 */ + inc_le128(v3) /* +3 */ - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; - eor v4.16b, v4.16b, RTMP0.16b; - eor v5.16b, v5.16b, RTMP1.16b; - eor v6.16b, v6.16b, RTMP2.16b; - eor v7.16b, v7.16b, RTMP3.16b; - st1 {v4.16b-v7.16b}, [x1], #64; + ld1 {v8.16b-v11.16b}, [x2], #64 - cbz w4, .Lctr_end; - b .Lctr_loop_blk; + SM4_CRYPT_BLK4(v0, v1, v2, v3) -.Lctr_tail8: - add w4, w4, #8; - cmp w4, #4; - blt .Lctr_tail4; + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b - sub w4, w4, #4; + st1 {v0.16b-v3.16b}, [x1], #64 - /* construct CTRs */ - inc_le128(v0); /* +0 */ - inc_le128(v1); /* +1 */ - inc_le128(v2); /* +2 */ - inc_le128(v3); /* +3 */ + cbz w4, .Lctr_end - SM4_CRYPT_BLK4(v0, v1, v2, v3); - - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; - eor v0.16b, v0.16b, RTMP0.16b; - eor v1.16b, v1.16b, RTMP1.16b; - eor v2.16b, v2.16b, RTMP2.16b; - eor v3.16b, v3.16b, RTMP3.16b; - st1 {v0.16b-v3.16b}, [x1], #64; - - cbz w4, .Lctr_end; - -.Lctr_tail4: - sub w4, w4, #1; +.Lctr_loop_1x: + sub w4, w4, #1 /* construct CTRs */ - inc_le128(v0); + inc_le128(v0) - SM4_CRYPT_BLK(v0); + ld1 {v8.16b}, [x2], #16 - ld1 {RTMP0.16b}, [x2], #16; - eor v0.16b, v0.16b, RTMP0.16b; - st1 {v0.16b}, [x1], #16; + SM4_CRYPT_BLK(v0) + + eor v0.16b, v0.16b, v8.16b + st1 {v0.16b}, [x1], #16 - cbnz w4, .Lctr_tail4; + cbnz w4, .Lctr_loop_1x .Lctr_end: /* store new CTR */ - rev x7, x7; - rev x8, x8; - stp x7, x8, [x3]; + rev x7, x7 + rev x8, x8 + stp x7, x8, [x3] - ret; + ret SYM_FUNC_END(sm4_ce_ctr_enc) diff --git a/arch/arm64/crypto/sm4-ce-glue.c b/arch/arm64/crypto/sm4-ce-glue.c index 496d55c0d01a4..e56e81b1f35f7 100644 --- a/arch/arm64/crypto/sm4-ce-glue.c +++ b/arch/arm64/crypto/sm4-ce-glue.c @@ -26,9 +26,9 @@ asmlinkage void sm4_ce_crypt_block(const u32 *rkey, u8 *dst, const u8 *src); asmlinkage void sm4_ce_crypt(const u32 *rkey, u8 *dst, const u8 *src, unsigned int nblks); asmlinkage void sm4_ce_cbc_enc(const u32 *rkey, u8 *dst, const u8 *src, - u8 *iv, unsigned int nblks); + u8 *iv, unsigned int nblocks); asmlinkage void sm4_ce_cbc_dec(const u32 *rkey, u8 *dst, const u8 *src, - u8 *iv, unsigned int nblks); + u8 *iv, unsigned int nblocks); asmlinkage void sm4_ce_cfb_enc(const u32 *rkey, u8 *dst, const u8 *src, u8 *iv, unsigned int nblks); asmlinkage void sm4_ce_cfb_dec(const u32 *rkey, u8 *dst, const u8 *src, @@ -94,66 +94,56 @@ static int sm4_ecb_decrypt(struct skcipher_request *req) return sm4_ecb_do_crypt(req, ctx->rkey_dec); } -static int sm4_cbc_encrypt(struct skcipher_request *req) +static int sm4_cbc_crypt(struct skcipher_request *req, + struct sm4_ctx *ctx, bool encrypt) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); + if (err) + return err; while ((nbytes = walk.nbytes) > 0) { const u8 *src = walk.src.virt.addr; u8 *dst = walk.dst.virt.addr; - unsigned int nblks; + unsigned int nblocks; - kernel_neon_begin(); + nblocks = nbytes / SM4_BLOCK_SIZE; + if (nblocks) { + kernel_neon_begin(); - nblks = BYTES2BLKS(nbytes); - if (nblks) { - sm4_ce_cbc_enc(ctx->rkey_enc, dst, src, walk.iv, nblks); - nbytes -= nblks * SM4_BLOCK_SIZE; - } + if (encrypt) + sm4_ce_cbc_enc(ctx->rkey_enc, dst, src, + walk.iv, nblocks); + else + sm4_ce_cbc_dec(ctx->rkey_dec, dst, src, + walk.iv, nblocks); - kernel_neon_end(); + kernel_neon_end(); + } - err = skcipher_walk_done(&walk, nbytes); + err = skcipher_walk_done(&walk, nbytes % SM4_BLOCK_SIZE); } return err; } -static int sm4_cbc_decrypt(struct skcipher_request *req) +static int sm4_cbc_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - while ((nbytes = walk.nbytes) > 0) { - const u8 *src = walk.src.virt.addr; - u8 *dst = walk.dst.virt.addr; - unsigned int nblks; - - kernel_neon_begin(); - - nblks = BYTES2BLKS(nbytes); - if (nblks) { - sm4_ce_cbc_dec(ctx->rkey_dec, dst, src, walk.iv, nblks); - nbytes -= nblks * SM4_BLOCK_SIZE; - } - - kernel_neon_end(); + return sm4_cbc_crypt(req, ctx, true); +} - err = skcipher_walk_done(&walk, nbytes); - } +static int sm4_cbc_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); - return err; + return sm4_cbc_crypt(req, ctx, false); } static int sm4_cfb_encrypt(struct skcipher_request *req) -- GitLab From cb9ba02b07d18172c6a6dcc69410c56482903230 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Thu, 27 Oct 2022 14:54:59 +0800 Subject: [PATCH 0364/1423] crypto: arm64/sm4 - simplify sm4_ce_expand_key() of CE implementation Use a 128-bit swap mask and tbl instruction to simplify the implementation for generating SM4 rkey_dec. Also fixed the issue of not being wrapped by kernel_neon_begin/end() when using the sm4_ce_expand_key() function. Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu --- arch/arm64/crypto/sm4-ce-core.S | 46 ++++++++++++++++----------------- arch/arm64/crypto/sm4-ce-glue.c | 2 ++ 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/arch/arm64/crypto/sm4-ce-core.S b/arch/arm64/crypto/sm4-ce-core.S index 41fc745a85284..9e4b4f01cdf3e 100644 --- a/arch/arm64/crypto/sm4-ce-core.S +++ b/arch/arm64/crypto/sm4-ce-core.S @@ -65,32 +65,23 @@ SYM_FUNC_START(sm4_ce_expand_key) sm4ekey v6.4s, v5.4s, v30.4s; sm4ekey v7.4s, v6.4s, v31.4s; + adr_l x5, .Lbswap128_mask + ld1 {v24.16b}, [x5] + st1 {v0.16b-v3.16b}, [x1], #64; st1 {v4.16b-v7.16b}, [x1]; - rev64 v7.4s, v7.4s; - rev64 v6.4s, v6.4s; - rev64 v5.4s, v5.4s; - rev64 v4.4s, v4.4s; - rev64 v3.4s, v3.4s; - rev64 v2.4s, v2.4s; - rev64 v1.4s, v1.4s; - rev64 v0.4s, v0.4s; - ext v7.16b, v7.16b, v7.16b, #8; - ext v6.16b, v6.16b, v6.16b, #8; - ext v5.16b, v5.16b, v5.16b, #8; - ext v4.16b, v4.16b, v4.16b, #8; - ext v3.16b, v3.16b, v3.16b, #8; - ext v2.16b, v2.16b, v2.16b, #8; - ext v1.16b, v1.16b, v1.16b, #8; - ext v0.16b, v0.16b, v0.16b, #8; - st1 {v7.16b}, [x2], #16; - st1 {v6.16b}, [x2], #16; - st1 {v5.16b}, [x2], #16; - st1 {v4.16b}, [x2], #16; - st1 {v3.16b}, [x2], #16; - st1 {v2.16b}, [x2], #16; - st1 {v1.16b}, [x2], #16; - st1 {v0.16b}, [x2]; + + tbl v16.16b, {v7.16b}, v24.16b + tbl v17.16b, {v6.16b}, v24.16b + tbl v18.16b, {v5.16b}, v24.16b + tbl v19.16b, {v4.16b}, v24.16b + tbl v20.16b, {v3.16b}, v24.16b + tbl v21.16b, {v2.16b}, v24.16b + tbl v22.16b, {v1.16b}, v24.16b + tbl v23.16b, {v0.16b}, v24.16b + + st1 {v16.16b-v19.16b}, [x2], #64 + st1 {v20.16b-v23.16b}, [x2] ret; SYM_FUNC_END(sm4_ce_expand_key) @@ -578,3 +569,10 @@ SYM_FUNC_START(sm4_ce_ctr_enc) ret SYM_FUNC_END(sm4_ce_ctr_enc) + + + .section ".rodata", "a" + .align 4 +.Lbswap128_mask: + .byte 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b + .byte 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 diff --git a/arch/arm64/crypto/sm4-ce-glue.c b/arch/arm64/crypto/sm4-ce-glue.c index e56e81b1f35f7..ff2d8442d4730 100644 --- a/arch/arm64/crypto/sm4-ce-glue.c +++ b/arch/arm64/crypto/sm4-ce-glue.c @@ -44,8 +44,10 @@ static int sm4_setkey(struct crypto_skcipher *tfm, const u8 *key, if (key_len != SM4_KEY_SIZE) return -EINVAL; + kernel_neon_begin(); sm4_ce_expand_key(key, ctx->rkey_enc, ctx->rkey_dec, crypto_sm4_fk, crypto_sm4_ck); + kernel_neon_end(); return 0; } -- GitLab From 45089dbe5952e9afbe2a3b3054105f2a694930f1 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Thu, 27 Oct 2022 14:55:00 +0800 Subject: [PATCH 0365/1423] crypto: arm64/sm4 - export reusable CE acceleration functions In the accelerated implementation of the SM4 algorithm using the Crypto Extension instructions, there are some functions that can be reused in the upcoming accelerated implementation of the GCM/CCM mode, and the CBC/CFB encryption is reused in the optimized implementation of SVESM4. Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu --- arch/arm64/crypto/sm4-ce-glue.c | 5 +++++ arch/arm64/crypto/sm4-ce.h | 16 ++++++++++++++++ 2 files changed, 21 insertions(+) create mode 100644 arch/arm64/crypto/sm4-ce.h diff --git a/arch/arm64/crypto/sm4-ce-glue.c b/arch/arm64/crypto/sm4-ce-glue.c index ff2d8442d4730..63abcadc684ba 100644 --- a/arch/arm64/crypto/sm4-ce-glue.c +++ b/arch/arm64/crypto/sm4-ce-glue.c @@ -36,6 +36,11 @@ asmlinkage void sm4_ce_cfb_dec(const u32 *rkey, u8 *dst, const u8 *src, asmlinkage void sm4_ce_ctr_enc(const u32 *rkey, u8 *dst, const u8 *src, u8 *iv, unsigned int nblks); +EXPORT_SYMBOL(sm4_ce_expand_key); +EXPORT_SYMBOL(sm4_ce_crypt_block); +EXPORT_SYMBOL(sm4_ce_cbc_enc); +EXPORT_SYMBOL(sm4_ce_cfb_enc); + static int sm4_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int key_len) { diff --git a/arch/arm64/crypto/sm4-ce.h b/arch/arm64/crypto/sm4-ce.h new file mode 100644 index 0000000000000..109c21b37590d --- /dev/null +++ b/arch/arm64/crypto/sm4-ce.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SM4 common functions for Crypto Extensions + * Copyright (C) 2022 Tianjia Zhang + */ + +void sm4_ce_expand_key(const u8 *key, u32 *rkey_enc, u32 *rkey_dec, + const u32 *fk, const u32 *ck); + +void sm4_ce_crypt_block(const u32 *rkey, u8 *dst, const u8 *src); + +void sm4_ce_cbc_enc(const u32 *rkey_enc, u8 *dst, const u8 *src, + u8 *iv, unsigned int nblocks); + +void sm4_ce_cfb_enc(const u32 *rkey_enc, u8 *dst, const u8 *src, + u8 *iv, unsigned int nblocks); -- GitLab From b1863fd0742f8da21f6f994e14e820db5831bd74 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Thu, 27 Oct 2022 14:55:01 +0800 Subject: [PATCH 0366/1423] crypto: arm64/sm4 - add CE implementation for CTS-CBC mode This patch is a CE-optimized assembly implementation for CTS-CBC mode. Benchmark on T-Head Yitian-710 2.75 GHz, the data comes from the 218 mode of tcrypt, and compared the performance before and after this patch (the driver used before this patch is cts(cbc-sm4-ce)). The abscissas are blocks of different lengths. The data is tabulated and the unit is Mb/s: Before: cts(cbc-sm4-ce) | 16 64 128 256 1024 1420 4096 ----------------+-------------------------------------------------------------- CTS-CBC enc | 286.09 297.17 457.97 627.75 868.58 900.80 957.69 CTS-CBC dec | 286.67 285.63 538.35 947.08 2241.03 2577.32 3391.14 After: cts-cbc-sm4-ce | 16 64 128 256 1024 1420 4096 ----------------+-------------------------------------------------------------- CTS-CBC enc | 288.19 428.80 593.57 741.04 911.73 931.80 950.00 CTS-CBC dec | 292.22 468.99 838.23 1380.76 2741.17 3036.42 3409.62 Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu --- arch/arm64/crypto/sm4-ce-core.S | 102 ++++++++++++++++++++++++++++++++ arch/arm64/crypto/sm4-ce-glue.c | 94 +++++++++++++++++++++++++++++ 2 files changed, 196 insertions(+) diff --git a/arch/arm64/crypto/sm4-ce-core.S b/arch/arm64/crypto/sm4-ce-core.S index 9e4b4f01cdf3e..414d29f8110b3 100644 --- a/arch/arm64/crypto/sm4-ce-core.S +++ b/arch/arm64/crypto/sm4-ce-core.S @@ -306,6 +306,100 @@ SYM_FUNC_START(sm4_ce_cbc_dec) ret SYM_FUNC_END(sm4_ce_cbc_dec) +.align 3 +SYM_FUNC_START(sm4_ce_cbc_cts_enc) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: iv (big endian, 128 bit) + * w4: nbytes + */ + SM4_PREPARE(x0) + + sub w5, w4, #16 + uxtw x5, w5 + + ld1 {RIV.16b}, [x3] + + ld1 {v0.16b}, [x2] + eor RIV.16b, RIV.16b, v0.16b + SM4_CRYPT_BLK(RIV) + + /* load permute table */ + adr_l x6, .Lcts_permute_table + add x7, x6, #32 + add x6, x6, x5 + sub x7, x7, x5 + ld1 {v3.16b}, [x6] + ld1 {v4.16b}, [x7] + + /* overlapping loads */ + add x2, x2, x5 + ld1 {v1.16b}, [x2] + + /* create Cn from En-1 */ + tbl v0.16b, {RIV.16b}, v3.16b + /* padding Pn with zeros */ + tbl v1.16b, {v1.16b}, v4.16b + + eor v1.16b, v1.16b, RIV.16b + SM4_CRYPT_BLK(v1) + + /* overlapping stores */ + add x5, x1, x5 + st1 {v0.16b}, [x5] + st1 {v1.16b}, [x1] + + ret +SYM_FUNC_END(sm4_ce_cbc_cts_enc) + +.align 3 +SYM_FUNC_START(sm4_ce_cbc_cts_dec) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: iv (big endian, 128 bit) + * w4: nbytes + */ + SM4_PREPARE(x0) + + sub w5, w4, #16 + uxtw x5, w5 + + ld1 {RIV.16b}, [x3] + + /* load permute table */ + adr_l x6, .Lcts_permute_table + add x7, x6, #32 + add x6, x6, x5 + sub x7, x7, x5 + ld1 {v3.16b}, [x6] + ld1 {v4.16b}, [x7] + + /* overlapping loads */ + ld1 {v0.16b}, [x2], x5 + ld1 {v1.16b}, [x2] + + SM4_CRYPT_BLK(v0) + /* select the first Ln bytes of Xn to create Pn */ + tbl v2.16b, {v0.16b}, v3.16b + eor v2.16b, v2.16b, v1.16b + + /* overwrite the first Ln bytes with Cn to create En-1 */ + tbx v0.16b, {v1.16b}, v4.16b + SM4_CRYPT_BLK(v0) + eor v0.16b, v0.16b, RIV.16b + + /* overlapping stores */ + add x5, x1, x5 + st1 {v2.16b}, [x5] + st1 {v0.16b}, [x1] + + ret +SYM_FUNC_END(sm4_ce_cbc_cts_dec) + .align 3 SYM_FUNC_START(sm4_ce_cfb_enc) /* input: @@ -576,3 +670,11 @@ SYM_FUNC_END(sm4_ce_ctr_enc) .Lbswap128_mask: .byte 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b .byte 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 + +.Lcts_permute_table: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff diff --git a/arch/arm64/crypto/sm4-ce-glue.c b/arch/arm64/crypto/sm4-ce-glue.c index 63abcadc684ba..4d4072c7bfa2e 100644 --- a/arch/arm64/crypto/sm4-ce-glue.c +++ b/arch/arm64/crypto/sm4-ce-glue.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #define BYTES2BLKS(nbytes) ((nbytes) >> 4) @@ -29,6 +30,10 @@ asmlinkage void sm4_ce_cbc_enc(const u32 *rkey, u8 *dst, const u8 *src, u8 *iv, unsigned int nblocks); asmlinkage void sm4_ce_cbc_dec(const u32 *rkey, u8 *dst, const u8 *src, u8 *iv, unsigned int nblocks); +asmlinkage void sm4_ce_cbc_cts_enc(const u32 *rkey, u8 *dst, const u8 *src, + u8 *iv, unsigned int nbytes); +asmlinkage void sm4_ce_cbc_cts_dec(const u32 *rkey, u8 *dst, const u8 *src, + u8 *iv, unsigned int nbytes); asmlinkage void sm4_ce_cfb_enc(const u32 *rkey, u8 *dst, const u8 *src, u8 *iv, unsigned int nblks); asmlinkage void sm4_ce_cfb_dec(const u32 *rkey, u8 *dst, const u8 *src, @@ -153,6 +158,78 @@ static int sm4_cbc_decrypt(struct skcipher_request *req) return sm4_cbc_crypt(req, ctx, false); } +static int sm4_cbc_cts_crypt(struct skcipher_request *req, bool encrypt) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct scatterlist *src = req->src; + struct scatterlist *dst = req->dst; + struct scatterlist sg_src[2], sg_dst[2]; + struct skcipher_request subreq; + struct skcipher_walk walk; + int cbc_blocks; + int err; + + if (req->cryptlen < SM4_BLOCK_SIZE) + return -EINVAL; + + if (req->cryptlen == SM4_BLOCK_SIZE) + return sm4_cbc_crypt(req, ctx, encrypt); + + skcipher_request_set_tfm(&subreq, tfm); + skcipher_request_set_callback(&subreq, skcipher_request_flags(req), + NULL, NULL); + + /* handle the CBC cryption part */ + cbc_blocks = DIV_ROUND_UP(req->cryptlen, SM4_BLOCK_SIZE) - 2; + if (cbc_blocks) { + skcipher_request_set_crypt(&subreq, src, dst, + cbc_blocks * SM4_BLOCK_SIZE, + req->iv); + + err = sm4_cbc_crypt(&subreq, ctx, encrypt); + if (err) + return err; + + dst = src = scatterwalk_ffwd(sg_src, src, subreq.cryptlen); + if (req->dst != req->src) + dst = scatterwalk_ffwd(sg_dst, req->dst, + subreq.cryptlen); + } + + /* handle ciphertext stealing */ + skcipher_request_set_crypt(&subreq, src, dst, + req->cryptlen - cbc_blocks * SM4_BLOCK_SIZE, + req->iv); + + err = skcipher_walk_virt(&walk, &subreq, false); + if (err) + return err; + + kernel_neon_begin(); + + if (encrypt) + sm4_ce_cbc_cts_enc(ctx->rkey_enc, walk.dst.virt.addr, + walk.src.virt.addr, walk.iv, walk.nbytes); + else + sm4_ce_cbc_cts_dec(ctx->rkey_dec, walk.dst.virt.addr, + walk.src.virt.addr, walk.iv, walk.nbytes); + + kernel_neon_end(); + + return skcipher_walk_done(&walk, 0); +} + +static int sm4_cbc_cts_encrypt(struct skcipher_request *req) +{ + return sm4_cbc_cts_crypt(req, true); +} + +static int sm4_cbc_cts_decrypt(struct skcipher_request *req) +{ + return sm4_cbc_cts_crypt(req, false); +} + static int sm4_cfb_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); @@ -342,6 +419,22 @@ static struct skcipher_alg sm4_algs[] = { .setkey = sm4_setkey, .encrypt = sm4_ctr_crypt, .decrypt = sm4_ctr_crypt, + }, { + .base = { + .cra_name = "cts(cbc(sm4))", + .cra_driver_name = "cts-cbc-sm4-ce", + .cra_priority = 400, + .cra_blocksize = SM4_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .walksize = SM4_BLOCK_SIZE * 2, + .setkey = sm4_setkey, + .encrypt = sm4_cbc_cts_encrypt, + .decrypt = sm4_cbc_cts_decrypt, } }; @@ -365,5 +458,6 @@ MODULE_ALIAS_CRYPTO("ecb(sm4)"); MODULE_ALIAS_CRYPTO("cbc(sm4)"); MODULE_ALIAS_CRYPTO("cfb(sm4)"); MODULE_ALIAS_CRYPTO("ctr(sm4)"); +MODULE_ALIAS_CRYPTO("cts(cbc(sm4))"); MODULE_AUTHOR("Tianjia Zhang "); MODULE_LICENSE("GPL v2"); -- GitLab From 01f633113b19534ab4f4e9cf72d8e72fb3568901 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Thu, 27 Oct 2022 14:55:02 +0800 Subject: [PATCH 0367/1423] crypto: arm64/sm4 - add CE implementation for XTS mode This patch is a CE-optimized assembly implementation for XTS mode. Benchmark on T-Head Yitian-710 2.75 GHz, the data comes from the 218 mode of tcrypt, and compared the performance before and after this patch (the driver used before this patch is xts(ecb-sm4-ce)). The abscissas are blocks of different lengths. The data is tabulated and the unit is Mb/s: Before: xts(ecb-sm4-ce) | 16 64 128 256 1024 1420 4096 ----------------+-------------------------------------------------------------- XTS enc | 117.17 430.56 732.92 1134.98 2007.03 2136.23 2347.20 XTS dec | 116.89 429.02 733.40 1132.96 2006.13 2130.50 2347.92 After: xts-sm4-ce | 16 64 128 256 1024 1420 4096 ----------------+-------------------------------------------------------------- XTS enc | 224.68 798.91 1248.08 1714.60 2413.73 2467.84 2612.62 XTS dec | 229.85 791.34 1237.79 1720.00 2413.30 2473.84 2611.95 Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu --- arch/arm64/crypto/Kconfig | 4 +- arch/arm64/crypto/sm4-ce-core.S | 343 ++++++++++++++++++++++++++++++++ arch/arm64/crypto/sm4-ce-glue.c | 159 ++++++++++++++- 3 files changed, 504 insertions(+), 2 deletions(-) diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 4b121dc0cfba2..8939f5ae9214d 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -231,7 +231,7 @@ config CRYPTO_SM4_ARM64_CE - NEON (Advanced SIMD) extensions config CRYPTO_SM4_ARM64_CE_BLK - tristate "Ciphers: SM4, modes: ECB/CBC/CFB/CTR (ARMv8 Crypto Extensions)" + tristate "Ciphers: SM4, modes: ECB/CBC/CFB/CTR/XTS (ARMv8 Crypto Extensions)" depends on KERNEL_MODE_NEON select CRYPTO_SKCIPHER select CRYPTO_SM4 @@ -242,6 +242,8 @@ config CRYPTO_SM4_ARM64_CE_BLK - CBC (Cipher Block Chaining) mode (NIST SP800-38A) - CFB (Cipher Feedback) mode (NIST SP800-38A) - CTR (Counter) mode (NIST SP800-38A) + - XTS (XOR Encrypt XOR with ciphertext stealing) mode (NIST SP800-38E + and IEEE 1619) Architecture: arm64 using: - ARMv8 Crypto Extensions diff --git a/arch/arm64/crypto/sm4-ce-core.S b/arch/arm64/crypto/sm4-ce-core.S index 414d29f8110b3..ddd15ec09d38c 100644 --- a/arch/arm64/crypto/sm4-ce-core.S +++ b/arch/arm64/crypto/sm4-ce-core.S @@ -35,6 +35,7 @@ #define RTMP3 v19 #define RIV v20 +#define RMASK v21 .align 3 @@ -665,6 +666,348 @@ SYM_FUNC_START(sm4_ce_ctr_enc) SYM_FUNC_END(sm4_ce_ctr_enc) +#define tweak_next(vt, vin, RTMP) \ + sshr RTMP.2d, vin.2d, #63; \ + and RTMP.16b, RTMP.16b, RMASK.16b; \ + add vt.2d, vin.2d, vin.2d; \ + ext RTMP.16b, RTMP.16b, RTMP.16b, #8; \ + eor vt.16b, vt.16b, RTMP.16b; + +.align 3 +SYM_FUNC_START(sm4_ce_xts_enc) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: tweak (big endian, 128 bit) + * w4: nbytes + * x5: round key array for IV + */ + ld1 {v8.16b}, [x3] + + cbz x5, .Lxts_enc_nofirst + + SM4_PREPARE(x5) + + /* Generate first tweak */ + SM4_CRYPT_BLK(v8) + +.Lxts_enc_nofirst: + SM4_PREPARE(x0) + + ands w5, w4, #15 + lsr w4, w4, #4 + sub w6, w4, #1 + csel w4, w4, w6, eq + uxtw x5, w5 + + movi RMASK.2s, #0x1 + movi RTMP0.2s, #0x87 + uzp1 RMASK.4s, RMASK.4s, RTMP0.4s + + cbz w4, .Lxts_enc_cts + +.Lxts_enc_loop_8x: + sub w4, w4, #8 + tbnz w4, #31, .Lxts_enc_4x + + tweak_next( v9, v8, RTMP0) + tweak_next(v10, v9, RTMP1) + tweak_next(v11, v10, RTMP2) + tweak_next(v12, v11, RTMP3) + tweak_next(v13, v12, RTMP0) + tweak_next(v14, v13, RTMP1) + tweak_next(v15, v14, RTMP2) + + ld1 {v0.16b-v3.16b}, [x2], #64 + ld1 {v4.16b-v7.16b}, [x2], #64 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b + eor v4.16b, v4.16b, v12.16b + eor v5.16b, v5.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + eor v7.16b, v7.16b, v15.16b + + SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) + + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b + eor v4.16b, v4.16b, v12.16b + eor v5.16b, v5.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + eor v7.16b, v7.16b, v15.16b + st1 {v0.16b-v3.16b}, [x1], #64 + st1 {v4.16b-v7.16b}, [x1], #64 + + tweak_next(v8, v15, RTMP3) + + cbz w4, .Lxts_enc_cts + b .Lxts_enc_loop_8x + +.Lxts_enc_4x: + add w4, w4, #8 + cmp w4, #4 + blt .Lxts_enc_loop_1x + + sub w4, w4, #4 + + tweak_next( v9, v8, RTMP0) + tweak_next(v10, v9, RTMP1) + tweak_next(v11, v10, RTMP2) + + ld1 {v0.16b-v3.16b}, [x2], #64 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b + + SM4_CRYPT_BLK4(v0, v1, v2, v3) + + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b + st1 {v0.16b-v3.16b}, [x1], #64 + + tweak_next(v8, v11, RTMP3) + + cbz w4, .Lxts_enc_cts + +.Lxts_enc_loop_1x: + sub w4, w4, #1 + + ld1 {v0.16b}, [x2], #16 + eor v0.16b, v0.16b, v8.16b + + SM4_CRYPT_BLK(v0) + + eor v0.16b, v0.16b, v8.16b + st1 {v0.16b}, [x1], #16 + + tweak_next(v8, v8, RTMP0) + + cbnz w4, .Lxts_enc_loop_1x + +.Lxts_enc_cts: + cbz x5, .Lxts_enc_end + + /* cipher text stealing */ + + tweak_next(v9, v8, RTMP0) + ld1 {v0.16b}, [x2] + eor v0.16b, v0.16b, v8.16b + SM4_CRYPT_BLK(v0) + eor v0.16b, v0.16b, v8.16b + + /* load permute table */ + adr_l x6, .Lcts_permute_table + add x7, x6, #32 + add x6, x6, x5 + sub x7, x7, x5 + ld1 {v3.16b}, [x6] + ld1 {v4.16b}, [x7] + + /* overlapping loads */ + add x2, x2, x5 + ld1 {v1.16b}, [x2] + + /* create Cn from En-1 */ + tbl v2.16b, {v0.16b}, v3.16b + /* padding Pn with En-1 at the end */ + tbx v0.16b, {v1.16b}, v4.16b + + eor v0.16b, v0.16b, v9.16b + SM4_CRYPT_BLK(v0) + eor v0.16b, v0.16b, v9.16b + + + /* overlapping stores */ + add x5, x1, x5 + st1 {v2.16b}, [x5] + st1 {v0.16b}, [x1] + + b .Lxts_enc_ret + +.Lxts_enc_end: + /* store new tweak */ + st1 {v8.16b}, [x3] + +.Lxts_enc_ret: + ret +SYM_FUNC_END(sm4_ce_xts_enc) + +.align 3 +SYM_FUNC_START(sm4_ce_xts_dec) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: tweak (big endian, 128 bit) + * w4: nbytes + * x5: round key array for IV + */ + ld1 {v8.16b}, [x3] + + cbz x5, .Lxts_dec_nofirst + + SM4_PREPARE(x5) + + /* Generate first tweak */ + SM4_CRYPT_BLK(v8) + +.Lxts_dec_nofirst: + SM4_PREPARE(x0) + + ands w5, w4, #15 + lsr w4, w4, #4 + sub w6, w4, #1 + csel w4, w4, w6, eq + uxtw x5, w5 + + movi RMASK.2s, #0x1 + movi RTMP0.2s, #0x87 + uzp1 RMASK.4s, RMASK.4s, RTMP0.4s + + cbz w4, .Lxts_dec_cts + +.Lxts_dec_loop_8x: + sub w4, w4, #8 + tbnz w4, #31, .Lxts_dec_4x + + tweak_next( v9, v8, RTMP0) + tweak_next(v10, v9, RTMP1) + tweak_next(v11, v10, RTMP2) + tweak_next(v12, v11, RTMP3) + tweak_next(v13, v12, RTMP0) + tweak_next(v14, v13, RTMP1) + tweak_next(v15, v14, RTMP2) + + ld1 {v0.16b-v3.16b}, [x2], #64 + ld1 {v4.16b-v7.16b}, [x2], #64 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b + eor v4.16b, v4.16b, v12.16b + eor v5.16b, v5.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + eor v7.16b, v7.16b, v15.16b + + SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) + + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b + eor v4.16b, v4.16b, v12.16b + eor v5.16b, v5.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + eor v7.16b, v7.16b, v15.16b + st1 {v0.16b-v3.16b}, [x1], #64 + st1 {v4.16b-v7.16b}, [x1], #64 + + tweak_next(v8, v15, RTMP3) + + cbz w4, .Lxts_dec_cts + b .Lxts_dec_loop_8x + +.Lxts_dec_4x: + add w4, w4, #8 + cmp w4, #4 + blt .Lxts_dec_loop_1x + + sub w4, w4, #4 + + tweak_next( v9, v8, RTMP0) + tweak_next(v10, v9, RTMP1) + tweak_next(v11, v10, RTMP2) + + ld1 {v0.16b-v3.16b}, [x2], #64 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b + + SM4_CRYPT_BLK4(v0, v1, v2, v3) + + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b + st1 {v0.16b-v3.16b}, [x1], #64 + + tweak_next(v8, v11, RTMP3) + + cbz w4, .Lxts_dec_cts + +.Lxts_dec_loop_1x: + sub w4, w4, #1 + + ld1 {v0.16b}, [x2], #16 + eor v0.16b, v0.16b, v8.16b + + SM4_CRYPT_BLK(v0) + + eor v0.16b, v0.16b, v8.16b + st1 {v0.16b}, [x1], #16 + + tweak_next(v8, v8, RTMP0) + + cbnz w4, .Lxts_dec_loop_1x + +.Lxts_dec_cts: + cbz x5, .Lxts_dec_end + + /* cipher text stealing */ + + tweak_next(v9, v8, RTMP0) + ld1 {v0.16b}, [x2] + eor v0.16b, v0.16b, v9.16b + SM4_CRYPT_BLK(v0) + eor v0.16b, v0.16b, v9.16b + + /* load permute table */ + adr_l x6, .Lcts_permute_table + add x7, x6, #32 + add x6, x6, x5 + sub x7, x7, x5 + ld1 {v3.16b}, [x6] + ld1 {v4.16b}, [x7] + + /* overlapping loads */ + add x2, x2, x5 + ld1 {v1.16b}, [x2] + + /* create Cn from En-1 */ + tbl v2.16b, {v0.16b}, v3.16b + /* padding Pn with En-1 at the end */ + tbx v0.16b, {v1.16b}, v4.16b + + eor v0.16b, v0.16b, v8.16b + SM4_CRYPT_BLK(v0) + eor v0.16b, v0.16b, v8.16b + + + /* overlapping stores */ + add x5, x1, x5 + st1 {v2.16b}, [x5] + st1 {v0.16b}, [x1] + + b .Lxts_dec_ret + +.Lxts_dec_end: + /* store new tweak */ + st1 {v8.16b}, [x3] + +.Lxts_dec_ret: + ret +SYM_FUNC_END(sm4_ce_xts_dec) + + .section ".rodata", "a" .align 4 .Lbswap128_mask: diff --git a/arch/arm64/crypto/sm4-ce-glue.c b/arch/arm64/crypto/sm4-ce-glue.c index 4d4072c7bfa2e..8222766f712a4 100644 --- a/arch/arm64/crypto/sm4-ce-glue.c +++ b/arch/arm64/crypto/sm4-ce-glue.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #define BYTES2BLKS(nbytes) ((nbytes) >> 4) @@ -40,12 +41,23 @@ asmlinkage void sm4_ce_cfb_dec(const u32 *rkey, u8 *dst, const u8 *src, u8 *iv, unsigned int nblks); asmlinkage void sm4_ce_ctr_enc(const u32 *rkey, u8 *dst, const u8 *src, u8 *iv, unsigned int nblks); +asmlinkage void sm4_ce_xts_enc(const u32 *rkey1, u8 *dst, const u8 *src, + u8 *tweak, unsigned int nbytes, + const u32 *rkey2_enc); +asmlinkage void sm4_ce_xts_dec(const u32 *rkey1, u8 *dst, const u8 *src, + u8 *tweak, unsigned int nbytes, + const u32 *rkey2_enc); EXPORT_SYMBOL(sm4_ce_expand_key); EXPORT_SYMBOL(sm4_ce_crypt_block); EXPORT_SYMBOL(sm4_ce_cbc_enc); EXPORT_SYMBOL(sm4_ce_cfb_enc); +struct sm4_xts_ctx { + struct sm4_ctx key1; + struct sm4_ctx key2; +}; + static int sm4_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int key_len) { @@ -61,6 +73,29 @@ static int sm4_setkey(struct crypto_skcipher *tfm, const u8 *key, return 0; } +static int sm4_xts_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int key_len) +{ + struct sm4_xts_ctx *ctx = crypto_skcipher_ctx(tfm); + int ret; + + if (key_len != SM4_KEY_SIZE * 2) + return -EINVAL; + + ret = xts_verify_key(tfm, key, key_len); + if (ret) + return ret; + + kernel_neon_begin(); + sm4_ce_expand_key(key, ctx->key1.rkey_enc, + ctx->key1.rkey_dec, crypto_sm4_fk, crypto_sm4_ck); + sm4_ce_expand_key(&key[SM4_KEY_SIZE], ctx->key2.rkey_enc, + ctx->key2.rkey_dec, crypto_sm4_fk, crypto_sm4_ck); + kernel_neon_end(); + + return 0; +} + static int sm4_ecb_do_crypt(struct skcipher_request *req, const u32 *rkey) { struct skcipher_walk walk; @@ -357,6 +392,111 @@ static int sm4_ctr_crypt(struct skcipher_request *req) return err; } +static int sm4_xts_crypt(struct skcipher_request *req, bool encrypt) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_xts_ctx *ctx = crypto_skcipher_ctx(tfm); + int tail = req->cryptlen % SM4_BLOCK_SIZE; + const u32 *rkey2_enc = ctx->key2.rkey_enc; + struct scatterlist sg_src[2], sg_dst[2]; + struct skcipher_request subreq; + struct scatterlist *src, *dst; + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + if (req->cryptlen < SM4_BLOCK_SIZE) + return -EINVAL; + + err = skcipher_walk_virt(&walk, req, false); + if (err) + return err; + + if (unlikely(tail > 0 && walk.nbytes < walk.total)) { + int nblocks = DIV_ROUND_UP(req->cryptlen, SM4_BLOCK_SIZE) - 2; + + skcipher_walk_abort(&walk); + + skcipher_request_set_tfm(&subreq, tfm); + skcipher_request_set_callback(&subreq, + skcipher_request_flags(req), + NULL, NULL); + skcipher_request_set_crypt(&subreq, req->src, req->dst, + nblocks * SM4_BLOCK_SIZE, req->iv); + + err = skcipher_walk_virt(&walk, &subreq, false); + if (err) + return err; + } else { + tail = 0; + } + + while ((nbytes = walk.nbytes) >= SM4_BLOCK_SIZE) { + if (nbytes < walk.total) + nbytes &= ~(SM4_BLOCK_SIZE - 1); + + kernel_neon_begin(); + + if (encrypt) + sm4_ce_xts_enc(ctx->key1.rkey_enc, walk.dst.virt.addr, + walk.src.virt.addr, walk.iv, nbytes, + rkey2_enc); + else + sm4_ce_xts_dec(ctx->key1.rkey_dec, walk.dst.virt.addr, + walk.src.virt.addr, walk.iv, nbytes, + rkey2_enc); + + kernel_neon_end(); + + rkey2_enc = NULL; + + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + if (err) + return err; + } + + if (likely(tail == 0)) + return 0; + + /* handle ciphertext stealing */ + + dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen); + if (req->dst != req->src) + dst = scatterwalk_ffwd(sg_dst, req->dst, subreq.cryptlen); + + skcipher_request_set_crypt(&subreq, src, dst, SM4_BLOCK_SIZE + tail, + req->iv); + + err = skcipher_walk_virt(&walk, &subreq, false); + if (err) + return err; + + kernel_neon_begin(); + + if (encrypt) + sm4_ce_xts_enc(ctx->key1.rkey_enc, walk.dst.virt.addr, + walk.src.virt.addr, walk.iv, walk.nbytes, + rkey2_enc); + else + sm4_ce_xts_dec(ctx->key1.rkey_dec, walk.dst.virt.addr, + walk.src.virt.addr, walk.iv, walk.nbytes, + rkey2_enc); + + kernel_neon_end(); + + return skcipher_walk_done(&walk, 0); +} + +static int sm4_xts_encrypt(struct skcipher_request *req) +{ + return sm4_xts_crypt(req, true); +} + +static int sm4_xts_decrypt(struct skcipher_request *req) +{ + return sm4_xts_crypt(req, false); +} + static struct skcipher_alg sm4_algs[] = { { .base = { @@ -435,6 +575,22 @@ static struct skcipher_alg sm4_algs[] = { .setkey = sm4_setkey, .encrypt = sm4_cbc_cts_encrypt, .decrypt = sm4_cbc_cts_decrypt, + }, { + .base = { + .cra_name = "xts(sm4)", + .cra_driver_name = "xts-sm4-ce", + .cra_priority = 400, + .cra_blocksize = SM4_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct sm4_xts_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE * 2, + .max_keysize = SM4_KEY_SIZE * 2, + .ivsize = SM4_BLOCK_SIZE, + .walksize = SM4_BLOCK_SIZE * 2, + .setkey = sm4_xts_setkey, + .encrypt = sm4_xts_encrypt, + .decrypt = sm4_xts_decrypt, } }; @@ -451,7 +607,7 @@ static void __exit sm4_exit(void) module_cpu_feature_match(SM4, sm4_init); module_exit(sm4_exit); -MODULE_DESCRIPTION("SM4 ECB/CBC/CFB/CTR using ARMv8 Crypto Extensions"); +MODULE_DESCRIPTION("SM4 ECB/CBC/CFB/CTR/XTS using ARMv8 Crypto Extensions"); MODULE_ALIAS_CRYPTO("sm4-ce"); MODULE_ALIAS_CRYPTO("sm4"); MODULE_ALIAS_CRYPTO("ecb(sm4)"); @@ -459,5 +615,6 @@ MODULE_ALIAS_CRYPTO("cbc(sm4)"); MODULE_ALIAS_CRYPTO("cfb(sm4)"); MODULE_ALIAS_CRYPTO("ctr(sm4)"); MODULE_ALIAS_CRYPTO("cts(cbc(sm4))"); +MODULE_ALIAS_CRYPTO("xts(sm4)"); MODULE_AUTHOR("Tianjia Zhang "); MODULE_LICENSE("GPL v2"); -- GitLab From 6b5360a5e0ad357b73776d092437715ba4a77865 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Thu, 27 Oct 2022 14:55:03 +0800 Subject: [PATCH 0368/1423] crypto: arm64/sm4 - add CE implementation for cmac/xcbc/cbcmac This patch is a CE-optimized assembly implementation for cmac/xcbc/cbcmac. Benchmark on T-Head Yitian-710 2.75 GHz, the data comes from the 300 mode of tcrypt, and compared the performance before and after this patch (the driver used before this patch is XXXmac(sm4-ce)). The abscissas are blocks of different lengths. The data is tabulated and the unit is Mb/s: Before: update-size | 16 64 256 1024 2048 4096 8192 ---------------+-------------------------------------------------------- cmac(sm4-ce) | 293.33 403.69 503.76 527.78 531.10 535.46 535.81 xcbc(sm4-ce) | 292.83 402.50 504.02 529.08 529.87 536.55 538.24 cbcmac(sm4-ce) | 318.42 415.79 497.12 515.05 523.15 521.19 523.01 After: update-size | 16 64 256 1024 2048 4096 8192 ---------------+-------------------------------------------------------- cmac-sm4-ce | 371.99 675.28 903.56 971.65 980.57 990.40 991.04 xcbc-sm4-ce | 372.11 674.55 903.47 971.61 980.96 990.42 991.10 cbcmac-sm4-ce | 371.63 675.33 903.23 972.07 981.42 990.93 991.45 Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu --- arch/arm64/crypto/sm4-ce-core.S | 70 +++++++++ arch/arm64/crypto/sm4-ce-glue.c | 267 +++++++++++++++++++++++++++++++- 2 files changed, 336 insertions(+), 1 deletion(-) diff --git a/arch/arm64/crypto/sm4-ce-core.S b/arch/arm64/crypto/sm4-ce-core.S index ddd15ec09d38c..877b80c54a0d3 100644 --- a/arch/arm64/crypto/sm4-ce-core.S +++ b/arch/arm64/crypto/sm4-ce-core.S @@ -35,6 +35,7 @@ #define RTMP3 v19 #define RIV v20 +#define RMAC v20 #define RMASK v21 @@ -1007,6 +1008,75 @@ SYM_FUNC_START(sm4_ce_xts_dec) ret SYM_FUNC_END(sm4_ce_xts_dec) +.align 3 +SYM_FUNC_START(sm4_ce_mac_update) + /* input: + * x0: round key array, CTX + * x1: digest + * x2: src + * w3: nblocks + * w4: enc_before + * w5: enc_after + */ + SM4_PREPARE(x0) + + ld1 {RMAC.16b}, [x1] + + cbz w4, .Lmac_update + + SM4_CRYPT_BLK(RMAC) + +.Lmac_update: + cbz w3, .Lmac_ret + + sub w6, w3, #1 + cmp w5, wzr + csel w3, w3, w6, ne + + cbz w3, .Lmac_end + +.Lmac_loop_4x: + cmp w3, #4 + blt .Lmac_loop_1x + + sub w3, w3, #4 + + ld1 {v0.16b-v3.16b}, [x2], #64 + + eor RMAC.16b, RMAC.16b, v0.16b + SM4_CRYPT_BLK(RMAC) + eor RMAC.16b, RMAC.16b, v1.16b + SM4_CRYPT_BLK(RMAC) + eor RMAC.16b, RMAC.16b, v2.16b + SM4_CRYPT_BLK(RMAC) + eor RMAC.16b, RMAC.16b, v3.16b + SM4_CRYPT_BLK(RMAC) + + cbz w3, .Lmac_end + b .Lmac_loop_4x + +.Lmac_loop_1x: + sub w3, w3, #1 + + ld1 {v0.16b}, [x2], #16 + + eor RMAC.16b, RMAC.16b, v0.16b + SM4_CRYPT_BLK(RMAC) + + cbnz w3, .Lmac_loop_1x + + +.Lmac_end: + cbnz w5, .Lmac_ret + + ld1 {v0.16b}, [x2], #16 + eor RMAC.16b, RMAC.16b, v0.16b + +.Lmac_ret: + st1 {RMAC.16b}, [x1] + ret +SYM_FUNC_END(sm4_ce_mac_update) + .section ".rodata", "a" .align 4 diff --git a/arch/arm64/crypto/sm4-ce-glue.c b/arch/arm64/crypto/sm4-ce-glue.c index 8222766f712a4..0a2d32ed3bdec 100644 --- a/arch/arm64/crypto/sm4-ce-glue.c +++ b/arch/arm64/crypto/sm4-ce-glue.c @@ -14,8 +14,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -47,6 +49,9 @@ asmlinkage void sm4_ce_xts_enc(const u32 *rkey1, u8 *dst, const u8 *src, asmlinkage void sm4_ce_xts_dec(const u32 *rkey1, u8 *dst, const u8 *src, u8 *tweak, unsigned int nbytes, const u32 *rkey2_enc); +asmlinkage void sm4_ce_mac_update(const u32 *rkey_enc, u8 *digest, + const u8 *src, unsigned int nblocks, + bool enc_before, bool enc_after); EXPORT_SYMBOL(sm4_ce_expand_key); EXPORT_SYMBOL(sm4_ce_crypt_block); @@ -58,6 +63,16 @@ struct sm4_xts_ctx { struct sm4_ctx key2; }; +struct sm4_mac_tfm_ctx { + struct sm4_ctx key; + u8 __aligned(8) consts[]; +}; + +struct sm4_mac_desc_ctx { + unsigned int len; + u8 digest[SM4_BLOCK_SIZE]; +}; + static int sm4_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int key_len) { @@ -594,13 +609,260 @@ static struct skcipher_alg sm4_algs[] = { } }; +static int sm4_cbcmac_setkey(struct crypto_shash *tfm, const u8 *key, + unsigned int key_len) +{ + struct sm4_mac_tfm_ctx *ctx = crypto_shash_ctx(tfm); + + if (key_len != SM4_KEY_SIZE) + return -EINVAL; + + kernel_neon_begin(); + sm4_ce_expand_key(key, ctx->key.rkey_enc, ctx->key.rkey_dec, + crypto_sm4_fk, crypto_sm4_ck); + kernel_neon_end(); + + return 0; +} + +static int sm4_cmac_setkey(struct crypto_shash *tfm, const u8 *key, + unsigned int key_len) +{ + struct sm4_mac_tfm_ctx *ctx = crypto_shash_ctx(tfm); + be128 *consts = (be128 *)ctx->consts; + u64 a, b; + + if (key_len != SM4_KEY_SIZE) + return -EINVAL; + + memset(consts, 0, SM4_BLOCK_SIZE); + + kernel_neon_begin(); + + sm4_ce_expand_key(key, ctx->key.rkey_enc, ctx->key.rkey_dec, + crypto_sm4_fk, crypto_sm4_ck); + + /* encrypt the zero block */ + sm4_ce_crypt_block(ctx->key.rkey_enc, (u8 *)consts, (const u8 *)consts); + + kernel_neon_end(); + + /* gf(2^128) multiply zero-ciphertext with u and u^2 */ + a = be64_to_cpu(consts[0].a); + b = be64_to_cpu(consts[0].b); + consts[0].a = cpu_to_be64((a << 1) | (b >> 63)); + consts[0].b = cpu_to_be64((b << 1) ^ ((a >> 63) ? 0x87 : 0)); + + a = be64_to_cpu(consts[0].a); + b = be64_to_cpu(consts[0].b); + consts[1].a = cpu_to_be64((a << 1) | (b >> 63)); + consts[1].b = cpu_to_be64((b << 1) ^ ((a >> 63) ? 0x87 : 0)); + + return 0; +} + +static int sm4_xcbc_setkey(struct crypto_shash *tfm, const u8 *key, + unsigned int key_len) +{ + struct sm4_mac_tfm_ctx *ctx = crypto_shash_ctx(tfm); + u8 __aligned(8) key2[SM4_BLOCK_SIZE]; + static u8 const ks[3][SM4_BLOCK_SIZE] = { + { [0 ... SM4_BLOCK_SIZE - 1] = 0x1}, + { [0 ... SM4_BLOCK_SIZE - 1] = 0x2}, + { [0 ... SM4_BLOCK_SIZE - 1] = 0x3}, + }; + + if (key_len != SM4_KEY_SIZE) + return -EINVAL; + + kernel_neon_begin(); + + sm4_ce_expand_key(key, ctx->key.rkey_enc, ctx->key.rkey_dec, + crypto_sm4_fk, crypto_sm4_ck); + + sm4_ce_crypt_block(ctx->key.rkey_enc, key2, ks[0]); + sm4_ce_crypt(ctx->key.rkey_enc, ctx->consts, ks[1], 2); + + sm4_ce_expand_key(key2, ctx->key.rkey_enc, ctx->key.rkey_dec, + crypto_sm4_fk, crypto_sm4_ck); + + kernel_neon_end(); + + return 0; +} + +static int sm4_mac_init(struct shash_desc *desc) +{ + struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc); + + memset(ctx->digest, 0, SM4_BLOCK_SIZE); + ctx->len = 0; + + return 0; +} + +static int sm4_mac_update(struct shash_desc *desc, const u8 *p, + unsigned int len) +{ + struct sm4_mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc); + unsigned int l, nblocks; + + if (len == 0) + return 0; + + if (ctx->len || ctx->len + len < SM4_BLOCK_SIZE) { + l = min(len, SM4_BLOCK_SIZE - ctx->len); + + crypto_xor(ctx->digest + ctx->len, p, l); + ctx->len += l; + len -= l; + p += l; + } + + if (len && (ctx->len % SM4_BLOCK_SIZE) == 0) { + kernel_neon_begin(); + + if (len < SM4_BLOCK_SIZE && ctx->len == SM4_BLOCK_SIZE) { + sm4_ce_crypt_block(tctx->key.rkey_enc, + ctx->digest, ctx->digest); + ctx->len = 0; + } else { + nblocks = len / SM4_BLOCK_SIZE; + len %= SM4_BLOCK_SIZE; + + sm4_ce_mac_update(tctx->key.rkey_enc, ctx->digest, p, + nblocks, (ctx->len == SM4_BLOCK_SIZE), + (len != 0)); + + p += nblocks * SM4_BLOCK_SIZE; + + if (len == 0) + ctx->len = SM4_BLOCK_SIZE; + } + + kernel_neon_end(); + + if (len) { + crypto_xor(ctx->digest, p, len); + ctx->len = len; + } + } + + return 0; +} + +static int sm4_cmac_final(struct shash_desc *desc, u8 *out) +{ + struct sm4_mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc); + const u8 *consts = tctx->consts; + + if (ctx->len != SM4_BLOCK_SIZE) { + ctx->digest[ctx->len] ^= 0x80; + consts += SM4_BLOCK_SIZE; + } + + kernel_neon_begin(); + sm4_ce_mac_update(tctx->key.rkey_enc, ctx->digest, consts, 1, + false, true); + kernel_neon_end(); + + memcpy(out, ctx->digest, SM4_BLOCK_SIZE); + + return 0; +} + +static int sm4_cbcmac_final(struct shash_desc *desc, u8 *out) +{ + struct sm4_mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc); + + if (ctx->len) { + kernel_neon_begin(); + sm4_ce_crypt_block(tctx->key.rkey_enc, ctx->digest, + ctx->digest); + kernel_neon_end(); + } + + memcpy(out, ctx->digest, SM4_BLOCK_SIZE); + + return 0; +} + +static struct shash_alg sm4_mac_algs[] = { + { + .base = { + .cra_name = "cmac(sm4)", + .cra_driver_name = "cmac-sm4-ce", + .cra_priority = 400, + .cra_blocksize = SM4_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct sm4_mac_tfm_ctx) + + SM4_BLOCK_SIZE * 2, + .cra_module = THIS_MODULE, + }, + .digestsize = SM4_BLOCK_SIZE, + .init = sm4_mac_init, + .update = sm4_mac_update, + .final = sm4_cmac_final, + .setkey = sm4_cmac_setkey, + .descsize = sizeof(struct sm4_mac_desc_ctx), + }, { + .base = { + .cra_name = "xcbc(sm4)", + .cra_driver_name = "xcbc-sm4-ce", + .cra_priority = 400, + .cra_blocksize = SM4_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct sm4_mac_tfm_ctx) + + SM4_BLOCK_SIZE * 2, + .cra_module = THIS_MODULE, + }, + .digestsize = SM4_BLOCK_SIZE, + .init = sm4_mac_init, + .update = sm4_mac_update, + .final = sm4_cmac_final, + .setkey = sm4_xcbc_setkey, + .descsize = sizeof(struct sm4_mac_desc_ctx), + }, { + .base = { + .cra_name = "cbcmac(sm4)", + .cra_driver_name = "cbcmac-sm4-ce", + .cra_priority = 400, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct sm4_mac_tfm_ctx), + .cra_module = THIS_MODULE, + }, + .digestsize = SM4_BLOCK_SIZE, + .init = sm4_mac_init, + .update = sm4_mac_update, + .final = sm4_cbcmac_final, + .setkey = sm4_cbcmac_setkey, + .descsize = sizeof(struct sm4_mac_desc_ctx), + } +}; + static int __init sm4_init(void) { - return crypto_register_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs)); + int err; + + err = crypto_register_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs)); + if (err) + return err; + + err = crypto_register_shashes(sm4_mac_algs, ARRAY_SIZE(sm4_mac_algs)); + if (err) + goto out_err; + + return 0; + +out_err: + crypto_unregister_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs)); + return err; } static void __exit sm4_exit(void) { + crypto_unregister_shashes(sm4_mac_algs, ARRAY_SIZE(sm4_mac_algs)); crypto_unregister_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs)); } @@ -616,5 +878,8 @@ MODULE_ALIAS_CRYPTO("cfb(sm4)"); MODULE_ALIAS_CRYPTO("ctr(sm4)"); MODULE_ALIAS_CRYPTO("cts(cbc(sm4))"); MODULE_ALIAS_CRYPTO("xts(sm4)"); +MODULE_ALIAS_CRYPTO("cmac(sm4)"); +MODULE_ALIAS_CRYPTO("xcbc(sm4)"); +MODULE_ALIAS_CRYPTO("cbcmac(sm4)"); MODULE_AUTHOR("Tianjia Zhang "); MODULE_LICENSE("GPL v2"); -- GitLab From 67fa3a7fdf80c80ee737840dfdd225260e5c1044 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Thu, 27 Oct 2022 14:55:04 +0800 Subject: [PATCH 0369/1423] crypto: arm64/sm4 - add CE implementation for CCM mode This patch is a CE-optimized assembly implementation for CCM mode. Benchmark on T-Head Yitian-710 2.75 GHz, the data comes from the 223 and 225 modes of tcrypt, and compared the performance before and after this patch (the driver used before this patch is ccm_base(ctr-sm4-ce,cbcmac-sm4-ce)). The abscissas are blocks of different lengths. The data is tabulated and the unit is Mb/s: Before (rfc4309(ccm_base(ctr-sm4-ce,cbcmac-sm4-ce))): ccm(sm4) | 16 64 256 512 1024 1420 4096 8192 -------------+--------------------------------------------------------------- CCM enc | 35.07 125.40 336.47 468.17 581.97 619.18 712.56 736.01 CCM dec | 34.87 124.40 335.08 466.75 581.04 618.81 712.25 735.89 CCM mb enc | 34.71 123.96 333.92 465.39 579.91 617.49 711.45 734.92 CCM mb dec | 34.42 122.80 331.02 462.81 578.28 616.42 709.88 734.19 After (rfc4309(ccm-sm4-ce)): ccm-sm4-ce | 16 64 256 512 1024 1420 4096 8192 -------------+--------------------------------------------------------------- CCM enc | 77.12 249.82 569.94 725.17 839.27 867.71 952.87 969.89 CCM dec | 75.90 247.26 566.29 722.12 836.90 865.95 951.74 968.57 CCM mb enc | 75.98 245.25 562.91 718.99 834.76 864.70 950.17 967.90 CCM mb dec | 75.06 243.78 560.58 717.13 833.68 862.70 949.35 967.11 Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu --- arch/arm64/crypto/Kconfig | 16 ++ arch/arm64/crypto/Makefile | 3 + arch/arm64/crypto/sm4-ce-ccm-core.S | 328 ++++++++++++++++++++++++++++ arch/arm64/crypto/sm4-ce-ccm-glue.c | 303 +++++++++++++++++++++++++ 4 files changed, 650 insertions(+) create mode 100644 arch/arm64/crypto/sm4-ce-ccm-core.S create mode 100644 arch/arm64/crypto/sm4-ce-ccm-glue.c diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 8939f5ae9214d..2611036a3e3fa 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -281,6 +281,22 @@ config CRYPTO_AES_ARM64_CE_CCM - ARMv8 Crypto Extensions - NEON (Advanced SIMD) extensions +config CRYPTO_SM4_ARM64_CE_CCM + tristate "AEAD cipher: SM4 in CCM mode (ARMv8 Crypto Extensions)" + depends on KERNEL_MODE_NEON + select CRYPTO_ALGAPI + select CRYPTO_AEAD + select CRYPTO_SM4 + select CRYPTO_SM4_ARM64_CE_BLK + help + AEAD cipher: SM4 cipher algorithms (OSCCA GB/T 32907-2016) with + CCM (Counter with Cipher Block Chaining-Message Authentication Code) + authenticated encryption mode (NIST SP800-38C) + + Architecture: arm64 using: + - ARMv8 Crypto Extensions + - NEON (Advanced SIMD) extensions + config CRYPTO_CRCT10DIF_ARM64_CE tristate "CRCT10DIF (PMULL)" depends on KERNEL_MODE_NEON && CRC_T10DIF diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index 087f1625e7751..843ea52669653 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -29,6 +29,9 @@ sm4-ce-cipher-y := sm4-ce-cipher-glue.o sm4-ce-cipher-core.o obj-$(CONFIG_CRYPTO_SM4_ARM64_CE_BLK) += sm4-ce.o sm4-ce-y := sm4-ce-glue.o sm4-ce-core.o +obj-$(CONFIG_CRYPTO_SM4_ARM64_CE_CCM) += sm4-ce-ccm.o +sm4-ce-ccm-y := sm4-ce-ccm-glue.o sm4-ce-ccm-core.o + obj-$(CONFIG_CRYPTO_SM4_ARM64_NEON_BLK) += sm4-neon.o sm4-neon-y := sm4-neon-glue.o sm4-neon-core.o diff --git a/arch/arm64/crypto/sm4-ce-ccm-core.S b/arch/arm64/crypto/sm4-ce-ccm-core.S new file mode 100644 index 0000000000000..028207c4afd0f --- /dev/null +++ b/arch/arm64/crypto/sm4-ce-ccm-core.S @@ -0,0 +1,328 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SM4-CCM AEAD Algorithm using ARMv8 Crypto Extensions + * as specified in rfc8998 + * https://datatracker.ietf.org/doc/html/rfc8998 + * + * Copyright (C) 2022 Tianjia Zhang + */ + +#include +#include +#include "sm4-ce-asm.h" + +.arch armv8-a+crypto + +.irp b, 0, 1, 8, 9, 10, 11, 12, 13, 14, 15, 16, 24, 25, 26, 27, 28, 29, 30, 31 + .set .Lv\b\().4s, \b +.endr + +.macro sm4e, vd, vn + .inst 0xcec08400 | (.L\vn << 5) | .L\vd +.endm + +/* Register macros */ + +#define RMAC v16 + +/* Helper macros. */ + +#define inc_le128(vctr) \ + mov vctr.d[1], x8; \ + mov vctr.d[0], x7; \ + adds x8, x8, #1; \ + rev64 vctr.16b, vctr.16b; \ + adc x7, x7, xzr; + + +.align 3 +SYM_FUNC_START(sm4_ce_cbcmac_update) + /* input: + * x0: round key array, CTX + * x1: mac + * x2: src + * w3: nblocks + */ + SM4_PREPARE(x0) + + ld1 {RMAC.16b}, [x1] + +.Lcbcmac_loop_4x: + cmp w3, #4 + blt .Lcbcmac_loop_1x + + sub w3, w3, #4 + + ld1 {v0.16b-v3.16b}, [x2], #64 + + SM4_CRYPT_BLK(RMAC) + eor RMAC.16b, RMAC.16b, v0.16b + SM4_CRYPT_BLK(RMAC) + eor RMAC.16b, RMAC.16b, v1.16b + SM4_CRYPT_BLK(RMAC) + eor RMAC.16b, RMAC.16b, v2.16b + SM4_CRYPT_BLK(RMAC) + eor RMAC.16b, RMAC.16b, v3.16b + + cbz w3, .Lcbcmac_end + b .Lcbcmac_loop_4x + +.Lcbcmac_loop_1x: + sub w3, w3, #1 + + ld1 {v0.16b}, [x2], #16 + + SM4_CRYPT_BLK(RMAC) + eor RMAC.16b, RMAC.16b, v0.16b + + cbnz w3, .Lcbcmac_loop_1x + +.Lcbcmac_end: + st1 {RMAC.16b}, [x1] + ret +SYM_FUNC_END(sm4_ce_cbcmac_update) + +.align 3 +SYM_FUNC_START(sm4_ce_ccm_final) + /* input: + * x0: round key array, CTX + * x1: ctr0 (big endian, 128 bit) + * x2: mac + */ + SM4_PREPARE(x0) + + ld1 {RMAC.16b}, [x2] + ld1 {v0.16b}, [x1] + + SM4_CRYPT_BLK2(RMAC, v0) + + /* en-/decrypt the mac with ctr0 */ + eor RMAC.16b, RMAC.16b, v0.16b + st1 {RMAC.16b}, [x2] + + ret +SYM_FUNC_END(sm4_ce_ccm_final) + +.align 3 +SYM_FUNC_START(sm4_ce_ccm_enc) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: ctr (big endian, 128 bit) + * w4: nbytes + * x5: mac + */ + SM4_PREPARE(x0) + + ldp x7, x8, [x3] + rev x7, x7 + rev x8, x8 + + ld1 {RMAC.16b}, [x5] + +.Lccm_enc_loop_4x: + cmp w4, #(4 * 16) + blt .Lccm_enc_loop_1x + + sub w4, w4, #(4 * 16) + + /* construct CTRs */ + inc_le128(v8) /* +0 */ + inc_le128(v9) /* +1 */ + inc_le128(v10) /* +2 */ + inc_le128(v11) /* +3 */ + + ld1 {v0.16b-v3.16b}, [x2], #64 + + SM4_CRYPT_BLK2(v8, RMAC) + eor v8.16b, v8.16b, v0.16b + eor RMAC.16b, RMAC.16b, v0.16b + SM4_CRYPT_BLK2(v9, RMAC) + eor v9.16b, v9.16b, v1.16b + eor RMAC.16b, RMAC.16b, v1.16b + SM4_CRYPT_BLK2(v10, RMAC) + eor v10.16b, v10.16b, v2.16b + eor RMAC.16b, RMAC.16b, v2.16b + SM4_CRYPT_BLK2(v11, RMAC) + eor v11.16b, v11.16b, v3.16b + eor RMAC.16b, RMAC.16b, v3.16b + + st1 {v8.16b-v11.16b}, [x1], #64 + + cbz w4, .Lccm_enc_end + b .Lccm_enc_loop_4x + +.Lccm_enc_loop_1x: + cmp w4, #16 + blt .Lccm_enc_tail + + sub w4, w4, #16 + + /* construct CTRs */ + inc_le128(v8) + + ld1 {v0.16b}, [x2], #16 + + SM4_CRYPT_BLK2(v8, RMAC) + eor v8.16b, v8.16b, v0.16b + eor RMAC.16b, RMAC.16b, v0.16b + + st1 {v8.16b}, [x1], #16 + + cbz w4, .Lccm_enc_end + b .Lccm_enc_loop_1x + +.Lccm_enc_tail: + /* construct CTRs */ + inc_le128(v8) + + SM4_CRYPT_BLK2(RMAC, v8) + + /* store new MAC */ + st1 {RMAC.16b}, [x5] + +.Lccm_enc_tail_loop: + ldrb w0, [x2], #1 /* get 1 byte from input */ + umov w9, v8.b[0] /* get top crypted CTR byte */ + umov w6, RMAC.b[0] /* get top MAC byte */ + + eor w9, w9, w0 /* w9 = CTR ^ input */ + eor w6, w6, w0 /* w6 = MAC ^ input */ + + strb w9, [x1], #1 /* store out byte */ + strb w6, [x5], #1 /* store MAC byte */ + + subs w4, w4, #1 + beq .Lccm_enc_ret + + /* shift out one byte */ + ext RMAC.16b, RMAC.16b, RMAC.16b, #1 + ext v8.16b, v8.16b, v8.16b, #1 + + b .Lccm_enc_tail_loop + +.Lccm_enc_end: + /* store new MAC */ + st1 {RMAC.16b}, [x5] + + /* store new CTR */ + rev x7, x7 + rev x8, x8 + stp x7, x8, [x3] + +.Lccm_enc_ret: + ret +SYM_FUNC_END(sm4_ce_ccm_enc) + +.align 3 +SYM_FUNC_START(sm4_ce_ccm_dec) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: ctr (big endian, 128 bit) + * w4: nbytes + * x5: mac + */ + SM4_PREPARE(x0) + + ldp x7, x8, [x3] + rev x7, x7 + rev x8, x8 + + ld1 {RMAC.16b}, [x5] + +.Lccm_dec_loop_4x: + cmp w4, #(4 * 16) + blt .Lccm_dec_loop_1x + + sub w4, w4, #(4 * 16) + + /* construct CTRs */ + inc_le128(v8) /* +0 */ + inc_le128(v9) /* +1 */ + inc_le128(v10) /* +2 */ + inc_le128(v11) /* +3 */ + + ld1 {v0.16b-v3.16b}, [x2], #64 + + SM4_CRYPT_BLK2(v8, RMAC) + eor v8.16b, v8.16b, v0.16b + eor RMAC.16b, RMAC.16b, v8.16b + SM4_CRYPT_BLK2(v9, RMAC) + eor v9.16b, v9.16b, v1.16b + eor RMAC.16b, RMAC.16b, v9.16b + SM4_CRYPT_BLK2(v10, RMAC) + eor v10.16b, v10.16b, v2.16b + eor RMAC.16b, RMAC.16b, v10.16b + SM4_CRYPT_BLK2(v11, RMAC) + eor v11.16b, v11.16b, v3.16b + eor RMAC.16b, RMAC.16b, v11.16b + + st1 {v8.16b-v11.16b}, [x1], #64 + + cbz w4, .Lccm_dec_end + b .Lccm_dec_loop_4x + +.Lccm_dec_loop_1x: + cmp w4, #16 + blt .Lccm_dec_tail + + sub w4, w4, #16 + + /* construct CTRs */ + inc_le128(v8) + + ld1 {v0.16b}, [x2], #16 + + SM4_CRYPT_BLK2(v8, RMAC) + eor v8.16b, v8.16b, v0.16b + eor RMAC.16b, RMAC.16b, v8.16b + + st1 {v8.16b}, [x1], #16 + + cbz w4, .Lccm_dec_end + b .Lccm_dec_loop_1x + +.Lccm_dec_tail: + /* construct CTRs */ + inc_le128(v8) + + SM4_CRYPT_BLK2(RMAC, v8) + + /* store new MAC */ + st1 {RMAC.16b}, [x5] + +.Lccm_dec_tail_loop: + ldrb w0, [x2], #1 /* get 1 byte from input */ + umov w9, v8.b[0] /* get top crypted CTR byte */ + umov w6, RMAC.b[0] /* get top MAC byte */ + + eor w9, w9, w0 /* w9 = CTR ^ input */ + eor w6, w6, w9 /* w6 = MAC ^ output */ + + strb w9, [x1], #1 /* store out byte */ + strb w6, [x5], #1 /* store MAC byte */ + + subs w4, w4, #1 + beq .Lccm_dec_ret + + /* shift out one byte */ + ext RMAC.16b, RMAC.16b, RMAC.16b, #1 + ext v8.16b, v8.16b, v8.16b, #1 + + b .Lccm_dec_tail_loop + +.Lccm_dec_end: + /* store new MAC */ + st1 {RMAC.16b}, [x5] + + /* store new CTR */ + rev x7, x7 + rev x8, x8 + stp x7, x8, [x3] + +.Lccm_dec_ret: + ret +SYM_FUNC_END(sm4_ce_ccm_dec) diff --git a/arch/arm64/crypto/sm4-ce-ccm-glue.c b/arch/arm64/crypto/sm4-ce-ccm-glue.c new file mode 100644 index 0000000000000..f2cec7b52efcd --- /dev/null +++ b/arch/arm64/crypto/sm4-ce-ccm-glue.c @@ -0,0 +1,303 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SM4-CCM AEAD Algorithm using ARMv8 Crypto Extensions + * as specified in rfc8998 + * https://datatracker.ietf.org/doc/html/rfc8998 + * + * Copyright (C) 2022 Tianjia Zhang + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "sm4-ce.h" + +asmlinkage void sm4_ce_cbcmac_update(const u32 *rkey_enc, u8 *mac, + const u8 *src, unsigned int nblocks); +asmlinkage void sm4_ce_ccm_enc(const u32 *rkey_enc, u8 *dst, const u8 *src, + u8 *iv, unsigned int nbytes, u8 *mac); +asmlinkage void sm4_ce_ccm_dec(const u32 *rkey_enc, u8 *dst, const u8 *src, + u8 *iv, unsigned int nbytes, u8 *mac); +asmlinkage void sm4_ce_ccm_final(const u32 *rkey_enc, u8 *iv, u8 *mac); + + +static int ccm_setkey(struct crypto_aead *tfm, const u8 *key, + unsigned int key_len) +{ + struct sm4_ctx *ctx = crypto_aead_ctx(tfm); + + if (key_len != SM4_KEY_SIZE) + return -EINVAL; + + kernel_neon_begin(); + sm4_ce_expand_key(key, ctx->rkey_enc, ctx->rkey_dec, + crypto_sm4_fk, crypto_sm4_ck); + kernel_neon_end(); + + return 0; +} + +static int ccm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) +{ + if ((authsize & 1) || authsize < 4) + return -EINVAL; + return 0; +} + +static int ccm_format_input(u8 info[], struct aead_request *req, + unsigned int msglen) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + unsigned int l = req->iv[0] + 1; + unsigned int m; + __be32 len; + + /* verify that CCM dimension 'L': 2 <= L <= 8 */ + if (l < 2 || l > 8) + return -EINVAL; + if (l < 4 && msglen >> (8 * l)) + return -EOVERFLOW; + + memset(&req->iv[SM4_BLOCK_SIZE - l], 0, l); + + memcpy(info, req->iv, SM4_BLOCK_SIZE); + + m = crypto_aead_authsize(aead); + + /* format flags field per RFC 3610/NIST 800-38C */ + *info |= ((m - 2) / 2) << 3; + if (req->assoclen) + *info |= (1 << 6); + + /* + * format message length field, + * Linux uses a u32 type to represent msglen + */ + if (l >= 4) + l = 4; + + len = cpu_to_be32(msglen); + memcpy(&info[SM4_BLOCK_SIZE - l], (u8 *)&len + 4 - l, l); + + return 0; +} + +static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[]) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct sm4_ctx *ctx = crypto_aead_ctx(aead); + struct __packed { __be16 l; __be32 h; } aadlen; + u32 assoclen = req->assoclen; + struct scatter_walk walk; + unsigned int len; + + if (assoclen < 0xff00) { + aadlen.l = cpu_to_be16(assoclen); + len = 2; + } else { + aadlen.l = cpu_to_be16(0xfffe); + put_unaligned_be32(assoclen, &aadlen.h); + len = 6; + } + + sm4_ce_crypt_block(ctx->rkey_enc, mac, mac); + crypto_xor(mac, (const u8 *)&aadlen, len); + + scatterwalk_start(&walk, req->src); + + do { + u32 n = scatterwalk_clamp(&walk, assoclen); + u8 *p, *ptr; + + if (!n) { + scatterwalk_start(&walk, sg_next(walk.sg)); + n = scatterwalk_clamp(&walk, assoclen); + } + + p = ptr = scatterwalk_map(&walk); + assoclen -= n; + scatterwalk_advance(&walk, n); + + while (n > 0) { + unsigned int l, nblocks; + + if (len == SM4_BLOCK_SIZE) { + if (n < SM4_BLOCK_SIZE) { + sm4_ce_crypt_block(ctx->rkey_enc, + mac, mac); + + len = 0; + } else { + nblocks = n / SM4_BLOCK_SIZE; + sm4_ce_cbcmac_update(ctx->rkey_enc, + mac, ptr, nblocks); + + ptr += nblocks * SM4_BLOCK_SIZE; + n %= SM4_BLOCK_SIZE; + + continue; + } + } + + l = min(n, SM4_BLOCK_SIZE - len); + if (l) { + crypto_xor(mac + len, ptr, l); + len += l; + ptr += l; + n -= l; + } + } + + scatterwalk_unmap(p); + scatterwalk_done(&walk, 0, assoclen); + } while (assoclen); +} + +static int ccm_crypt(struct aead_request *req, struct skcipher_walk *walk, + u32 *rkey_enc, u8 mac[], + void (*sm4_ce_ccm_crypt)(const u32 *rkey_enc, u8 *dst, + const u8 *src, u8 *iv, + unsigned int nbytes, u8 *mac)) +{ + u8 __aligned(8) ctr0[SM4_BLOCK_SIZE]; + int err; + + /* preserve the initial ctr0 for the TAG */ + memcpy(ctr0, walk->iv, SM4_BLOCK_SIZE); + crypto_inc(walk->iv, SM4_BLOCK_SIZE); + + kernel_neon_begin(); + + if (req->assoclen) + ccm_calculate_auth_mac(req, mac); + + do { + unsigned int tail = walk->nbytes % SM4_BLOCK_SIZE; + const u8 *src = walk->src.virt.addr; + u8 *dst = walk->dst.virt.addr; + + if (walk->nbytes == walk->total) + tail = 0; + + if (walk->nbytes - tail) + sm4_ce_ccm_crypt(rkey_enc, dst, src, walk->iv, + walk->nbytes - tail, mac); + + if (walk->nbytes == walk->total) + sm4_ce_ccm_final(rkey_enc, ctr0, mac); + + kernel_neon_end(); + + if (walk->nbytes) { + err = skcipher_walk_done(walk, tail); + if (err) + return err; + if (walk->nbytes) + kernel_neon_begin(); + } + } while (walk->nbytes > 0); + + return 0; +} + +static int ccm_encrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct sm4_ctx *ctx = crypto_aead_ctx(aead); + u8 __aligned(8) mac[SM4_BLOCK_SIZE]; + struct skcipher_walk walk; + int err; + + err = ccm_format_input(mac, req, req->cryptlen); + if (err) + return err; + + err = skcipher_walk_aead_encrypt(&walk, req, false); + if (err) + return err; + + err = ccm_crypt(req, &walk, ctx->rkey_enc, mac, sm4_ce_ccm_enc); + if (err) + return err; + + /* copy authtag to end of dst */ + scatterwalk_map_and_copy(mac, req->dst, req->assoclen + req->cryptlen, + crypto_aead_authsize(aead), 1); + + return 0; +} + +static int ccm_decrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + unsigned int authsize = crypto_aead_authsize(aead); + struct sm4_ctx *ctx = crypto_aead_ctx(aead); + u8 __aligned(8) mac[SM4_BLOCK_SIZE]; + u8 authtag[SM4_BLOCK_SIZE]; + struct skcipher_walk walk; + int err; + + err = ccm_format_input(mac, req, req->cryptlen - authsize); + if (err) + return err; + + err = skcipher_walk_aead_decrypt(&walk, req, false); + if (err) + return err; + + err = ccm_crypt(req, &walk, ctx->rkey_enc, mac, sm4_ce_ccm_dec); + if (err) + return err; + + /* compare calculated auth tag with the stored one */ + scatterwalk_map_and_copy(authtag, req->src, + req->assoclen + req->cryptlen - authsize, + authsize, 0); + + if (crypto_memneq(authtag, mac, authsize)) + return -EBADMSG; + + return 0; +} + +static struct aead_alg sm4_ccm_alg = { + .base = { + .cra_name = "ccm(sm4)", + .cra_driver_name = "ccm-sm4-ce", + .cra_priority = 400, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .ivsize = SM4_BLOCK_SIZE, + .chunksize = SM4_BLOCK_SIZE, + .maxauthsize = SM4_BLOCK_SIZE, + .setkey = ccm_setkey, + .setauthsize = ccm_setauthsize, + .encrypt = ccm_encrypt, + .decrypt = ccm_decrypt, +}; + +static int __init sm4_ce_ccm_init(void) +{ + return crypto_register_aead(&sm4_ccm_alg); +} + +static void __exit sm4_ce_ccm_exit(void) +{ + crypto_unregister_aead(&sm4_ccm_alg); +} + +module_cpu_feature_match(SM4, sm4_ce_ccm_init); +module_exit(sm4_ce_ccm_exit); + +MODULE_DESCRIPTION("Synchronous SM4 in CCM mode using ARMv8 Crypto Extensions"); +MODULE_ALIAS_CRYPTO("ccm(sm4)"); +MODULE_AUTHOR("Tianjia Zhang "); +MODULE_LICENSE("GPL v2"); -- GitLab From ae1b83c7d572101b3b5cfbf40415c4cc5d469bde Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Thu, 27 Oct 2022 14:55:05 +0800 Subject: [PATCH 0370/1423] crypto: arm64/sm4 - add CE implementation for GCM mode This patch is a CE-optimized assembly implementation for GCM mode. Benchmark on T-Head Yitian-710 2.75 GHz, the data comes from the 224 and 224 modes of tcrypt, and compared the performance before and after this patch (the driver used before this patch is gcm_base(ctr-sm4-ce,ghash-generic)). The abscissas are blocks of different lengths. The data is tabulated and the unit is Mb/s: Before (gcm_base(ctr-sm4-ce,ghash-generic)): gcm(sm4) | 16 64 256 512 1024 1420 4096 8192 -------------+--------------------------------------------------------------------- GCM enc | 25.24 64.65 104.66 116.69 123.81 125.12 129.67 130.62 GCM dec | 25.40 64.80 104.74 116.70 123.81 125.21 129.68 130.59 GCM mb enc | 24.95 64.06 104.20 116.38 123.55 124.97 129.63 130.61 GCM mb dec | 24.92 64.00 104.13 116.34 123.55 124.98 129.56 130.48 After: gcm-sm4-ce | 16 64 256 512 1024 1420 4096 8192 -------------+--------------------------------------------------------------------- GCM enc | 108.62 397.18 971.60 1283.92 1522.77 1513.39 1777.00 1806.96 GCM dec | 116.36 398.14 1004.27 1319.11 1624.21 1635.43 1932.54 1974.20 GCM mb enc | 107.13 391.79 962.05 1274.94 1514.76 1508.57 1769.07 1801.58 GCM mb dec | 113.40 389.36 988.51 1307.68 1619.10 1631.55 1931.70 1970.86 Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu --- arch/arm64/crypto/Kconfig | 16 + arch/arm64/crypto/Makefile | 3 + arch/arm64/crypto/sm4-ce-gcm-core.S | 741 ++++++++++++++++++++++++++++ arch/arm64/crypto/sm4-ce-gcm-glue.c | 286 +++++++++++ 4 files changed, 1046 insertions(+) create mode 100644 arch/arm64/crypto/sm4-ce-gcm-core.S create mode 100644 arch/arm64/crypto/sm4-ce-gcm-glue.c diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 2611036a3e3fa..6793d5bc3ee53 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -297,6 +297,22 @@ config CRYPTO_SM4_ARM64_CE_CCM - ARMv8 Crypto Extensions - NEON (Advanced SIMD) extensions +config CRYPTO_SM4_ARM64_CE_GCM + tristate "AEAD cipher: SM4 in GCM mode (ARMv8 Crypto Extensions)" + depends on KERNEL_MODE_NEON + select CRYPTO_ALGAPI + select CRYPTO_AEAD + select CRYPTO_SM4 + select CRYPTO_SM4_ARM64_CE_BLK + help + AEAD cipher: SM4 cipher algorithms (OSCCA GB/T 32907-2016) with + GCM (Galois/Counter Mode) authenticated encryption mode (NIST SP800-38D) + + Architecture: arm64 using: + - ARMv8 Crypto Extensions + - PMULL (Polynomial Multiply Long) instructions + - NEON (Advanced SIMD) extensions + config CRYPTO_CRCT10DIF_ARM64_CE tristate "CRCT10DIF (PMULL)" depends on KERNEL_MODE_NEON && CRC_T10DIF diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index 843ea52669653..4818e204c2aca 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -32,6 +32,9 @@ sm4-ce-y := sm4-ce-glue.o sm4-ce-core.o obj-$(CONFIG_CRYPTO_SM4_ARM64_CE_CCM) += sm4-ce-ccm.o sm4-ce-ccm-y := sm4-ce-ccm-glue.o sm4-ce-ccm-core.o +obj-$(CONFIG_CRYPTO_SM4_ARM64_CE_GCM) += sm4-ce-gcm.o +sm4-ce-gcm-y := sm4-ce-gcm-glue.o sm4-ce-gcm-core.o + obj-$(CONFIG_CRYPTO_SM4_ARM64_NEON_BLK) += sm4-neon.o sm4-neon-y := sm4-neon-glue.o sm4-neon-core.o diff --git a/arch/arm64/crypto/sm4-ce-gcm-core.S b/arch/arm64/crypto/sm4-ce-gcm-core.S new file mode 100644 index 0000000000000..7aa3ec18a2891 --- /dev/null +++ b/arch/arm64/crypto/sm4-ce-gcm-core.S @@ -0,0 +1,741 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SM4-GCM AEAD Algorithm using ARMv8 Crypto Extensions + * as specified in rfc8998 + * https://datatracker.ietf.org/doc/html/rfc8998 + * + * Copyright (C) 2016 Jussi Kivilinna + * Copyright (C) 2022 Tianjia Zhang + */ + +#include +#include +#include "sm4-ce-asm.h" + +.arch armv8-a+crypto + +.irp b, 0, 1, 2, 3, 24, 25, 26, 27, 28, 29, 30, 31 + .set .Lv\b\().4s, \b +.endr + +.macro sm4e, vd, vn + .inst 0xcec08400 | (.L\vn << 5) | .L\vd +.endm + +/* Register macros */ + +/* Used for both encryption and decryption */ +#define RHASH v21 +#define RRCONST v22 +#define RZERO v23 + +/* Helper macros. */ + +/* + * input: m0, m1 + * output: r0:r1 (low 128-bits in r0, high in r1) + */ +#define PMUL_128x128(r0, r1, m0, m1, T0, T1) \ + ext T0.16b, m1.16b, m1.16b, #8; \ + pmull r0.1q, m0.1d, m1.1d; \ + pmull T1.1q, m0.1d, T0.1d; \ + pmull2 T0.1q, m0.2d, T0.2d; \ + pmull2 r1.1q, m0.2d, m1.2d; \ + eor T0.16b, T0.16b, T1.16b; \ + ext T1.16b, RZERO.16b, T0.16b, #8; \ + ext T0.16b, T0.16b, RZERO.16b, #8; \ + eor r0.16b, r0.16b, T1.16b; \ + eor r1.16b, r1.16b, T0.16b; + +#define PMUL_128x128_4x(r0, r1, m0, m1, T0, T1, \ + r2, r3, m2, m3, T2, T3, \ + r4, r5, m4, m5, T4, T5, \ + r6, r7, m6, m7, T6, T7) \ + ext T0.16b, m1.16b, m1.16b, #8; \ + ext T2.16b, m3.16b, m3.16b, #8; \ + ext T4.16b, m5.16b, m5.16b, #8; \ + ext T6.16b, m7.16b, m7.16b, #8; \ + pmull r0.1q, m0.1d, m1.1d; \ + pmull r2.1q, m2.1d, m3.1d; \ + pmull r4.1q, m4.1d, m5.1d; \ + pmull r6.1q, m6.1d, m7.1d; \ + pmull T1.1q, m0.1d, T0.1d; \ + pmull T3.1q, m2.1d, T2.1d; \ + pmull T5.1q, m4.1d, T4.1d; \ + pmull T7.1q, m6.1d, T6.1d; \ + pmull2 T0.1q, m0.2d, T0.2d; \ + pmull2 T2.1q, m2.2d, T2.2d; \ + pmull2 T4.1q, m4.2d, T4.2d; \ + pmull2 T6.1q, m6.2d, T6.2d; \ + pmull2 r1.1q, m0.2d, m1.2d; \ + pmull2 r3.1q, m2.2d, m3.2d; \ + pmull2 r5.1q, m4.2d, m5.2d; \ + pmull2 r7.1q, m6.2d, m7.2d; \ + eor T0.16b, T0.16b, T1.16b; \ + eor T2.16b, T2.16b, T3.16b; \ + eor T4.16b, T4.16b, T5.16b; \ + eor T6.16b, T6.16b, T7.16b; \ + ext T1.16b, RZERO.16b, T0.16b, #8; \ + ext T3.16b, RZERO.16b, T2.16b, #8; \ + ext T5.16b, RZERO.16b, T4.16b, #8; \ + ext T7.16b, RZERO.16b, T6.16b, #8; \ + ext T0.16b, T0.16b, RZERO.16b, #8; \ + ext T2.16b, T2.16b, RZERO.16b, #8; \ + ext T4.16b, T4.16b, RZERO.16b, #8; \ + ext T6.16b, T6.16b, RZERO.16b, #8; \ + eor r0.16b, r0.16b, T1.16b; \ + eor r2.16b, r2.16b, T3.16b; \ + eor r4.16b, r4.16b, T5.16b; \ + eor r6.16b, r6.16b, T7.16b; \ + eor r1.16b, r1.16b, T0.16b; \ + eor r3.16b, r3.16b, T2.16b; \ + eor r5.16b, r5.16b, T4.16b; \ + eor r7.16b, r7.16b, T6.16b; + +/* + * input: r0:r1 (low 128-bits in r0, high in r1) + * output: a + */ +#define REDUCTION(a, r0, r1, rconst, T0, T1) \ + pmull2 T0.1q, r1.2d, rconst.2d; \ + ext T1.16b, T0.16b, RZERO.16b, #8; \ + ext T0.16b, RZERO.16b, T0.16b, #8; \ + eor r1.16b, r1.16b, T1.16b; \ + eor r0.16b, r0.16b, T0.16b; \ + pmull T0.1q, r1.1d, rconst.1d; \ + eor a.16b, r0.16b, T0.16b; + +#define SM4_CRYPT_PMUL_128x128_BLK(b0, r0, r1, m0, m1, T0, T1) \ + rev32 b0.16b, b0.16b; \ + ext T0.16b, m1.16b, m1.16b, #8; \ + sm4e b0.4s, v24.4s; \ + pmull r0.1q, m0.1d, m1.1d; \ + sm4e b0.4s, v25.4s; \ + pmull T1.1q, m0.1d, T0.1d; \ + sm4e b0.4s, v26.4s; \ + pmull2 T0.1q, m0.2d, T0.2d; \ + sm4e b0.4s, v27.4s; \ + pmull2 r1.1q, m0.2d, m1.2d; \ + sm4e b0.4s, v28.4s; \ + eor T0.16b, T0.16b, T1.16b; \ + sm4e b0.4s, v29.4s; \ + ext T1.16b, RZERO.16b, T0.16b, #8; \ + sm4e b0.4s, v30.4s; \ + ext T0.16b, T0.16b, RZERO.16b, #8; \ + sm4e b0.4s, v31.4s; \ + eor r0.16b, r0.16b, T1.16b; \ + rev64 b0.4s, b0.4s; \ + eor r1.16b, r1.16b, T0.16b; \ + ext b0.16b, b0.16b, b0.16b, #8; \ + rev32 b0.16b, b0.16b; + +#define SM4_CRYPT_PMUL_128x128_BLK3(b0, b1, b2, \ + r0, r1, m0, m1, T0, T1, \ + r2, r3, m2, m3, T2, T3, \ + r4, r5, m4, m5, T4, T5) \ + rev32 b0.16b, b0.16b; \ + rev32 b1.16b, b1.16b; \ + rev32 b2.16b, b2.16b; \ + ext T0.16b, m1.16b, m1.16b, #8; \ + ext T2.16b, m3.16b, m3.16b, #8; \ + ext T4.16b, m5.16b, m5.16b, #8; \ + sm4e b0.4s, v24.4s; \ + sm4e b1.4s, v24.4s; \ + sm4e b2.4s, v24.4s; \ + pmull r0.1q, m0.1d, m1.1d; \ + pmull r2.1q, m2.1d, m3.1d; \ + pmull r4.1q, m4.1d, m5.1d; \ + sm4e b0.4s, v25.4s; \ + sm4e b1.4s, v25.4s; \ + sm4e b2.4s, v25.4s; \ + pmull T1.1q, m0.1d, T0.1d; \ + pmull T3.1q, m2.1d, T2.1d; \ + pmull T5.1q, m4.1d, T4.1d; \ + sm4e b0.4s, v26.4s; \ + sm4e b1.4s, v26.4s; \ + sm4e b2.4s, v26.4s; \ + pmull2 T0.1q, m0.2d, T0.2d; \ + pmull2 T2.1q, m2.2d, T2.2d; \ + pmull2 T4.1q, m4.2d, T4.2d; \ + sm4e b0.4s, v27.4s; \ + sm4e b1.4s, v27.4s; \ + sm4e b2.4s, v27.4s; \ + pmull2 r1.1q, m0.2d, m1.2d; \ + pmull2 r3.1q, m2.2d, m3.2d; \ + pmull2 r5.1q, m4.2d, m5.2d; \ + sm4e b0.4s, v28.4s; \ + sm4e b1.4s, v28.4s; \ + sm4e b2.4s, v28.4s; \ + eor T0.16b, T0.16b, T1.16b; \ + eor T2.16b, T2.16b, T3.16b; \ + eor T4.16b, T4.16b, T5.16b; \ + sm4e b0.4s, v29.4s; \ + sm4e b1.4s, v29.4s; \ + sm4e b2.4s, v29.4s; \ + ext T1.16b, RZERO.16b, T0.16b, #8; \ + ext T3.16b, RZERO.16b, T2.16b, #8; \ + ext T5.16b, RZERO.16b, T4.16b, #8; \ + sm4e b0.4s, v30.4s; \ + sm4e b1.4s, v30.4s; \ + sm4e b2.4s, v30.4s; \ + ext T0.16b, T0.16b, RZERO.16b, #8; \ + ext T2.16b, T2.16b, RZERO.16b, #8; \ + ext T4.16b, T4.16b, RZERO.16b, #8; \ + sm4e b0.4s, v31.4s; \ + sm4e b1.4s, v31.4s; \ + sm4e b2.4s, v31.4s; \ + eor r0.16b, r0.16b, T1.16b; \ + eor r2.16b, r2.16b, T3.16b; \ + eor r4.16b, r4.16b, T5.16b; \ + rev64 b0.4s, b0.4s; \ + rev64 b1.4s, b1.4s; \ + rev64 b2.4s, b2.4s; \ + eor r1.16b, r1.16b, T0.16b; \ + eor r3.16b, r3.16b, T2.16b; \ + eor r5.16b, r5.16b, T4.16b; \ + ext b0.16b, b0.16b, b0.16b, #8; \ + ext b1.16b, b1.16b, b1.16b, #8; \ + ext b2.16b, b2.16b, b2.16b, #8; \ + eor r0.16b, r0.16b, r2.16b; \ + eor r1.16b, r1.16b, r3.16b; \ + rev32 b0.16b, b0.16b; \ + rev32 b1.16b, b1.16b; \ + rev32 b2.16b, b2.16b; \ + eor r0.16b, r0.16b, r4.16b; \ + eor r1.16b, r1.16b, r5.16b; + +#define inc32_le128(vctr) \ + mov vctr.d[1], x9; \ + add w6, w9, #1; \ + mov vctr.d[0], x8; \ + bfi x9, x6, #0, #32; \ + rev64 vctr.16b, vctr.16b; + +#define GTAG_HASH_LENGTHS(vctr0, vlen) \ + ld1 {vlen.16b}, [x7]; \ + /* construct CTR0 */ \ + /* the lower 32-bits of initial IV is always be32(1) */ \ + mov x6, #0x1; \ + bfi x9, x6, #0, #32; \ + mov vctr0.d[0], x8; \ + mov vctr0.d[1], x9; \ + rbit vlen.16b, vlen.16b; \ + rev64 vctr0.16b, vctr0.16b; \ + /* authtag = GCTR(CTR0, GHASH) */ \ + eor RHASH.16b, RHASH.16b, vlen.16b; \ + SM4_CRYPT_PMUL_128x128_BLK(vctr0, RR0, RR1, RHASH, RH1, \ + RTMP0, RTMP1); \ + REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3); \ + rbit RHASH.16b, RHASH.16b; \ + eor RHASH.16b, RHASH.16b, vctr0.16b; + + +/* Register macros for encrypt and ghash */ + +/* can be the same as input v0-v3 */ +#define RR1 v0 +#define RR3 v1 +#define RR5 v2 +#define RR7 v3 + +#define RR0 v4 +#define RR2 v5 +#define RR4 v6 +#define RR6 v7 + +#define RTMP0 v8 +#define RTMP1 v9 +#define RTMP2 v10 +#define RTMP3 v11 +#define RTMP4 v12 +#define RTMP5 v13 +#define RTMP6 v14 +#define RTMP7 v15 + +#define RH1 v16 +#define RH2 v17 +#define RH3 v18 +#define RH4 v19 + +.align 3 +SYM_FUNC_START(sm4_ce_pmull_ghash_setup) + /* input: + * x0: round key array, CTX + * x1: ghash table + */ + SM4_PREPARE(x0) + + adr_l x2, .Lghash_rconst + ld1r {RRCONST.2d}, [x2] + + eor RZERO.16b, RZERO.16b, RZERO.16b + + /* H = E(K, 0^128) */ + rev32 v0.16b, RZERO.16b + SM4_CRYPT_BLK_BE(v0) + + /* H ^ 1 */ + rbit RH1.16b, v0.16b + + /* H ^ 2 */ + PMUL_128x128(RR0, RR1, RH1, RH1, RTMP0, RTMP1) + REDUCTION(RH2, RR0, RR1, RRCONST, RTMP2, RTMP3) + + /* H ^ 3 */ + PMUL_128x128(RR0, RR1, RH2, RH1, RTMP0, RTMP1) + REDUCTION(RH3, RR0, RR1, RRCONST, RTMP2, RTMP3) + + /* H ^ 4 */ + PMUL_128x128(RR0, RR1, RH2, RH2, RTMP0, RTMP1) + REDUCTION(RH4, RR0, RR1, RRCONST, RTMP2, RTMP3) + + st1 {RH1.16b-RH4.16b}, [x1] + + ret +SYM_FUNC_END(sm4_ce_pmull_ghash_setup) + +.align 3 +SYM_FUNC_START(pmull_ghash_update) + /* input: + * x0: ghash table + * x1: ghash result + * x2: src + * w3: nblocks + */ + ld1 {RH1.16b-RH4.16b}, [x0] + + ld1 {RHASH.16b}, [x1] + rbit RHASH.16b, RHASH.16b + + adr_l x4, .Lghash_rconst + ld1r {RRCONST.2d}, [x4] + + eor RZERO.16b, RZERO.16b, RZERO.16b + +.Lghash_loop_4x: + cmp w3, #4 + blt .Lghash_loop_1x + + sub w3, w3, #4 + + ld1 {v0.16b-v3.16b}, [x2], #64 + + rbit v0.16b, v0.16b + rbit v1.16b, v1.16b + rbit v2.16b, v2.16b + rbit v3.16b, v3.16b + + /* + * (in0 ^ HASH) * H^4 => rr0:rr1 + * (in1) * H^3 => rr2:rr3 + * (in2) * H^2 => rr4:rr5 + * (in3) * H^1 => rr6:rr7 + */ + eor RHASH.16b, RHASH.16b, v0.16b + + PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1, + RR2, RR3, v1, RH3, RTMP2, RTMP3, + RR4, RR5, v2, RH2, RTMP4, RTMP5, + RR6, RR7, v3, RH1, RTMP6, RTMP7) + + eor RR0.16b, RR0.16b, RR2.16b + eor RR1.16b, RR1.16b, RR3.16b + eor RR0.16b, RR0.16b, RR4.16b + eor RR1.16b, RR1.16b, RR5.16b + eor RR0.16b, RR0.16b, RR6.16b + eor RR1.16b, RR1.16b, RR7.16b + + REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1) + + cbz w3, .Lghash_end + b .Lghash_loop_4x + +.Lghash_loop_1x: + sub w3, w3, #1 + + ld1 {v0.16b}, [x2], #16 + rbit v0.16b, v0.16b + eor RHASH.16b, RHASH.16b, v0.16b + + PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1) + REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3) + + cbnz w3, .Lghash_loop_1x + +.Lghash_end: + rbit RHASH.16b, RHASH.16b + st1 {RHASH.2d}, [x1] + + ret +SYM_FUNC_END(pmull_ghash_update) + +.align 3 +SYM_FUNC_START(sm4_ce_pmull_gcm_enc) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: ctr (big endian, 128 bit) + * w4: nbytes + * x5: ghash result + * x6: ghash table + * x7: lengths (only for last block) + */ + SM4_PREPARE(x0) + + ldp x8, x9, [x3] + rev x8, x8 + rev x9, x9 + + ld1 {RH1.16b-RH4.16b}, [x6] + + ld1 {RHASH.16b}, [x5] + rbit RHASH.16b, RHASH.16b + + adr_l x6, .Lghash_rconst + ld1r {RRCONST.2d}, [x6] + + eor RZERO.16b, RZERO.16b, RZERO.16b + + cbz w4, .Lgcm_enc_hash_len + +.Lgcm_enc_loop_4x: + cmp w4, #(4 * 16) + blt .Lgcm_enc_loop_1x + + sub w4, w4, #(4 * 16) + + /* construct CTRs */ + inc32_le128(v0) /* +0 */ + inc32_le128(v1) /* +1 */ + inc32_le128(v2) /* +2 */ + inc32_le128(v3) /* +3 */ + + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64 + + SM4_CRYPT_BLK4(v0, v1, v2, v3) + + eor v0.16b, v0.16b, RTMP0.16b + eor v1.16b, v1.16b, RTMP1.16b + eor v2.16b, v2.16b, RTMP2.16b + eor v3.16b, v3.16b, RTMP3.16b + st1 {v0.16b-v3.16b}, [x1], #64 + + /* ghash update */ + + rbit v0.16b, v0.16b + rbit v1.16b, v1.16b + rbit v2.16b, v2.16b + rbit v3.16b, v3.16b + + /* + * (in0 ^ HASH) * H^4 => rr0:rr1 + * (in1) * H^3 => rr2:rr3 + * (in2) * H^2 => rr4:rr5 + * (in3) * H^1 => rr6:rr7 + */ + eor RHASH.16b, RHASH.16b, v0.16b + + PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1, + RR2, RR3, v1, RH3, RTMP2, RTMP3, + RR4, RR5, v2, RH2, RTMP4, RTMP5, + RR6, RR7, v3, RH1, RTMP6, RTMP7) + + eor RR0.16b, RR0.16b, RR2.16b + eor RR1.16b, RR1.16b, RR3.16b + eor RR0.16b, RR0.16b, RR4.16b + eor RR1.16b, RR1.16b, RR5.16b + eor RR0.16b, RR0.16b, RR6.16b + eor RR1.16b, RR1.16b, RR7.16b + + REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1) + + cbz w4, .Lgcm_enc_hash_len + b .Lgcm_enc_loop_4x + +.Lgcm_enc_loop_1x: + cmp w4, #16 + blt .Lgcm_enc_tail + + sub w4, w4, #16 + + /* construct CTRs */ + inc32_le128(v0) + + ld1 {RTMP0.16b}, [x2], #16 + + SM4_CRYPT_BLK(v0) + + eor v0.16b, v0.16b, RTMP0.16b + st1 {v0.16b}, [x1], #16 + + /* ghash update */ + rbit v0.16b, v0.16b + eor RHASH.16b, RHASH.16b, v0.16b + PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1) + REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3) + + cbz w4, .Lgcm_enc_hash_len + b .Lgcm_enc_loop_1x + +.Lgcm_enc_tail: + /* construct CTRs */ + inc32_le128(v0) + SM4_CRYPT_BLK(v0) + + /* load permute table */ + adr_l x0, .Lcts_permute_table + add x0, x0, #32 + sub x0, x0, w4, uxtw + ld1 {v3.16b}, [x0] + +.Lgcm_enc_tail_loop: + /* do encrypt */ + ldrb w0, [x2], #1 /* get 1 byte from input */ + umov w6, v0.b[0] /* get top crypted byte */ + eor w6, w6, w0 /* w6 = CTR ^ input */ + strb w6, [x1], #1 /* store out byte */ + + /* shift right out one byte */ + ext v0.16b, v0.16b, v0.16b, #1 + /* the last ciphertext is placed in high bytes */ + ins v0.b[15], w6 + + subs w4, w4, #1 + bne .Lgcm_enc_tail_loop + + /* padding last block with zeros */ + tbl v0.16b, {v0.16b}, v3.16b + + /* ghash update */ + rbit v0.16b, v0.16b + eor RHASH.16b, RHASH.16b, v0.16b + PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1) + REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3) + +.Lgcm_enc_hash_len: + cbz x7, .Lgcm_enc_end + + GTAG_HASH_LENGTHS(v1, v3) + + b .Lgcm_enc_ret + +.Lgcm_enc_end: + /* store new CTR */ + rev x8, x8 + rev x9, x9 + stp x8, x9, [x3] + + rbit RHASH.16b, RHASH.16b + +.Lgcm_enc_ret: + /* store new MAC */ + st1 {RHASH.2d}, [x5] + + ret +SYM_FUNC_END(sm4_ce_pmull_gcm_enc) + +#undef RR1 +#undef RR3 +#undef RR5 +#undef RR7 +#undef RR0 +#undef RR2 +#undef RR4 +#undef RR6 +#undef RTMP0 +#undef RTMP1 +#undef RTMP2 +#undef RTMP3 +#undef RTMP4 +#undef RTMP5 +#undef RTMP6 +#undef RTMP7 +#undef RH1 +#undef RH2 +#undef RH3 +#undef RH4 + + +/* Register macros for decrypt */ + +/* v0-v2 for building CTRs, v3-v5 for saving inputs */ + +#define RR1 v6 +#define RR3 v7 +#define RR5 v8 + +#define RR0 v9 +#define RR2 v10 +#define RR4 v11 + +#define RTMP0 v12 +#define RTMP1 v13 +#define RTMP2 v14 +#define RTMP3 v15 +#define RTMP4 v16 +#define RTMP5 v17 + +#define RH1 v18 +#define RH2 v19 +#define RH3 v20 + +.align 3 +SYM_FUNC_START(sm4_ce_pmull_gcm_dec) + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: ctr (big endian, 128 bit) + * w4: nbytes + * x5: ghash result + * x6: ghash table + * x7: lengths (only for last block) + */ + SM4_PREPARE(x0) + + ldp x8, x9, [x3] + rev x8, x8 + rev x9, x9 + + ld1 {RH1.16b-RH3.16b}, [x6] + + ld1 {RHASH.16b}, [x5] + rbit RHASH.16b, RHASH.16b + + adr_l x6, .Lghash_rconst + ld1r {RRCONST.2d}, [x6] + + eor RZERO.16b, RZERO.16b, RZERO.16b + + cbz w4, .Lgcm_dec_hash_len + +.Lgcm_dec_loop_3x: + cmp w4, #(3 * 16) + blt .Lgcm_dec_loop_1x + + sub w4, w4, #(3 * 16) + + ld1 {v3.16b-v5.16b}, [x2], #(3 * 16) + + /* construct CTRs */ + inc32_le128(v0) /* +0 */ + rbit v6.16b, v3.16b + inc32_le128(v1) /* +1 */ + rbit v7.16b, v4.16b + inc32_le128(v2) /* +2 */ + rbit v8.16b, v5.16b + + eor RHASH.16b, RHASH.16b, v6.16b + + /* decrypt & ghash update */ + SM4_CRYPT_PMUL_128x128_BLK3(v0, v1, v2, + RR0, RR1, RHASH, RH3, RTMP0, RTMP1, + RR2, RR3, v7, RH2, RTMP2, RTMP3, + RR4, RR5, v8, RH1, RTMP4, RTMP5) + + eor v0.16b, v0.16b, v3.16b + eor v1.16b, v1.16b, v4.16b + eor v2.16b, v2.16b, v5.16b + + REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1) + + st1 {v0.16b-v2.16b}, [x1], #(3 * 16) + + cbz w4, .Lgcm_dec_hash_len + b .Lgcm_dec_loop_3x + +.Lgcm_dec_loop_1x: + cmp w4, #16 + blt .Lgcm_dec_tail + + sub w4, w4, #16 + + ld1 {v3.16b}, [x2], #16 + + /* construct CTRs */ + inc32_le128(v0) + rbit v6.16b, v3.16b + + eor RHASH.16b, RHASH.16b, v6.16b + + SM4_CRYPT_PMUL_128x128_BLK(v0, RR0, RR1, RHASH, RH1, RTMP0, RTMP1) + + eor v0.16b, v0.16b, v3.16b + + REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3) + + st1 {v0.16b}, [x1], #16 + + cbz w4, .Lgcm_dec_hash_len + b .Lgcm_dec_loop_1x + +.Lgcm_dec_tail: + /* construct CTRs */ + inc32_le128(v0) + SM4_CRYPT_BLK(v0) + + /* load permute table */ + adr_l x0, .Lcts_permute_table + add x0, x0, #32 + sub x0, x0, w4, uxtw + ld1 {v3.16b}, [x0] + +.Lgcm_dec_tail_loop: + /* do decrypt */ + ldrb w0, [x2], #1 /* get 1 byte from input */ + umov w6, v0.b[0] /* get top crypted byte */ + eor w6, w6, w0 /* w6 = CTR ^ input */ + strb w6, [x1], #1 /* store out byte */ + + /* shift right out one byte */ + ext v0.16b, v0.16b, v0.16b, #1 + /* the last ciphertext is placed in high bytes */ + ins v0.b[15], w0 + + subs w4, w4, #1 + bne .Lgcm_dec_tail_loop + + /* padding last block with zeros */ + tbl v0.16b, {v0.16b}, v3.16b + + /* ghash update */ + rbit v0.16b, v0.16b + eor RHASH.16b, RHASH.16b, v0.16b + PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1) + REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3) + +.Lgcm_dec_hash_len: + cbz x7, .Lgcm_dec_end + + GTAG_HASH_LENGTHS(v1, v3) + + b .Lgcm_dec_ret + +.Lgcm_dec_end: + /* store new CTR */ + rev x8, x8 + rev x9, x9 + stp x8, x9, [x3] + + rbit RHASH.16b, RHASH.16b + +.Lgcm_dec_ret: + /* store new MAC */ + st1 {RHASH.2d}, [x5] + + ret +SYM_FUNC_END(sm4_ce_pmull_gcm_dec) + + .section ".rodata", "a" + .align 4 +.Lcts_permute_table: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + +.Lghash_rconst: + .quad 0x87 diff --git a/arch/arm64/crypto/sm4-ce-gcm-glue.c b/arch/arm64/crypto/sm4-ce-gcm-glue.c new file mode 100644 index 0000000000000..e90ea0f17beb2 --- /dev/null +++ b/arch/arm64/crypto/sm4-ce-gcm-glue.c @@ -0,0 +1,286 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SM4-GCM AEAD Algorithm using ARMv8 Crypto Extensions + * as specified in rfc8998 + * https://datatracker.ietf.org/doc/html/rfc8998 + * + * Copyright (C) 2022 Tianjia Zhang + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "sm4-ce.h" + +asmlinkage void sm4_ce_pmull_ghash_setup(const u32 *rkey_enc, u8 *ghash_table); +asmlinkage void pmull_ghash_update(const u8 *ghash_table, u8 *ghash, + const u8 *src, unsigned int nblocks); +asmlinkage void sm4_ce_pmull_gcm_enc(const u32 *rkey_enc, u8 *dst, + const u8 *src, u8 *iv, + unsigned int nbytes, u8 *ghash, + const u8 *ghash_table, const u8 *lengths); +asmlinkage void sm4_ce_pmull_gcm_dec(const u32 *rkey_enc, u8 *dst, + const u8 *src, u8 *iv, + unsigned int nbytes, u8 *ghash, + const u8 *ghash_table, const u8 *lengths); + +#define GHASH_BLOCK_SIZE 16 +#define GCM_IV_SIZE 12 + +struct sm4_gcm_ctx { + struct sm4_ctx key; + u8 ghash_table[16 * 4]; +}; + + +static int gcm_setkey(struct crypto_aead *tfm, const u8 *key, + unsigned int key_len) +{ + struct sm4_gcm_ctx *ctx = crypto_aead_ctx(tfm); + + if (key_len != SM4_KEY_SIZE) + return -EINVAL; + + kernel_neon_begin(); + + sm4_ce_expand_key(key, ctx->key.rkey_enc, ctx->key.rkey_dec, + crypto_sm4_fk, crypto_sm4_ck); + sm4_ce_pmull_ghash_setup(ctx->key.rkey_enc, ctx->ghash_table); + + kernel_neon_end(); + return 0; +} + +static int gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) +{ + switch (authsize) { + case 4: + case 8: + case 12 ... 16: + return 0; + default: + return -EINVAL; + } +} + +static void gcm_calculate_auth_mac(struct aead_request *req, u8 ghash[]) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct sm4_gcm_ctx *ctx = crypto_aead_ctx(aead); + u8 __aligned(8) buffer[GHASH_BLOCK_SIZE]; + u32 assoclen = req->assoclen; + struct scatter_walk walk; + unsigned int buflen = 0; + + scatterwalk_start(&walk, req->src); + + do { + u32 n = scatterwalk_clamp(&walk, assoclen); + u8 *p, *ptr; + + if (!n) { + scatterwalk_start(&walk, sg_next(walk.sg)); + n = scatterwalk_clamp(&walk, assoclen); + } + + p = ptr = scatterwalk_map(&walk); + assoclen -= n; + scatterwalk_advance(&walk, n); + + if (n + buflen < GHASH_BLOCK_SIZE) { + memcpy(&buffer[buflen], ptr, n); + buflen += n; + } else { + unsigned int nblocks; + + if (buflen) { + unsigned int l = GHASH_BLOCK_SIZE - buflen; + + memcpy(&buffer[buflen], ptr, l); + ptr += l; + n -= l; + + pmull_ghash_update(ctx->ghash_table, ghash, + buffer, 1); + } + + nblocks = n / GHASH_BLOCK_SIZE; + if (nblocks) { + pmull_ghash_update(ctx->ghash_table, ghash, + ptr, nblocks); + ptr += nblocks * GHASH_BLOCK_SIZE; + } + + buflen = n % GHASH_BLOCK_SIZE; + if (buflen) + memcpy(&buffer[0], ptr, buflen); + } + + scatterwalk_unmap(p); + scatterwalk_done(&walk, 0, assoclen); + } while (assoclen); + + /* padding with '0' */ + if (buflen) { + memset(&buffer[buflen], 0, GHASH_BLOCK_SIZE - buflen); + pmull_ghash_update(ctx->ghash_table, ghash, buffer, 1); + } +} + +static int gcm_crypt(struct aead_request *req, struct skcipher_walk *walk, + struct sm4_gcm_ctx *ctx, u8 ghash[], + void (*sm4_ce_pmull_gcm_crypt)(const u32 *rkey_enc, + u8 *dst, const u8 *src, u8 *iv, + unsigned int nbytes, u8 *ghash, + const u8 *ghash_table, const u8 *lengths)) +{ + u8 __aligned(8) iv[SM4_BLOCK_SIZE]; + be128 __aligned(8) lengths; + int err; + + memset(ghash, 0, SM4_BLOCK_SIZE); + + lengths.a = cpu_to_be64(req->assoclen * 8); + lengths.b = cpu_to_be64(walk->total * 8); + + memcpy(iv, walk->iv, GCM_IV_SIZE); + put_unaligned_be32(2, iv + GCM_IV_SIZE); + + kernel_neon_begin(); + + if (req->assoclen) + gcm_calculate_auth_mac(req, ghash); + + do { + unsigned int tail = walk->nbytes % SM4_BLOCK_SIZE; + const u8 *src = walk->src.virt.addr; + u8 *dst = walk->dst.virt.addr; + + if (walk->nbytes == walk->total) { + tail = 0; + + sm4_ce_pmull_gcm_crypt(ctx->key.rkey_enc, dst, src, iv, + walk->nbytes, ghash, + ctx->ghash_table, + (const u8 *)&lengths); + } else if (walk->nbytes - tail) { + sm4_ce_pmull_gcm_crypt(ctx->key.rkey_enc, dst, src, iv, + walk->nbytes - tail, ghash, + ctx->ghash_table, NULL); + } + + kernel_neon_end(); + + err = skcipher_walk_done(walk, tail); + if (err) + return err; + if (walk->nbytes) + kernel_neon_begin(); + } while (walk->nbytes > 0); + + return 0; +} + +static int gcm_encrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct sm4_gcm_ctx *ctx = crypto_aead_ctx(aead); + u8 __aligned(8) ghash[SM4_BLOCK_SIZE]; + struct skcipher_walk walk; + int err; + + err = skcipher_walk_aead_encrypt(&walk, req, false); + if (err) + return err; + + err = gcm_crypt(req, &walk, ctx, ghash, sm4_ce_pmull_gcm_enc); + if (err) + return err; + + /* copy authtag to end of dst */ + scatterwalk_map_and_copy(ghash, req->dst, req->assoclen + req->cryptlen, + crypto_aead_authsize(aead), 1); + + return 0; +} + +static int gcm_decrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + unsigned int authsize = crypto_aead_authsize(aead); + struct sm4_gcm_ctx *ctx = crypto_aead_ctx(aead); + u8 __aligned(8) ghash[SM4_BLOCK_SIZE]; + u8 authtag[SM4_BLOCK_SIZE]; + struct skcipher_walk walk; + int err; + + err = skcipher_walk_aead_decrypt(&walk, req, false); + if (err) + return err; + + err = gcm_crypt(req, &walk, ctx, ghash, sm4_ce_pmull_gcm_dec); + if (err) + return err; + + /* compare calculated auth tag with the stored one */ + scatterwalk_map_and_copy(authtag, req->src, + req->assoclen + req->cryptlen - authsize, + authsize, 0); + + if (crypto_memneq(authtag, ghash, authsize)) + return -EBADMSG; + + return 0; +} + +static struct aead_alg sm4_gcm_alg = { + .base = { + .cra_name = "gcm(sm4)", + .cra_driver_name = "gcm-sm4-ce", + .cra_priority = 400, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct sm4_gcm_ctx), + .cra_module = THIS_MODULE, + }, + .ivsize = GCM_IV_SIZE, + .chunksize = SM4_BLOCK_SIZE, + .maxauthsize = SM4_BLOCK_SIZE, + .setkey = gcm_setkey, + .setauthsize = gcm_setauthsize, + .encrypt = gcm_encrypt, + .decrypt = gcm_decrypt, +}; + +static int __init sm4_ce_gcm_init(void) +{ + if (!cpu_have_named_feature(PMULL)) + return -ENODEV; + + return crypto_register_aead(&sm4_gcm_alg); +} + +static void __exit sm4_ce_gcm_exit(void) +{ + crypto_unregister_aead(&sm4_gcm_alg); +} + +static const struct cpu_feature sm4_ce_gcm_cpu_feature[] = { + { cpu_feature(PMULL) }, + {} +}; +MODULE_DEVICE_TABLE(cpu, sm4_ce_gcm_cpu_feature); + +module_cpu_feature_match(SM4, sm4_ce_gcm_init); +module_exit(sm4_ce_gcm_exit); + +MODULE_DESCRIPTION("Synchronous SM4 in GCM mode using ARMv8 Crypto Extensions"); +MODULE_ALIAS_CRYPTO("gcm(sm4)"); +MODULE_AUTHOR("Tianjia Zhang "); +MODULE_LICENSE("GPL v2"); -- GitLab From 329cfa42e5280decfc9247598b9996e13b28c380 Mon Sep 17 00:00:00 2001 From: Ralph Siemsen Date: Thu, 27 Oct 2022 15:35:44 -0400 Subject: [PATCH 0371/1423] crypto: doc - use correct function name The hashing API does not have a function called .finish() Signed-off-by: Ralph Siemsen Signed-off-by: Herbert Xu --- Documentation/crypto/devel-algos.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/crypto/devel-algos.rst b/Documentation/crypto/devel-algos.rst index f225a953ab4b9..3506899ef83e3 100644 --- a/Documentation/crypto/devel-algos.rst +++ b/Documentation/crypto/devel-algos.rst @@ -172,7 +172,7 @@ Here are schematics of how these functions are called when operated from other part of the kernel. Note that the .setkey() call might happen before or after any of these schematics happen, but must not happen during any of these are in-flight. Please note that calling .init() -followed immediately by .finish() is also a perfectly valid +followed immediately by .final() is also a perfectly valid transformation. :: -- GitLab From d59fdbc7164a6b2a0ed45c13387deac8efeed5a2 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 3 Nov 2022 22:30:05 +0200 Subject: [PATCH 0372/1423] gpiolib: of: Make use of device_match_of_node() Make use of device_match_of_node() instead of open coding its functionality. Signed-off-by: Andy Shevchenko Reviewed-by: Dmitry Torokhov Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-of.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index 52616848a37c4..4b91e19366a8e 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -85,7 +85,7 @@ static int of_gpiochip_match_node_and_xlate(struct gpio_chip *chip, void *data) { struct of_phandle_args *gpiospec = data; - return chip->gpiodev->dev.of_node == gpiospec->np && + return device_match_of_node(&chip->gpiodev->dev, gpiospec->np) && chip->of_xlate && chip->of_xlate(chip, gpiospec, NULL) >= 0; } -- GitLab From 34cb9352b62366038fd2d5b9d9f393f35d0be1e0 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sat, 29 Oct 2022 21:40:46 -0700 Subject: [PATCH 0373/1423] gpiolib: of: factor out quirk setting polarity via separate property Several legacy bindings use a separate property to specify polarity of GPIOs instead of specifying it directly in the GPIO property. Factor out this code to make it easier to add more such cases. Signed-off-by: Dmitry Torokhov Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-of.c | 98 +++++++++++++++++++++------------------ 1 file changed, 53 insertions(+), 45 deletions(-) diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index 4b91e19366a8e..e72b56a87b86c 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -186,33 +186,68 @@ static void of_gpio_try_fixup_polarity(const struct device_node *np, } } -static void of_gpio_flags_quirks(const struct device_node *np, - const char *propname, - enum of_gpio_flags *flags, - int index) +static void of_gpio_set_polarity_by_property(const struct device_node *np, + const char *propname, + enum of_gpio_flags *flags) { - of_gpio_try_fixup_polarity(np, propname, flags); + static const struct { + const char *compatible; + const char *gpio_propname; + const char *polarity_propname; + } gpios[] = { +#if IS_ENABLED(CONFIG_FEC) + /* Freescale Fast Ethernet Controller */ + { "fsl,imx25-fec", "phy-reset-gpios", "phy-reset-active-high" }, + { "fsl,imx27-fec", "phy-reset-gpios", "phy-reset-active-high" }, + { "fsl,imx28-fec", "phy-reset-gpios", "phy-reset-active-high" }, + { "fsl,imx6q-fec", "phy-reset-gpios", "phy-reset-active-high" }, + { "fsl,mvf600-fec", "phy-reset-gpios", "phy-reset-active-high" }, + { "fsl,imx6sx-fec", "phy-reset-gpios", "phy-reset-active-high" }, + { "fsl,imx6ul-fec", "phy-reset-gpios", "phy-reset-active-high" }, + { "fsl,imx8mq-fec", "phy-reset-gpios", "phy-reset-active-high" }, + { "fsl,imx8qm-fec", "phy-reset-gpios", "phy-reset-active-high" }, + { "fsl,s32v234-fec", "phy-reset-gpios", "phy-reset-active-high" }, +#endif - /* - * Some GPIO fixed regulator quirks. - * Note that active low is the default. - */ - if (IS_ENABLED(CONFIG_REGULATOR) && - (of_device_is_compatible(np, "regulator-fixed") || - of_device_is_compatible(np, "reg-fixed-voltage") || - (!(strcmp(propname, "enable-gpio") && - strcmp(propname, "enable-gpios")) && - of_device_is_compatible(np, "regulator-gpio")))) { - bool active_high = of_property_read_bool(np, - "enable-active-high"); /* * The regulator GPIO handles are specified such that the * presence or absence of "enable-active-high" solely controls * the polarity of the GPIO line. Any phandle flags must * be actively ignored. */ - of_gpio_quirk_polarity(np, active_high, flags); +#if IS_ENABLED(CONFIG_REGULATOR_FIXED_VOLTAGE) + { "regulator-fixed", "gpios", "enable-active-high" }, + { "regulator-fixed", "gpio", "enable-active-high" }, + { "reg-fixed-voltage", "gpios", "enable-active-high" }, + { "reg-fixed-voltage", "gpio", "enable-active-high" }, +#endif +#if IS_ENABLED(CONFIG_REGULATOR_GPIO) + { "regulator-gpio", "enable-gpio", "enable-active-high" }, + { "regulator-gpio", "enable-gpios", "enable-active-high" }, +#endif + }; + unsigned int i; + bool active_high; + + for (i = 0; i < ARRAY_SIZE(gpios); i++) { + if (of_device_is_compatible(np, gpios[i].compatible) && + !strcmp(propname, gpios[i].gpio_propname)) { + active_high = of_property_read_bool(np, + gpios[i].polarity_propname); + of_gpio_quirk_polarity(np, active_high, flags); + break; + } } +} + +static void of_gpio_flags_quirks(const struct device_node *np, + const char *propname, + enum of_gpio_flags *flags, + int index) +{ + of_gpio_try_fixup_polarity(np, propname, flags); + of_gpio_set_polarity_by_property(np, propname, flags); + /* * Legacy open drain handling for fixed voltage regulators. */ @@ -267,33 +302,6 @@ static void of_gpio_flags_quirks(const struct device_node *np, !strcmp(propname, "snps,reset-gpio") && of_property_read_bool(np, "snps,reset-active-low")) *flags |= OF_GPIO_ACTIVE_LOW; - - /* - * Freescale Fast Ethernet Controller uses a separate property to - * describe polarity of the phy reset line. - */ - if (IS_ENABLED(CONFIG_FEC)) { - static const char * const fec_devices[] = { - "fsl,imx25-fec", - "fsl,imx27-fec", - "fsl,imx28-fec", - "fsl,imx6q-fec", - "fsl,mvf600-fec", - "fsl,imx6sx-fec", - "fsl,imx6ul-fec", - "fsl,imx8mq-fec", - "fsl,imx8qm-fec", - "fsl,s32v234-fec", - NULL - }; - - if (!strcmp(propname, "phy-reset-gpios") && - of_device_compatible_match(np, fec_devices)) { - bool active_high = of_property_read_bool(np, - "phy-reset-active-high"); - of_gpio_quirk_polarity(np, active_high, flags); - } - } } /** -- GitLab From b8b80348c57b360019071e17380298619c5d8066 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sat, 29 Oct 2022 21:40:47 -0700 Subject: [PATCH 0374/1423] gpiolib: of: add polarity quirk for Freescale PCIe controller Bindings for Freescale PCIe controller use a separate property called "reset-gpio-active-high" to control polarity of its reset line, add it to the list of quirks in gpiolib so that gpiod API can be used in the driver. Signed-off-by: Dmitry Torokhov Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-of.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index e72b56a87b86c..be9c34cca3225 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -208,6 +208,15 @@ static void of_gpio_set_polarity_by_property(const struct device_node *np, { "fsl,imx8qm-fec", "phy-reset-gpios", "phy-reset-active-high" }, { "fsl,s32v234-fec", "phy-reset-gpios", "phy-reset-active-high" }, #endif +#if IS_ENABLED(CONFIG_PCI_IMX6) + { "fsl,imx6q-pcie", "reset-gpio", "reset-gpio-active-high" }, + { "fsl,imx6sx-pcie", "reset-gpio", "reset-gpio-active-high" }, + { "fsl,imx6qp-pcie", "reset-gpio", "reset-gpio-active-high" }, + { "fsl,imx7d-pcie", "reset-gpio", "reset-gpio-active-high" }, + { "fsl,imx8mq-pcie", "reset-gpio", "reset-gpio-active-high" }, + { "fsl,imx8mm-pcie", "reset-gpio", "reset-gpio-active-high" }, + { "fsl,imx8mp-pcie", "reset-gpio", "reset-gpio-active-high" }, +#endif /* * The regulator GPIO handles are specified such that the -- GitLab From 503fa23614dc95f96af883a8e2e873d5c6cd53d8 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Sat, 17 Sep 2022 13:03:09 +0100 Subject: [PATCH 0375/1423] PCI: Access Link 2 registers only for devices with Links PCIe r2.0, sec 7.8 added Link Capabilities/Status/Control 2 registers to the PCIe Capability with Capability Version 2. Previously we assumed these registers were implemented for all PCIe Capabilities of version 2 or greater, but in fact they are only implemented for devices with Links. Update pcie_capability_reg_implemented() to check whether the device has a Link. [bhelgaas: commit log, squash export] Link: https://lore.kernel.org/r/alpine.DEB.2.21.2209100057070.2275@angie.orcam.me.uk Link: https://lore.kernel.org/r/alpine.DEB.2.21.2209100057300.2275@angie.orcam.me.uk Signed-off-by: Maciej W. Rozycki Signed-off-by: Bjorn Helgaas --- drivers/pci/access.c | 8 +++++++- drivers/pci/pci.h | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/pci/access.c b/drivers/pci/access.c index 708c7529647fd..3c230ca3de584 100644 --- a/drivers/pci/access.c +++ b/drivers/pci/access.c @@ -350,6 +350,11 @@ bool pcie_cap_has_lnkctl(const struct pci_dev *dev) type == PCI_EXP_TYPE_PCIE_BRIDGE; } +bool pcie_cap_has_lnkctl2(const struct pci_dev *dev) +{ + return pcie_cap_has_lnkctl(dev) && pcie_cap_version(dev) > 1; +} + static inline bool pcie_cap_has_sltctl(const struct pci_dev *dev) { return pcie_downstream_port(dev) && @@ -390,10 +395,11 @@ static bool pcie_capability_reg_implemented(struct pci_dev *dev, int pos) return pcie_cap_has_rtctl(dev); case PCI_EXP_DEVCAP2: case PCI_EXP_DEVCTL2: + return pcie_cap_version(dev) > 1; case PCI_EXP_LNKCAP2: case PCI_EXP_LNKCTL2: case PCI_EXP_LNKSTA2: - return pcie_cap_version(dev) > 1; + return pcie_cap_has_lnkctl2(dev); default: return false; } diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index b1ebb7ab88051..9ed3b55500432 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -15,6 +15,7 @@ extern const unsigned char pcie_link_speed[]; extern bool pci_early_dump; bool pcie_cap_has_lnkctl(const struct pci_dev *dev); +bool pcie_cap_has_lnkctl2(const struct pci_dev *dev); bool pcie_cap_has_rtctl(const struct pci_dev *dev); /* Functions internal to the PCI core code */ -- GitLab From 19526717f768bf2f89ca01bd2a595728ebe57540 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Nov 2022 22:31:19 +0100 Subject: [PATCH 0376/1423] objtool: Optimize elf_dirty_reloc_sym() When moving a symbol in the symtab its index changes and any reloc referring that symtol-table-index will need to be rewritten too. In order to facilitate this, objtool simply marks the whole reloc section 'changed' which will cause the whole section to be re-generated. However, finding the relocs that use any given symbol is implemented rather crudely -- a fully iteration of all sections and their relocs. Given that some builds have over 20k sections (kallsyms etc..) iterating all that for *each* symbol moved takes a bit of time. Instead have each symbol keep a list of relocs that reference it. This *vastly* improves build times for certain configs. Reported-by: Borislav Petkov Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/Y2LlRA7x+8UsE1xf@hirez.programming.kicks-ass.net --- tools/objtool/elf.c | 27 ++++++++++----------------- tools/objtool/include/objtool/elf.h | 2 ++ 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 3d636d12d679f..8cd7f018002c5 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -356,6 +356,7 @@ static void elf_add_symbol(struct elf *elf, struct symbol *sym) struct rb_node *pnode; struct symbol *iter; + INIT_LIST_HEAD(&sym->reloc_list); INIT_LIST_HEAD(&sym->pv_target); sym->alias = sym; @@ -557,6 +558,7 @@ int elf_add_reloc(struct elf *elf, struct section *sec, unsigned long offset, reloc->sym = sym; reloc->addend = addend; + list_add_tail(&reloc->sym_reloc_entry, &sym->reloc_list); list_add_tail(&reloc->list, &sec->reloc->reloc_list); elf_hash_add(reloc, &reloc->hash, reloc_hash(reloc)); @@ -573,21 +575,10 @@ int elf_add_reloc(struct elf *elf, struct section *sec, unsigned long offset, */ static void elf_dirty_reloc_sym(struct elf *elf, struct symbol *sym) { - struct section *sec; - - list_for_each_entry(sec, &elf->sections, list) { - struct reloc *reloc; - - if (sec->changed) - continue; + struct reloc *reloc; - list_for_each_entry(reloc, &sec->reloc_list, list) { - if (reloc->sym == sym) { - sec->changed = true; - break; - } - } - } + list_for_each_entry(reloc, &sym->reloc_list, sym_reloc_entry) + reloc->sec->changed = true; } /* @@ -902,11 +893,12 @@ static int read_rela_reloc(struct section *sec, int i, struct reloc *reloc, unsi static int read_relocs(struct elf *elf) { + unsigned long nr_reloc, max_reloc = 0, tot_reloc = 0; struct section *sec; struct reloc *reloc; - int i; unsigned int symndx; - unsigned long nr_reloc, max_reloc = 0, tot_reloc = 0; + struct symbol *sym; + int i; if (!elf_alloc_hash(reloc, elf->text_size / 16)) return -1; @@ -947,13 +939,14 @@ static int read_relocs(struct elf *elf) reloc->sec = sec; reloc->idx = i; - reloc->sym = find_symbol_by_index(elf, symndx); + reloc->sym = sym = find_symbol_by_index(elf, symndx); if (!reloc->sym) { WARN("can't find reloc entry symbol %d for %s", symndx, sec->name); return -1; } + list_add_tail(&reloc->sym_reloc_entry, &sym->reloc_list); list_add_tail(&reloc->list, &sec->reloc_list); elf_hash_add(reloc, &reloc->hash, reloc_hash(reloc)); diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index b6974e3173aa5..bca719b2104b8 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -62,6 +62,7 @@ struct symbol { u8 fentry : 1; u8 profiling_func : 1; struct list_head pv_target; + struct list_head reloc_list; }; struct reloc { @@ -73,6 +74,7 @@ struct reloc { }; struct section *sec; struct symbol *sym; + struct list_head sym_reloc_entry; unsigned long offset; unsigned int type; s64 addend; -- GitLab From 023f2340f053537cce170c31c430b0886c6f07ca Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Nov 2022 20:57:51 +0100 Subject: [PATCH 0377/1423] objtool: Fix weak hole vs prefix symbol Boris (and the robot) reported that objtool grew a new complaint about unreachable instructions. Upon inspection it was immediately clear the __weak zombie instructions struck again. For the unweary, the linker will simply remove the symbol for overriden __weak symbols but leave the instructions in place, creating unreachable instructions -- and objtool likes to report these. Commit 4adb23686795 ("objtool: Ignore extra-symbol code") was supposed to have dealt with that, but the new commit 9f2899fe36a6 ("objtool: Add option to generate prefix symbols") subtly broke that logic by created unvisited symbols. Fixes: 9f2899fe36a6 ("objtool: Add option to generate prefix symbols") Reported-by: Borislav Petkov Reported-by: kernel test robot Signed-off-by: Peter Zijlstra (Intel) --- tools/objtool/check.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 55066c4935702..4f1a7384426b4 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -4053,8 +4053,28 @@ static int add_prefix_symbol(struct objtool_file *file, struct symbol *func, offset = func->offset - prev->offset; if (offset >= opts.prefix) { - if (offset == opts.prefix) + if (offset == opts.prefix) { + /* + * Since the sec->symbol_list is ordered by + * offset (see elf_add_symbol()) the added + * symbol will not be seen by the iteration in + * validate_section(). + * + * Hence the lack of list_for_each_entry_safe() + * there. + * + * The direct concequence is that prefix symbols + * don't get visited (because pointless), except + * for the logic in ignore_unreachable_insn() + * that needs the terminating insn to be visited + * otherwise it will report the hole. + * + * Hence mark the first instruction of the + * prefix symbol as visisted. + */ + prev->visited |= VISITED_BRANCH; elf_create_prefix_symbol(file->elf, func, opts.prefix); + } break; } insn = prev; -- GitLab From b32fd8a60f5d855758208c2b5b49cba8087f03c4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Nov 2022 21:17:03 +0100 Subject: [PATCH 0378/1423] x86,pm: Force out-of-line memcpy() GCC fancies inlining memcpy(), and because it cannot prove the destination is page-aligned (it is) it ends up generating atrocious code like: 19e: 48 8b 15 00 00 00 00 mov 0x0(%rip),%rdx # 1a5 1a1: R_X86_64_PC32 core_restore_code-0x4 1a5: 48 8d 78 08 lea 0x8(%rax),%rdi 1a9: 48 89 c1 mov %rax,%rcx 1ac: 48 c7 c6 00 00 00 00 mov $0x0,%rsi 1af: R_X86_64_32S core_restore_code 1b3: 48 83 e7 f8 and $0xfffffffffffffff8,%rdi 1b7: 48 29 f9 sub %rdi,%rcx 1ba: 48 89 10 mov %rdx,(%rax) 1bd: 48 8b 15 00 00 00 00 mov 0x0(%rip),%rdx # 1c4 1c0: R_X86_64_PC32 core_restore_code+0xff4 1c4: 48 29 ce sub %rcx,%rsi 1c7: 81 c1 00 10 00 00 add $0x1000,%ecx 1cd: 48 89 90 f8 0f 00 00 mov %rdx,0xff8(%rax) 1d4: c1 e9 03 shr $0x3,%ecx 1d7: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi) Notably the alignment code generates a text reference to code_restore_code+0xff8, for which objtool raises the objection: vmlinux.o: warning: objtool: relocate_restore_code+0x3d: relocation to !ENDBR: next_arg+0x18 Applying some __assume_aligned(PAGE_SIZE) improve code-gen to: 19e: 48 89 c7 mov %rax,%rdi 1a1: 48 c7 c6 00 00 00 00 mov $0x0,%rsi 1a4: R_X86_64_32S core_restore_code 1a8: b9 00 02 00 00 mov $0x200,%ecx 1ad: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi) And resolve the problem, however, none of this is important code and a much simpler solution still is to force a memcpy() call: 1a1: ba 00 10 00 00 mov $0x1000,%edx 1a6: 48 c7 c6 00 00 00 00 mov $0x0,%rsi 1a9: R_X86_64_32S core_restore_code 1ad: e8 00 00 00 00 call 1b2 1ae: R_X86_64_PLT32 __memcpy-0x4 Signed-off-by: Peter Zijlstra (Intel) --- arch/x86/power/hibernate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c index e94e0050a583a..6f955eb1e1631 100644 --- a/arch/x86/power/hibernate.c +++ b/arch/x86/power/hibernate.c @@ -159,7 +159,7 @@ int relocate_restore_code(void) if (!relocated_restore_code) return -ENOMEM; - memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE); + __memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE); /* Make the page containing the relocated code executable */ pgd = (pgd_t *)__va(read_cr3_pa()) + -- GitLab From 4fd5f70ce14da230c6a29648c3d51a48ee0b4bfd Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 1 Nov 2022 10:25:07 -0700 Subject: [PATCH 0379/1423] x86/Kconfig: Enable kernel IBT by default The kernel IBT defense strongly mitigates the common "first step" of ROP attacks, by eliminating arbitrary stack pivots (that appear either at the end of a function or in immediate values), which cannot be reached if indirect calls must be to marked function entry addresses. IBT is also required to be enabled to gain the FineIBT feature when built with Kernel Control Flow Integrity. Additionally, given that this feature is runtime enabled via CPU ID, it clearly should be built in by default; it will only be enabled if the CPU supports it. The build takes 2 seconds longer, which seems a small price to pay for gaining this coverage by default. Suggested-by: Sami Tolvanen Signed-off-by: Kees Cook Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20221101172503.gonna.094-kees@kernel.org --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 479ee63898f57..aaf1f0f461617 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1856,7 +1856,7 @@ config CC_HAS_IBT config X86_KERNEL_IBT prompt "Indirect Branch Tracking" - bool + def_bool y depends on X86_64 && CC_HAS_IBT && HAVE_OBJTOOL # https://github.com/llvm/llvm-project/commit/9d7001eba9c4cb311e03cd8cdc231f9e579f2d0f depends on !LD_IS_LLD || LLD_VERSION >= 140000 -- GitLab From b1599915f09157e98f59556e1b2eafe473603347 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 6 Nov 2022 09:55:56 +0100 Subject: [PATCH 0380/1423] x86/cpufeatures: Move X86_FEATURE_CALL_DEPTH from bit 18 to bit 19 of word 11, to leave space for WIP X86_FEATURE_SGX_EDECCSSA bit Reallocate a soft-cpufeatures bit allocated for call-depth tracking code, which clashes with this recent KVM/SGX patch being worked on: KVM/VMX: Allow exposing EDECCSSA user leaf function to KVM guest Instead of reallocating cpufeatures bits in evil merges, make the allocation explicit. Acked-by: Peter Zijlstra Cc: Thomas Gleixner Cc: Borislav Petkov Cc: Dave Hansen Cc: x86@kernel.org Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeatures.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index aefd0816a333f..864c9b0dda687 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -304,7 +304,8 @@ #define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */ #define X86_FEATURE_USE_IBPB_FW (11*32+16) /* "" Use IBPB during runtime firmware calls */ #define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM exit when EIBRS is enabled */ -#define X86_FEATURE_CALL_DEPTH (11*32+18) /* "" Call depth tracking for RSB stuffing */ + /* Hole left for X86_FEATURE_SGX_EDECCSSA */ +#define X86_FEATURE_CALL_DEPTH (11*32+19) /* "" Call depth tracking for RSB stuffing */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ -- GitLab From abef378c434e6f5abd46fd536e9972374fb74e98 Mon Sep 17 00:00:00 2001 From: Arumugam Kolappan Date: Tue, 1 Nov 2022 00:27:44 -0700 Subject: [PATCH 0381/1423] RDMA/mlx5: Change debug log level for remote access error syndromes The mlx5 driver dumps the entire CQE buffer by default for few syndromes. Some syndromes are expected due to the application behavior [ex: MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR, MLX5_CQE_SYNDROME_REMOTE_OP_ERR and MLX5_CQE_SYNDROME_LOCAL_PROT_ERR]. Hence, for these syndromes, the patch converts the log level from KERN_WARNING to KERN_DEBUG. This enables the application to get the CQE buffer dump by changing to KERN_DEBUG level as and when needed. Suggested-by: Leon Romanovsky Signed-off-by: Arumugam Kolappan Link: https://lore.kernel.org/r/1667287664-19377-1-git-send-email-aru.kolappan@oracle.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/cq.c | 27 +++++++++++++++------------ drivers/infiniband/hw/mlx5/mlx5_ib.h | 4 ++++ 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index be189e0525de6..efc9e4a6df04a 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -267,17 +267,20 @@ static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe, wc->wc_flags |= IB_WC_WITH_NETWORK_HDR_TYPE; } -static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe) +static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe, + struct ib_wc *wc, const char *level) { - mlx5_ib_warn(dev, "dump error cqe\n"); - mlx5_dump_err_cqe(dev->mdev, cqe); + mlx5_ib_log(level, dev, "WC error: %d, Message: %s\n", wc->status, + ib_wc_status_msg(wc->status)); + print_hex_dump(level, "cqe_dump: ", DUMP_PREFIX_OFFSET, 16, 1, + cqe, sizeof(*cqe), false); } static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe, struct ib_wc *wc) { - int dump = 1; + const char *dump = KERN_WARNING; switch (cqe->syndrome) { case MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR: @@ -287,10 +290,11 @@ static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev, wc->status = IB_WC_LOC_QP_OP_ERR; break; case MLX5_CQE_SYNDROME_LOCAL_PROT_ERR: + dump = KERN_DEBUG; wc->status = IB_WC_LOC_PROT_ERR; break; case MLX5_CQE_SYNDROME_WR_FLUSH_ERR: - dump = 0; + dump = NULL; wc->status = IB_WC_WR_FLUSH_ERR; break; case MLX5_CQE_SYNDROME_MW_BIND_ERR: @@ -306,18 +310,20 @@ static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev, wc->status = IB_WC_REM_INV_REQ_ERR; break; case MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR: + dump = KERN_DEBUG; wc->status = IB_WC_REM_ACCESS_ERR; break; case MLX5_CQE_SYNDROME_REMOTE_OP_ERR: + dump = KERN_DEBUG; wc->status = IB_WC_REM_OP_ERR; break; case MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR: + dump = NULL; wc->status = IB_WC_RETRY_EXC_ERR; - dump = 0; break; case MLX5_CQE_SYNDROME_RNR_RETRY_EXC_ERR: + dump = NULL; wc->status = IB_WC_RNR_RETRY_EXC_ERR; - dump = 0; break; case MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR: wc->status = IB_WC_REM_ABORT_ERR; @@ -328,11 +334,8 @@ static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev, } wc->vendor_err = cqe->vendor_err_synd; - if (dump) { - mlx5_ib_warn(dev, "WC error: %d, Message: %s\n", wc->status, - ib_wc_status_msg(wc->status)); - dump_cqe(dev, cqe); - } + if (dump) + dump_cqe(dev, cqe, wc, dump); } static void handle_atomics(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64, diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 4a7f7064bd0eb..8b91babdd4c08 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -38,6 +38,10 @@ dev_warn(&(_dev)->ib_dev.dev, "%s:%d:(pid %d): " format, __func__, \ __LINE__, current->pid, ##arg) +#define mlx5_ib_log(lvl, _dev, format, arg...) \ + dev_printk(lvl, &(_dev)->ib_dev.dev, "%s:%d:(pid %d): " format, \ + __func__, __LINE__, current->pid, ##arg) + #define MLX5_IB_DEFAULT_UIDX 0xffffff #define MLX5_USER_ASSIGNED_UIDX_MASK __mlx5_mask(qpc, user_index) -- GitLab From 118e021b4b66f758f8e8f21dc0e5e0a4c721e69e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 7 Nov 2022 10:09:11 +1100 Subject: [PATCH 0382/1423] xfs: write page faults in iomap are not buffered writes When we reserve a delalloc region in xfs_buffered_write_iomap_begin, we mark the iomap as IOMAP_F_NEW so that the the write context understands that it allocated the delalloc region. If we then fail that buffered write, xfs_buffered_write_iomap_end() checks for the IOMAP_F_NEW flag and if it is set, it punches out the unused delalloc region that was allocated for the write. The assumption this code makes is that all buffered write operations that can allocate space are run under an exclusive lock (i_rwsem). This is an invalid assumption: page faults in mmap()d regions call through this same function pair to map the file range being faulted and this runs only holding the inode->i_mapping->invalidate_lock in shared mode. IOWs, we can have races between page faults and write() calls that fail the nested page cache write operation that result in data loss. That is, the failing iomap_end call will punch out the data that the other racing iomap iteration brought into the page cache. This can be reproduced with generic/34[46] if we arbitrarily fail page cache copy-in operations from write() syscalls. Code analysis tells us that the iomap_page_mkwrite() function holds the already instantiated and uptodate folio locked across the iomap mapping iterations. Hence the folio cannot be removed from memory whilst we are mapping the range it covers, and as such we do not care if the mapping changes state underneath the iomap iteration loop: 1. if the folio is not already dirty, there is no writeback races possible. 2. if we allocated the mapping (delalloc or unwritten), the folio cannot already be dirty. See #1. 3. If the folio is already dirty, it must be up to date. As we hold it locked, it cannot be reclaimed from memory. Hence we always have valid data in the page cache while iterating the mapping. 4. Valid data in the page cache can exist when the underlying mapping is DELALLOC, UNWRITTEN or WRITTEN. Having the mapping change from DELALLOC->UNWRITTEN or UNWRITTEN->WRITTEN does not change the data in the page - it only affects actions if we are initialising a new page. Hence #3 applies and we don't care about these extent map transitions racing with iomap_page_mkwrite(). 5. iomap_page_mkwrite() checks for page invalidation races (truncate, hole punch, etc) after it locks the folio. We also hold the mapping->invalidation_lock here, and hence the mapping cannot change due to extent removal operations while we are iterating the folio. As such, filesystems that don't use bufferheads will never fail the iomap_folio_mkwrite_iter() operation on the current mapping, regardless of whether the iomap should be considered stale. Further, the range we are asked to iterate is limited to the range inside EOF that the folio spans. Hence, for XFS, we will only map the exact range we are asked for, and we will only do speculative preallocation with delalloc if we are mapping a hole at the EOF page. The iterator will consume the entire range of the folio that is within EOF, and anything beyond the EOF block cannot be accessed. We never need to truncate this post-EOF speculative prealloc away in the context of the iomap_page_mkwrite() iterator because if it remains unused we'll remove it when the last reference to the inode goes away. Hence we don't actually need an .iomap_end() cleanup/error handling path at all for iomap_page_mkwrite() for XFS. This means we can separate the page fault processing from the complexity of the .iomap_end() processing in the buffered write path. This also means that the buffered write path will also be able to take the mapping->invalidate_lock as necessary. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 2 +- fs/xfs/xfs_iomap.c | 9 +++++++++ fs/xfs/xfs_iomap.h | 1 + 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e462d39c840e6..595a5bcf46b94 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1325,7 +1325,7 @@ __xfs_filemap_fault( if (write_fault) { xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); ret = iomap_page_mkwrite(vmf, - &xfs_buffered_write_iomap_ops); + &xfs_page_mkwrite_iomap_ops); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); } else { ret = filemap_fault(vmf); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 07da03976ec12..5cea069a38b4e 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1187,6 +1187,15 @@ const struct iomap_ops xfs_buffered_write_iomap_ops = { .iomap_end = xfs_buffered_write_iomap_end, }; +/* + * iomap_page_mkwrite() will never fail in a way that requires delalloc extents + * that it allocated to be revoked. Hence we do not need an .iomap_end method + * for this operation. + */ +const struct iomap_ops xfs_page_mkwrite_iomap_ops = { + .iomap_begin = xfs_buffered_write_iomap_begin, +}; + static int xfs_read_iomap_begin( struct inode *inode, diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index c782e8c0479c0..0f62ab633040c 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -47,6 +47,7 @@ xfs_aligned_fsb_count( } extern const struct iomap_ops xfs_buffered_write_iomap_ops; +extern const struct iomap_ops xfs_page_mkwrite_iomap_ops; extern const struct iomap_ops xfs_direct_write_iomap_ops; extern const struct iomap_ops xfs_read_iomap_ops; extern const struct iomap_ops xfs_seek_iomap_ops; -- GitLab From 4eace75e0853273755b878ffa9cce6de84df975a Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Fri, 4 Nov 2022 18:49:57 -0500 Subject: [PATCH 0383/1423] RDMA/irdma: Report the correct link speed The active link speed is currently hard-coded in irdma_query_port due to which the port rate in ibstatus does reflect the active link speed. Call ib_get_eth_speed in irdma_query_port to get the active link speed. Fixes: b48c24c2d710 ("RDMA/irdma: Implement device supported verb APIs") Reported-by: Kamal Heib Signed-off-by: Shiraz Saleem Link: https://lore.kernel.org/r/20221104234957.1135-1-shiraz.saleem@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/verbs.c | 35 +++-------------------------- 1 file changed, 3 insertions(+), 32 deletions(-) diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index a22afbb25bc58..434241789f123 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -63,36 +63,6 @@ static int irdma_query_device(struct ib_device *ibdev, return 0; } -/** - * irdma_get_eth_speed_and_width - Get IB port speed and width from netdev speed - * @link_speed: netdev phy link speed - * @active_speed: IB port speed - * @active_width: IB port width - */ -static void irdma_get_eth_speed_and_width(u32 link_speed, u16 *active_speed, - u8 *active_width) -{ - if (link_speed <= SPEED_1000) { - *active_width = IB_WIDTH_1X; - *active_speed = IB_SPEED_SDR; - } else if (link_speed <= SPEED_10000) { - *active_width = IB_WIDTH_1X; - *active_speed = IB_SPEED_FDR10; - } else if (link_speed <= SPEED_20000) { - *active_width = IB_WIDTH_4X; - *active_speed = IB_SPEED_DDR; - } else if (link_speed <= SPEED_25000) { - *active_width = IB_WIDTH_1X; - *active_speed = IB_SPEED_EDR; - } else if (link_speed <= SPEED_40000) { - *active_width = IB_WIDTH_4X; - *active_speed = IB_SPEED_FDR10; - } else { - *active_width = IB_WIDTH_4X; - *active_speed = IB_SPEED_EDR; - } -} - /** * irdma_query_port - get port attributes * @ibdev: device pointer from stack @@ -120,8 +90,9 @@ static int irdma_query_port(struct ib_device *ibdev, u32 port, props->state = IB_PORT_DOWN; props->phys_state = IB_PORT_PHYS_STATE_DISABLED; } - irdma_get_eth_speed_and_width(SPEED_100000, &props->active_speed, - &props->active_width); + + ib_get_eth_speed(ibdev, port, &props->active_speed, + &props->active_width); if (rdma_protocol_roce(ibdev, 1)) { props->gid_tbl_len = 32; -- GitLab From ece43fad220ba03c529cc0f6f302d796044e8476 Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Mon, 7 Nov 2022 10:18:43 +0800 Subject: [PATCH 0384/1423] RDMA/erdma: Extend access right field of FRMR and REG MR to support atomic To support atomic operations, IB_ACCESS_REMOTE_ATOMIC right should be passed to hardware for permission check. Since "access mode" field in FRMR SQE and RegMr command is never used by hw, we remove the "access mode" field, so that we can then have enough space to extend access fields. Signed-off-by: Cheng Xu Link: https://lore.kernel.org/r/20221107021845.44598-2-chengyou@linux.alibaba.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma_hw.h | 6 ++---- drivers/infiniband/hw/erdma/erdma_qp.c | 3 +-- drivers/infiniband/hw/erdma/erdma_verbs.c | 3 +-- drivers/infiniband/hw/erdma/erdma_verbs.h | 12 +++++++----- 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index e788887732e1f..2a9a4c73d52cd 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -224,8 +224,7 @@ struct erdma_cmdq_create_cq_req { /* regmr cfg1 */ #define ERDMA_CMD_REGMR_PD_MASK GENMASK(31, 12) #define ERDMA_CMD_REGMR_TYPE_MASK GENMASK(7, 6) -#define ERDMA_CMD_REGMR_RIGHT_MASK GENMASK(5, 2) -#define ERDMA_CMD_REGMR_ACC_MODE_MASK GENMASK(1, 0) +#define ERDMA_CMD_REGMR_RIGHT_MASK GENMASK(5, 1) /* regmr cfg2 */ #define ERDMA_CMD_REGMR_PAGESIZE_MASK GENMASK(31, 27) @@ -370,8 +369,7 @@ struct erdma_rqe { #define ERDMA_SQE_HDR_WQEBB_INDEX_MASK GENMASK_ULL(15, 0) /* REG MR attrs */ -#define ERDMA_SQE_MR_MODE_MASK GENMASK(1, 0) -#define ERDMA_SQE_MR_ACCESS_MASK GENMASK(5, 2) +#define ERDMA_SQE_MR_ACCESS_MASK GENMASK(5, 1) #define ERDMA_SQE_MR_MTT_TYPE_MASK GENMASK(7, 6) #define ERDMA_SQE_MR_MTT_CNT_MASK GENMASK(31, 12) diff --git a/drivers/infiniband/hw/erdma/erdma_qp.c b/drivers/infiniband/hw/erdma/erdma_qp.c index 5fe1a339a4354..c7f343173cb90 100644 --- a/drivers/infiniband/hw/erdma/erdma_qp.c +++ b/drivers/infiniband/hw/erdma/erdma_qp.c @@ -397,8 +397,7 @@ static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi, regmr_sge->addr = cpu_to_le64(mr->ibmr.iova); regmr_sge->length = cpu_to_le32(mr->ibmr.length); regmr_sge->stag = cpu_to_le32(reg_wr(send_wr)->key); - attrs = FIELD_PREP(ERDMA_SQE_MR_MODE_MASK, 0) | - FIELD_PREP(ERDMA_SQE_MR_ACCESS_MASK, mr->access) | + attrs = FIELD_PREP(ERDMA_SQE_MR_ACCESS_MASK, mr->access) | FIELD_PREP(ERDMA_SQE_MR_MTT_CNT_MASK, mr->mem.mtt_nents); diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index 62be98e2b9414..f3bf87f17527f 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -118,8 +118,7 @@ static int regmr_cmd(struct erdma_dev *dev, struct erdma_mr *mr) FIELD_PREP(ERDMA_CMD_MR_MPT_IDX_MASK, mr->ibmr.lkey >> 8); req.cfg1 = FIELD_PREP(ERDMA_CMD_REGMR_PD_MASK, pd->pdn) | FIELD_PREP(ERDMA_CMD_REGMR_TYPE_MASK, mr->type) | - FIELD_PREP(ERDMA_CMD_REGMR_RIGHT_MASK, mr->access) | - FIELD_PREP(ERDMA_CMD_REGMR_ACC_MODE_MASK, 0); + FIELD_PREP(ERDMA_CMD_REGMR_RIGHT_MASK, mr->access); req.cfg2 = FIELD_PREP(ERDMA_CMD_REGMR_PAGESIZE_MASK, ilog2(mr->mem.page_size)) | FIELD_PREP(ERDMA_CMD_REGMR_MTT_TYPE_MASK, mr->mem.mtt_type) | diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h index ab6380635e9e6..a5574f0252bb4 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.h +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -71,16 +71,18 @@ struct erdma_pd { #define ERDMA_MR_INLINE_MTT 0 #define ERDMA_MR_INDIRECT_MTT 1 -#define ERDMA_MR_ACC_LR BIT(0) -#define ERDMA_MR_ACC_LW BIT(1) -#define ERDMA_MR_ACC_RR BIT(2) -#define ERDMA_MR_ACC_RW BIT(3) +#define ERDMA_MR_ACC_RA BIT(0) +#define ERDMA_MR_ACC_LR BIT(1) +#define ERDMA_MR_ACC_LW BIT(2) +#define ERDMA_MR_ACC_RR BIT(3) +#define ERDMA_MR_ACC_RW BIT(4) static inline u8 to_erdma_access_flags(int access) { return (access & IB_ACCESS_REMOTE_READ ? ERDMA_MR_ACC_RR : 0) | (access & IB_ACCESS_LOCAL_WRITE ? ERDMA_MR_ACC_LW : 0) | - (access & IB_ACCESS_REMOTE_WRITE ? ERDMA_MR_ACC_RW : 0); + (access & IB_ACCESS_REMOTE_WRITE ? ERDMA_MR_ACC_RW : 0) | + (access & IB_ACCESS_REMOTE_ATOMIC ? ERDMA_MR_ACC_RA : 0); } struct erdma_mem { -- GitLab From 71c6925f280ae8cb52eafee2404ae75c176c28ba Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Mon, 7 Nov 2022 10:18:44 +0800 Subject: [PATCH 0385/1423] RDMA/erdma: Report atomic capacity when hardware supports atomic feature Introduce "capacity flags" field at where hardware put all zeros originally in "query device" response. Using this field, hardware can report atomic feature if supports. Signed-off-by: Cheng Xu Link: https://lore.kernel.org/r/20221107021845.44598-3-chengyou@linux.alibaba.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma.h | 1 + drivers/infiniband/hw/erdma/erdma_hw.h | 5 +++++ drivers/infiniband/hw/erdma/erdma_main.c | 1 + drivers/infiniband/hw/erdma/erdma_verbs.c | 4 ++++ 4 files changed, 11 insertions(+) diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h index 730783fbc8949..bb23d897c7108 100644 --- a/drivers/infiniband/hw/erdma/erdma.h +++ b/drivers/infiniband/hw/erdma/erdma.h @@ -124,6 +124,7 @@ struct erdma_devattr { u32 fw_version; unsigned char peer_addr[ETH_ALEN]; + unsigned long cap_flags; int numa_node; enum erdma_cc_alg cc; diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index 2a9a4c73d52cd..808e7ee56d931 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -303,6 +303,7 @@ struct erdma_cmdq_destroy_qp_req { /* cap qword 0 definition */ #define ERDMA_CMD_DEV_CAP_MAX_CQE_MASK GENMASK_ULL(47, 40) +#define ERDMA_CMD_DEV_CAP_FLAGS_MASK GENMASK_ULL(31, 24) #define ERDMA_CMD_DEV_CAP_MAX_RECV_WR_MASK GENMASK_ULL(23, 16) #define ERDMA_CMD_DEV_CAP_MAX_MR_SIZE_MASK GENMASK_ULL(7, 0) @@ -314,6 +315,10 @@ struct erdma_cmdq_destroy_qp_req { #define ERDMA_NQP_PER_QBLOCK 1024 +enum { + ERDMA_DEV_CAP_FLAGS_ATOMIC = 1 << 7, +}; + #define ERDMA_CMD_INFO0_FW_VER_MASK GENMASK_ULL(31, 0) /* CQE hdr */ diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c index 49778bb294ae4..e44b06fea5957 100644 --- a/drivers/infiniband/hw/erdma/erdma_main.c +++ b/drivers/infiniband/hw/erdma/erdma_main.c @@ -374,6 +374,7 @@ static int erdma_dev_attrs_init(struct erdma_dev *dev) dev->attrs.max_qp = ERDMA_NQP_PER_QBLOCK * ERDMA_GET_CAP(QBLOCK, cap1); dev->attrs.max_mr = dev->attrs.max_qp << 1; dev->attrs.max_cq = dev->attrs.max_qp << 1; + dev->attrs.cap_flags = ERDMA_GET_CAP(FLAGS, cap0); dev->attrs.max_send_wr = ERDMA_MAX_SEND_WR; dev->attrs.max_ord = ERDMA_MAX_ORD; diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index f3bf87f17527f..d843ce1f35f3d 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -288,6 +288,10 @@ int erdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr, attr->max_mw = dev->attrs.max_mw; attr->max_fast_reg_page_list_len = ERDMA_MAX_FRMR_PA; attr->page_size_cap = ERDMA_PAGE_SIZE_SUPPORT; + + if (dev->attrs.cap_flags & ERDMA_DEV_CAP_FLAGS_ATOMIC) + attr->atomic_cap = IB_ATOMIC_GLOB; + attr->fw_ver = dev->attrs.fw_version; if (dev->netdev) -- GitLab From 0ca9c2e2844aa285c3656a29d4803839cfa8bca9 Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Mon, 7 Nov 2022 10:18:45 +0800 Subject: [PATCH 0386/1423] RDMA/erdma: Implement atomic operations support Add atomic operations support in post_send and poll_cq implementation. Also, rename 'laddr' and 'lkey' in struct erdma_sge to 'addr' and 'key', because this structure is used for both local and remote SGEs. Signed-off-by: Cheng Xu Link: https://lore.kernel.org/r/20221107021845.44598-4-chengyou@linux.alibaba.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma_cq.c | 2 ++ drivers/infiniband/hw/erdma/erdma_hw.h | 18 ++++++++++-- drivers/infiniband/hw/erdma/erdma_qp.c | 40 ++++++++++++++++++++++---- 3 files changed, 52 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/erdma/erdma_cq.c b/drivers/infiniband/hw/erdma/erdma_cq.c index 58e0dc5c75d1d..cabd8678b3558 100644 --- a/drivers/infiniband/hw/erdma/erdma_cq.c +++ b/drivers/infiniband/hw/erdma/erdma_cq.c @@ -64,6 +64,8 @@ static const enum ib_wc_opcode wc_mapping_table[ERDMA_NUM_OPCODES] = { [ERDMA_OP_REG_MR] = IB_WC_REG_MR, [ERDMA_OP_LOCAL_INV] = IB_WC_LOCAL_INV, [ERDMA_OP_READ_WITH_INV] = IB_WC_RDMA_READ, + [ERDMA_OP_ATOMIC_CAS] = IB_WC_COMP_SWAP, + [ERDMA_OP_ATOMIC_FAD] = IB_WC_FETCH_ADD, }; static const struct { diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index 808e7ee56d931..1b2e2b70678f5 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -344,9 +344,9 @@ struct erdma_cqe { }; struct erdma_sge { - __aligned_le64 laddr; + __aligned_le64 addr; __le32 length; - __le32 lkey; + __le32 key; }; /* Receive Queue Element */ @@ -413,6 +413,16 @@ struct erdma_readreq_sqe { __le32 rsvd; }; +struct erdma_atomic_sqe { + __le64 hdr; + __le64 rsvd; + __le64 fetchadd_swap_data; + __le64 cmp_data; + + struct erdma_sge remote; + struct erdma_sge sgl; +}; + struct erdma_reg_mr_sqe { __le64 hdr; __le64 addr; @@ -472,7 +482,9 @@ enum erdma_opcode { ERDMA_OP_REG_MR = 14, ERDMA_OP_LOCAL_INV = 15, ERDMA_OP_READ_WITH_INV = 16, - ERDMA_NUM_OPCODES = 17, + ERDMA_OP_ATOMIC_CAS = 17, + ERDMA_OP_ATOMIC_FAD = 18, + ERDMA_NUM_OPCODES = 19, ERDMA_OP_INVALID = ERDMA_NUM_OPCODES + 1 }; diff --git a/drivers/infiniband/hw/erdma/erdma_qp.c b/drivers/infiniband/hw/erdma/erdma_qp.c index c7f343173cb90..521e97258de77 100644 --- a/drivers/infiniband/hw/erdma/erdma_qp.c +++ b/drivers/infiniband/hw/erdma/erdma_qp.c @@ -285,15 +285,16 @@ static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi, u32 wqe_size, wqebb_cnt, hw_op, flags, sgl_offset; u32 idx = *pi & (qp->attrs.sq_size - 1); enum ib_wr_opcode op = send_wr->opcode; + struct erdma_atomic_sqe *atomic_sqe; struct erdma_readreq_sqe *read_sqe; struct erdma_reg_mr_sqe *regmr_sge; struct erdma_write_sqe *write_sqe; struct erdma_send_sqe *send_sqe; struct ib_rdma_wr *rdma_wr; - struct erdma_mr *mr; + struct erdma_sge *sge; __le32 *length_field; + struct erdma_mr *mr; u64 wqe_hdr, *entry; - struct ib_sge *sge; u32 attrs; int ret; @@ -360,9 +361,9 @@ static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi, sge = get_queue_entry(qp->kern_qp.sq_buf, idx + 1, qp->attrs.sq_size, SQEBB_SHIFT); - sge->addr = rdma_wr->remote_addr; - sge->lkey = rdma_wr->rkey; - sge->length = send_wr->sg_list[0].length; + sge->addr = cpu_to_le64(rdma_wr->remote_addr); + sge->key = cpu_to_le32(rdma_wr->rkey); + sge->length = cpu_to_le32(send_wr->sg_list[0].length); wqe_size = sizeof(struct erdma_readreq_sqe) + send_wr->num_sge * sizeof(struct ib_sge); @@ -423,6 +424,35 @@ static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi, regmr_sge->stag = cpu_to_le32(send_wr->ex.invalidate_rkey); wqe_size = sizeof(struct erdma_reg_mr_sqe); goto out; + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + atomic_sqe = (struct erdma_atomic_sqe *)entry; + if (op == IB_WR_ATOMIC_CMP_AND_SWP) { + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, + ERDMA_OP_ATOMIC_CAS); + atomic_sqe->fetchadd_swap_data = + cpu_to_le64(atomic_wr(send_wr)->swap); + atomic_sqe->cmp_data = + cpu_to_le64(atomic_wr(send_wr)->compare_add); + } else { + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, + ERDMA_OP_ATOMIC_FAD); + atomic_sqe->fetchadd_swap_data = + cpu_to_le64(atomic_wr(send_wr)->compare_add); + } + + sge = get_queue_entry(qp->kern_qp.sq_buf, idx + 1, + qp->attrs.sq_size, SQEBB_SHIFT); + sge->addr = cpu_to_le64(atomic_wr(send_wr)->remote_addr); + sge->key = cpu_to_le32(atomic_wr(send_wr)->rkey); + sge++; + + sge->addr = cpu_to_le64(send_wr->sg_list[0].addr); + sge->key = cpu_to_le32(send_wr->sg_list[0].lkey); + sge->length = cpu_to_le32(send_wr->sg_list[0].length); + + wqe_size = sizeof(*atomic_sqe); + goto out; default: return -EOPNOTSUPP; } -- GitLab From c8a51f03503633ec4a3f390aaadc3e8959fa44de Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 3 Nov 2022 22:28:17 +0200 Subject: [PATCH 0387/1423] gpio: Add Generic regmap GPIO conversion to the TODO list It's actually preferable to use Generic regmap GPIO over other simple approaches. Add a TODO item for that. Signed-off-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- drivers/gpio/TODO | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpio/TODO b/drivers/gpio/TODO index f87ff3fa8a531..76560744587a5 100644 --- a/drivers/gpio/TODO +++ b/drivers/gpio/TODO @@ -124,6 +124,13 @@ Work items: this with dry-coding and sending to maintainers to test +Generic regmap GPIO + +In the very similar way to Generic MMIO GPIO convert the users which can +take advantage of using regmap over direct IO accessors. Note, even in +MMIO case the regmap MMIO with gpio-regmap.c is preferable over gpio-mmio.c. + + GPIOLIB irqchip The GPIOLIB irqchip is a helper irqchip for "simple cases" that should -- GitLab From 58635d6615f1e5a870548ae8999870fdfcdecec0 Mon Sep 17 00:00:00 2001 From: Nico Boehr Date: Mon, 7 Nov 2022 13:12:21 +0100 Subject: [PATCH 0388/1423] s390/mm: fix virtual-physical address confusion for swiotlb swiotlb passes virtual addresses to set_memory_encrypted() and set_memory_decrypted(), but uv_remove_shared() and uv_set_shared() expect physical addresses. This currently works, because virtual and physical addresses are the same. Add virt_to_phys() to resolve the virtual-physical confusion. Reported-by: Marc Hartmayer Signed-off-by: Nico Boehr Reviewed-by: Claudio Imbrenda Reviewed-by: Christian Borntraeger Link: https://lore.kernel.org/r/20221107121221.156274-2-nrb@linux.ibm.com Message-Id: <20221107121221.156274-2-nrb@linux.ibm.com> Signed-off-by: Janosch Frank --- arch/s390/include/asm/mem_encrypt.h | 4 ++-- arch/s390/mm/init.c | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/s390/include/asm/mem_encrypt.h b/arch/s390/include/asm/mem_encrypt.h index 08a8b96606d7e..b85e13505a0f2 100644 --- a/arch/s390/include/asm/mem_encrypt.h +++ b/arch/s390/include/asm/mem_encrypt.h @@ -4,8 +4,8 @@ #ifndef __ASSEMBLY__ -int set_memory_encrypted(unsigned long addr, int numpages); -int set_memory_decrypted(unsigned long addr, int numpages); +int set_memory_encrypted(unsigned long vaddr, int numpages); +int set_memory_decrypted(unsigned long vaddr, int numpages); #endif /* __ASSEMBLY__ */ diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 97d66a3e60fb2..d509656c67d77 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -140,25 +140,25 @@ void mark_rodata_ro(void) debug_checkwx(); } -int set_memory_encrypted(unsigned long addr, int numpages) +int set_memory_encrypted(unsigned long vaddr, int numpages) { int i; /* make specified pages unshared, (swiotlb, dma_free) */ for (i = 0; i < numpages; ++i) { - uv_remove_shared(addr); - addr += PAGE_SIZE; + uv_remove_shared(virt_to_phys((void *)vaddr)); + vaddr += PAGE_SIZE; } return 0; } -int set_memory_decrypted(unsigned long addr, int numpages) +int set_memory_decrypted(unsigned long vaddr, int numpages) { int i; /* make specified pages shared (swiotlb, dma_alloca) */ for (i = 0; i < numpages; ++i) { - uv_set_shared(addr); - addr += PAGE_SIZE; + uv_set_shared(virt_to_phys((void *)vaddr)); + vaddr += PAGE_SIZE; } return 0; } -- GitLab From 95e7fc84c78adeef654acb919f04d98a87e6372d Mon Sep 17 00:00:00 2001 From: Weilong Chen Date: Tue, 1 Nov 2022 16:24:42 +0800 Subject: [PATCH 0389/1423] dt-bindings: gpio: add entry for hisilicon,ascend910-gpio Add the new compatible for HiSilicon gpio controller driver. Signed-off-by: Weilong Chen Reviewed-by: Rob Herring Reviewed-by: Yicong Yang Signed-off-by: Bartosz Golaszewski --- .../gpio/hisilicon,ascend910-gpio.yaml | 56 +++++++++++++++++++ MAINTAINERS | 1 + 2 files changed, 57 insertions(+) create mode 100644 Documentation/devicetree/bindings/gpio/hisilicon,ascend910-gpio.yaml diff --git a/Documentation/devicetree/bindings/gpio/hisilicon,ascend910-gpio.yaml b/Documentation/devicetree/bindings/gpio/hisilicon,ascend910-gpio.yaml new file mode 100644 index 0000000000000..735d97d645a09 --- /dev/null +++ b/Documentation/devicetree/bindings/gpio/hisilicon,ascend910-gpio.yaml @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/gpio/hisilicon,ascend910-gpio.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: HiSilicon common GPIO controller + +maintainers: + - Jay Fang + +description: + The HiSilicon common GPIO controller can be used for many different + types of SoC such as Huawei Ascend AI series chips. + +properties: + compatible: + const: hisilicon,ascend910-gpio + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + gpio-controller: true + + "#gpio-cells": + const: 2 + + ngpios: + minimum: 1 + maximum: 32 + +required: + - compatible + - reg + - interrupts + - gpio-controller + - "#gpio-cells" + - ngpios + +additionalProperties: false + +examples: + - | + #include + + gpio@840d0000 { + compatible = "hisilicon,ascend910-gpio"; + reg = <0x840d0000 0x1000>; + ngpios = <32>; + gpio-controller; + #gpio-cells = <2>; + interrupts = ; + }; diff --git a/MAINTAINERS b/MAINTAINERS index 74efa0492c430..02f333c1093eb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9198,6 +9198,7 @@ HISILICON GPIO DRIVER M: Jay Fang L: linux-gpio@vger.kernel.org S: Maintained +F: Documentation/devicetree/bindings/gpio/hisilicon,ascend910-gpio.yaml F: drivers/gpio/gpio-hisi.c HISILICON HIGH PERFORMANCE RSA ENGINE DRIVER (HPRE) -- GitLab From 80280df758c1498485988b30cf6887fde7796056 Mon Sep 17 00:00:00 2001 From: Weilong Chen Date: Tue, 1 Nov 2022 16:24:41 +0800 Subject: [PATCH 0390/1423] gpio: hisi: Add initial device tree support Add support for HiSilicon GPIO controller in embedded platform, which boot from devicetree. Signed-off-by: Weilong Chen Acked-by: Jay Fang Reviewed-by: Yicong Yang Signed-off-by: Bartosz Golaszewski --- drivers/gpio/Kconfig | 2 +- drivers/gpio/gpio-hisi.c | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 8c756cb292140..4bfedb0109a76 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -319,7 +319,7 @@ config GPIO_GRGPIO config GPIO_HISI tristate "HiSilicon GPIO controller driver" - depends on (ARM64 && ACPI) || COMPILE_TEST + depends on ARM64 || COMPILE_TEST select GPIO_GENERIC select GPIOLIB_IRQCHIP help diff --git a/drivers/gpio/gpio-hisi.c b/drivers/gpio/gpio-hisi.c index 3caabef5c7a2e..55bd69043bf42 100644 --- a/drivers/gpio/gpio-hisi.c +++ b/drivers/gpio/gpio-hisi.c @@ -221,6 +221,12 @@ static const struct acpi_device_id hisi_gpio_acpi_match[] = { }; MODULE_DEVICE_TABLE(acpi, hisi_gpio_acpi_match); +static const struct of_device_id hisi_gpio_dts_match[] = { + { .compatible = "hisilicon,ascend910-gpio", }, + { } +}; +MODULE_DEVICE_TABLE(of, hisi_gpio_dts_match); + static void hisi_gpio_get_pdata(struct device *dev, struct hisi_gpio *hisi_gpio) { @@ -311,6 +317,7 @@ static struct platform_driver hisi_gpio_driver = { .driver = { .name = HISI_GPIO_DRIVER_NAME, .acpi_match_table = hisi_gpio_acpi_match, + .of_match_table = hisi_gpio_dts_match, }, .probe = hisi_gpio_probe, }; -- GitLab From 9f0b4cc174c3c5ceb9322d01372369610f327c42 Mon Sep 17 00:00:00 2001 From: Yipeng Zou Date: Fri, 4 Nov 2022 11:24:30 +0800 Subject: [PATCH 0391/1423] PCI/ACPI: Use METHOD_NAME__UID instead of plain string Replace the string "_UID" with the METHOD_NAME__UID macro so instances are easier to find. Link: https://lore.kernel.org/r/20221104032430.186424-1-zouyipeng@huawei.com Signed-off-by: Yipeng Zou Signed-off-by: Bjorn Helgaas --- drivers/pci/pci-acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index a46fec776ad77..068d6745bf98c 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -67,7 +67,7 @@ static acpi_status acpi_match_rc(acpi_handle handle, u32 lvl, void *context, unsigned long long uid; acpi_status status; - status = acpi_evaluate_integer(handle, "_UID", NULL, &uid); + status = acpi_evaluate_integer(handle, METHOD_NAME__UID, NULL, &uid); if (ACPI_FAILURE(status) || uid != *segment) return AE_CTRL_DEPTH; -- GitLab From 44e985938e85503d0a69ec538e15fd33c1a4df05 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 7 Nov 2022 15:31:08 -0600 Subject: [PATCH 0392/1423] Revert "PCI: Clear PCI_STATUS when setting up device" This reverts commit 6cd514e58f12b211d638dbf6f791fa18d854f09c. Christophe Fergeau reported that 6cd514e58f12 ("PCI: Clear PCI_STATUS when setting up device") causes boot failures when trying to start linux guests with Apple's virtualization framework (for example using https://developer.apple.com/documentation/virtualization/running_linux_in_a_virtual_machine?language=objc) 6cd514e58f12 only solved a cosmetic problem, so revert it to fix the boot failures. Link: https://bugzilla.redhat.com/show_bug.cgi?id=2137803 Signed-off-by: Bjorn Helgaas --- drivers/pci/probe.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index b66fa42c4b1fa..1d6f7b502020d 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1891,9 +1891,6 @@ int pci_setup_device(struct pci_dev *dev) dev->broken_intx_masking = pci_intx_mask_broken(dev); - /* Clear errors left from system firmware */ - pci_write_config_word(dev, PCI_STATUS, 0xffff); - switch (dev->hdr_type) { /* header type */ case PCI_HEADER_TYPE_NORMAL: /* standard header */ if (class == PCI_CLASS_BRIDGE_PCI) -- GitLab From 61da03328a603d2d4a5b2e80cbe29bbf0122e6f8 Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Mon, 7 Nov 2022 00:28:05 -0600 Subject: [PATCH 0393/1423] memblock tests: introduce range tests for memblock_alloc_exact_nid_raw Add TEST_F_EXACT flag, which specifies that tests should run memblock_alloc_exact_nid_raw(). Introduce range tests for memblock_alloc_exact_nid_raw() by using the TEST_F_EXACT flag to run the range tests in alloc_nid_api.c, since memblock_alloc_exact_nid_raw() and memblock_alloc_try_nid_raw() behave the same way when nid = NUMA_NO_NODE. Rename tests and other functions in alloc_nid_api.c by removing "_try". Since the test names will be displayed in verbose output, they need to be general enough to refer to any of the memblock functions that the tests may run. Acked-by: David Hildenbrand Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/5a4b6d1b6130ab7375314e1c45a6d5813dfdabbd.1667802195.git.remckee0@gmail.com --- tools/testing/memblock/Makefile | 2 +- tools/testing/memblock/main.c | 2 + .../memblock/tests/alloc_exact_nid_api.c | 22 + .../memblock/tests/alloc_exact_nid_api.h | 9 + tools/testing/memblock/tests/alloc_nid_api.c | 546 +++++++++--------- tools/testing/memblock/tests/alloc_nid_api.h | 1 + tools/testing/memblock/tests/common.h | 2 + 7 files changed, 320 insertions(+), 264 deletions(-) create mode 100644 tools/testing/memblock/tests/alloc_exact_nid_api.c create mode 100644 tools/testing/memblock/tests/alloc_exact_nid_api.h diff --git a/tools/testing/memblock/Makefile b/tools/testing/memblock/Makefile index 246f7ac8489b4..2310ac4d080ec 100644 --- a/tools/testing/memblock/Makefile +++ b/tools/testing/memblock/Makefile @@ -7,7 +7,7 @@ CFLAGS += -I. -I../../include -Wall -O2 -fsanitize=address \ LDFLAGS += -fsanitize=address -fsanitize=undefined TARGETS = main TEST_OFILES = tests/alloc_nid_api.o tests/alloc_helpers_api.o tests/alloc_api.o \ - tests/basic_api.o tests/common.o + tests/basic_api.o tests/common.o tests/alloc_exact_nid_api.o DEP_OFILES = memblock.o lib/slab.o mmzone.o slab.o OFILES = main.o $(DEP_OFILES) $(TEST_OFILES) EXTR_SRC = ../../../mm/memblock.c diff --git a/tools/testing/memblock/main.c b/tools/testing/memblock/main.c index 4ca1024342b13..278f9dec50087 100644 --- a/tools/testing/memblock/main.c +++ b/tools/testing/memblock/main.c @@ -3,6 +3,7 @@ #include "tests/alloc_api.h" #include "tests/alloc_helpers_api.h" #include "tests/alloc_nid_api.h" +#include "tests/alloc_exact_nid_api.h" #include "tests/common.h" int main(int argc, char **argv) @@ -12,6 +13,7 @@ int main(int argc, char **argv) memblock_alloc_checks(); memblock_alloc_helpers_checks(); memblock_alloc_nid_checks(); + memblock_alloc_exact_nid_checks(); return 0; } diff --git a/tools/testing/memblock/tests/alloc_exact_nid_api.c b/tools/testing/memblock/tests/alloc_exact_nid_api.c new file mode 100644 index 0000000000000..6406496623a01 --- /dev/null +++ b/tools/testing/memblock/tests/alloc_exact_nid_api.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include "alloc_exact_nid_api.h" +#include "alloc_nid_api.h" + +#define FUNC_NAME "memblock_alloc_exact_nid_raw" + +int memblock_alloc_exact_nid_checks(void) +{ + prefix_reset(); + prefix_push(FUNC_NAME); + + reset_memblock_attributes(); + dummy_physical_memory_init(); + + memblock_alloc_exact_nid_range_checks(); + + dummy_physical_memory_cleanup(); + + prefix_pop(); + + return 0; +} diff --git a/tools/testing/memblock/tests/alloc_exact_nid_api.h b/tools/testing/memblock/tests/alloc_exact_nid_api.h new file mode 100644 index 0000000000000..4408719de3b9a --- /dev/null +++ b/tools/testing/memblock/tests/alloc_exact_nid_api.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _MEMBLOCK_ALLOC_EXACT_NID_H +#define _MEMBLOCK_ALLOC_EXACT_NID_H + +#include "common.h" + +int memblock_alloc_exact_nid_checks(void); + +#endif diff --git a/tools/testing/memblock/tests/alloc_nid_api.c b/tools/testing/memblock/tests/alloc_nid_api.c index 2c2d60f4e3e3c..49ef68cccd6f8 100644 --- a/tools/testing/memblock/tests/alloc_nid_api.c +++ b/tools/testing/memblock/tests/alloc_nid_api.c @@ -18,18 +18,29 @@ static const unsigned int node_fractions[] = { 625, /* 1/16 */ }; -static inline const char * const get_memblock_alloc_try_nid_name(int flags) +static inline const char * const get_memblock_alloc_nid_name(int flags) { + if (flags & TEST_F_EXACT) + return "memblock_alloc_exact_nid_raw"; if (flags & TEST_F_RAW) return "memblock_alloc_try_nid_raw"; return "memblock_alloc_try_nid"; } -static inline void *run_memblock_alloc_try_nid(phys_addr_t size, - phys_addr_t align, - phys_addr_t min_addr, - phys_addr_t max_addr, int nid) -{ +static inline void *run_memblock_alloc_nid(phys_addr_t size, + phys_addr_t align, + phys_addr_t min_addr, + phys_addr_t max_addr, int nid) +{ + assert(!(alloc_nid_test_flags & TEST_F_EXACT) || + (alloc_nid_test_flags & TEST_F_RAW)); + /* + * TEST_F_EXACT should be checked before TEST_F_RAW since + * memblock_alloc_exact_nid_raw() performs raw allocations. + */ + if (alloc_nid_test_flags & TEST_F_EXACT) + return memblock_alloc_exact_nid_raw(size, align, min_addr, + max_addr, nid); if (alloc_nid_test_flags & TEST_F_RAW) return memblock_alloc_try_nid_raw(size, align, min_addr, max_addr, nid); @@ -50,7 +61,7 @@ static inline void *run_memblock_alloc_try_nid(phys_addr_t size, * * Expect to allocate a region that ends at max_addr. */ -static int alloc_try_nid_top_down_simple_check(void) +static int alloc_nid_top_down_simple_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; @@ -65,9 +76,9 @@ static int alloc_try_nid_top_down_simple_check(void) min_addr = memblock_start_of_DRAM() + SMP_CACHE_BYTES * 2; max_addr = min_addr + SZ_512; - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); rgn_end = rgn->base + rgn->size; ASSERT_NE(allocated_ptr, NULL); @@ -102,7 +113,7 @@ static int alloc_try_nid_top_down_simple_check(void) * * Expect to allocate an aligned region that ends before max_addr. */ -static int alloc_try_nid_top_down_end_misaligned_check(void) +static int alloc_nid_top_down_end_misaligned_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; @@ -118,9 +129,9 @@ static int alloc_try_nid_top_down_end_misaligned_check(void) min_addr = memblock_start_of_DRAM() + SMP_CACHE_BYTES * 2; max_addr = min_addr + SZ_512 + misalign; - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); rgn_end = rgn->base + rgn->size; ASSERT_NE(allocated_ptr, NULL); @@ -153,7 +164,7 @@ static int alloc_try_nid_top_down_end_misaligned_check(void) * Expect to allocate a region that starts at min_addr and ends at * max_addr, given that min_addr is aligned. */ -static int alloc_try_nid_exact_address_generic_check(void) +static int alloc_nid_exact_address_generic_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; @@ -168,9 +179,9 @@ static int alloc_try_nid_exact_address_generic_check(void) min_addr = memblock_start_of_DRAM() + SMP_CACHE_BYTES; max_addr = min_addr + size; - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); rgn_end = rgn->base + rgn->size; ASSERT_NE(allocated_ptr, NULL); @@ -205,7 +216,7 @@ static int alloc_try_nid_exact_address_generic_check(void) * Expect to drop the lower limit and allocate a memory region which * ends at max_addr (if the address is aligned). */ -static int alloc_try_nid_top_down_narrow_range_check(void) +static int alloc_nid_top_down_narrow_range_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; @@ -219,9 +230,9 @@ static int alloc_try_nid_top_down_narrow_range_check(void) min_addr = memblock_start_of_DRAM() + SZ_512; max_addr = min_addr + SMP_CACHE_BYTES; - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); @@ -257,7 +268,7 @@ static int alloc_try_nid_top_down_narrow_range_check(void) * * Expect no allocation to happen. */ -static int alloc_try_nid_low_max_generic_check(void) +static int alloc_nid_low_max_generic_check(void) { void *allocated_ptr = NULL; phys_addr_t size = SZ_1K; @@ -270,9 +281,9 @@ static int alloc_try_nid_low_max_generic_check(void) min_addr = memblock_start_of_DRAM(); max_addr = min_addr + SMP_CACHE_BYTES; - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_EQ(allocated_ptr, NULL); @@ -295,7 +306,7 @@ static int alloc_try_nid_low_max_generic_check(void) * * Expect a merge of both regions. Only the region size gets updated. */ -static int alloc_try_nid_min_reserved_generic_check(void) +static int alloc_nid_min_reserved_generic_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; @@ -315,9 +326,9 @@ static int alloc_try_nid_min_reserved_generic_check(void) memblock_reserve(reserved_base, r1_size); - allocated_ptr = run_memblock_alloc_try_nid(r2_size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_nid(r2_size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, r2_size, alloc_nid_test_flags); @@ -347,7 +358,7 @@ static int alloc_try_nid_min_reserved_generic_check(void) * * Expect a merge of regions. Only the region size gets updated. */ -static int alloc_try_nid_max_reserved_generic_check(void) +static int alloc_nid_max_reserved_generic_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; @@ -365,9 +376,9 @@ static int alloc_try_nid_max_reserved_generic_check(void) memblock_reserve(max_addr, r1_size); - allocated_ptr = run_memblock_alloc_try_nid(r2_size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_nid(r2_size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, r2_size, alloc_nid_test_flags); @@ -400,7 +411,7 @@ static int alloc_try_nid_max_reserved_generic_check(void) * updated. The total size field gets updated. */ -static int alloc_try_nid_top_down_reserved_with_space_check(void) +static int alloc_nid_top_down_reserved_with_space_check(void) { struct memblock_region *rgn1 = &memblock.reserved.regions[1]; struct memblock_region *rgn2 = &memblock.reserved.regions[0]; @@ -428,9 +439,9 @@ static int alloc_try_nid_top_down_reserved_with_space_check(void) memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - allocated_ptr = run_memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_nid(r3_size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, r3_size, alloc_nid_test_flags); @@ -465,7 +476,7 @@ static int alloc_try_nid_top_down_reserved_with_space_check(void) * Expect to merge all of the regions into one. The region counter and total * size fields get updated. */ -static int alloc_try_nid_reserved_full_merge_generic_check(void) +static int alloc_nid_reserved_full_merge_generic_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; @@ -491,9 +502,9 @@ static int alloc_try_nid_reserved_full_merge_generic_check(void) memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - allocated_ptr = run_memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_nid(r3_size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, r3_size, alloc_nid_test_flags); @@ -527,7 +538,7 @@ static int alloc_try_nid_reserved_full_merge_generic_check(void) * Expect to merge the new region with r2. The second region does not get * updated. The total size counter gets updated. */ -static int alloc_try_nid_top_down_reserved_no_space_check(void) +static int alloc_nid_top_down_reserved_no_space_check(void) { struct memblock_region *rgn1 = &memblock.reserved.regions[1]; struct memblock_region *rgn2 = &memblock.reserved.regions[0]; @@ -555,9 +566,9 @@ static int alloc_try_nid_top_down_reserved_no_space_check(void) memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - allocated_ptr = run_memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_nid(r3_size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, r3_size, alloc_nid_test_flags); @@ -596,7 +607,7 @@ static int alloc_try_nid_top_down_reserved_no_space_check(void) * Expect no allocation to happen. */ -static int alloc_try_nid_reserved_all_generic_check(void) +static int alloc_nid_reserved_all_generic_check(void) { void *allocated_ptr = NULL; struct region r1, r2; @@ -620,9 +631,9 @@ static int alloc_try_nid_reserved_all_generic_check(void) memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - allocated_ptr = run_memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_nid(r3_size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_EQ(allocated_ptr, NULL); @@ -636,7 +647,7 @@ static int alloc_try_nid_reserved_all_generic_check(void) * bigger than the end address of the available memory. Expect to allocate * a region that ends before the end of the memory. */ -static int alloc_try_nid_top_down_cap_max_check(void) +static int alloc_nid_top_down_cap_max_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; @@ -650,9 +661,9 @@ static int alloc_try_nid_top_down_cap_max_check(void) min_addr = memblock_end_of_DRAM() - SZ_1K; max_addr = memblock_end_of_DRAM() + SZ_256; - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); @@ -673,7 +684,7 @@ static int alloc_try_nid_top_down_cap_max_check(void) * smaller than the start address of the available memory. Expect to allocate * a region that ends before the end of the memory. */ -static int alloc_try_nid_top_down_cap_min_check(void) +static int alloc_nid_top_down_cap_min_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; @@ -687,9 +698,9 @@ static int alloc_try_nid_top_down_cap_min_check(void) min_addr = memblock_start_of_DRAM() - SZ_256; max_addr = memblock_end_of_DRAM(); - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); @@ -719,7 +730,7 @@ static int alloc_try_nid_top_down_cap_min_check(void) * * Expect to allocate a region that ends before max_addr. */ -static int alloc_try_nid_bottom_up_simple_check(void) +static int alloc_nid_bottom_up_simple_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; @@ -734,9 +745,9 @@ static int alloc_try_nid_bottom_up_simple_check(void) min_addr = memblock_start_of_DRAM() + SMP_CACHE_BYTES * 2; max_addr = min_addr + SZ_512; - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); rgn_end = rgn->base + rgn->size; ASSERT_NE(allocated_ptr, NULL); @@ -771,7 +782,7 @@ static int alloc_try_nid_bottom_up_simple_check(void) * * Expect to allocate an aligned region that ends before max_addr. */ -static int alloc_try_nid_bottom_up_start_misaligned_check(void) +static int alloc_nid_bottom_up_start_misaligned_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; @@ -787,9 +798,9 @@ static int alloc_try_nid_bottom_up_start_misaligned_check(void) min_addr = memblock_start_of_DRAM() + misalign; max_addr = min_addr + SZ_512; - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); rgn_end = rgn->base + rgn->size; ASSERT_NE(allocated_ptr, NULL); @@ -824,7 +835,7 @@ static int alloc_try_nid_bottom_up_start_misaligned_check(void) * Expect to drop the lower limit and allocate a memory region which * starts at the beginning of the available memory. */ -static int alloc_try_nid_bottom_up_narrow_range_check(void) +static int alloc_nid_bottom_up_narrow_range_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; @@ -838,9 +849,9 @@ static int alloc_try_nid_bottom_up_narrow_range_check(void) min_addr = memblock_start_of_DRAM() + SZ_512; max_addr = min_addr + SMP_CACHE_BYTES; - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); @@ -873,7 +884,7 @@ static int alloc_try_nid_bottom_up_narrow_range_check(void) * updated. The total size field gets updated. */ -static int alloc_try_nid_bottom_up_reserved_with_space_check(void) +static int alloc_nid_bottom_up_reserved_with_space_check(void) { struct memblock_region *rgn1 = &memblock.reserved.regions[1]; struct memblock_region *rgn2 = &memblock.reserved.regions[0]; @@ -901,9 +912,9 @@ static int alloc_try_nid_bottom_up_reserved_with_space_check(void) memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - allocated_ptr = run_memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_nid(r3_size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, r3_size, alloc_nid_test_flags); @@ -942,7 +953,7 @@ static int alloc_try_nid_bottom_up_reserved_with_space_check(void) * Other regions are not modified. */ -static int alloc_try_nid_bottom_up_reserved_no_space_check(void) +static int alloc_nid_bottom_up_reserved_no_space_check(void) { struct memblock_region *rgn1 = &memblock.reserved.regions[2]; struct memblock_region *rgn2 = &memblock.reserved.regions[1]; @@ -971,9 +982,9 @@ static int alloc_try_nid_bottom_up_reserved_no_space_check(void) memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - allocated_ptr = run_memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_nid(r3_size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, r3_size, alloc_nid_test_flags); @@ -1000,7 +1011,7 @@ static int alloc_try_nid_bottom_up_reserved_no_space_check(void) * bigger than the end address of the available memory. Expect to allocate * a region that starts at the min_addr. */ -static int alloc_try_nid_bottom_up_cap_max_check(void) +static int alloc_nid_bottom_up_cap_max_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; @@ -1014,9 +1025,9 @@ static int alloc_try_nid_bottom_up_cap_max_check(void) min_addr = memblock_start_of_DRAM() + SZ_1K; max_addr = memblock_end_of_DRAM() + SZ_256; - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); @@ -1037,7 +1048,7 @@ static int alloc_try_nid_bottom_up_cap_max_check(void) * smaller than the start address of the available memory. Expect to allocate * a region at the beginning of the available memory. */ -static int alloc_try_nid_bottom_up_cap_min_check(void) +static int alloc_nid_bottom_up_cap_min_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; @@ -1051,9 +1062,9 @@ static int alloc_try_nid_bottom_up_cap_min_check(void) min_addr = memblock_start_of_DRAM(); max_addr = memblock_end_of_DRAM() - SZ_256; - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); @@ -1070,133 +1081,133 @@ static int alloc_try_nid_bottom_up_cap_min_check(void) } /* Test case wrappers for range tests */ -static int alloc_try_nid_simple_check(void) +static int alloc_nid_simple_check(void) { test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); - alloc_try_nid_top_down_simple_check(); + alloc_nid_top_down_simple_check(); memblock_set_bottom_up(true); - alloc_try_nid_bottom_up_simple_check(); + alloc_nid_bottom_up_simple_check(); return 0; } -static int alloc_try_nid_misaligned_check(void) +static int alloc_nid_misaligned_check(void) { test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); - alloc_try_nid_top_down_end_misaligned_check(); + alloc_nid_top_down_end_misaligned_check(); memblock_set_bottom_up(true); - alloc_try_nid_bottom_up_start_misaligned_check(); + alloc_nid_bottom_up_start_misaligned_check(); return 0; } -static int alloc_try_nid_narrow_range_check(void) +static int alloc_nid_narrow_range_check(void) { test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); - alloc_try_nid_top_down_narrow_range_check(); + alloc_nid_top_down_narrow_range_check(); memblock_set_bottom_up(true); - alloc_try_nid_bottom_up_narrow_range_check(); + alloc_nid_bottom_up_narrow_range_check(); return 0; } -static int alloc_try_nid_reserved_with_space_check(void) +static int alloc_nid_reserved_with_space_check(void) { test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); - alloc_try_nid_top_down_reserved_with_space_check(); + alloc_nid_top_down_reserved_with_space_check(); memblock_set_bottom_up(true); - alloc_try_nid_bottom_up_reserved_with_space_check(); + alloc_nid_bottom_up_reserved_with_space_check(); return 0; } -static int alloc_try_nid_reserved_no_space_check(void) +static int alloc_nid_reserved_no_space_check(void) { test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); - alloc_try_nid_top_down_reserved_no_space_check(); + alloc_nid_top_down_reserved_no_space_check(); memblock_set_bottom_up(true); - alloc_try_nid_bottom_up_reserved_no_space_check(); + alloc_nid_bottom_up_reserved_no_space_check(); return 0; } -static int alloc_try_nid_cap_max_check(void) +static int alloc_nid_cap_max_check(void) { test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); - alloc_try_nid_top_down_cap_max_check(); + alloc_nid_top_down_cap_max_check(); memblock_set_bottom_up(true); - alloc_try_nid_bottom_up_cap_max_check(); + alloc_nid_bottom_up_cap_max_check(); return 0; } -static int alloc_try_nid_cap_min_check(void) +static int alloc_nid_cap_min_check(void) { test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); - alloc_try_nid_top_down_cap_min_check(); + alloc_nid_top_down_cap_min_check(); memblock_set_bottom_up(true); - alloc_try_nid_bottom_up_cap_min_check(); + alloc_nid_bottom_up_cap_min_check(); return 0; } -static int alloc_try_nid_min_reserved_check(void) +static int alloc_nid_min_reserved_check(void) { test_print("\tRunning %s...\n", __func__); - run_top_down(alloc_try_nid_min_reserved_generic_check); - run_bottom_up(alloc_try_nid_min_reserved_generic_check); + run_top_down(alloc_nid_min_reserved_generic_check); + run_bottom_up(alloc_nid_min_reserved_generic_check); return 0; } -static int alloc_try_nid_max_reserved_check(void) +static int alloc_nid_max_reserved_check(void) { test_print("\tRunning %s...\n", __func__); - run_top_down(alloc_try_nid_max_reserved_generic_check); - run_bottom_up(alloc_try_nid_max_reserved_generic_check); + run_top_down(alloc_nid_max_reserved_generic_check); + run_bottom_up(alloc_nid_max_reserved_generic_check); return 0; } -static int alloc_try_nid_exact_address_check(void) +static int alloc_nid_exact_address_check(void) { test_print("\tRunning %s...\n", __func__); - run_top_down(alloc_try_nid_exact_address_generic_check); - run_bottom_up(alloc_try_nid_exact_address_generic_check); + run_top_down(alloc_nid_exact_address_generic_check); + run_bottom_up(alloc_nid_exact_address_generic_check); return 0; } -static int alloc_try_nid_reserved_full_merge_check(void) +static int alloc_nid_reserved_full_merge_check(void) { test_print("\tRunning %s...\n", __func__); - run_top_down(alloc_try_nid_reserved_full_merge_generic_check); - run_bottom_up(alloc_try_nid_reserved_full_merge_generic_check); + run_top_down(alloc_nid_reserved_full_merge_generic_check); + run_bottom_up(alloc_nid_reserved_full_merge_generic_check); return 0; } -static int alloc_try_nid_reserved_all_check(void) +static int alloc_nid_reserved_all_check(void) { test_print("\tRunning %s...\n", __func__); - run_top_down(alloc_try_nid_reserved_all_generic_check); - run_bottom_up(alloc_try_nid_reserved_all_generic_check); + run_top_down(alloc_nid_reserved_all_generic_check); + run_bottom_up(alloc_nid_reserved_all_generic_check); return 0; } -static int alloc_try_nid_low_max_check(void) +static int alloc_nid_low_max_check(void) { test_print("\tRunning %s...\n", __func__); - run_top_down(alloc_try_nid_low_max_generic_check); - run_bottom_up(alloc_try_nid_low_max_generic_check); + run_top_down(alloc_nid_low_max_generic_check); + run_bottom_up(alloc_nid_low_max_generic_check); return 0; } @@ -1204,22 +1215,22 @@ static int alloc_try_nid_low_max_check(void) static int memblock_alloc_nid_range_checks(void) { test_print("Running %s range tests...\n", - get_memblock_alloc_try_nid_name(alloc_nid_test_flags)); + get_memblock_alloc_nid_name(alloc_nid_test_flags)); - alloc_try_nid_simple_check(); - alloc_try_nid_misaligned_check(); - alloc_try_nid_narrow_range_check(); - alloc_try_nid_reserved_with_space_check(); - alloc_try_nid_reserved_no_space_check(); - alloc_try_nid_cap_max_check(); - alloc_try_nid_cap_min_check(); + alloc_nid_simple_check(); + alloc_nid_misaligned_check(); + alloc_nid_narrow_range_check(); + alloc_nid_reserved_with_space_check(); + alloc_nid_reserved_no_space_check(); + alloc_nid_cap_max_check(); + alloc_nid_cap_min_check(); - alloc_try_nid_min_reserved_check(); - alloc_try_nid_max_reserved_check(); - alloc_try_nid_exact_address_check(); - alloc_try_nid_reserved_full_merge_check(); - alloc_try_nid_reserved_all_check(); - alloc_try_nid_low_max_check(); + alloc_nid_min_reserved_check(); + alloc_nid_max_reserved_check(); + alloc_nid_exact_address_check(); + alloc_nid_reserved_full_merge_check(); + alloc_nid_reserved_all_check(); + alloc_nid_low_max_check(); return 0; } @@ -1229,7 +1240,7 @@ static int memblock_alloc_nid_range_checks(void) * has enough memory to allocate a region of the requested size. * Expect to allocate an aligned region at the end of the requested node. */ -static int alloc_try_nid_top_down_numa_simple_check(void) +static int alloc_nid_top_down_numa_simple_check(void) { int nid_req = 3; struct memblock_region *new_rgn = &memblock.reserved.regions[0]; @@ -1247,8 +1258,8 @@ static int alloc_try_nid_top_down_numa_simple_check(void) min_addr = memblock_start_of_DRAM(); max_addr = memblock_end_of_DRAM(); - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, nid_req); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); @@ -1280,7 +1291,7 @@ static int alloc_try_nid_top_down_numa_simple_check(void) * Expect to allocate an aligned region at the end of the last node that has * enough memory (in this case, nid = 6) after falling back to NUMA_NO_NODE. */ -static int alloc_try_nid_top_down_numa_small_node_check(void) +static int alloc_nid_top_down_numa_small_node_check(void) { int nid_req = 1; int nid_exp = 6; @@ -1299,8 +1310,8 @@ static int alloc_try_nid_top_down_numa_small_node_check(void) min_addr = memblock_start_of_DRAM(); max_addr = memblock_end_of_DRAM(); - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, nid_req); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); @@ -1333,7 +1344,7 @@ static int alloc_try_nid_top_down_numa_small_node_check(void) * large enough and has enough unreserved memory (in this case, nid = 6) after * falling back to NUMA_NO_NODE. The region count and total size get updated. */ -static int alloc_try_nid_top_down_numa_node_reserved_check(void) +static int alloc_nid_top_down_numa_node_reserved_check(void) { int nid_req = 2; int nid_exp = 6; @@ -1353,8 +1364,8 @@ static int alloc_try_nid_top_down_numa_node_reserved_check(void) max_addr = memblock_end_of_DRAM(); memblock_reserve(req_node->base, req_node->size); - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, nid_req); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); @@ -1386,7 +1397,7 @@ static int alloc_try_nid_top_down_numa_node_reserved_check(void) * Expect to allocate an aligned region at the end of the requested node. The * region count and total size get updated. */ -static int alloc_try_nid_top_down_numa_part_reserved_check(void) +static int alloc_nid_top_down_numa_part_reserved_check(void) { int nid_req = 4; struct memblock_region *new_rgn = &memblock.reserved.regions[1]; @@ -1408,8 +1419,8 @@ static int alloc_try_nid_top_down_numa_part_reserved_check(void) max_addr = memblock_end_of_DRAM(); memblock_reserve(r1.base, r1.size); - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, nid_req); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); @@ -1444,7 +1455,7 @@ static int alloc_try_nid_top_down_numa_part_reserved_check(void) * nid = NUMA_NODES - 1) after falling back to NUMA_NO_NODE. The region count * and total size get updated. */ -static int alloc_try_nid_top_down_numa_part_reserved_fallback_check(void) +static int alloc_nid_top_down_numa_part_reserved_fallback_check(void) { int nid_req = 4; int nid_exp = NUMA_NODES - 1; @@ -1469,8 +1480,8 @@ static int alloc_try_nid_top_down_numa_part_reserved_fallback_check(void) max_addr = memblock_end_of_DRAM(); memblock_reserve(r1.base, r1.size); - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, nid_req); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); @@ -1507,7 +1518,7 @@ static int alloc_try_nid_top_down_numa_part_reserved_fallback_check(void) * Expect to drop the lower limit and allocate a memory region that ends at * the end of the requested node. */ -static int alloc_try_nid_top_down_numa_split_range_low_check(void) +static int alloc_nid_top_down_numa_split_range_low_check(void) { int nid_req = 2; struct memblock_region *new_rgn = &memblock.reserved.regions[0]; @@ -1525,8 +1536,8 @@ static int alloc_try_nid_top_down_numa_split_range_low_check(void) min_addr = req_node_end - SZ_256; max_addr = min_addr + size; - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, nid_req); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); @@ -1563,7 +1574,7 @@ static int alloc_try_nid_top_down_numa_split_range_low_check(void) * Expect to drop the lower limit and allocate a memory region that * ends at the end of the first node that overlaps with the range. */ -static int alloc_try_nid_top_down_numa_split_range_high_check(void) +static int alloc_nid_top_down_numa_split_range_high_check(void) { int nid_req = 3; int nid_exp = nid_req - 1; @@ -1582,8 +1593,8 @@ static int alloc_try_nid_top_down_numa_split_range_high_check(void) min_addr = exp_node_end - SZ_256; max_addr = min_addr + size; - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, nid_req); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); @@ -1620,7 +1631,7 @@ static int alloc_try_nid_top_down_numa_split_range_high_check(void) * Expect to drop the lower limit and allocate a memory region that ends at * the end of the requested node. */ -static int alloc_try_nid_top_down_numa_no_overlap_split_check(void) +static int alloc_nid_top_down_numa_no_overlap_split_check(void) { int nid_req = 2; struct memblock_region *new_rgn = &memblock.reserved.regions[0]; @@ -1638,8 +1649,8 @@ static int alloc_try_nid_top_down_numa_no_overlap_split_check(void) min_addr = node2->base - SZ_256; max_addr = min_addr + size; - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, nid_req); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); @@ -1677,7 +1688,7 @@ static int alloc_try_nid_top_down_numa_no_overlap_split_check(void) * Expect to allocate a memory region at the end of the final node in * the range after falling back to NUMA_NO_NODE. */ -static int alloc_try_nid_top_down_numa_no_overlap_low_check(void) +static int alloc_nid_top_down_numa_no_overlap_low_check(void) { int nid_req = 0; struct memblock_region *new_rgn = &memblock.reserved.regions[0]; @@ -1694,8 +1705,8 @@ static int alloc_try_nid_top_down_numa_no_overlap_low_check(void) min_addr = min_node->base; max_addr = region_end(max_node); - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, nid_req); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); @@ -1733,7 +1744,7 @@ static int alloc_try_nid_top_down_numa_no_overlap_low_check(void) * Expect to allocate a memory region at the end of the final node in * the range after falling back to NUMA_NO_NODE. */ -static int alloc_try_nid_top_down_numa_no_overlap_high_check(void) +static int alloc_nid_top_down_numa_no_overlap_high_check(void) { int nid_req = 7; struct memblock_region *new_rgn = &memblock.reserved.regions[0]; @@ -1750,8 +1761,8 @@ static int alloc_try_nid_top_down_numa_no_overlap_high_check(void) min_addr = min_node->base; max_addr = region_end(max_node); - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, nid_req); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); @@ -1773,7 +1784,7 @@ static int alloc_try_nid_top_down_numa_no_overlap_high_check(void) * has enough memory to allocate a region of the requested size. * Expect to allocate an aligned region at the beginning of the requested node. */ -static int alloc_try_nid_bottom_up_numa_simple_check(void) +static int alloc_nid_bottom_up_numa_simple_check(void) { int nid_req = 3; struct memblock_region *new_rgn = &memblock.reserved.regions[0]; @@ -1791,8 +1802,8 @@ static int alloc_try_nid_bottom_up_numa_simple_check(void) min_addr = memblock_start_of_DRAM(); max_addr = memblock_end_of_DRAM(); - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, nid_req); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); @@ -1824,7 +1835,7 @@ static int alloc_try_nid_bottom_up_numa_simple_check(void) * Expect to allocate an aligned region at the beginning of the first node that * has enough memory (in this case, nid = 0) after falling back to NUMA_NO_NODE. */ -static int alloc_try_nid_bottom_up_numa_small_node_check(void) +static int alloc_nid_bottom_up_numa_small_node_check(void) { int nid_req = 1; int nid_exp = 0; @@ -1843,8 +1854,8 @@ static int alloc_try_nid_bottom_up_numa_small_node_check(void) min_addr = memblock_start_of_DRAM(); max_addr = memblock_end_of_DRAM(); - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, nid_req); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); @@ -1878,7 +1889,7 @@ static int alloc_try_nid_bottom_up_numa_small_node_check(void) * after falling back to NUMA_NO_NODE. The region count and total size get * updated. */ -static int alloc_try_nid_bottom_up_numa_node_reserved_check(void) +static int alloc_nid_bottom_up_numa_node_reserved_check(void) { int nid_req = 2; int nid_exp = 0; @@ -1898,8 +1909,8 @@ static int alloc_try_nid_bottom_up_numa_node_reserved_check(void) max_addr = memblock_end_of_DRAM(); memblock_reserve(req_node->base, req_node->size); - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, nid_req); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); @@ -1931,7 +1942,7 @@ static int alloc_try_nid_bottom_up_numa_node_reserved_check(void) * Expect to allocate an aligned region in the requested node that merges with * the existing reserved region. The total size gets updated. */ -static int alloc_try_nid_bottom_up_numa_part_reserved_check(void) +static int alloc_nid_bottom_up_numa_part_reserved_check(void) { int nid_req = 4; struct memblock_region *new_rgn = &memblock.reserved.regions[0]; @@ -1955,8 +1966,8 @@ static int alloc_try_nid_bottom_up_numa_part_reserved_check(void) total_size = size + r1.size; memblock_reserve(r1.base, r1.size); - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, nid_req); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); @@ -1991,7 +2002,7 @@ static int alloc_try_nid_bottom_up_numa_part_reserved_check(void) * nid = 0) after falling back to NUMA_NO_NODE. The region count and total size * get updated. */ -static int alloc_try_nid_bottom_up_numa_part_reserved_fallback_check(void) +static int alloc_nid_bottom_up_numa_part_reserved_fallback_check(void) { int nid_req = 4; int nid_exp = 0; @@ -2016,8 +2027,8 @@ static int alloc_try_nid_bottom_up_numa_part_reserved_fallback_check(void) max_addr = memblock_end_of_DRAM(); memblock_reserve(r1.base, r1.size); - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, nid_req); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); @@ -2054,7 +2065,7 @@ static int alloc_try_nid_bottom_up_numa_part_reserved_fallback_check(void) * Expect to drop the lower limit and allocate a memory region at the beginning * of the requested node. */ -static int alloc_try_nid_bottom_up_numa_split_range_low_check(void) +static int alloc_nid_bottom_up_numa_split_range_low_check(void) { int nid_req = 2; struct memblock_region *new_rgn = &memblock.reserved.regions[0]; @@ -2072,8 +2083,8 @@ static int alloc_try_nid_bottom_up_numa_split_range_low_check(void) min_addr = req_node_end - SZ_256; max_addr = min_addr + size; - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, nid_req); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); @@ -2110,7 +2121,7 @@ static int alloc_try_nid_bottom_up_numa_split_range_low_check(void) * Expect to drop the lower limit and allocate a memory region at the beginning * of the first node that has enough memory. */ -static int alloc_try_nid_bottom_up_numa_split_range_high_check(void) +static int alloc_nid_bottom_up_numa_split_range_high_check(void) { int nid_req = 3; int nid_exp = 0; @@ -2130,8 +2141,8 @@ static int alloc_try_nid_bottom_up_numa_split_range_high_check(void) min_addr = req_node->base - SZ_256; max_addr = min_addr + size; - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, nid_req); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); @@ -2168,7 +2179,7 @@ static int alloc_try_nid_bottom_up_numa_split_range_high_check(void) * Expect to drop the lower limit and allocate a memory region that starts at * the beginning of the requested node. */ -static int alloc_try_nid_bottom_up_numa_no_overlap_split_check(void) +static int alloc_nid_bottom_up_numa_no_overlap_split_check(void) { int nid_req = 2; struct memblock_region *new_rgn = &memblock.reserved.regions[0]; @@ -2186,8 +2197,8 @@ static int alloc_try_nid_bottom_up_numa_no_overlap_split_check(void) min_addr = node2->base - SZ_256; max_addr = min_addr + size; - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, nid_req); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); @@ -2225,7 +2236,7 @@ static int alloc_try_nid_bottom_up_numa_no_overlap_split_check(void) * Expect to allocate a memory region at the beginning of the first node * in the range after falling back to NUMA_NO_NODE. */ -static int alloc_try_nid_bottom_up_numa_no_overlap_low_check(void) +static int alloc_nid_bottom_up_numa_no_overlap_low_check(void) { int nid_req = 0; struct memblock_region *new_rgn = &memblock.reserved.regions[0]; @@ -2242,8 +2253,8 @@ static int alloc_try_nid_bottom_up_numa_no_overlap_low_check(void) min_addr = min_node->base; max_addr = region_end(max_node); - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, nid_req); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); @@ -2281,7 +2292,7 @@ static int alloc_try_nid_bottom_up_numa_no_overlap_low_check(void) * Expect to allocate a memory region at the beginning of the first node * in the range after falling back to NUMA_NO_NODE. */ -static int alloc_try_nid_bottom_up_numa_no_overlap_high_check(void) +static int alloc_nid_bottom_up_numa_no_overlap_high_check(void) { int nid_req = 7; struct memblock_region *new_rgn = &memblock.reserved.regions[0]; @@ -2298,8 +2309,8 @@ static int alloc_try_nid_bottom_up_numa_no_overlap_high_check(void) min_addr = min_node->base; max_addr = region_end(max_node); - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, nid_req); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); @@ -2330,7 +2341,7 @@ static int alloc_try_nid_bottom_up_numa_no_overlap_high_check(void) * * Expect no allocation to happen. */ -static int alloc_try_nid_numa_large_region_generic_check(void) +static int alloc_nid_numa_large_region_generic_check(void) { int nid_req = 3; void *allocated_ptr = NULL; @@ -2344,8 +2355,8 @@ static int alloc_try_nid_numa_large_region_generic_check(void) min_addr = memblock_start_of_DRAM(); max_addr = memblock_end_of_DRAM(); - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, nid_req); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); ASSERT_EQ(allocated_ptr, NULL); test_pass_pop(); @@ -2374,7 +2385,7 @@ static int alloc_try_nid_numa_large_region_generic_check(void) * Expect to merge all of the regions into one. The region counter and total * size fields get updated. */ -static int alloc_try_nid_numa_reserved_full_merge_generic_check(void) +static int alloc_nid_numa_reserved_full_merge_generic_check(void) { int nid_req = 6; int nid_next = nid_req + 1; @@ -2404,8 +2415,8 @@ static int alloc_try_nid_numa_reserved_full_merge_generic_check(void) memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, nid_req); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, nid_req); ASSERT_NE(allocated_ptr, NULL); assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); @@ -2448,7 +2459,7 @@ static int alloc_try_nid_numa_reserved_full_merge_generic_check(void) * * Expect no allocation to happen. */ -static int alloc_try_nid_numa_split_all_reserved_generic_check(void) +static int alloc_nid_numa_split_all_reserved_generic_check(void) { void *allocated_ptr = NULL; struct memblock_region *next_node = &memblock.memory.regions[7]; @@ -2472,9 +2483,9 @@ static int alloc_try_nid_numa_split_all_reserved_generic_check(void) memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_EQ(allocated_ptr, NULL); @@ -2484,139 +2495,139 @@ static int alloc_try_nid_numa_split_all_reserved_generic_check(void) } /* Test case wrappers for NUMA tests */ -static int alloc_try_nid_numa_simple_check(void) +static int alloc_nid_numa_simple_check(void) { test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); - alloc_try_nid_top_down_numa_simple_check(); + alloc_nid_top_down_numa_simple_check(); memblock_set_bottom_up(true); - alloc_try_nid_bottom_up_numa_simple_check(); + alloc_nid_bottom_up_numa_simple_check(); return 0; } -static int alloc_try_nid_numa_small_node_check(void) +static int alloc_nid_numa_small_node_check(void) { test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); - alloc_try_nid_top_down_numa_small_node_check(); + alloc_nid_top_down_numa_small_node_check(); memblock_set_bottom_up(true); - alloc_try_nid_bottom_up_numa_small_node_check(); + alloc_nid_bottom_up_numa_small_node_check(); return 0; } -static int alloc_try_nid_numa_node_reserved_check(void) +static int alloc_nid_numa_node_reserved_check(void) { test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); - alloc_try_nid_top_down_numa_node_reserved_check(); + alloc_nid_top_down_numa_node_reserved_check(); memblock_set_bottom_up(true); - alloc_try_nid_bottom_up_numa_node_reserved_check(); + alloc_nid_bottom_up_numa_node_reserved_check(); return 0; } -static int alloc_try_nid_numa_part_reserved_check(void) +static int alloc_nid_numa_part_reserved_check(void) { test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); - alloc_try_nid_top_down_numa_part_reserved_check(); + alloc_nid_top_down_numa_part_reserved_check(); memblock_set_bottom_up(true); - alloc_try_nid_bottom_up_numa_part_reserved_check(); + alloc_nid_bottom_up_numa_part_reserved_check(); return 0; } -static int alloc_try_nid_numa_part_reserved_fallback_check(void) +static int alloc_nid_numa_part_reserved_fallback_check(void) { test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); - alloc_try_nid_top_down_numa_part_reserved_fallback_check(); + alloc_nid_top_down_numa_part_reserved_fallback_check(); memblock_set_bottom_up(true); - alloc_try_nid_bottom_up_numa_part_reserved_fallback_check(); + alloc_nid_bottom_up_numa_part_reserved_fallback_check(); return 0; } -static int alloc_try_nid_numa_split_range_low_check(void) +static int alloc_nid_numa_split_range_low_check(void) { test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); - alloc_try_nid_top_down_numa_split_range_low_check(); + alloc_nid_top_down_numa_split_range_low_check(); memblock_set_bottom_up(true); - alloc_try_nid_bottom_up_numa_split_range_low_check(); + alloc_nid_bottom_up_numa_split_range_low_check(); return 0; } -static int alloc_try_nid_numa_split_range_high_check(void) +static int alloc_nid_numa_split_range_high_check(void) { test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); - alloc_try_nid_top_down_numa_split_range_high_check(); + alloc_nid_top_down_numa_split_range_high_check(); memblock_set_bottom_up(true); - alloc_try_nid_bottom_up_numa_split_range_high_check(); + alloc_nid_bottom_up_numa_split_range_high_check(); return 0; } -static int alloc_try_nid_numa_no_overlap_split_check(void) +static int alloc_nid_numa_no_overlap_split_check(void) { test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); - alloc_try_nid_top_down_numa_no_overlap_split_check(); + alloc_nid_top_down_numa_no_overlap_split_check(); memblock_set_bottom_up(true); - alloc_try_nid_bottom_up_numa_no_overlap_split_check(); + alloc_nid_bottom_up_numa_no_overlap_split_check(); return 0; } -static int alloc_try_nid_numa_no_overlap_low_check(void) +static int alloc_nid_numa_no_overlap_low_check(void) { test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); - alloc_try_nid_top_down_numa_no_overlap_low_check(); + alloc_nid_top_down_numa_no_overlap_low_check(); memblock_set_bottom_up(true); - alloc_try_nid_bottom_up_numa_no_overlap_low_check(); + alloc_nid_bottom_up_numa_no_overlap_low_check(); return 0; } -static int alloc_try_nid_numa_no_overlap_high_check(void) +static int alloc_nid_numa_no_overlap_high_check(void) { test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); - alloc_try_nid_top_down_numa_no_overlap_high_check(); + alloc_nid_top_down_numa_no_overlap_high_check(); memblock_set_bottom_up(true); - alloc_try_nid_bottom_up_numa_no_overlap_high_check(); + alloc_nid_bottom_up_numa_no_overlap_high_check(); return 0; } -static int alloc_try_nid_numa_large_region_check(void) +static int alloc_nid_numa_large_region_check(void) { test_print("\tRunning %s...\n", __func__); - run_top_down(alloc_try_nid_numa_large_region_generic_check); - run_bottom_up(alloc_try_nid_numa_large_region_generic_check); + run_top_down(alloc_nid_numa_large_region_generic_check); + run_bottom_up(alloc_nid_numa_large_region_generic_check); return 0; } -static int alloc_try_nid_numa_reserved_full_merge_check(void) +static int alloc_nid_numa_reserved_full_merge_check(void) { test_print("\tRunning %s...\n", __func__); - run_top_down(alloc_try_nid_numa_reserved_full_merge_generic_check); - run_bottom_up(alloc_try_nid_numa_reserved_full_merge_generic_check); + run_top_down(alloc_nid_numa_reserved_full_merge_generic_check); + run_bottom_up(alloc_nid_numa_reserved_full_merge_generic_check); return 0; } -static int alloc_try_nid_numa_split_all_reserved_check(void) +static int alloc_nid_numa_split_all_reserved_check(void) { test_print("\tRunning %s...\n", __func__); - run_top_down(alloc_try_nid_numa_split_all_reserved_generic_check); - run_bottom_up(alloc_try_nid_numa_split_all_reserved_generic_check); + run_top_down(alloc_nid_numa_split_all_reserved_generic_check); + run_bottom_up(alloc_nid_numa_split_all_reserved_generic_check); return 0; } @@ -2624,22 +2635,22 @@ static int alloc_try_nid_numa_split_all_reserved_check(void) int __memblock_alloc_nid_numa_checks(void) { test_print("Running %s NUMA tests...\n", - get_memblock_alloc_try_nid_name(alloc_nid_test_flags)); + get_memblock_alloc_nid_name(alloc_nid_test_flags)); - alloc_try_nid_numa_simple_check(); - alloc_try_nid_numa_small_node_check(); - alloc_try_nid_numa_node_reserved_check(); - alloc_try_nid_numa_part_reserved_check(); - alloc_try_nid_numa_part_reserved_fallback_check(); - alloc_try_nid_numa_split_range_low_check(); - alloc_try_nid_numa_split_range_high_check(); + alloc_nid_numa_simple_check(); + alloc_nid_numa_small_node_check(); + alloc_nid_numa_node_reserved_check(); + alloc_nid_numa_part_reserved_check(); + alloc_nid_numa_part_reserved_fallback_check(); + alloc_nid_numa_split_range_low_check(); + alloc_nid_numa_split_range_high_check(); - alloc_try_nid_numa_no_overlap_split_check(); - alloc_try_nid_numa_no_overlap_low_check(); - alloc_try_nid_numa_no_overlap_high_check(); - alloc_try_nid_numa_large_region_check(); - alloc_try_nid_numa_reserved_full_merge_check(); - alloc_try_nid_numa_split_all_reserved_check(); + alloc_nid_numa_no_overlap_split_check(); + alloc_nid_numa_no_overlap_low_check(); + alloc_nid_numa_no_overlap_high_check(); + alloc_nid_numa_large_region_check(); + alloc_nid_numa_reserved_full_merge_check(); + alloc_nid_numa_split_all_reserved_check(); return 0; } @@ -2649,7 +2660,7 @@ static int memblock_alloc_nid_checks_internal(int flags) alloc_nid_test_flags = flags; prefix_reset(); - prefix_push(get_memblock_alloc_try_nid_name(flags)); + prefix_push(get_memblock_alloc_nid_name(flags)); reset_memblock_attributes(); dummy_physical_memory_init(); @@ -2671,3 +2682,12 @@ int memblock_alloc_nid_checks(void) return 0; } + +int memblock_alloc_exact_nid_range_checks(void) +{ + alloc_nid_test_flags = (TEST_F_RAW | TEST_F_EXACT); + + memblock_alloc_nid_range_checks(); + + return 0; +} diff --git a/tools/testing/memblock/tests/alloc_nid_api.h b/tools/testing/memblock/tests/alloc_nid_api.h index 92d07d230e18f..2b8cabacacb82 100644 --- a/tools/testing/memblock/tests/alloc_nid_api.h +++ b/tools/testing/memblock/tests/alloc_nid_api.h @@ -5,6 +5,7 @@ #include "common.h" int memblock_alloc_nid_checks(void); +int memblock_alloc_exact_nid_range_checks(void); int __memblock_alloc_nid_numa_checks(void); #ifdef CONFIG_NUMA diff --git a/tools/testing/memblock/tests/common.h b/tools/testing/memblock/tests/common.h index cc82b85151b61..4f23302ee6779 100644 --- a/tools/testing/memblock/tests/common.h +++ b/tools/testing/memblock/tests/common.h @@ -21,6 +21,8 @@ enum test_flags { TEST_F_NONE = 0x0, /* Perform raw allocations (no zeroing of memory). */ TEST_F_RAW = 0x1, + /* Perform allocations on the exact node specified. */ + TEST_F_EXACT = 0x2 }; /** -- GitLab From bfc05a4ce3650a1e5a47ccdaf8c87f814829b4a7 Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Mon, 7 Nov 2022 00:28:06 -0600 Subject: [PATCH 0394/1423] memblock tests: add top-down NUMA tests for memblock_alloc_exact_nid_raw Add tests for memblock_alloc_exact_nid_raw() where the simulated physical memory is set up with multiple NUMA nodes. Additionally, all of these tests set nid != NUMA_NO_NODE. These tests are run with a top-down allocation direction. The tested scenarios are: Range unrestricted: - region can be allocated in the specific node requested: + there are no previously reserved regions + the requested node is partially reserved but has enough space Range restricted: - region can be allocated in the specific node requested after dropping min_addr: + range partially overlaps with two different nodes, where the first node is the requested node + range partially overlaps with two different nodes, where the requested node ends before min_addr + range overlaps with multiple nodes along node boundaries, and the requested node ends before min_addr Acked-by: David Hildenbrand Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/2cc0883243d68ddc3faf833d2d9e86f48534c1d7.1667802195.git.remckee0@gmail.com --- .../memblock/tests/alloc_exact_nid_api.c | 344 ++++++++++++++++++ .../memblock/tests/alloc_exact_nid_api.h | 16 + 2 files changed, 360 insertions(+) diff --git a/tools/testing/memblock/tests/alloc_exact_nid_api.c b/tools/testing/memblock/tests/alloc_exact_nid_api.c index 6406496623a01..79150784b3739 100644 --- a/tools/testing/memblock/tests/alloc_exact_nid_api.c +++ b/tools/testing/memblock/tests/alloc_exact_nid_api.c @@ -4,6 +4,349 @@ #define FUNC_NAME "memblock_alloc_exact_nid_raw" +/* + * contains the fraction of MEM_SIZE contained in each node in basis point + * units (one hundredth of 1% or 1/10000) + */ +static const unsigned int node_fractions[] = { + 2500, /* 1/4 */ + 625, /* 1/16 */ + 1250, /* 1/8 */ + 1250, /* 1/8 */ + 625, /* 1/16 */ + 625, /* 1/16 */ + 2500, /* 1/4 */ + 625, /* 1/16 */ +}; + +/* + * A test that tries to allocate a memory region in a specific NUMA node that + * has enough memory to allocate a region of the requested size. + * Expect to allocate an aligned region at the end of the requested node. + */ +static int alloc_exact_nid_top_down_numa_simple_check(void) +{ + int nid_req = 3; + struct memblock_region *new_rgn = &memblock.reserved.regions[0]; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + void *allocated_ptr = NULL; + phys_addr_t size; + phys_addr_t min_addr; + phys_addr_t max_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + ASSERT_LE(SZ_4, req_node->size); + size = req_node->size / SZ_4; + min_addr = memblock_start_of_DRAM(); + max_addr = memblock_end_of_DRAM(); + + allocated_ptr = memblock_alloc_exact_nid_raw(size, SMP_CACHE_BYTES, + min_addr, max_addr, + nid_req); + + ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_NE(allocated_ptr, 0, size); + + ASSERT_EQ(new_rgn->size, size); + ASSERT_EQ(new_rgn->base, region_end(req_node) - size); + ASSERT_LE(req_node->base, new_rgn->base); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate a memory region in a specific NUMA node that + * is partially reserved but has enough memory for the allocated region: + * + * | +---------------------------------------+ | + * | | requested | | + * +-----------+---------------------------------------+----------+ + * + * | +------------------+ +-----+ | + * | | reserved | | new | | + * +-----------+------------------+--------------+-----+----------+ + * + * Expect to allocate an aligned region at the end of the requested node. The + * region count and total size get updated. + */ +static int alloc_exact_nid_top_down_numa_part_reserved_check(void) +{ + int nid_req = 4; + struct memblock_region *new_rgn = &memblock.reserved.regions[1]; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + void *allocated_ptr = NULL; + struct region r1; + phys_addr_t size; + phys_addr_t min_addr; + phys_addr_t max_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + ASSERT_LE(SZ_8, req_node->size); + r1.base = req_node->base; + r1.size = req_node->size / SZ_2; + size = r1.size / SZ_4; + min_addr = memblock_start_of_DRAM(); + max_addr = memblock_end_of_DRAM(); + + memblock_reserve(r1.base, r1.size); + allocated_ptr = memblock_alloc_exact_nid_raw(size, SMP_CACHE_BYTES, + min_addr, max_addr, + nid_req); + + ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_NE(allocated_ptr, 0, size); + + ASSERT_EQ(new_rgn->size, size); + ASSERT_EQ(new_rgn->base, region_end(req_node) - size); + ASSERT_LE(req_node->base, new_rgn->base); + + ASSERT_EQ(memblock.reserved.cnt, 2); + ASSERT_EQ(memblock.reserved.total_size, size + r1.size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate a memory region that spans over the min_addr + * and max_addr range and overlaps with two different nodes, where the first + * node is the requested node: + * + * min_addr + * | max_addr + * | | + * v v + * | +-----------------------+-----------+ | + * | | requested | node3 | | + * +-----------+-----------------------+-----------+--------------+ + * + + + * | +-----------+ | + * | | rgn | | + * +-----------------------+-----------+--------------------------+ + * + * Expect to drop the lower limit and allocate a memory region that ends at + * the end of the requested node. + */ +static int alloc_exact_nid_top_down_numa_split_range_low_check(void) +{ + int nid_req = 2; + struct memblock_region *new_rgn = &memblock.reserved.regions[0]; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + void *allocated_ptr = NULL; + phys_addr_t size = SZ_512; + phys_addr_t min_addr; + phys_addr_t max_addr; + phys_addr_t req_node_end; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + req_node_end = region_end(req_node); + min_addr = req_node_end - SZ_256; + max_addr = min_addr + size; + + allocated_ptr = memblock_alloc_exact_nid_raw(size, SMP_CACHE_BYTES, + min_addr, max_addr, + nid_req); + + ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_NE(allocated_ptr, 0, size); + + ASSERT_EQ(new_rgn->size, size); + ASSERT_EQ(new_rgn->base, req_node_end - size); + ASSERT_LE(req_node->base, new_rgn->base); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate a memory region that spans over the min_addr + * and max_addr range and overlaps with two different nodes, where the requested + * node ends before min_addr: + * + * min_addr + * | max_addr + * | | + * v v + * | +---------------+ +-------------+---------+ | + * | | requested | | node1 | node2 | | + * +----+---------------+--------+-------------+---------+----------+ + * + + + * | +---------+ | + * | | rgn | | + * +----------+---------+-------------------------------------------+ + * + * Expect to drop the lower limit and allocate a memory region that ends at + * the end of the requested node. + */ +static int alloc_exact_nid_top_down_numa_no_overlap_split_check(void) +{ + int nid_req = 2; + struct memblock_region *new_rgn = &memblock.reserved.regions[0]; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + struct memblock_region *node2 = &memblock.memory.regions[6]; + void *allocated_ptr = NULL; + phys_addr_t size; + phys_addr_t min_addr; + phys_addr_t max_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + size = SZ_512; + min_addr = node2->base - SZ_256; + max_addr = min_addr + size; + + allocated_ptr = memblock_alloc_exact_nid_raw(size, SMP_CACHE_BYTES, + min_addr, max_addr, + nid_req); + + ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_NE(allocated_ptr, 0, size); + + ASSERT_EQ(new_rgn->size, size); + ASSERT_EQ(new_rgn->base, region_end(req_node) - size); + ASSERT_LE(req_node->base, new_rgn->base); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate memory within min_addr and max_add range when + * the requested node and the range do not overlap, and requested node ends + * before min_addr. The range overlaps with multiple nodes along node + * boundaries: + * + * min_addr + * | max_addr + * | | + * v v + * |-----------+ +----------+----...----+----------+ | + * | requested | | min node | ... | max node | | + * +-----------+-----------+----------+----...----+----------+------+ + * + + + * | +-----+ | + * | | rgn | | + * +-----+-----+----------------------------------------------------+ + * + * Expect to drop the lower limit and allocate a memory region that ends at + * the end of the requested node. + */ +static int alloc_exact_nid_top_down_numa_no_overlap_low_check(void) +{ + int nid_req = 0; + struct memblock_region *new_rgn = &memblock.reserved.regions[0]; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + struct memblock_region *min_node = &memblock.memory.regions[2]; + struct memblock_region *max_node = &memblock.memory.regions[5]; + void *allocated_ptr = NULL; + phys_addr_t size = SZ_64; + phys_addr_t max_addr; + phys_addr_t min_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + min_addr = min_node->base; + max_addr = region_end(max_node); + + allocated_ptr = memblock_alloc_exact_nid_raw(size, SMP_CACHE_BYTES, + min_addr, max_addr, + nid_req); + + ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_NE(allocated_ptr, 0, size); + + ASSERT_EQ(new_rgn->size, size); + ASSERT_EQ(new_rgn->base, region_end(req_node) - size); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); + + test_pass_pop(); + + return 0; +} + +/* Test case wrappers for NUMA tests */ +static int alloc_exact_nid_numa_simple_check(void) +{ + test_print("\tRunning %s...\n", __func__); + memblock_set_bottom_up(false); + alloc_exact_nid_top_down_numa_simple_check(); + + return 0; +} + +static int alloc_exact_nid_numa_part_reserved_check(void) +{ + test_print("\tRunning %s...\n", __func__); + memblock_set_bottom_up(false); + alloc_exact_nid_top_down_numa_part_reserved_check(); + + return 0; +} + +static int alloc_exact_nid_numa_split_range_low_check(void) +{ + test_print("\tRunning %s...\n", __func__); + memblock_set_bottom_up(false); + alloc_exact_nid_top_down_numa_split_range_low_check(); + + return 0; +} + +static int alloc_exact_nid_numa_no_overlap_split_check(void) +{ + test_print("\tRunning %s...\n", __func__); + memblock_set_bottom_up(false); + alloc_exact_nid_top_down_numa_no_overlap_split_check(); + + return 0; +} + +static int alloc_exact_nid_numa_no_overlap_low_check(void) +{ + test_print("\tRunning %s...\n", __func__); + memblock_set_bottom_up(false); + alloc_exact_nid_top_down_numa_no_overlap_low_check(); + + return 0; +} + +int __memblock_alloc_exact_nid_numa_checks(void) +{ + test_print("Running %s NUMA tests...\n", FUNC_NAME); + + alloc_exact_nid_numa_simple_check(); + alloc_exact_nid_numa_part_reserved_check(); + alloc_exact_nid_numa_split_range_low_check(); + alloc_exact_nid_numa_no_overlap_split_check(); + alloc_exact_nid_numa_no_overlap_low_check(); + + return 0; +} + int memblock_alloc_exact_nid_checks(void) { prefix_reset(); @@ -13,6 +356,7 @@ int memblock_alloc_exact_nid_checks(void) dummy_physical_memory_init(); memblock_alloc_exact_nid_range_checks(); + memblock_alloc_exact_nid_numa_checks(); dummy_physical_memory_cleanup(); diff --git a/tools/testing/memblock/tests/alloc_exact_nid_api.h b/tools/testing/memblock/tests/alloc_exact_nid_api.h index 4408719de3b9a..cef419d55d2ae 100644 --- a/tools/testing/memblock/tests/alloc_exact_nid_api.h +++ b/tools/testing/memblock/tests/alloc_exact_nid_api.h @@ -5,5 +5,21 @@ #include "common.h" int memblock_alloc_exact_nid_checks(void); +int __memblock_alloc_exact_nid_numa_checks(void); + +#ifdef CONFIG_NUMA +static inline int memblock_alloc_exact_nid_numa_checks(void) +{ + __memblock_alloc_exact_nid_numa_checks(); + return 0; +} + +#else +static inline int memblock_alloc_exact_nid_numa_checks(void) +{ + return 0; +} + +#endif /* CONFIG_NUMA */ #endif -- GitLab From b6df23edb1ba65b0b46788a872ddc85dfe86ccf5 Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Mon, 7 Nov 2022 00:28:07 -0600 Subject: [PATCH 0395/1423] memblock tests: add bottom-up NUMA tests for memblock_alloc_exact_nid_raw Add tests for memblock_alloc_exact_nid_raw() where the simulated physical memory is set up with multiple NUMA nodes. Additionally, all of these tests set nid != NUMA_NO_NODE. These tests are run with a bottom-up allocation direction. The tested scenarios are: Range unrestricted: - region can be allocated in the specific node requested: + there are no previously reserved regions + the requested node is partially reserved but has enough space Range restricted: - region can be allocated in the specific node requested after dropping min_addr: + range partially overlaps with two different nodes, where the first node is the requested node + range partially overlaps with two different nodes, where the requested node ends before min_addr + range overlaps with multiple nodes along node boundaries, and the requested node ends before min_addr Acked-by: David Hildenbrand Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/935f0eed5e06fd44dc67d9f49b277923d7896bd3.1667802195.git.remckee0@gmail.com --- .../memblock/tests/alloc_exact_nid_api.c | 282 ++++++++++++++++++ 1 file changed, 282 insertions(+) diff --git a/tools/testing/memblock/tests/alloc_exact_nid_api.c b/tools/testing/memblock/tests/alloc_exact_nid_api.c index 79150784b3739..b97b5c04de053 100644 --- a/tools/testing/memblock/tests/alloc_exact_nid_api.c +++ b/tools/testing/memblock/tests/alloc_exact_nid_api.c @@ -288,12 +288,286 @@ static int alloc_exact_nid_top_down_numa_no_overlap_low_check(void) return 0; } +/* + * A test that tries to allocate a memory region in a specific NUMA node that + * has enough memory to allocate a region of the requested size. + * Expect to allocate an aligned region at the beginning of the requested node. + */ +static int alloc_exact_nid_bottom_up_numa_simple_check(void) +{ + int nid_req = 3; + struct memblock_region *new_rgn = &memblock.reserved.regions[0]; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + void *allocated_ptr = NULL; + phys_addr_t size; + phys_addr_t min_addr; + phys_addr_t max_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + ASSERT_LE(SZ_4, req_node->size); + size = req_node->size / SZ_4; + min_addr = memblock_start_of_DRAM(); + max_addr = memblock_end_of_DRAM(); + + allocated_ptr = memblock_alloc_exact_nid_raw(size, SMP_CACHE_BYTES, + min_addr, max_addr, + nid_req); + + ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_NE(allocated_ptr, 0, size); + + ASSERT_EQ(new_rgn->size, size); + ASSERT_EQ(new_rgn->base, req_node->base); + ASSERT_LE(region_end(new_rgn), region_end(req_node)); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate a memory region in a specific NUMA node that + * is partially reserved but has enough memory for the allocated region: + * + * | +---------------------------------------+ | + * | | requested | | + * +-----------+---------------------------------------+---------+ + * + * | +------------------+-----+ | + * | | reserved | new | | + * +-----------+------------------+-----+------------------------+ + * + * Expect to allocate an aligned region in the requested node that merges with + * the existing reserved region. The total size gets updated. + */ +static int alloc_exact_nid_bottom_up_numa_part_reserved_check(void) +{ + int nid_req = 4; + struct memblock_region *new_rgn = &memblock.reserved.regions[0]; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + void *allocated_ptr = NULL; + struct region r1; + phys_addr_t size; + phys_addr_t min_addr; + phys_addr_t max_addr; + phys_addr_t total_size; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + ASSERT_LE(SZ_8, req_node->size); + r1.base = req_node->base; + r1.size = req_node->size / SZ_2; + size = r1.size / SZ_4; + min_addr = memblock_start_of_DRAM(); + max_addr = memblock_end_of_DRAM(); + total_size = size + r1.size; + + memblock_reserve(r1.base, r1.size); + allocated_ptr = memblock_alloc_exact_nid_raw(size, SMP_CACHE_BYTES, + min_addr, max_addr, + nid_req); + + ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_NE(allocated_ptr, 0, size); + + ASSERT_EQ(new_rgn->size, total_size); + ASSERT_EQ(new_rgn->base, req_node->base); + ASSERT_LE(region_end(new_rgn), region_end(req_node)); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, total_size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate a memory region that spans over the min_addr + * and max_addr range and overlaps with two different nodes, where the first + * node is the requested node: + * + * min_addr + * | max_addr + * | | + * v v + * | +-----------------------+-----------+ | + * | | requested | node3 | | + * +-----------+-----------------------+-----------+--------------+ + * + + + * | +-----------+ | + * | | rgn | | + * +-----------+-----------+--------------------------------------+ + * + * Expect to drop the lower limit and allocate a memory region at the beginning + * of the requested node. + */ +static int alloc_exact_nid_bottom_up_numa_split_range_low_check(void) +{ + int nid_req = 2; + struct memblock_region *new_rgn = &memblock.reserved.regions[0]; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + void *allocated_ptr = NULL; + phys_addr_t size = SZ_512; + phys_addr_t min_addr; + phys_addr_t max_addr; + phys_addr_t req_node_end; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + req_node_end = region_end(req_node); + min_addr = req_node_end - SZ_256; + max_addr = min_addr + size; + + allocated_ptr = memblock_alloc_exact_nid_raw(size, SMP_CACHE_BYTES, + min_addr, max_addr, + nid_req); + + ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_NE(allocated_ptr, 0, size); + + ASSERT_EQ(new_rgn->size, size); + ASSERT_EQ(new_rgn->base, req_node->base); + ASSERT_LE(region_end(new_rgn), req_node_end); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate a memory region that spans over the min_addr + * and max_addr range and overlaps with two different nodes, where the requested + * node ends before min_addr: + * + * min_addr + * | max_addr + * | | + * v v + * | +---------------+ +-------------+---------+ | + * | | requested | | node1 | node2 | | + * +----+---------------+--------+-------------+---------+---------+ + * + + + * | +---------+ | + * | | rgn | | + * +----+---------+------------------------------------------------+ + * + * Expect to drop the lower limit and allocate a memory region that starts at + * the beginning of the requested node. + */ +static int alloc_exact_nid_bottom_up_numa_no_overlap_split_check(void) +{ + int nid_req = 2; + struct memblock_region *new_rgn = &memblock.reserved.regions[0]; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + struct memblock_region *node2 = &memblock.memory.regions[6]; + void *allocated_ptr = NULL; + phys_addr_t size; + phys_addr_t min_addr; + phys_addr_t max_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + size = SZ_512; + min_addr = node2->base - SZ_256; + max_addr = min_addr + size; + + allocated_ptr = memblock_alloc_exact_nid_raw(size, SMP_CACHE_BYTES, + min_addr, max_addr, + nid_req); + + ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_NE(allocated_ptr, 0, size); + + ASSERT_EQ(new_rgn->size, size); + ASSERT_EQ(new_rgn->base, req_node->base); + ASSERT_LE(region_end(new_rgn), region_end(req_node)); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate memory within min_addr and max_add range when + * the requested node and the range do not overlap, and requested node ends + * before min_addr. The range overlaps with multiple nodes along node + * boundaries: + * + * min_addr + * | max_addr + * | | + * v v + * |-----------+ +----------+----...----+----------+ | + * | requested | | min node | ... | max node | | + * +-----------+-----------+----------+----...----+----------+------+ + * + + + * |-----+ | + * | rgn | | + * +-----+----------------------------------------------------------+ + * + * Expect to drop the lower limit and allocate a memory region that starts at + * the beginning of the requested node. + */ +static int alloc_exact_nid_bottom_up_numa_no_overlap_low_check(void) +{ + int nid_req = 0; + struct memblock_region *new_rgn = &memblock.reserved.regions[0]; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + struct memblock_region *min_node = &memblock.memory.regions[2]; + struct memblock_region *max_node = &memblock.memory.regions[5]; + void *allocated_ptr = NULL; + phys_addr_t size = SZ_64; + phys_addr_t max_addr; + phys_addr_t min_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + min_addr = min_node->base; + max_addr = region_end(max_node); + + allocated_ptr = memblock_alloc_exact_nid_raw(size, SMP_CACHE_BYTES, + min_addr, max_addr, + nid_req); + + ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_NE(allocated_ptr, 0, size); + + ASSERT_EQ(new_rgn->size, size); + ASSERT_EQ(new_rgn->base, req_node->base); + ASSERT_LE(region_end(new_rgn), region_end(req_node)); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); + + test_pass_pop(); + + return 0; +} + /* Test case wrappers for NUMA tests */ static int alloc_exact_nid_numa_simple_check(void) { test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_exact_nid_top_down_numa_simple_check(); + memblock_set_bottom_up(true); + alloc_exact_nid_bottom_up_numa_simple_check(); return 0; } @@ -303,6 +577,8 @@ static int alloc_exact_nid_numa_part_reserved_check(void) test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_exact_nid_top_down_numa_part_reserved_check(); + memblock_set_bottom_up(true); + alloc_exact_nid_bottom_up_numa_part_reserved_check(); return 0; } @@ -312,6 +588,8 @@ static int alloc_exact_nid_numa_split_range_low_check(void) test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_exact_nid_top_down_numa_split_range_low_check(); + memblock_set_bottom_up(true); + alloc_exact_nid_bottom_up_numa_split_range_low_check(); return 0; } @@ -321,6 +599,8 @@ static int alloc_exact_nid_numa_no_overlap_split_check(void) test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_exact_nid_top_down_numa_no_overlap_split_check(); + memblock_set_bottom_up(true); + alloc_exact_nid_bottom_up_numa_no_overlap_split_check(); return 0; } @@ -330,6 +610,8 @@ static int alloc_exact_nid_numa_no_overlap_low_check(void) test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_exact_nid_top_down_numa_no_overlap_low_check(); + memblock_set_bottom_up(true); + alloc_exact_nid_bottom_up_numa_no_overlap_low_check(); return 0; } -- GitLab From 62bdc99008b372a6c5f81e6d968f3b077a1e3667 Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Mon, 7 Nov 2022 00:28:08 -0600 Subject: [PATCH 0396/1423] memblock tests: add generic NUMA tests for memblock_alloc_exact_nid_raw Add tests for memblock_alloc_exact_nid_raw() where the simulated physical memory is set up with multiple NUMA nodes. Additionally, all but one of these tests set nid != NUMA_NO_NODE. All tests are run for both top-down and bottom-up allocation directions. The tested scenarios are: Range unrestricted: - region cannot be allocated: + there are no previously reserved regions, but requested node is too small + the requested node is fully reserved + the requested node is partially reserved and does not have enough space + none of the nodes have enough memory to allocate the region Range restricted: - region can be allocated in the specific node requested without dropping min_addr: + the range fully overlaps with the node, and there are adjacent reserved regions - region cannot be allocated: + range partially overlaps with two different nodes, where the second node is the requested node + range overlaps with multiple nodes along node boundaries, and the requested node starts after max_addr + nid is set to NUMA_NO_NODE and the total range can fit the region, but the range is split between two nodes and everything else is reserved Acked-by: David Hildenbrand Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/51b14da46e6591428df3aefc5acc7dca9341a541.1667802195.git.remckee0@gmail.com --- .../memblock/tests/alloc_exact_nid_api.c | 465 ++++++++++++++++++ 1 file changed, 465 insertions(+) diff --git a/tools/testing/memblock/tests/alloc_exact_nid_api.c b/tools/testing/memblock/tests/alloc_exact_nid_api.c index b97b5c04de053..6e14447da6e1c 100644 --- a/tools/testing/memblock/tests/alloc_exact_nid_api.c +++ b/tools/testing/memblock/tests/alloc_exact_nid_api.c @@ -560,6 +560,390 @@ static int alloc_exact_nid_bottom_up_numa_no_overlap_low_check(void) return 0; } +/* + * A test that tries to allocate a memory region in a specific NUMA node that + * does not have enough memory to allocate a region of the requested size: + * + * | +-----+ | + * | | req | | + * +---+-----+----------------------------+ + * + * +---------+ + * | rgn | + * +---------+ + * + * Expect no allocation to happen. + */ +static int alloc_exact_nid_numa_small_node_generic_check(void) +{ + int nid_req = 1; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + void *allocated_ptr = NULL; + phys_addr_t size; + phys_addr_t min_addr; + phys_addr_t max_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + size = SZ_2 * req_node->size; + min_addr = memblock_start_of_DRAM(); + max_addr = memblock_end_of_DRAM(); + + allocated_ptr = memblock_alloc_exact_nid_raw(size, SMP_CACHE_BYTES, + min_addr, max_addr, + nid_req); + + ASSERT_EQ(allocated_ptr, NULL); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate a memory region in a specific NUMA node that + * is fully reserved: + * + * | +---------+ | + * | |requested| | + * +--------------+---------+-------------+ + * + * | +---------+ | + * | | reserved| | + * +--------------+---------+-------------+ + * + * Expect no allocation to happen. + */ +static int alloc_exact_nid_numa_node_reserved_generic_check(void) +{ + int nid_req = 2; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + void *allocated_ptr = NULL; + phys_addr_t size; + phys_addr_t min_addr; + phys_addr_t max_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + size = req_node->size; + min_addr = memblock_start_of_DRAM(); + max_addr = memblock_end_of_DRAM(); + + memblock_reserve(req_node->base, req_node->size); + allocated_ptr = memblock_alloc_exact_nid_raw(size, SMP_CACHE_BYTES, + min_addr, max_addr, + nid_req); + + ASSERT_EQ(allocated_ptr, NULL); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate a memory region in a specific NUMA node that + * is partially reserved and does not have enough contiguous memory for the + * allocated region: + * + * | +-----------------------+ | + * | | requested | | + * +-----------+-----------------------+----+ + * + * | +----------+ | + * | | reserved | | + * +-----------------+----------+-----------+ + * + * Expect no allocation to happen. + */ +static int alloc_exact_nid_numa_part_reserved_fail_generic_check(void) +{ + int nid_req = 4; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + void *allocated_ptr = NULL; + struct region r1; + phys_addr_t size; + phys_addr_t min_addr; + phys_addr_t max_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + ASSERT_LE(SZ_4, req_node->size); + size = req_node->size / SZ_2; + r1.base = req_node->base + (size / SZ_2); + r1.size = size; + + min_addr = memblock_start_of_DRAM(); + max_addr = memblock_end_of_DRAM(); + + memblock_reserve(r1.base, r1.size); + allocated_ptr = memblock_alloc_exact_nid_raw(size, SMP_CACHE_BYTES, + min_addr, max_addr, + nid_req); + + ASSERT_EQ(allocated_ptr, NULL); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate a memory region that spans over the min_addr + * and max_addr range and overlaps with two different nodes, where the second + * node is the requested node: + * + * min_addr + * | max_addr + * | | + * v v + * | +--------------------------+---------+ | + * | | first node |requested| | + * +------+--------------------------+---------+----------------+ + * + * Expect no allocation to happen. + */ +static int alloc_exact_nid_numa_split_range_high_generic_check(void) +{ + int nid_req = 3; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + void *allocated_ptr = NULL; + phys_addr_t size = SZ_512; + phys_addr_t min_addr; + phys_addr_t max_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + min_addr = req_node->base - SZ_256; + max_addr = min_addr + size; + + allocated_ptr = memblock_alloc_exact_nid_raw(size, SMP_CACHE_BYTES, + min_addr, max_addr, + nid_req); + + ASSERT_EQ(allocated_ptr, NULL); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate memory within min_addr and max_add range when + * the requested node and the range do not overlap, and requested node starts + * after max_addr. The range overlaps with multiple nodes along node + * boundaries: + * + * min_addr + * | max_addr + * | | + * v v + * | +----------+----...----+----------+ +-----------+ | + * | | min node | ... | max node | | requested | | + * +-----+----------+----...----+----------+--------+-----------+---+ + * + * Expect no allocation to happen. + */ +static int alloc_exact_nid_numa_no_overlap_high_generic_check(void) +{ + int nid_req = 7; + struct memblock_region *min_node = &memblock.memory.regions[2]; + struct memblock_region *max_node = &memblock.memory.regions[5]; + void *allocated_ptr = NULL; + phys_addr_t size = SZ_64; + phys_addr_t max_addr; + phys_addr_t min_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + min_addr = min_node->base; + max_addr = region_end(max_node); + + allocated_ptr = memblock_alloc_exact_nid_raw(size, SMP_CACHE_BYTES, + min_addr, max_addr, + nid_req); + + ASSERT_EQ(allocated_ptr, NULL); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate a memory region in a specific NUMA node that + * does not have enough memory to allocate a region of the requested size. + * Additionally, none of the nodes have enough memory to allocate the region: + * + * +-----------------------------------+ + * | new | + * +-----------------------------------+ + * |-------+-------+-------+-------+-------+-------+-------+-------| + * | node0 | node1 | node2 | node3 | node4 | node5 | node6 | node7 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * Expect no allocation to happen. + */ +static int alloc_exact_nid_numa_large_region_generic_check(void) +{ + int nid_req = 3; + void *allocated_ptr = NULL; + phys_addr_t size = MEM_SIZE / SZ_2; + phys_addr_t min_addr; + phys_addr_t max_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + min_addr = memblock_start_of_DRAM(); + max_addr = memblock_end_of_DRAM(); + + allocated_ptr = memblock_alloc_exact_nid_raw(size, SMP_CACHE_BYTES, + min_addr, max_addr, + nid_req); + ASSERT_EQ(allocated_ptr, NULL); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate memory within min_addr and max_addr range when + * there are two reserved regions at the borders. The requested node starts at + * min_addr and ends at max_addr and is the same size as the region to be + * allocated: + * + * min_addr + * | max_addr + * | | + * v v + * | +-----------+-----------------------+-----------------------| + * | | node5 | requested | node7 | + * +------+-----------+-----------------------+-----------------------+ + * + + + * | +----+-----------------------+----+ | + * | | r2 | new | r1 | | + * +-------------+----+-----------------------+----+------------------+ + * + * Expect to merge all of the regions into one. The region counter and total + * size fields get updated. + */ +static int alloc_exact_nid_numa_reserved_full_merge_generic_check(void) +{ + int nid_req = 6; + int nid_next = nid_req + 1; + struct memblock_region *new_rgn = &memblock.reserved.regions[0]; + struct memblock_region *req_node = &memblock.memory.regions[nid_req]; + struct memblock_region *next_node = &memblock.memory.regions[nid_next]; + void *allocated_ptr = NULL; + struct region r1, r2; + phys_addr_t size = req_node->size; + phys_addr_t total_size; + phys_addr_t max_addr; + phys_addr_t min_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + r1.base = next_node->base; + r1.size = SZ_128; + + r2.size = SZ_128; + r2.base = r1.base - (size + r2.size); + + total_size = r1.size + r2.size + size; + min_addr = r2.base + r2.size; + max_addr = r1.base; + + memblock_reserve(r1.base, r1.size); + memblock_reserve(r2.base, r2.size); + + allocated_ptr = memblock_alloc_exact_nid_raw(size, SMP_CACHE_BYTES, + min_addr, max_addr, + nid_req); + + ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_NE(allocated_ptr, 0, size); + + ASSERT_EQ(new_rgn->size, total_size); + ASSERT_EQ(new_rgn->base, r2.base); + + ASSERT_LE(new_rgn->base, req_node->base); + ASSERT_LE(region_end(req_node), region_end(new_rgn)); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, total_size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to allocate memory within min_addr and max_add range, + * where the total range can fit the region, but it is split between two nodes + * and everything else is reserved. Additionally, nid is set to NUMA_NO_NODE + * instead of requesting a specific node: + * + * +-----------+ + * | new | + * +-----------+ + * | +---------------------+-----------| + * | | prev node | next node | + * +------+---------------------+-----------+ + * + + + * |----------------------+ +-----| + * | r1 | | r2 | + * +----------------------+-----------+-----+ + * ^ ^ + * | | + * | max_addr + * | + * min_addr + * + * Expect no allocation to happen. + */ +static int alloc_exact_nid_numa_split_all_reserved_generic_check(void) +{ + void *allocated_ptr = NULL; + struct memblock_region *next_node = &memblock.memory.regions[7]; + struct region r1, r2; + phys_addr_t size = SZ_256; + phys_addr_t max_addr; + phys_addr_t min_addr; + + PREFIX_PUSH(); + setup_numa_memblock(node_fractions); + + r2.base = next_node->base + SZ_128; + r2.size = memblock_end_of_DRAM() - r2.base; + + r1.size = MEM_SIZE - (r2.size + size); + r1.base = memblock_start_of_DRAM(); + + min_addr = r1.base + r1.size; + max_addr = r2.base; + + memblock_reserve(r1.base, r1.size); + memblock_reserve(r2.base, r2.size); + + allocated_ptr = memblock_alloc_exact_nid_raw(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); + + ASSERT_EQ(allocated_ptr, NULL); + + test_pass_pop(); + + return 0; +} + /* Test case wrappers for NUMA tests */ static int alloc_exact_nid_numa_simple_check(void) { @@ -616,6 +1000,78 @@ static int alloc_exact_nid_numa_no_overlap_low_check(void) return 0; } +static int alloc_exact_nid_numa_small_node_check(void) +{ + test_print("\tRunning %s...\n", __func__); + run_top_down(alloc_exact_nid_numa_small_node_generic_check); + run_bottom_up(alloc_exact_nid_numa_small_node_generic_check); + + return 0; +} + +static int alloc_exact_nid_numa_node_reserved_check(void) +{ + test_print("\tRunning %s...\n", __func__); + run_top_down(alloc_exact_nid_numa_node_reserved_generic_check); + run_bottom_up(alloc_exact_nid_numa_node_reserved_generic_check); + + return 0; +} + +static int alloc_exact_nid_numa_part_reserved_fail_check(void) +{ + test_print("\tRunning %s...\n", __func__); + run_top_down(alloc_exact_nid_numa_part_reserved_fail_generic_check); + run_bottom_up(alloc_exact_nid_numa_part_reserved_fail_generic_check); + + return 0; +} + +static int alloc_exact_nid_numa_split_range_high_check(void) +{ + test_print("\tRunning %s...\n", __func__); + run_top_down(alloc_exact_nid_numa_split_range_high_generic_check); + run_bottom_up(alloc_exact_nid_numa_split_range_high_generic_check); + + return 0; +} + +static int alloc_exact_nid_numa_no_overlap_high_check(void) +{ + test_print("\tRunning %s...\n", __func__); + run_top_down(alloc_exact_nid_numa_no_overlap_high_generic_check); + run_bottom_up(alloc_exact_nid_numa_no_overlap_high_generic_check); + + return 0; +} + +static int alloc_exact_nid_numa_large_region_check(void) +{ + test_print("\tRunning %s...\n", __func__); + run_top_down(alloc_exact_nid_numa_large_region_generic_check); + run_bottom_up(alloc_exact_nid_numa_large_region_generic_check); + + return 0; +} + +static int alloc_exact_nid_numa_reserved_full_merge_check(void) +{ + test_print("\tRunning %s...\n", __func__); + run_top_down(alloc_exact_nid_numa_reserved_full_merge_generic_check); + run_bottom_up(alloc_exact_nid_numa_reserved_full_merge_generic_check); + + return 0; +} + +static int alloc_exact_nid_numa_split_all_reserved_check(void) +{ + test_print("\tRunning %s...\n", __func__); + run_top_down(alloc_exact_nid_numa_split_all_reserved_generic_check); + run_bottom_up(alloc_exact_nid_numa_split_all_reserved_generic_check); + + return 0; +} + int __memblock_alloc_exact_nid_numa_checks(void) { test_print("Running %s NUMA tests...\n", FUNC_NAME); @@ -626,6 +1082,15 @@ int __memblock_alloc_exact_nid_numa_checks(void) alloc_exact_nid_numa_no_overlap_split_check(); alloc_exact_nid_numa_no_overlap_low_check(); + alloc_exact_nid_numa_small_node_check(); + alloc_exact_nid_numa_node_reserved_check(); + alloc_exact_nid_numa_part_reserved_fail_check(); + alloc_exact_nid_numa_split_range_high_check(); + alloc_exact_nid_numa_no_overlap_high_check(); + alloc_exact_nid_numa_large_region_check(); + alloc_exact_nid_numa_reserved_full_merge_check(); + alloc_exact_nid_numa_split_all_reserved_check(); + return 0; } -- GitLab From 80c2fe022ef5d29f3bafee90c37dbcff18cab57a Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Mon, 7 Nov 2022 00:28:09 -0600 Subject: [PATCH 0397/1423] memblock tests: remove completed TODO item Remove completed item from TODO list. Reviewed-by: David Hildenbrand Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/f2263abe45613b28f1583fbf04a4bffcf735bcf6.1667802195.git.remckee0@gmail.com --- tools/testing/memblock/TODO | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tools/testing/memblock/TODO b/tools/testing/memblock/TODO index 503cc96fcdc3a..e306c90c535fb 100644 --- a/tools/testing/memblock/TODO +++ b/tools/testing/memblock/TODO @@ -1,10 +1,5 @@ TODO ===== -1. Add test cases using this functions (implement them for both directions): - + memblock_alloc_raw() - + memblock_alloc_exact_nid_raw() - + memblock_alloc_try_nid_raw() - -2. Add tests for memblock_alloc_node() to check if the correct NUMA node is set +1. Add tests for memblock_alloc_node() to check if the correct NUMA node is set for the new region -- GitLab From c14f7ccc9f5dcf9d06ddeec706f85405b2c80600 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 14 Jul 2022 20:41:30 +0200 Subject: [PATCH 0398/1423] PCI: Assign PCI domain IDs by ida_alloc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace assignment of PCI domain IDs from atomic_inc_return() to ida_alloc(). Use two IDAs, one for static domain allocations (those which are defined in device tree) and second for dynamic allocations (all other). During removal of root bus / host bridge, also release the domain ID. The released ID can be reused again, for example when dynamically loading and unloading native PCI host bridge drivers. This change also allows to mix static device tree assignment and dynamic by kernel as all static allocations are reserved in dynamic pool. [bhelgaas: set "err" if "bus->domain_nr < 0"] Link: https://lore.kernel.org/r/20220714184130.5436-1-pali@kernel.org Signed-off-by: Pali Rohár Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.c | 103 +++++++++++++++++++++++++------------------ drivers/pci/probe.c | 7 +++ drivers/pci/remove.c | 6 +++ include/linux/pci.h | 1 + 4 files changed, 74 insertions(+), 43 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 2127aba3550b5..9f3cc829dfeea 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -6743,60 +6743,70 @@ static void pci_no_domains(void) } #ifdef CONFIG_PCI_DOMAINS_GENERIC -static atomic_t __domain_nr = ATOMIC_INIT(-1); +static DEFINE_IDA(pci_domain_nr_static_ida); +static DEFINE_IDA(pci_domain_nr_dynamic_ida); -static int pci_get_new_domain_nr(void) +static void of_pci_reserve_static_domain_nr(void) { - return atomic_inc_return(&__domain_nr); + struct device_node *np; + int domain_nr; + + for_each_node_by_type(np, "pci") { + domain_nr = of_get_pci_domain_nr(np); + if (domain_nr < 0) + continue; + /* + * Permanently allocate domain_nr in dynamic_ida + * to prevent it from dynamic allocation. + */ + ida_alloc_range(&pci_domain_nr_dynamic_ida, + domain_nr, domain_nr, GFP_KERNEL); + } } static int of_pci_bus_find_domain_nr(struct device *parent) { - static int use_dt_domains = -1; - int domain = -1; + static bool static_domains_reserved = false; + int domain_nr; - if (parent) - domain = of_get_pci_domain_nr(parent->of_node); + /* On the first call scan device tree for static allocations. */ + if (!static_domains_reserved) { + of_pci_reserve_static_domain_nr(); + static_domains_reserved = true; + } + + if (parent) { + /* + * If domain is in DT, allocate it in static IDA. This + * prevents duplicate static allocations in case of errors + * in DT. + */ + domain_nr = of_get_pci_domain_nr(parent->of_node); + if (domain_nr >= 0) + return ida_alloc_range(&pci_domain_nr_static_ida, + domain_nr, domain_nr, + GFP_KERNEL); + } /* - * Check DT domain and use_dt_domains values. - * - * If DT domain property is valid (domain >= 0) and - * use_dt_domains != 0, the DT assignment is valid since this means - * we have not previously allocated a domain number by using - * pci_get_new_domain_nr(); we should also update use_dt_domains to - * 1, to indicate that we have just assigned a domain number from - * DT. - * - * If DT domain property value is not valid (ie domain < 0), and we - * have not previously assigned a domain number from DT - * (use_dt_domains != 1) we should assign a domain number by - * using the: - * - * pci_get_new_domain_nr() - * - * API and update the use_dt_domains value to keep track of method we - * are using to assign domain numbers (use_dt_domains = 0). - * - * All other combinations imply we have a platform that is trying - * to mix domain numbers obtained from DT and pci_get_new_domain_nr(), - * which is a recipe for domain mishandling and it is prevented by - * invalidating the domain value (domain = -1) and printing a - * corresponding error. + * If domain was not specified in DT, choose a free ID from dynamic + * allocations. All domain numbers from DT are permanently in + * dynamic allocations to prevent assigning them to other DT nodes + * without static domain. */ - if (domain >= 0 && use_dt_domains) { - use_dt_domains = 1; - } else if (domain < 0 && use_dt_domains != 1) { - use_dt_domains = 0; - domain = pci_get_new_domain_nr(); - } else { - if (parent) - pr_err("Node %pOF has ", parent->of_node); - pr_err("Inconsistent \"linux,pci-domain\" property in DT\n"); - domain = -1; - } + return ida_alloc(&pci_domain_nr_dynamic_ida, GFP_KERNEL); +} - return domain; +static void of_pci_bus_release_domain_nr(struct pci_bus *bus, struct device *parent) +{ + if (bus->domain_nr < 0) + return; + + /* Release domain from IDA where it was allocated. */ + if (of_get_pci_domain_nr(parent->of_node) == bus->domain_nr) + ida_free(&pci_domain_nr_static_ida, bus->domain_nr); + else + ida_free(&pci_domain_nr_dynamic_ida, bus->domain_nr); } int pci_bus_find_domain_nr(struct pci_bus *bus, struct device *parent) @@ -6804,6 +6814,13 @@ int pci_bus_find_domain_nr(struct pci_bus *bus, struct device *parent) return acpi_disabled ? of_pci_bus_find_domain_nr(parent) : acpi_pci_bus_find_domain_nr(bus); } + +void pci_bus_release_domain_nr(struct pci_bus *bus, struct device *parent) +{ + if (!acpi_disabled) + return; + of_pci_bus_release_domain_nr(bus, parent); +} #endif /** diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 1d6f7b502020d..1e234189aff15 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -906,6 +906,10 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge) bus->domain_nr = pci_bus_find_domain_nr(bus, parent); else bus->domain_nr = bridge->domain_nr; + if (bus->domain_nr < 0) { + err = bus->domain_nr; + goto free; + } #endif b = pci_find_bus(pci_domain_nr(bus), bridge->busnr); @@ -1030,6 +1034,9 @@ unregister: device_del(&bridge->dev); free: +#ifdef CONFIG_PCI_DOMAINS_GENERIC + pci_bus_release_domain_nr(bus, parent); +#endif kfree(bus); return err; } diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c index 4c54c75050dc1..0145aef1b9301 100644 --- a/drivers/pci/remove.c +++ b/drivers/pci/remove.c @@ -160,6 +160,12 @@ void pci_remove_root_bus(struct pci_bus *bus) pci_remove_bus(bus); host_bridge->bus = NULL; +#ifdef CONFIG_PCI_DOMAINS_GENERIC + /* Release domain_nr if it was dynamically allocated */ + if (host_bridge->domain_nr == PCI_DOMAIN_NR_NOT_SET) + pci_bus_release_domain_nr(bus, host_bridge->dev.parent); +#endif + /* remove the host bridge */ device_del(&host_bridge->dev); } diff --git a/include/linux/pci.h b/include/linux/pci.h index 2bda4a4e47e81..28af4414f7891 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1726,6 +1726,7 @@ static inline int acpi_pci_bus_find_domain_nr(struct pci_bus *bus) { return 0; } #endif int pci_bus_find_domain_nr(struct pci_bus *bus, struct device *parent); +void pci_bus_release_domain_nr(struct pci_bus *bus, struct device *parent); #endif /* Some architectures require additional setup to direct VGA traffic */ -- GitLab From a055204b063ade914e6dd6e270d3c2c0453a3cf5 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 2 Sep 2022 17:55:25 -0700 Subject: [PATCH 0399/1423] leds: gpio: switch to using devm_fwnode_gpiod_get() devm_fwnode_get_gpiod_from_child() is going away as the name is too unwieldy, let's switch to using the new devm_fwnode_gpiod_get(). Signed-off-by: Dmitry Torokhov Reviewed-by: Andy Shevchenko Reviewed-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- drivers/leds/leds-gpio.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/leds/leds-gpio.c b/drivers/leds/leds-gpio.c index 092eb59a7d325..ce4e79939731d 100644 --- a/drivers/leds/leds-gpio.c +++ b/drivers/leds/leds-gpio.c @@ -151,9 +151,8 @@ static struct gpio_leds_priv *gpio_leds_create(struct platform_device *pdev) * will be updated after LED class device is registered, * Only then the final LED name is known. */ - led.gpiod = devm_fwnode_get_gpiod_from_child(dev, NULL, child, - GPIOD_ASIS, - NULL); + led.gpiod = devm_fwnode_gpiod_get(dev, child, NULL, GPIOD_ASIS, + NULL); if (IS_ERR(led.gpiod)) { fwnode_handle_put(child); return ERR_CAST(led.gpiod); -- GitLab From 17521f263fc0ee2c202d5a762679c0ff0cf24b80 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 2 Sep 2022 17:55:26 -0700 Subject: [PATCH 0400/1423] leds: lgm-sso: switch to using devm_fwnode_gpiod_get() devm_fwnode_get_gpiod_from_child() is going away as the name is too unwieldy, let's switch to using the new devm_fwnode_gpiod_get(). Signed-off-by: Dmitry Torokhov Reviewed-by: Andy Shevchenko Reviewed-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- drivers/leds/blink/leds-lgm-sso.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/leds/blink/leds-lgm-sso.c b/drivers/leds/blink/leds-lgm-sso.c index 6f270c0272fb1..35c61311e7fd8 100644 --- a/drivers/leds/blink/leds-lgm-sso.c +++ b/drivers/leds/blink/leds-lgm-sso.c @@ -635,9 +635,8 @@ __sso_led_dt_parse(struct sso_led_priv *priv, struct fwnode_handle *fw_ssoled) led->priv = priv; desc = &led->desc; - led->gpiod = devm_fwnode_get_gpiod_from_child(dev, NULL, - fwnode_child, - GPIOD_ASIS, NULL); + led->gpiod = devm_fwnode_gpiod_get(dev, fwnode_child, NULL, + GPIOD_ASIS, NULL); if (IS_ERR(led->gpiod)) { ret = dev_err_probe(dev, PTR_ERR(led->gpiod), "led: get gpio fail!\n"); goto __dt_err; -- GitLab From 2fe8e1dcf937272c5425e69947819894fcf077a6 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 2 Sep 2022 17:55:27 -0700 Subject: [PATCH 0401/1423] gpiolib: remove devm_fwnode_get_[index_]gpiod_from_child() Now that there are no more users of these APIs in the kernel we can remove them. Signed-off-by: Dmitry Torokhov Reviewed-by: Andy Shevchenko Reviewed-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/consumer.h | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 36460ced060b0..45da8f137fe5d 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -581,27 +581,6 @@ struct gpio_desc *devm_fwnode_gpiod_get(struct device *dev, flags, label); } -static inline -struct gpio_desc *devm_fwnode_get_index_gpiod_from_child(struct device *dev, - const char *con_id, int index, - struct fwnode_handle *child, - enum gpiod_flags flags, - const char *label) -{ - return devm_fwnode_gpiod_get_index(dev, child, con_id, index, - flags, label); -} - -static inline -struct gpio_desc *devm_fwnode_get_gpiod_from_child(struct device *dev, - const char *con_id, - struct fwnode_handle *child, - enum gpiod_flags flags, - const char *label) -{ - return devm_fwnode_gpiod_get_index(dev, child, con_id, 0, flags, label); -} - #if IS_ENABLED(CONFIG_GPIOLIB) && IS_ENABLED(CONFIG_OF_GPIO) struct device_node; -- GitLab From 8afe82550240640617abfb3d6ba2c7579261e7fa Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 8 Nov 2022 15:38:52 +0200 Subject: [PATCH 0402/1423] gpiolib: of: Prepare of_gpiochip_add() / of_gpiochip_remove() for fwnode GPIO library is getting rid of of_node, fwnode should be utilized instead. Prepare of_gpiochip_add() / of_gpiochip_remove() for fwnode. Signed-off-by: Andy Shevchenko Reviewed-by: Dmitry Torokhov Reviewed-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-of.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index be9c34cca3225..000020eb78d83 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -1104,9 +1104,11 @@ static int of_gpiochip_add_pin_range(struct gpio_chip *chip) { return 0; } int of_gpiochip_add(struct gpio_chip *chip) { + struct device_node *np; int ret; - if (!chip->of_node) + np = to_of_node(chip->fwnode); + if (!np) return 0; if (!chip->of_xlate) { @@ -1123,18 +1125,18 @@ int of_gpiochip_add(struct gpio_chip *chip) if (ret) return ret; - of_node_get(chip->of_node); + fwnode_handle_get(chip->fwnode); ret = of_gpiochip_scan_gpios(chip); if (ret) - of_node_put(chip->of_node); + fwnode_handle_put(chip->fwnode); return ret; } void of_gpiochip_remove(struct gpio_chip *chip) { - of_node_put(chip->of_node); + fwnode_handle_put(chip->fwnode); } void of_gpio_dev_init(struct gpio_chip *gc, struct gpio_device *gdev) -- GitLab From 27043a7d500c4a3debb899c28bbf492492f64e58 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 8 Nov 2022 15:38:53 +0200 Subject: [PATCH 0403/1423] gpiolib: of: Integrate of_gpiochip_init_valid_mask() into gpiochip_init_valid_mask() In preparation to complete fwnode switch, integrate of_gpiochip_init_valid_mask() into gpiochip_init_valid_mask(). Signed-off-by: Andy Shevchenko Reviewed-by: Dmitry Torokhov Reviewed-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-of.c | 42 ------------------------------ drivers/gpio/gpiolib-of.h | 5 ---- drivers/gpio/gpiolib.c | 54 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 53 insertions(+), 48 deletions(-) diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index 000020eb78d83..4be3c21aa7188 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -112,24 +112,6 @@ static struct gpio_desc *of_xlate_and_get_gpiod_flags(struct gpio_chip *chip, return gpiochip_get_desc(chip, ret); } -/** - * of_gpio_need_valid_mask() - figure out if the OF GPIO driver needs - * to set the .valid_mask - * @gc: the target gpio_chip - * - * Return: true if the valid mask needs to be set - */ -bool of_gpio_need_valid_mask(const struct gpio_chip *gc) -{ - int size; - const struct device_node *np = gc->of_node; - - size = of_property_count_u32_elems(np, "gpio-reserved-ranges"); - if (size > 0 && size % 2 == 0) - return true; - return false; -} - /* * Overrides stated polarity of a gpio line and warns when there is a * discrepancy. @@ -989,28 +971,6 @@ void of_mm_gpiochip_remove(struct of_mm_gpio_chip *mm_gc) } EXPORT_SYMBOL_GPL(of_mm_gpiochip_remove); -static void of_gpiochip_init_valid_mask(struct gpio_chip *chip) -{ - int len, i; - u32 start, count; - struct device_node *np = chip->of_node; - - len = of_property_count_u32_elems(np, "gpio-reserved-ranges"); - if (len < 0 || len % 2 != 0) - return; - - for (i = 0; i < len; i += 2) { - of_property_read_u32_index(np, "gpio-reserved-ranges", - i, &start); - of_property_read_u32_index(np, "gpio-reserved-ranges", - i + 1, &count); - if (start >= chip->ngpio || start + count > chip->ngpio) - continue; - - bitmap_clear(chip->valid_mask, start, count); - } -}; - #ifdef CONFIG_PINCTRL static int of_gpiochip_add_pin_range(struct gpio_chip *chip) { @@ -1119,8 +1079,6 @@ int of_gpiochip_add(struct gpio_chip *chip) if (chip->of_gpio_n_cells > MAX_PHANDLE_ARGS) return -EINVAL; - of_gpiochip_init_valid_mask(chip); - ret = of_gpiochip_add_pin_range(chip); if (ret) return ret; diff --git a/drivers/gpio/gpiolib-of.h b/drivers/gpio/gpiolib-of.h index 8af2bc899aab2..2c32a332ede59 100644 --- a/drivers/gpio/gpiolib-of.h +++ b/drivers/gpio/gpiolib-of.h @@ -14,7 +14,6 @@ struct gpio_desc *of_find_gpio(struct device *dev, int of_gpiochip_add(struct gpio_chip *gc); void of_gpiochip_remove(struct gpio_chip *gc); int of_gpio_get_count(struct device *dev, const char *con_id); -bool of_gpio_need_valid_mask(const struct gpio_chip *gc); void of_gpio_dev_init(struct gpio_chip *gc, struct gpio_device *gdev); #else static inline struct gpio_desc *of_find_gpio(struct device *dev, @@ -30,10 +29,6 @@ static inline int of_gpio_get_count(struct device *dev, const char *con_id) { return 0; } -static inline bool of_gpio_need_valid_mask(const struct gpio_chip *gc) -{ - return false; -} static inline void of_gpio_dev_init(struct gpio_chip *gc, struct gpio_device *gdev) { diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index e8faedca6b14f..11fb7ec883e9f 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -445,9 +445,21 @@ static unsigned long *gpiochip_allocate_mask(struct gpio_chip *gc) return p; } +static unsigned int gpiochip_count_reserved_ranges(struct gpio_chip *gc) +{ + int size; + + /* Format is "start, count, ..." */ + size = fwnode_property_count_u32(gc->fwnode, "gpio-reserved-ranges"); + if (size > 0 && size % 2 == 0) + return size; + + return 0; +} + static int gpiochip_alloc_valid_mask(struct gpio_chip *gc) { - if (!(of_gpio_need_valid_mask(gc) || gc->init_valid_mask)) + if (!(gpiochip_count_reserved_ranges(gc) || gc->init_valid_mask)) return 0; gc->valid_mask = gpiochip_allocate_mask(gc); @@ -457,8 +469,48 @@ static int gpiochip_alloc_valid_mask(struct gpio_chip *gc) return 0; } +static int gpiochip_apply_reserved_ranges(struct gpio_chip *gc) +{ + unsigned int size; + u32 *ranges; + int ret; + + size = gpiochip_count_reserved_ranges(gc); + if (size == 0) + return 0; + + ranges = kmalloc_array(size, sizeof(*ranges), GFP_KERNEL); + if (!ranges) + return -ENOMEM; + + ret = fwnode_property_read_u32_array(gc->fwnode, "gpio-reserved-ranges", ranges, size); + if (ret) { + kfree(ranges); + return ret; + } + + while (size) { + u32 count = ranges[--size]; + u32 start = ranges[--size]; + + if (start >= gc->ngpio || start + count > gc->ngpio) + continue; + + bitmap_clear(gc->valid_mask, start, count); + } + + kfree(ranges); + return 0; +} + static int gpiochip_init_valid_mask(struct gpio_chip *gc) { + int ret; + + ret = gpiochip_apply_reserved_ranges(gc); + if (ret) + return ret; + if (gc->init_valid_mask) return gc->init_valid_mask(gc, gc->valid_mask, -- GitLab From bdf1da5df9da680589a7f74448dd0a94dd3e1446 Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Mon, 7 Nov 2022 15:50:57 +0100 Subject: [PATCH 0404/1423] RDMA/siw: Fix immediate work request flush to completion queue Correctly set send queue element opcode during immediate work request flushing in post sendqueue operation, if the QP is in ERROR state. An undefined ocode value results in out-of-bounds access to an array for mapping the opcode between siw internal and RDMA core representation in work completion generation. It resulted in a KASAN BUG report of type 'global-out-of-bounds' during NFSoRDMA testing. This patch further fixes a potential case of a malicious user which may write undefined values for completion queue elements status or opcode, if the CQ is memory mapped to user land. It avoids the same out-of-bounds access to arrays for status and opcode mapping as described above. Fixes: 303ae1cdfdf7 ("rdma/siw: application interface") Fixes: b0fff7317bb4 ("rdma/siw: completion queue methods") Reported-by: Olga Kornievskaia Reviewed-by: Tom Talpey Signed-off-by: Bernard Metzler Link: https://lore.kernel.org/r/20221107145057.895747-1-bmt@zurich.ibm.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/siw/siw_cq.c | 24 ++++++++++++++-- drivers/infiniband/sw/siw/siw_verbs.c | 40 ++++++++++++++++++++++++--- 2 files changed, 58 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/sw/siw/siw_cq.c b/drivers/infiniband/sw/siw/siw_cq.c index d68e37859e73b..acc7bcd538b53 100644 --- a/drivers/infiniband/sw/siw/siw_cq.c +++ b/drivers/infiniband/sw/siw/siw_cq.c @@ -56,8 +56,6 @@ int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc) if (READ_ONCE(cqe->flags) & SIW_WQE_VALID) { memset(wc, 0, sizeof(*wc)); wc->wr_id = cqe->id; - wc->status = map_cqe_status[cqe->status].ib; - wc->opcode = map_wc_opcode[cqe->opcode]; wc->byte_len = cqe->bytes; /* @@ -71,10 +69,32 @@ int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc) wc->wc_flags = IB_WC_WITH_INVALIDATE; } wc->qp = cqe->base_qp; + wc->opcode = map_wc_opcode[cqe->opcode]; + wc->status = map_cqe_status[cqe->status].ib; siw_dbg_cq(cq, "idx %u, type %d, flags %2x, id 0x%pK\n", cq->cq_get % cq->num_cqe, cqe->opcode, cqe->flags, (void *)(uintptr_t)cqe->id); + } else { + /* + * A malicious user may set invalid opcode or + * status in the user mmapped CQE array. + * Sanity check and correct values in that case + * to avoid out-of-bounds access to global arrays + * for opcode and status mapping. + */ + u8 opcode = cqe->opcode; + u16 status = cqe->status; + + if (opcode >= SIW_NUM_OPCODES) { + opcode = 0; + status = IB_WC_GENERAL_ERR; + } else if (status >= SIW_NUM_WC_STATUS) { + status = IB_WC_GENERAL_ERR; + } + wc->opcode = map_wc_opcode[opcode]; + wc->status = map_cqe_status[status].ib; + } WRITE_ONCE(cqe->flags, 0); cq->cq_get++; diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c index 3e814cfb298cf..906fde1a2a0de 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.c +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -676,13 +676,45 @@ static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr, static int siw_sq_flush_wr(struct siw_qp *qp, const struct ib_send_wr *wr, const struct ib_send_wr **bad_wr) { - struct siw_sqe sqe = {}; int rv = 0; while (wr) { - sqe.id = wr->wr_id; - sqe.opcode = wr->opcode; - rv = siw_sqe_complete(qp, &sqe, 0, SIW_WC_WR_FLUSH_ERR); + struct siw_sqe sqe = {}; + + switch (wr->opcode) { + case IB_WR_RDMA_WRITE: + sqe.opcode = SIW_OP_WRITE; + break; + case IB_WR_RDMA_READ: + sqe.opcode = SIW_OP_READ; + break; + case IB_WR_RDMA_READ_WITH_INV: + sqe.opcode = SIW_OP_READ_LOCAL_INV; + break; + case IB_WR_SEND: + sqe.opcode = SIW_OP_SEND; + break; + case IB_WR_SEND_WITH_IMM: + sqe.opcode = SIW_OP_SEND_WITH_IMM; + break; + case IB_WR_SEND_WITH_INV: + sqe.opcode = SIW_OP_SEND_REMOTE_INV; + break; + case IB_WR_LOCAL_INV: + sqe.opcode = SIW_OP_INVAL_STAG; + break; + case IB_WR_REG_MR: + sqe.opcode = SIW_OP_REG_MR; + break; + default: + rv = -EINVAL; + break; + } + if (!rv) { + sqe.id = wr->wr_id; + rv = siw_sqe_complete(qp, &sqe, 0, + SIW_WC_WR_FLUSH_ERR); + } if (rv) { if (bad_wr) *bad_wr = wr; -- GitLab From 0b9ca98b722969660ad98b39f766a561ccb39f5f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:03:07 +0000 Subject: [PATCH 0405/1423] perf/x86/core: Zero @lbr instead of returning -1 in x86_perf_get_lbr() stub Drop the return value from x86_perf_get_lbr() and have the stub zero out the @lbr structure instead of returning -1 to indicate "no LBR support". KVM doesn't actually check the return value, and instead subtly relies on zeroing the number of LBRs in intel_pmu_init(). Formalize "nr=0 means unsupported" so that KVM doesn't need to add a pointless check on the return value to fix KVM's benign bug. Note, the stub is necessary even though KVM x86 selects PERF_EVENTS and the caller exists only when CONFIG_KVM_INTEL=y. Despite the name, KVM_INTEL doesn't strictly require CPU_SUP_INTEL, it can be built with any of INTEL || CENTAUR || ZHAOXIN CPUs. Signed-off-by: Sean Christopherson Message-Id: <20221006000314.73240-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/events/intel/lbr.c | 6 +----- arch/x86/include/asm/perf_event.h | 6 +++--- arch/x86/kvm/vmx/capabilities.h | 3 ++- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 8259d725054d0..4dbde69c423ba 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1603,10 +1603,8 @@ clear_arch_lbr: * x86_perf_get_lbr - get the LBR records information * * @lbr: the caller's memory to store the LBR records information - * - * Returns: 0 indicates the LBR info has been successfully obtained */ -int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) +void x86_perf_get_lbr(struct x86_pmu_lbr *lbr) { int lbr_fmt = x86_pmu.intel_cap.lbr_format; @@ -1614,8 +1612,6 @@ int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) lbr->from = x86_pmu.lbr_from; lbr->to = x86_pmu.lbr_to; lbr->info = (lbr_fmt == LBR_FORMAT_INFO) ? x86_pmu.lbr_info : 0; - - return 0; } EXPORT_SYMBOL_GPL(x86_perf_get_lbr); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 9ac46dbe57d48..5d0f6891ae611 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -543,12 +543,12 @@ static inline void perf_check_microcode(void) { } #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data); -extern int x86_perf_get_lbr(struct x86_pmu_lbr *lbr); +extern void x86_perf_get_lbr(struct x86_pmu_lbr *lbr); #else struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data); -static inline int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) +static inline void x86_perf_get_lbr(struct x86_pmu_lbr *lbr) { - return -1; + memset(lbr, 0, sizeof(*lbr)); } #endif diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 07254314f3dd5..479124e49bbda 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -407,7 +407,8 @@ static inline u64 vmx_get_perf_capabilities(void) if (boot_cpu_has(X86_FEATURE_PDCM)) rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); - if (x86_perf_get_lbr(&lbr) >= 0 && lbr.nr) + x86_perf_get_lbr(&lbr); + if (lbr.nr) perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; if (vmx_pebs_supported()) { -- GitLab From bec46859fb9d797a21c983100b1f425bebe89747 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:03:11 +0000 Subject: [PATCH 0406/1423] KVM: x86: Track supported PERF_CAPABILITIES in kvm_caps Track KVM's supported PERF_CAPABILITIES in kvm_caps instead of computing the supported capabilities on the fly every time. Using kvm_caps will also allow for future cleanups as the kvm_caps values can be used directly in common x86 code. Signed-off-by: Sean Christopherson Acked-by: Like Xu Message-Id: <20221006000314.73240-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 2 ++ arch/x86/kvm/vmx/capabilities.h | 25 ------------------------ arch/x86/kvm/vmx/pmu_intel.c | 2 +- arch/x86/kvm/vmx/vmx.c | 34 +++++++++++++++++++++++++++++---- arch/x86/kvm/x86.h | 1 + 5 files changed, 34 insertions(+), 30 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9f88c8e6766e4..d15f1934c9042 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2714,6 +2714,7 @@ static int svm_get_msr_feature(struct kvm_msr_entry *msr) msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE; break; case MSR_IA32_PERF_CAPABILITIES: + msr->data = kvm_caps.supported_perf_cap; return 0; default: return KVM_MSR_RET_INVALID; @@ -4865,6 +4866,7 @@ static __init void svm_set_cpu_caps(void) { kvm_set_cpu_caps(); + kvm_caps.supported_perf_cap = 0; kvm_caps.supported_xss = 0; /* CPUID 0x80000001 and 0x8000000A (SVM features) */ diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 479124e49bbda..cd2ac9536c998 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -395,31 +395,6 @@ static inline bool vmx_pebs_supported(void) return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept; } -static inline u64 vmx_get_perf_capabilities(void) -{ - u64 perf_cap = PMU_CAP_FW_WRITES; - struct x86_pmu_lbr lbr; - u64 host_perf_cap = 0; - - if (!enable_pmu) - return 0; - - if (boot_cpu_has(X86_FEATURE_PDCM)) - rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); - - x86_perf_get_lbr(&lbr); - if (lbr.nr) - perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; - - if (vmx_pebs_supported()) { - perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; - if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4) - perf_cap &= ~PERF_CAP_PEBS_BASELINE; - } - - return perf_cap; -} - static inline bool cpu_has_notify_vmexit(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 10b33da9bd058..9fabfe71fd879 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -631,7 +631,7 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) pmu->fixed_counters[i].current_config = 0; } - vcpu->arch.perf_capabilities = vmx_get_perf_capabilities(); + vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap; lbr_desc->records.nr = 0; lbr_desc->event = NULL; lbr_desc->msr_passthrough = false; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 63247c57c72cc..3fec439530516 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1850,7 +1850,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) return 1; return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); case MSR_IA32_PERF_CAPABILITIES: - msr->data = vmx_get_perf_capabilities(); + msr->data = kvm_caps.supported_perf_cap; return 0; default: return KVM_MSR_RET_INVALID; @@ -2029,7 +2029,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; - if ((vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT) && + if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) && (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; @@ -2342,14 +2342,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; if (data & PMU_CAP_LBR_FMT) { if ((data & PMU_CAP_LBR_FMT) != - (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)) + (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT)) return 1; if (!cpuid_model_is_consistent(vcpu)) return 1; } if (data & PERF_CAP_PEBS_FORMAT) { if ((data & PERF_CAP_PEBS_MASK) != - (vmx_get_perf_capabilities() & PERF_CAP_PEBS_MASK)) + (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK)) return 1; if (!guest_cpuid_has(vcpu, X86_FEATURE_DS)) return 1; @@ -7669,6 +7669,31 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmx_update_exception_bitmap(vcpu); } +static u64 vmx_get_perf_capabilities(void) +{ + u64 perf_cap = PMU_CAP_FW_WRITES; + struct x86_pmu_lbr lbr; + u64 host_perf_cap = 0; + + if (!enable_pmu) + return 0; + + if (boot_cpu_has(X86_FEATURE_PDCM)) + rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); + + x86_perf_get_lbr(&lbr); + if (lbr.nr) + perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; + + if (vmx_pebs_supported()) { + perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; + if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4) + perf_cap &= ~PERF_CAP_PEBS_BASELINE; + } + + return perf_cap; +} + static __init void vmx_set_cpu_caps(void) { kvm_set_cpu_caps(); @@ -7691,6 +7716,7 @@ static __init void vmx_set_cpu_caps(void) if (!enable_pmu) kvm_cpu_cap_clear(X86_FEATURE_PDCM); + kvm_caps.supported_perf_cap = vmx_get_perf_capabilities(); if (!enable_sgx) { kvm_cpu_cap_clear(X86_FEATURE_SGX); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 829d3134c1eb0..9de72586f4065 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -27,6 +27,7 @@ struct kvm_caps { u64 supported_mce_cap; u64 supported_xcr0; u64 supported_xss; + u64 supported_perf_cap; }; void kvm_spurious_fault(void); -- GitLab From 6c6f82bea96fddb96898edbbee248ad362b768f4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:03:12 +0000 Subject: [PATCH 0407/1423] KVM: x86: Init vcpu->arch.perf_capabilities in common x86 code Initialize vcpu->arch.perf_capabilities in x86's kvm_arch_vcpu_create() instead of deferring initialization to vendor code. For better or worse, common x86 handles reads and writes to the MSR, and so common x86 should also handle initializing the MSR. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20221006000314.73240-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/pmu_intel.c | 1 - arch/x86/kvm/x86.c | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 9fabfe71fd879..b7c3a9874a93c 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -631,7 +631,6 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) pmu->fixed_counters[i].current_config = 0; } - vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap; lbr_desc->records.nr = 0; lbr_desc->event = NULL; lbr_desc->msr_passthrough = false; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ecea83f0da498..6ce7c80180d24 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11893,6 +11893,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; kvm_async_pf_hash_reset(vcpu); + + vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap; kvm_pmu_init(vcpu); vcpu->arch.pending_external_vector = -1; -- GitLab From 5fe9805dc2f58bebb3bf5dd298559c4bad5f0448 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:03:13 +0000 Subject: [PATCH 0408/1423] KVM: x86: Handle PERF_CAPABILITIES in common x86's kvm_get_msr_feature() Handle PERF_CAPABILITIES directly in kvm_get_msr_feature() now that the supported value is available in kvm_caps. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20221006000314.73240-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 3 --- arch/x86/kvm/vmx/vmx.c | 3 --- arch/x86/kvm/x86.c | 3 +++ 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d15f1934c9042..3fc8e4999891a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2713,9 +2713,6 @@ static int svm_get_msr_feature(struct kvm_msr_entry *msr) if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE; break; - case MSR_IA32_PERF_CAPABILITIES: - msr->data = kvm_caps.supported_perf_cap; - return 0; default: return KVM_MSR_RET_INVALID; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3fec439530516..7a7e14d4d78c2 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1849,9 +1849,6 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) if (!nested) return 1; return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); - case MSR_IA32_PERF_CAPABILITIES: - msr->data = kvm_caps.supported_perf_cap; - return 0; default: return KVM_MSR_RET_INVALID; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6ce7c80180d24..6f8a370d2b1d5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1648,6 +1648,9 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr) case MSR_IA32_ARCH_CAPABILITIES: msr->data = kvm_get_arch_capabilities(); break; + case MSR_IA32_PERF_CAPABILITIES: + msr->data = kvm_caps.supported_perf_cap; + break; case MSR_IA32_UCODE_REV: rdmsrl_safe(msr->index, &msr->data); break; -- GitLab From 686e0f0324f07ea1979462f659b24b1195996fe0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:03:14 +0000 Subject: [PATCH 0409/1423] KVM: x86: Directly query supported PERF_CAPABILITIES for WRMSR checks Use kvm_caps.supported_perf_cap directly instead of bouncing through kvm_get_msr_feature() when checking the incoming value for writes to PERF_CAPABILITIES. Note, kvm_get_msr_feature() is guaranteed to succeed when getting PERF_CAPABILITIES, i.e. dropping that check is a nop. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20221006000314.73240-9-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6f8a370d2b1d5..30a5365f4b320 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3563,20 +3563,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.arch_capabilities = data; break; - case MSR_IA32_PERF_CAPABILITIES: { - struct kvm_msr_entry msr_ent = {.index = msr, .data = 0}; - + case MSR_IA32_PERF_CAPABILITIES: if (!msr_info->host_initiated) return 1; - if (kvm_get_msr_feature(&msr_ent)) - return 1; - if (data & ~msr_ent.data) + if (data & ~kvm_caps.supported_perf_cap) return 1; vcpu->arch.perf_capabilities = data; kvm_pmu_refresh(vcpu); return 0; - } case MSR_EFER: return set_efer(vcpu, msr_info); case MSR_K7_HWCR: -- GitLab From 0f9edb8cab29667ef5ac50736bb1f9a0557e7f94 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 13 Sep 2022 17:05:37 +0800 Subject: [PATCH 0410/1423] KVM: x86: remove obsolete kvm_mmu_gva_to_gpa_fetch() There's no caller. Remove it. Signed-off-by: Miaohe Lin Reviewed-by: Sean Christopherson Message-Id: <20220913090537.25195-1-linmiaohe@huawei.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 -- arch/x86/kvm/x86.c | 10 ---------- 2 files changed, 12 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f05ebaa26f0ff..1723a357190d6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1909,8 +1909,6 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu); gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); -gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, - struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 30a5365f4b320..3c63ba5512e9a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7154,16 +7154,6 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, } EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read); - gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, - struct x86_exception *exception) -{ - struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - - u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; - access |= PFERR_FETCH_MASK; - return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); -} - gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { -- GitLab From fa3e42037ef53c2aea85faf51d6f947e16dd8fc5 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 13 Sep 2022 17:17:25 +0800 Subject: [PATCH 0411/1423] KVM: x86/mmu: fix some comment typos Fix some typos in comments. Signed-off-by: Miaohe Lin Reviewed-by: Sean Christopherson Message-Id: <20220913091725.35953-1-linmiaohe@huawei.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/mmu/spte.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6f81539061d64..639ac64a4e8c9 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1894,7 +1894,7 @@ static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) if (sp->role.invalid) return true; - /* TDP MMU pages due not use the MMU generation. */ + /* TDP MMU pages do not use the MMU generation. */ return !sp->tdp_mmu_page && unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); } diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 7670c13ce251b..79560d77aa4c7 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -188,7 +188,7 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; * should not modify the SPTE. * * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on - * bot AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF + * both AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF * vulnerability. Use only low bits to avoid 64-bit immediates. * * Only used by the TDP MMU. -- GitLab From 3adbdf81038801ee84367845e78bb1a3b36bde39 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 13 Sep 2022 16:54:52 +0800 Subject: [PATCH 0412/1423] KVM: x86/mmu: use helper macro SPTE_ENT_PER_PAGE Use helper macro SPTE_ENT_PER_PAGE to get the number of spte entries per page. Minor readability improvement. Signed-off-by: Miaohe Lin Reviewed-by: Sean Christopherson Message-Id: <20220913085452.25561-1-linmiaohe@huawei.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 639ac64a4e8c9..2640871bdcf17 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1645,7 +1645,7 @@ static int is_empty_shadow_page(u64 *spt) u64 *pos; u64 *end; - for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) + for (pos = spt, end = pos + SPTE_ENT_PER_PAGE; pos != end; pos++) if (is_shadow_present_pte(*pos)) { printk(KERN_ERR "%s: %p %llx\n", __func__, pos, *pos); -- GitLab From bb5c8abea09453d8b137e6613980b8e257485868 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Tue, 30 Aug 2022 15:52:09 -0700 Subject: [PATCH 0413/1423] KVM: x86: Insert "AMD" in KVM_X86_FEATURE_PSFD Intel and AMD have separate CPUID bits for each SPEC_CTRL bit. In the case of every bit other than PFSD, the Intel CPUID bit has no vendor name qualifier, but the AMD CPUID bit does. For consistency, rename KVM_X86_FEATURE_PSFD to KVM_X86_FEATURE_AMD_PSFD. No functional change intended. Signed-off-by: Jim Mattson Cc: Babu Moger Message-Id: <20220830225210.2381310-1-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 62bc7a01cecca..6b5912578eddc 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -62,7 +62,7 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted) * This one is tied to SSB in the user API, and not * visible in /proc/cpuinfo. */ -#define KVM_X86_FEATURE_PSFD (13*32+28) /* Predictive Store Forwarding Disable */ +#define KVM_X86_FEATURE_AMD_PSFD (13*32+28) /* Predictive Store Forwarding Disable */ #define F feature_bit #define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0) @@ -694,7 +694,7 @@ void kvm_set_cpu_caps(void) F(CLZERO) | F(XSAVEERPTR) | F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) | - __feature_bit(KVM_X86_FEATURE_PSFD) + __feature_bit(KVM_X86_FEATURE_AMD_PSFD) ); /* -- GitLab From 00009406f0dbc53b95b9062c0cc297d6893ff394 Mon Sep 17 00:00:00 2001 From: Rafael Mendonca Date: Thu, 20 Oct 2022 23:01:13 -0300 Subject: [PATCH 0414/1423] x86/kvm: Remove unused virt to phys translation in kvm_guest_cpu_init() Presumably, this was introduced due to a conflict resolution with commit ef68017eb570 ("x86/kvm: Handle async page faults directly through do_page_fault()"), given that the last posted version [1] of the blamed commit was not based on the aforementioned commit. [1] https://lore.kernel.org/kvm/20200525144125.143875-9-vkuznets@redhat.com/ Fixes: b1d405751cd5 ("KVM: x86: Switch KVM guest to using interrupts for page ready APF delivery") Signed-off-by: Rafael Mendonca Message-Id: <20221021020113.922027-1-rafaelmendsr@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index d4e48b4a438b2..cf886f86038a0 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -349,7 +349,7 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) static void kvm_guest_cpu_init(void) { if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { - u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); + u64 pa; WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled)); -- GitLab From 07a368b3f55a79d33cad113d506b279e04fd2a00 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 25 Oct 2022 15:47:27 +0300 Subject: [PATCH 0415/1423] bug: introduce ASSERT_STRUCT_OFFSET ASSERT_STRUCT_OFFSET allows to assert during the build of the kernel that a field in a struct have an expected offset. KVM used to have such macro, but there is almost nothing KVM specific in it so move it to build_bug.h, so that it can be used in other places in KVM. Signed-off-by: Maxim Levitsky Message-Id: <20221025124741.228045-10-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmcs12.h | 5 ++--- include/linux/build_bug.h | 9 +++++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 746129ddd5ae0..01936013428b5 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -208,9 +208,8 @@ struct __packed vmcs12 { /* * For save/restore compatibility, the vmcs12 field offsets must not change. */ -#define CHECK_OFFSET(field, loc) \ - BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc), \ - "Offset of " #field " in struct vmcs12 has changed.") +#define CHECK_OFFSET(field, loc) \ + ASSERT_STRUCT_OFFSET(struct vmcs12, field, loc) static inline void vmx_check_vmcs12_offsets(void) { diff --git a/include/linux/build_bug.h b/include/linux/build_bug.h index e3a0be2c90ad9..3aa3640f8c181 100644 --- a/include/linux/build_bug.h +++ b/include/linux/build_bug.h @@ -77,4 +77,13 @@ #define static_assert(expr, ...) __static_assert(expr, ##__VA_ARGS__, #expr) #define __static_assert(expr, msg, ...) _Static_assert(expr, msg) + +/* + * Compile time check that field has an expected offset + */ +#define ASSERT_STRUCT_OFFSET(type, field, expected_offset) \ + BUILD_BUG_ON_MSG(offsetof(type, field) != (expected_offset), \ + "Offset of " #field " in " #type " has changed.") + + #endif /* _LINUX_BUILD_BUG_H */ -- GitLab From d08b48585309247d4d28051dd7a315eef5d1db26 Mon Sep 17 00:00:00 2001 From: Carlos Bilbao Date: Mon, 24 Oct 2022 11:44:48 -0500 Subject: [PATCH 0416/1423] KVM: SVM: Name and check reserved fields with structs offset Rename reserved fields on all structs in arch/x86/include/asm/svm.h following their offset within the structs. Include compile time checks for this in the same place where other BUILD_BUG_ON for the structs are. This also solves that fields of struct sev_es_save_area are named by their order of appearance, but right now they jump from reserved_5 to reserved_7. Link: https://lkml.org/lkml/2022/10/22/376 Signed-off-by: Carlos Bilbao Message-Id: <20221024164448.203351-1-carlos.bilbao@amd.com> [Use ASSERT_STRUCT_OFFSET + fix a couple wrong offsets. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/svm.h | 93 ++++++++++++++++++++++++++------------ arch/x86/kvm/svm/sev.c | 2 +- 2 files changed, 66 insertions(+), 29 deletions(-) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 0361626841bc0..4352b46dd20c9 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -293,12 +293,13 @@ struct vmcb_save_area { struct vmcb_seg ldtr; struct vmcb_seg idtr; struct vmcb_seg tr; - u8 reserved_1[42]; + /* Reserved fields are named following their struct offset */ + u8 reserved_0xa0[42]; u8 vmpl; u8 cpl; - u8 reserved_2[4]; + u8 reserved_0xcc[4]; u64 efer; - u8 reserved_3[112]; + u8 reserved_0xd8[112]; u64 cr4; u64 cr3; u64 cr0; @@ -306,7 +307,7 @@ struct vmcb_save_area { u64 dr6; u64 rflags; u64 rip; - u8 reserved_4[88]; + u8 reserved_0x180[88]; u64 rsp; u64 s_cet; u64 ssp; @@ -321,14 +322,14 @@ struct vmcb_save_area { u64 sysenter_esp; u64 sysenter_eip; u64 cr2; - u8 reserved_5[32]; + u8 reserved_0x248[32]; u64 g_pat; u64 dbgctl; u64 br_from; u64 br_to; u64 last_excp_from; u64 last_excp_to; - u8 reserved_6[72]; + u8 reserved_0x298[72]; u32 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */ } __packed; @@ -349,12 +350,12 @@ struct sev_es_save_area { u64 vmpl2_ssp; u64 vmpl3_ssp; u64 u_cet; - u8 reserved_1[2]; + u8 reserved_0xc8[2]; u8 vmpl; u8 cpl; - u8 reserved_2[4]; + u8 reserved_0xcc[4]; u64 efer; - u8 reserved_3[104]; + u8 reserved_0xd8[104]; u64 xss; u64 cr4; u64 cr3; @@ -371,7 +372,7 @@ struct sev_es_save_area { u64 dr1_addr_mask; u64 dr2_addr_mask; u64 dr3_addr_mask; - u8 reserved_4[24]; + u8 reserved_0x1c0[24]; u64 rsp; u64 s_cet; u64 ssp; @@ -386,21 +387,21 @@ struct sev_es_save_area { u64 sysenter_esp; u64 sysenter_eip; u64 cr2; - u8 reserved_5[32]; + u8 reserved_0x248[32]; u64 g_pat; u64 dbgctl; u64 br_from; u64 br_to; u64 last_excp_from; u64 last_excp_to; - u8 reserved_7[80]; + u8 reserved_0x298[80]; u32 pkru; - u8 reserved_8[20]; - u64 reserved_9; /* rax already available at 0x01f8 */ + u32 tsc_aux; + u8 reserved_0x2f0[24]; u64 rcx; u64 rdx; u64 rbx; - u64 reserved_10; /* rsp already available at 0x01d8 */ + u64 reserved_0x320; /* rsp already available at 0x01d8 */ u64 rbp; u64 rsi; u64 rdi; @@ -412,7 +413,7 @@ struct sev_es_save_area { u64 r13; u64 r14; u64 r15; - u8 reserved_11[16]; + u8 reserved_0x380[16]; u64 guest_exit_info_1; u64 guest_exit_info_2; u64 guest_exit_int_info; @@ -425,7 +426,7 @@ struct sev_es_save_area { u64 pcpu_id; u64 event_inj; u64 xcr0; - u8 reserved_12[16]; + u8 reserved_0x3f0[16]; /* Floating point area */ u64 x87_dp; @@ -443,23 +444,23 @@ struct sev_es_save_area { } __packed; struct ghcb_save_area { - u8 reserved_1[203]; + u8 reserved_0x0[203]; u8 cpl; - u8 reserved_2[116]; + u8 reserved_0xcc[116]; u64 xss; - u8 reserved_3[24]; + u8 reserved_0x148[24]; u64 dr7; - u8 reserved_4[16]; + u8 reserved_0x168[16]; u64 rip; - u8 reserved_5[88]; + u8 reserved_0x180[88]; u64 rsp; - u8 reserved_6[24]; + u8 reserved_0x1e0[24]; u64 rax; - u8 reserved_7[264]; + u8 reserved_0x200[264]; u64 rcx; u64 rdx; u64 rbx; - u8 reserved_8[8]; + u8 reserved_0x320[8]; u64 rbp; u64 rsi; u64 rdi; @@ -471,12 +472,12 @@ struct ghcb_save_area { u64 r13; u64 r14; u64 r15; - u8 reserved_9[16]; + u8 reserved_0x380[16]; u64 sw_exit_code; u64 sw_exit_info_1; u64 sw_exit_info_2; u64 sw_scratch; - u8 reserved_10[56]; + u8 reserved_0x3b0[56]; u64 xcr0; u8 valid_bitmap[16]; u64 x87_state_gpa; @@ -490,7 +491,7 @@ struct ghcb { u8 shared_buffer[GHCB_SHARED_BUF_SIZE]; - u8 reserved_1[10]; + u8 reserved_0xff0[10]; u16 protocol_version; /* negotiated SEV-ES/GHCB protocol version */ u32 ghcb_usage; } __packed; @@ -502,6 +503,9 @@ struct ghcb { #define EXPECTED_VMCB_CONTROL_AREA_SIZE 1024 #define EXPECTED_GHCB_SIZE PAGE_SIZE +#define BUILD_BUG_RESERVED_OFFSET(x, y) \ + ASSERT_STRUCT_OFFSET(struct x, reserved ## _ ## y, y) + static inline void __unused_size_checks(void) { BUILD_BUG_ON(sizeof(struct vmcb_save_area) != EXPECTED_VMCB_SAVE_AREA_SIZE); @@ -509,6 +513,39 @@ static inline void __unused_size_checks(void) BUILD_BUG_ON(sizeof(struct sev_es_save_area) != EXPECTED_SEV_ES_SAVE_AREA_SIZE); BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE); BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE); + + /* Check offsets of reserved fields */ + + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0xa0); + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0xcc); + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0xd8); + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0x180); + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0x248); + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0x298); + + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0xc8); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0xcc); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0xd8); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x1c0); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x248); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x298); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x2f0); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x320); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x380); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x3f0); + + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x0); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0xcc); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x148); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x168); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x180); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x1e0); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x200); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x320); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x380); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x3b0); + + BUILD_BUG_RESERVED_OFFSET(ghcb, 0xff0); } struct vmcb { diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index efaaef2b7ae11..69dbf17f0d6ad 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2648,7 +2648,7 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) ghcb_scratch_beg = control->ghcb_gpa + offsetof(struct ghcb, shared_buffer); ghcb_scratch_end = control->ghcb_gpa + - offsetof(struct ghcb, reserved_1); + offsetof(struct ghcb, reserved_0xff0); /* * If the scratch area begins within the GHCB, it must be -- GitLab From b0b42197b5c6f0d9447e5b710d64c671be8deec1 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 29 Sep 2022 13:20:09 -0400 Subject: [PATCH 0417/1423] KVM: x86: start moving SMM-related functions to new files Create a new header and source with code related to system management mode emulation. Entry and exit will move there too; for now, opportunistically rename put_smstate to PUT_SMSTATE while moving it to smm.h, and adjust the SMM state saving code. Signed-off-by: Paolo Bonzini Reviewed-by: Maxim Levitsky Message-Id: <20220929172016.319443-2-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 6 -- arch/x86/kvm/Makefile | 1 + arch/x86/kvm/emulate.c | 1 + arch/x86/kvm/kvm_cache_regs.h | 5 -- arch/x86/kvm/lapic.c | 8 +- arch/x86/kvm/lapic.h | 2 +- arch/x86/kvm/mmu/mmu.c | 1 + arch/x86/kvm/smm.c | 37 ++++++++ arch/x86/kvm/smm.h | 25 ++++++ arch/x86/kvm/svm/nested.c | 1 + arch/x86/kvm/svm/svm.c | 5 +- arch/x86/kvm/vmx/nested.c | 1 + arch/x86/kvm/vmx/vmx.c | 1 + arch/x86/kvm/x86.c | 148 ++++++++++++-------------------- 14 files changed, 132 insertions(+), 110 deletions(-) create mode 100644 arch/x86/kvm/smm.c create mode 100644 arch/x86/kvm/smm.h diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1723a357190d6..c70e84e69cca6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2087,12 +2087,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu) #endif } -#define put_smstate(type, buf, offset, val) \ - *(type *)((buf) + (offset) - 0x7e00) = val - -#define GET_SMSTATE(type, buf, offset) \ - (*(type *)((buf) + (offset) - 0x7e00)) - int kvm_cpu_dirty_log_size(void); int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index f453a0f96e243..b584cb0e06bd8 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -20,6 +20,7 @@ endif kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o kvm-$(CONFIG_KVM_XEN) += xen.o +kvm-y += smm.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ vmx/evmcs.o vmx/nested.o vmx/posted_intr.o diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 4a43261d25a2a..eea29aac83bd8 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -30,6 +30,7 @@ #include "tss.h" #include "mmu.h" #include "pmu.h" +#include "smm.h" /* * Operand types diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 3febc342360cc..c09174f73a344 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -200,9 +200,4 @@ static inline bool is_guest_mode(struct kvm_vcpu *vcpu) return vcpu->arch.hflags & HF_GUEST_MASK; } -static inline bool is_smm(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.hflags & HF_SMM_MASK; -} - #endif diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d7639d126e6c7..1bb63746e9910 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -42,6 +42,7 @@ #include "x86.h" #include "cpuid.h" #include "hyperv.h" +#include "smm.h" #ifndef CONFIG_X86_64 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) @@ -1170,9 +1171,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; case APIC_DM_SMI: - result = 1; - kvm_make_request(KVM_REQ_SMI, vcpu); - kvm_vcpu_kick(vcpu); + if (!kvm_inject_smi(vcpu)) { + kvm_vcpu_kick(vcpu); + result = 1; + } break; case APIC_DM_NMI: diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index a5ac4a5a51792..28e3769066e21 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -7,7 +7,7 @@ #include #include "hyperv.h" -#include "kvm_cache_regs.h" +#include "smm.h" #define KVM_APIC_INIT 0 #define KVM_APIC_SIPI 1 diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2640871bdcf17..f8c92a4a35faa 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -22,6 +22,7 @@ #include "tdp_mmu.h" #include "x86.h" #include "kvm_cache_regs.h" +#include "smm.h" #include "kvm_emulate.h" #include "cpuid.h" #include "spte.h" diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c new file mode 100644 index 0000000000000..b91c48d91f6ed --- /dev/null +++ b/arch/x86/kvm/smm.c @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include +#include "x86.h" +#include "kvm_cache_regs.h" +#include "kvm_emulate.h" +#include "smm.h" +#include "trace.h" + +void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) +{ + trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm); + + if (entering_smm) { + vcpu->arch.hflags |= HF_SMM_MASK; + } else { + vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK); + + /* Process a latched INIT or SMI, if any. */ + kvm_make_request(KVM_REQ_EVENT, vcpu); + + /* + * Even if KVM_SET_SREGS2 loaded PDPTRs out of band, + * on SMM exit we still need to reload them from + * guest memory + */ + vcpu->arch.pdptrs_from_userspace = false; + } + + kvm_mmu_reset_context(vcpu); +} + +void process_smi(struct kvm_vcpu *vcpu) +{ + vcpu->arch.smi_pending = true; + kvm_make_request(KVM_REQ_EVENT, vcpu); +} diff --git a/arch/x86/kvm/smm.h b/arch/x86/kvm/smm.h new file mode 100644 index 0000000000000..d85d4ccd32dd1 --- /dev/null +++ b/arch/x86/kvm/smm.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef ASM_KVM_SMM_H +#define ASM_KVM_SMM_H + +#define GET_SMSTATE(type, buf, offset) \ + (*(type *)((buf) + (offset) - 0x7e00)) + +#define PUT_SMSTATE(type, buf, offset, val) \ + *(type *)((buf) + (offset) - 0x7e00) = val + +static inline int kvm_inject_smi(struct kvm_vcpu *vcpu) +{ + kvm_make_request(KVM_REQ_SMI, vcpu); + return 0; +} + +static inline bool is_smm(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.hflags & HF_SMM_MASK; +} + +void kvm_smm_changed(struct kvm_vcpu *vcpu, bool in_smm); +void process_smi(struct kvm_vcpu *vcpu); + +#endif diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 4c620999d230a..cc0fd75f7cbab 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -25,6 +25,7 @@ #include "trace.h" #include "mmu.h" #include "x86.h" +#include "smm.h" #include "cpuid.h" #include "lapic.h" #include "svm.h" diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 3fc8e4999891a..3bb07ec789859 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -6,6 +6,7 @@ #include "mmu.h" #include "kvm_cache_regs.h" #include "x86.h" +#include "smm.h" #include "cpuid.h" #include "pmu.h" @@ -4407,9 +4408,9 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) return 0; /* FED8h - SVM Guest */ - put_smstate(u64, smstate, 0x7ed8, 1); + PUT_SMSTATE(u64, smstate, 0x7ed8, 1); /* FEE0h - SVM Guest VMCB Physical Address */ - put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa); + PUT_SMSTATE(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa); svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 0c62352dda6ab..61a2e551640a0 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -16,6 +16,7 @@ #include "trace.h" #include "vmx.h" #include "x86.h" +#include "smm.h" static bool __read_mostly enable_shadow_vmcs = 1; module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 7a7e14d4d78c2..49065614a3dbb 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -66,6 +66,7 @@ #include "vmcs12.h" #include "vmx.h" #include "x86.h" +#include "smm.h" MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3c63ba5512e9a..d936b0f15d7da 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -30,6 +30,7 @@ #include "hyperv.h" #include "lapic.h" #include "xen.h" +#include "smm.h" #include #include @@ -119,7 +120,6 @@ static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); -static void process_smi(struct kvm_vcpu *vcpu); static void enter_smm(struct kvm_vcpu *vcpu); static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); static void store_regs(struct kvm_vcpu *vcpu); @@ -4889,13 +4889,6 @@ static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) return 0; } -static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu) -{ - kvm_make_request(KVM_REQ_SMI, vcpu); - - return 0; -} - static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, struct kvm_tpr_access_ctl *tac) { @@ -5118,8 +5111,6 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, memset(&events->reserved, 0, sizeof(events->reserved)); } -static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm); - static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { @@ -5572,7 +5563,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_SMI: { - r = kvm_vcpu_ioctl_smi(vcpu); + r = kvm_inject_smi(vcpu); break; } case KVM_SET_CPUID: { @@ -8569,29 +8560,6 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, static int complete_emulated_mmio(struct kvm_vcpu *vcpu); static int complete_emulated_pio(struct kvm_vcpu *vcpu); -static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) -{ - trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm); - - if (entering_smm) { - vcpu->arch.hflags |= HF_SMM_MASK; - } else { - vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK); - - /* Process a latched INIT or SMI, if any. */ - kvm_make_request(KVM_REQ_EVENT, vcpu); - - /* - * Even if KVM_SET_SREGS2 loaded PDPTRs out of band, - * on SMM exit we still need to reload them from - * guest memory - */ - vcpu->arch.pdptrs_from_userspace = false; - } - - kvm_mmu_reset_context(vcpu); -} - static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, unsigned long *db) { @@ -10088,16 +10056,16 @@ static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n) int offset; kvm_get_segment(vcpu, &seg, n); - put_smstate(u32, buf, 0x7fa8 + n * 4, seg.selector); + PUT_SMSTATE(u32, buf, 0x7fa8 + n * 4, seg.selector); if (n < 3) offset = 0x7f84 + n * 12; else offset = 0x7f2c + (n - 3) * 12; - put_smstate(u32, buf, offset + 8, seg.base); - put_smstate(u32, buf, offset + 4, seg.limit); - put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg)); + PUT_SMSTATE(u32, buf, offset + 8, seg.base); + PUT_SMSTATE(u32, buf, offset + 4, seg.limit); + PUT_SMSTATE(u32, buf, offset, enter_smm_get_segment_flags(&seg)); } #ifdef CONFIG_X86_64 @@ -10111,10 +10079,10 @@ static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n) offset = 0x7e00 + n * 16; flags = enter_smm_get_segment_flags(&seg) >> 8; - put_smstate(u16, buf, offset, seg.selector); - put_smstate(u16, buf, offset + 2, flags); - put_smstate(u32, buf, offset + 4, seg.limit); - put_smstate(u64, buf, offset + 8, seg.base); + PUT_SMSTATE(u16, buf, offset, seg.selector); + PUT_SMSTATE(u16, buf, offset + 2, flags); + PUT_SMSTATE(u32, buf, offset + 4, seg.limit); + PUT_SMSTATE(u64, buf, offset + 8, seg.base); } #endif @@ -10125,47 +10093,47 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf) unsigned long val; int i; - put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu)); - put_smstate(u32, buf, 0x7ff8, kvm_read_cr3(vcpu)); - put_smstate(u32, buf, 0x7ff4, kvm_get_rflags(vcpu)); - put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu)); + PUT_SMSTATE(u32, buf, 0x7ffc, kvm_read_cr0(vcpu)); + PUT_SMSTATE(u32, buf, 0x7ff8, kvm_read_cr3(vcpu)); + PUT_SMSTATE(u32, buf, 0x7ff4, kvm_get_rflags(vcpu)); + PUT_SMSTATE(u32, buf, 0x7ff0, kvm_rip_read(vcpu)); for (i = 0; i < 8; i++) - put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read_raw(vcpu, i)); + PUT_SMSTATE(u32, buf, 0x7fd0 + i * 4, kvm_register_read_raw(vcpu, i)); kvm_get_dr(vcpu, 6, &val); - put_smstate(u32, buf, 0x7fcc, (u32)val); + PUT_SMSTATE(u32, buf, 0x7fcc, (u32)val); kvm_get_dr(vcpu, 7, &val); - put_smstate(u32, buf, 0x7fc8, (u32)val); + PUT_SMSTATE(u32, buf, 0x7fc8, (u32)val); kvm_get_segment(vcpu, &seg, VCPU_SREG_TR); - put_smstate(u32, buf, 0x7fc4, seg.selector); - put_smstate(u32, buf, 0x7f64, seg.base); - put_smstate(u32, buf, 0x7f60, seg.limit); - put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg)); + PUT_SMSTATE(u32, buf, 0x7fc4, seg.selector); + PUT_SMSTATE(u32, buf, 0x7f64, seg.base); + PUT_SMSTATE(u32, buf, 0x7f60, seg.limit); + PUT_SMSTATE(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg)); kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR); - put_smstate(u32, buf, 0x7fc0, seg.selector); - put_smstate(u32, buf, 0x7f80, seg.base); - put_smstate(u32, buf, 0x7f7c, seg.limit); - put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg)); + PUT_SMSTATE(u32, buf, 0x7fc0, seg.selector); + PUT_SMSTATE(u32, buf, 0x7f80, seg.base); + PUT_SMSTATE(u32, buf, 0x7f7c, seg.limit); + PUT_SMSTATE(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg)); static_call(kvm_x86_get_gdt)(vcpu, &dt); - put_smstate(u32, buf, 0x7f74, dt.address); - put_smstate(u32, buf, 0x7f70, dt.size); + PUT_SMSTATE(u32, buf, 0x7f74, dt.address); + PUT_SMSTATE(u32, buf, 0x7f70, dt.size); static_call(kvm_x86_get_idt)(vcpu, &dt); - put_smstate(u32, buf, 0x7f58, dt.address); - put_smstate(u32, buf, 0x7f54, dt.size); + PUT_SMSTATE(u32, buf, 0x7f58, dt.address); + PUT_SMSTATE(u32, buf, 0x7f54, dt.size); for (i = 0; i < 6; i++) enter_smm_save_seg_32(vcpu, buf, i); - put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu)); + PUT_SMSTATE(u32, buf, 0x7f14, kvm_read_cr4(vcpu)); /* revision id */ - put_smstate(u32, buf, 0x7efc, 0x00020000); - put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase); + PUT_SMSTATE(u32, buf, 0x7efc, 0x00020000); + PUT_SMSTATE(u32, buf, 0x7ef8, vcpu->arch.smbase); } #ifdef CONFIG_X86_64 @@ -10177,46 +10145,46 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) int i; for (i = 0; i < 16; i++) - put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read_raw(vcpu, i)); + PUT_SMSTATE(u64, buf, 0x7ff8 - i * 8, kvm_register_read_raw(vcpu, i)); - put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu)); - put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu)); + PUT_SMSTATE(u64, buf, 0x7f78, kvm_rip_read(vcpu)); + PUT_SMSTATE(u32, buf, 0x7f70, kvm_get_rflags(vcpu)); kvm_get_dr(vcpu, 6, &val); - put_smstate(u64, buf, 0x7f68, val); + PUT_SMSTATE(u64, buf, 0x7f68, val); kvm_get_dr(vcpu, 7, &val); - put_smstate(u64, buf, 0x7f60, val); + PUT_SMSTATE(u64, buf, 0x7f60, val); - put_smstate(u64, buf, 0x7f58, kvm_read_cr0(vcpu)); - put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu)); - put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu)); + PUT_SMSTATE(u64, buf, 0x7f58, kvm_read_cr0(vcpu)); + PUT_SMSTATE(u64, buf, 0x7f50, kvm_read_cr3(vcpu)); + PUT_SMSTATE(u64, buf, 0x7f48, kvm_read_cr4(vcpu)); - put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase); + PUT_SMSTATE(u32, buf, 0x7f00, vcpu->arch.smbase); /* revision id */ - put_smstate(u32, buf, 0x7efc, 0x00020064); + PUT_SMSTATE(u32, buf, 0x7efc, 0x00020064); - put_smstate(u64, buf, 0x7ed0, vcpu->arch.efer); + PUT_SMSTATE(u64, buf, 0x7ed0, vcpu->arch.efer); kvm_get_segment(vcpu, &seg, VCPU_SREG_TR); - put_smstate(u16, buf, 0x7e90, seg.selector); - put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8); - put_smstate(u32, buf, 0x7e94, seg.limit); - put_smstate(u64, buf, 0x7e98, seg.base); + PUT_SMSTATE(u16, buf, 0x7e90, seg.selector); + PUT_SMSTATE(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8); + PUT_SMSTATE(u32, buf, 0x7e94, seg.limit); + PUT_SMSTATE(u64, buf, 0x7e98, seg.base); static_call(kvm_x86_get_idt)(vcpu, &dt); - put_smstate(u32, buf, 0x7e84, dt.size); - put_smstate(u64, buf, 0x7e88, dt.address); + PUT_SMSTATE(u32, buf, 0x7e84, dt.size); + PUT_SMSTATE(u64, buf, 0x7e88, dt.address); kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR); - put_smstate(u16, buf, 0x7e70, seg.selector); - put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8); - put_smstate(u32, buf, 0x7e74, seg.limit); - put_smstate(u64, buf, 0x7e78, seg.base); + PUT_SMSTATE(u16, buf, 0x7e70, seg.selector); + PUT_SMSTATE(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8); + PUT_SMSTATE(u32, buf, 0x7e74, seg.limit); + PUT_SMSTATE(u64, buf, 0x7e78, seg.base); static_call(kvm_x86_get_gdt)(vcpu, &dt); - put_smstate(u32, buf, 0x7e64, dt.size); - put_smstate(u64, buf, 0x7e68, dt.address); + PUT_SMSTATE(u32, buf, 0x7e64, dt.size); + PUT_SMSTATE(u64, buf, 0x7e68, dt.address); for (i = 0; i < 6; i++) enter_smm_save_seg_64(vcpu, buf, i); @@ -10302,12 +10270,6 @@ static void enter_smm(struct kvm_vcpu *vcpu) kvm_mmu_reset_context(vcpu); } -static void process_smi(struct kvm_vcpu *vcpu) -{ - vcpu->arch.smi_pending = true; - kvm_make_request(KVM_REQ_EVENT, vcpu); -} - void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, unsigned long *vcpu_bitmap) { -- GitLab From c53da4f3af6e613e82f88abc6eb988e44c5dadd7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 29 Sep 2022 13:20:10 -0400 Subject: [PATCH 0418/1423] KVM: x86: move SMM entry to a new file Some users of KVM implement the UEFI variable store through a paravirtual device that does not require the "SMM lockbox" component of edk2, and would like to compile out system management mode. In preparation for that, move the SMM entry code out of x86.c and into a new file. Signed-off-by: Paolo Bonzini Reviewed-by: Maxim Levitsky Message-Id: <20220929172016.319443-3-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/smm.c | 235 +++++++++++++++++++++++++++++++ arch/x86/kvm/smm.h | 1 + arch/x86/kvm/x86.c | 239 +------------------------------- 4 files changed, 239 insertions(+), 237 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c70e84e69cca6..612ef60631c1f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1844,6 +1844,7 @@ int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu); int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); +void kvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index b91c48d91f6ed..26a6859e421fe 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -5,6 +5,7 @@ #include "kvm_cache_regs.h" #include "kvm_emulate.h" #include "smm.h" +#include "cpuid.h" #include "trace.h" void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) @@ -35,3 +36,237 @@ void process_smi(struct kvm_vcpu *vcpu) vcpu->arch.smi_pending = true; kvm_make_request(KVM_REQ_EVENT, vcpu); } + +static u32 enter_smm_get_segment_flags(struct kvm_segment *seg) +{ + u32 flags = 0; + flags |= seg->g << 23; + flags |= seg->db << 22; + flags |= seg->l << 21; + flags |= seg->avl << 20; + flags |= seg->present << 15; + flags |= seg->dpl << 13; + flags |= seg->s << 12; + flags |= seg->type << 8; + return flags; +} + +static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n) +{ + struct kvm_segment seg; + int offset; + + kvm_get_segment(vcpu, &seg, n); + PUT_SMSTATE(u32, buf, 0x7fa8 + n * 4, seg.selector); + + if (n < 3) + offset = 0x7f84 + n * 12; + else + offset = 0x7f2c + (n - 3) * 12; + + PUT_SMSTATE(u32, buf, offset + 8, seg.base); + PUT_SMSTATE(u32, buf, offset + 4, seg.limit); + PUT_SMSTATE(u32, buf, offset, enter_smm_get_segment_flags(&seg)); +} + +#ifdef CONFIG_X86_64 +static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n) +{ + struct kvm_segment seg; + int offset; + u16 flags; + + kvm_get_segment(vcpu, &seg, n); + offset = 0x7e00 + n * 16; + + flags = enter_smm_get_segment_flags(&seg) >> 8; + PUT_SMSTATE(u16, buf, offset, seg.selector); + PUT_SMSTATE(u16, buf, offset + 2, flags); + PUT_SMSTATE(u32, buf, offset + 4, seg.limit); + PUT_SMSTATE(u64, buf, offset + 8, seg.base); +} +#endif + +static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf) +{ + struct desc_ptr dt; + struct kvm_segment seg; + unsigned long val; + int i; + + PUT_SMSTATE(u32, buf, 0x7ffc, kvm_read_cr0(vcpu)); + PUT_SMSTATE(u32, buf, 0x7ff8, kvm_read_cr3(vcpu)); + PUT_SMSTATE(u32, buf, 0x7ff4, kvm_get_rflags(vcpu)); + PUT_SMSTATE(u32, buf, 0x7ff0, kvm_rip_read(vcpu)); + + for (i = 0; i < 8; i++) + PUT_SMSTATE(u32, buf, 0x7fd0 + i * 4, kvm_register_read_raw(vcpu, i)); + + kvm_get_dr(vcpu, 6, &val); + PUT_SMSTATE(u32, buf, 0x7fcc, (u32)val); + kvm_get_dr(vcpu, 7, &val); + PUT_SMSTATE(u32, buf, 0x7fc8, (u32)val); + + kvm_get_segment(vcpu, &seg, VCPU_SREG_TR); + PUT_SMSTATE(u32, buf, 0x7fc4, seg.selector); + PUT_SMSTATE(u32, buf, 0x7f64, seg.base); + PUT_SMSTATE(u32, buf, 0x7f60, seg.limit); + PUT_SMSTATE(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg)); + + kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR); + PUT_SMSTATE(u32, buf, 0x7fc0, seg.selector); + PUT_SMSTATE(u32, buf, 0x7f80, seg.base); + PUT_SMSTATE(u32, buf, 0x7f7c, seg.limit); + PUT_SMSTATE(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg)); + + static_call(kvm_x86_get_gdt)(vcpu, &dt); + PUT_SMSTATE(u32, buf, 0x7f74, dt.address); + PUT_SMSTATE(u32, buf, 0x7f70, dt.size); + + static_call(kvm_x86_get_idt)(vcpu, &dt); + PUT_SMSTATE(u32, buf, 0x7f58, dt.address); + PUT_SMSTATE(u32, buf, 0x7f54, dt.size); + + for (i = 0; i < 6; i++) + enter_smm_save_seg_32(vcpu, buf, i); + + PUT_SMSTATE(u32, buf, 0x7f14, kvm_read_cr4(vcpu)); + + /* revision id */ + PUT_SMSTATE(u32, buf, 0x7efc, 0x00020000); + PUT_SMSTATE(u32, buf, 0x7ef8, vcpu->arch.smbase); +} + +#ifdef CONFIG_X86_64 +static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) +{ + struct desc_ptr dt; + struct kvm_segment seg; + unsigned long val; + int i; + + for (i = 0; i < 16; i++) + PUT_SMSTATE(u64, buf, 0x7ff8 - i * 8, kvm_register_read_raw(vcpu, i)); + + PUT_SMSTATE(u64, buf, 0x7f78, kvm_rip_read(vcpu)); + PUT_SMSTATE(u32, buf, 0x7f70, kvm_get_rflags(vcpu)); + + kvm_get_dr(vcpu, 6, &val); + PUT_SMSTATE(u64, buf, 0x7f68, val); + kvm_get_dr(vcpu, 7, &val); + PUT_SMSTATE(u64, buf, 0x7f60, val); + + PUT_SMSTATE(u64, buf, 0x7f58, kvm_read_cr0(vcpu)); + PUT_SMSTATE(u64, buf, 0x7f50, kvm_read_cr3(vcpu)); + PUT_SMSTATE(u64, buf, 0x7f48, kvm_read_cr4(vcpu)); + + PUT_SMSTATE(u32, buf, 0x7f00, vcpu->arch.smbase); + + /* revision id */ + PUT_SMSTATE(u32, buf, 0x7efc, 0x00020064); + + PUT_SMSTATE(u64, buf, 0x7ed0, vcpu->arch.efer); + + kvm_get_segment(vcpu, &seg, VCPU_SREG_TR); + PUT_SMSTATE(u16, buf, 0x7e90, seg.selector); + PUT_SMSTATE(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8); + PUT_SMSTATE(u32, buf, 0x7e94, seg.limit); + PUT_SMSTATE(u64, buf, 0x7e98, seg.base); + + static_call(kvm_x86_get_idt)(vcpu, &dt); + PUT_SMSTATE(u32, buf, 0x7e84, dt.size); + PUT_SMSTATE(u64, buf, 0x7e88, dt.address); + + kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR); + PUT_SMSTATE(u16, buf, 0x7e70, seg.selector); + PUT_SMSTATE(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8); + PUT_SMSTATE(u32, buf, 0x7e74, seg.limit); + PUT_SMSTATE(u64, buf, 0x7e78, seg.base); + + static_call(kvm_x86_get_gdt)(vcpu, &dt); + PUT_SMSTATE(u32, buf, 0x7e64, dt.size); + PUT_SMSTATE(u64, buf, 0x7e68, dt.address); + + for (i = 0; i < 6; i++) + enter_smm_save_seg_64(vcpu, buf, i); +} +#endif + +void enter_smm(struct kvm_vcpu *vcpu) +{ + struct kvm_segment cs, ds; + struct desc_ptr dt; + unsigned long cr0; + char buf[512]; + + memset(buf, 0, 512); +#ifdef CONFIG_X86_64 + if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) + enter_smm_save_state_64(vcpu, buf); + else +#endif + enter_smm_save_state_32(vcpu, buf); + + /* + * Give enter_smm() a chance to make ISA-specific changes to the vCPU + * state (e.g. leave guest mode) after we've saved the state into the + * SMM state-save area. + */ + static_call(kvm_x86_enter_smm)(vcpu, buf); + + kvm_smm_changed(vcpu, true); + kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); + + if (static_call(kvm_x86_get_nmi_mask)(vcpu)) + vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; + else + static_call(kvm_x86_set_nmi_mask)(vcpu, true); + + kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); + kvm_rip_write(vcpu, 0x8000); + + cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG); + static_call(kvm_x86_set_cr0)(vcpu, cr0); + vcpu->arch.cr0 = cr0; + + static_call(kvm_x86_set_cr4)(vcpu, 0); + + /* Undocumented: IDT limit is set to zero on entry to SMM. */ + dt.address = dt.size = 0; + static_call(kvm_x86_set_idt)(vcpu, &dt); + + kvm_set_dr(vcpu, 7, DR7_FIXED_1); + + cs.selector = (vcpu->arch.smbase >> 4) & 0xffff; + cs.base = vcpu->arch.smbase; + + ds.selector = 0; + ds.base = 0; + + cs.limit = ds.limit = 0xffffffff; + cs.type = ds.type = 0x3; + cs.dpl = ds.dpl = 0; + cs.db = ds.db = 0; + cs.s = ds.s = 1; + cs.l = ds.l = 0; + cs.g = ds.g = 1; + cs.avl = ds.avl = 0; + cs.present = ds.present = 1; + cs.unusable = ds.unusable = 0; + cs.padding = ds.padding = 0; + + kvm_set_segment(vcpu, &cs, VCPU_SREG_CS); + kvm_set_segment(vcpu, &ds, VCPU_SREG_DS); + kvm_set_segment(vcpu, &ds, VCPU_SREG_ES); + kvm_set_segment(vcpu, &ds, VCPU_SREG_FS); + kvm_set_segment(vcpu, &ds, VCPU_SREG_GS); + kvm_set_segment(vcpu, &ds, VCPU_SREG_SS); + +#ifdef CONFIG_X86_64 + if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) + static_call(kvm_x86_set_efer)(vcpu, 0); +#endif + + kvm_update_cpuid_runtime(vcpu); + kvm_mmu_reset_context(vcpu); +} diff --git a/arch/x86/kvm/smm.h b/arch/x86/kvm/smm.h index d85d4ccd32dd1..aacc6dac2c99a 100644 --- a/arch/x86/kvm/smm.h +++ b/arch/x86/kvm/smm.h @@ -20,6 +20,7 @@ static inline bool is_smm(struct kvm_vcpu *vcpu) } void kvm_smm_changed(struct kvm_vcpu *vcpu, bool in_smm); +void enter_smm(struct kvm_vcpu *vcpu); void process_smi(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d936b0f15d7da..0730b16564f99 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -120,7 +120,6 @@ static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); -static void enter_smm(struct kvm_vcpu *vcpu); static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); static void store_regs(struct kvm_vcpu *vcpu); static int sync_regs(struct kvm_vcpu *vcpu); @@ -7108,8 +7107,8 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) return handled; } -static void kvm_set_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) +void kvm_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) { static_call(kvm_x86_set_segment)(vcpu, var, seg); } @@ -10036,240 +10035,6 @@ static void process_nmi(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); } -static u32 enter_smm_get_segment_flags(struct kvm_segment *seg) -{ - u32 flags = 0; - flags |= seg->g << 23; - flags |= seg->db << 22; - flags |= seg->l << 21; - flags |= seg->avl << 20; - flags |= seg->present << 15; - flags |= seg->dpl << 13; - flags |= seg->s << 12; - flags |= seg->type << 8; - return flags; -} - -static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n) -{ - struct kvm_segment seg; - int offset; - - kvm_get_segment(vcpu, &seg, n); - PUT_SMSTATE(u32, buf, 0x7fa8 + n * 4, seg.selector); - - if (n < 3) - offset = 0x7f84 + n * 12; - else - offset = 0x7f2c + (n - 3) * 12; - - PUT_SMSTATE(u32, buf, offset + 8, seg.base); - PUT_SMSTATE(u32, buf, offset + 4, seg.limit); - PUT_SMSTATE(u32, buf, offset, enter_smm_get_segment_flags(&seg)); -} - -#ifdef CONFIG_X86_64 -static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n) -{ - struct kvm_segment seg; - int offset; - u16 flags; - - kvm_get_segment(vcpu, &seg, n); - offset = 0x7e00 + n * 16; - - flags = enter_smm_get_segment_flags(&seg) >> 8; - PUT_SMSTATE(u16, buf, offset, seg.selector); - PUT_SMSTATE(u16, buf, offset + 2, flags); - PUT_SMSTATE(u32, buf, offset + 4, seg.limit); - PUT_SMSTATE(u64, buf, offset + 8, seg.base); -} -#endif - -static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf) -{ - struct desc_ptr dt; - struct kvm_segment seg; - unsigned long val; - int i; - - PUT_SMSTATE(u32, buf, 0x7ffc, kvm_read_cr0(vcpu)); - PUT_SMSTATE(u32, buf, 0x7ff8, kvm_read_cr3(vcpu)); - PUT_SMSTATE(u32, buf, 0x7ff4, kvm_get_rflags(vcpu)); - PUT_SMSTATE(u32, buf, 0x7ff0, kvm_rip_read(vcpu)); - - for (i = 0; i < 8; i++) - PUT_SMSTATE(u32, buf, 0x7fd0 + i * 4, kvm_register_read_raw(vcpu, i)); - - kvm_get_dr(vcpu, 6, &val); - PUT_SMSTATE(u32, buf, 0x7fcc, (u32)val); - kvm_get_dr(vcpu, 7, &val); - PUT_SMSTATE(u32, buf, 0x7fc8, (u32)val); - - kvm_get_segment(vcpu, &seg, VCPU_SREG_TR); - PUT_SMSTATE(u32, buf, 0x7fc4, seg.selector); - PUT_SMSTATE(u32, buf, 0x7f64, seg.base); - PUT_SMSTATE(u32, buf, 0x7f60, seg.limit); - PUT_SMSTATE(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg)); - - kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR); - PUT_SMSTATE(u32, buf, 0x7fc0, seg.selector); - PUT_SMSTATE(u32, buf, 0x7f80, seg.base); - PUT_SMSTATE(u32, buf, 0x7f7c, seg.limit); - PUT_SMSTATE(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg)); - - static_call(kvm_x86_get_gdt)(vcpu, &dt); - PUT_SMSTATE(u32, buf, 0x7f74, dt.address); - PUT_SMSTATE(u32, buf, 0x7f70, dt.size); - - static_call(kvm_x86_get_idt)(vcpu, &dt); - PUT_SMSTATE(u32, buf, 0x7f58, dt.address); - PUT_SMSTATE(u32, buf, 0x7f54, dt.size); - - for (i = 0; i < 6; i++) - enter_smm_save_seg_32(vcpu, buf, i); - - PUT_SMSTATE(u32, buf, 0x7f14, kvm_read_cr4(vcpu)); - - /* revision id */ - PUT_SMSTATE(u32, buf, 0x7efc, 0x00020000); - PUT_SMSTATE(u32, buf, 0x7ef8, vcpu->arch.smbase); -} - -#ifdef CONFIG_X86_64 -static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) -{ - struct desc_ptr dt; - struct kvm_segment seg; - unsigned long val; - int i; - - for (i = 0; i < 16; i++) - PUT_SMSTATE(u64, buf, 0x7ff8 - i * 8, kvm_register_read_raw(vcpu, i)); - - PUT_SMSTATE(u64, buf, 0x7f78, kvm_rip_read(vcpu)); - PUT_SMSTATE(u32, buf, 0x7f70, kvm_get_rflags(vcpu)); - - kvm_get_dr(vcpu, 6, &val); - PUT_SMSTATE(u64, buf, 0x7f68, val); - kvm_get_dr(vcpu, 7, &val); - PUT_SMSTATE(u64, buf, 0x7f60, val); - - PUT_SMSTATE(u64, buf, 0x7f58, kvm_read_cr0(vcpu)); - PUT_SMSTATE(u64, buf, 0x7f50, kvm_read_cr3(vcpu)); - PUT_SMSTATE(u64, buf, 0x7f48, kvm_read_cr4(vcpu)); - - PUT_SMSTATE(u32, buf, 0x7f00, vcpu->arch.smbase); - - /* revision id */ - PUT_SMSTATE(u32, buf, 0x7efc, 0x00020064); - - PUT_SMSTATE(u64, buf, 0x7ed0, vcpu->arch.efer); - - kvm_get_segment(vcpu, &seg, VCPU_SREG_TR); - PUT_SMSTATE(u16, buf, 0x7e90, seg.selector); - PUT_SMSTATE(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8); - PUT_SMSTATE(u32, buf, 0x7e94, seg.limit); - PUT_SMSTATE(u64, buf, 0x7e98, seg.base); - - static_call(kvm_x86_get_idt)(vcpu, &dt); - PUT_SMSTATE(u32, buf, 0x7e84, dt.size); - PUT_SMSTATE(u64, buf, 0x7e88, dt.address); - - kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR); - PUT_SMSTATE(u16, buf, 0x7e70, seg.selector); - PUT_SMSTATE(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8); - PUT_SMSTATE(u32, buf, 0x7e74, seg.limit); - PUT_SMSTATE(u64, buf, 0x7e78, seg.base); - - static_call(kvm_x86_get_gdt)(vcpu, &dt); - PUT_SMSTATE(u32, buf, 0x7e64, dt.size); - PUT_SMSTATE(u64, buf, 0x7e68, dt.address); - - for (i = 0; i < 6; i++) - enter_smm_save_seg_64(vcpu, buf, i); -} -#endif - -static void enter_smm(struct kvm_vcpu *vcpu) -{ - struct kvm_segment cs, ds; - struct desc_ptr dt; - unsigned long cr0; - char buf[512]; - - memset(buf, 0, 512); -#ifdef CONFIG_X86_64 - if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) - enter_smm_save_state_64(vcpu, buf); - else -#endif - enter_smm_save_state_32(vcpu, buf); - - /* - * Give enter_smm() a chance to make ISA-specific changes to the vCPU - * state (e.g. leave guest mode) after we've saved the state into the - * SMM state-save area. - */ - static_call(kvm_x86_enter_smm)(vcpu, buf); - - kvm_smm_changed(vcpu, true); - kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); - - if (static_call(kvm_x86_get_nmi_mask)(vcpu)) - vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; - else - static_call(kvm_x86_set_nmi_mask)(vcpu, true); - - kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); - kvm_rip_write(vcpu, 0x8000); - - cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG); - static_call(kvm_x86_set_cr0)(vcpu, cr0); - vcpu->arch.cr0 = cr0; - - static_call(kvm_x86_set_cr4)(vcpu, 0); - - /* Undocumented: IDT limit is set to zero on entry to SMM. */ - dt.address = dt.size = 0; - static_call(kvm_x86_set_idt)(vcpu, &dt); - - kvm_set_dr(vcpu, 7, DR7_FIXED_1); - - cs.selector = (vcpu->arch.smbase >> 4) & 0xffff; - cs.base = vcpu->arch.smbase; - - ds.selector = 0; - ds.base = 0; - - cs.limit = ds.limit = 0xffffffff; - cs.type = ds.type = 0x3; - cs.dpl = ds.dpl = 0; - cs.db = ds.db = 0; - cs.s = ds.s = 1; - cs.l = ds.l = 0; - cs.g = ds.g = 1; - cs.avl = ds.avl = 0; - cs.present = ds.present = 1; - cs.unusable = ds.unusable = 0; - cs.padding = ds.padding = 0; - - kvm_set_segment(vcpu, &cs, VCPU_SREG_CS); - kvm_set_segment(vcpu, &ds, VCPU_SREG_DS); - kvm_set_segment(vcpu, &ds, VCPU_SREG_ES); - kvm_set_segment(vcpu, &ds, VCPU_SREG_FS); - kvm_set_segment(vcpu, &ds, VCPU_SREG_GS); - kvm_set_segment(vcpu, &ds, VCPU_SREG_SS); - -#ifdef CONFIG_X86_64 - if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) - static_call(kvm_x86_set_efer)(vcpu, 0); -#endif - - kvm_update_cpuid_runtime(vcpu); - kvm_mmu_reset_context(vcpu); -} - void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, unsigned long *vcpu_bitmap) { -- GitLab From f1554150d3c694e30e92c681c20ce9714cac3d42 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 28 Oct 2022 06:01:26 -0400 Subject: [PATCH 0419/1423] KVM: x86: move SMM exit to a new file Some users of KVM implement the UEFI variable store through a paravirtual device that does not require the "SMM lockbox" component of edk2, and would like to compile out system management mode. In preparation for that, move the SMM exit code out of emulate.c and into a new file. The code is still written as a series of invocations of the emulator callbacks, but the two exiting_smm and leave_smm callbacks are merged into one, and all the code from em_rsm is now part of the callback. This removes all knowledge of the format of the SMM save state area from the emulator. Further patches will clean up the code and invoke KVM's own functions to access control registers, descriptor caches, etc. Signed-off-by: Paolo Bonzini Reviewed-by: Maxim Levitsky Message-Id: <20220929172016.319443-4-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 356 +------------------------------------ arch/x86/kvm/kvm_emulate.h | 34 +++- arch/x86/kvm/smm.c | 316 ++++++++++++++++++++++++++++++++ arch/x86/kvm/smm.h | 1 + arch/x86/kvm/x86.c | 14 -- 5 files changed, 351 insertions(+), 370 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index eea29aac83bd8..5cc3efa0e21c1 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -30,7 +30,6 @@ #include "tss.h" #include "mmu.h" #include "pmu.h" -#include "smm.h" /* * Operand types @@ -243,37 +242,6 @@ enum x86_transfer_type { X86_TRANSFER_TASK_SWITCH, }; -static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) -{ - if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt)) - nr &= NR_EMULATOR_GPRS - 1; - - if (!(ctxt->regs_valid & (1 << nr))) { - ctxt->regs_valid |= 1 << nr; - ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr); - } - return ctxt->_regs[nr]; -} - -static ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr) -{ - if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt)) - nr &= NR_EMULATOR_GPRS - 1; - - BUILD_BUG_ON(sizeof(ctxt->regs_dirty) * BITS_PER_BYTE < NR_EMULATOR_GPRS); - BUILD_BUG_ON(sizeof(ctxt->regs_valid) * BITS_PER_BYTE < NR_EMULATOR_GPRS); - - ctxt->regs_valid |= 1 << nr; - ctxt->regs_dirty |= 1 << nr; - return &ctxt->_regs[nr]; -} - -static ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr) -{ - reg_read(ctxt, nr); - return reg_write(ctxt, nr); -} - static void writeback_registers(struct x86_emulate_ctxt *ctxt) { unsigned long dirty = ctxt->regs_dirty; @@ -2339,335 +2307,15 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt) return rc; } -static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt) -{ -#ifdef CONFIG_X86_64 - return ctxt->ops->guest_has_long_mode(ctxt); -#else - return false; -#endif -} - -static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags) -{ - desc->g = (flags >> 23) & 1; - desc->d = (flags >> 22) & 1; - desc->l = (flags >> 21) & 1; - desc->avl = (flags >> 20) & 1; - desc->p = (flags >> 15) & 1; - desc->dpl = (flags >> 13) & 3; - desc->s = (flags >> 12) & 1; - desc->type = (flags >> 8) & 15; -} - -static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, const char *smstate, - int n) -{ - struct desc_struct desc; - int offset; - u16 selector; - - selector = GET_SMSTATE(u32, smstate, 0x7fa8 + n * 4); - - if (n < 3) - offset = 0x7f84 + n * 12; - else - offset = 0x7f2c + (n - 3) * 12; - - set_desc_base(&desc, GET_SMSTATE(u32, smstate, offset + 8)); - set_desc_limit(&desc, GET_SMSTATE(u32, smstate, offset + 4)); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, offset)); - ctxt->ops->set_segment(ctxt, selector, &desc, 0, n); - return X86EMUL_CONTINUE; -} - -#ifdef CONFIG_X86_64 -static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, const char *smstate, - int n) -{ - struct desc_struct desc; - int offset; - u16 selector; - u32 base3; - - offset = 0x7e00 + n * 16; - - selector = GET_SMSTATE(u16, smstate, offset); - rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smstate, offset + 2) << 8); - set_desc_limit(&desc, GET_SMSTATE(u32, smstate, offset + 4)); - set_desc_base(&desc, GET_SMSTATE(u32, smstate, offset + 8)); - base3 = GET_SMSTATE(u32, smstate, offset + 12); - - ctxt->ops->set_segment(ctxt, selector, &desc, base3, n); - return X86EMUL_CONTINUE; -} -#endif - -static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, - u64 cr0, u64 cr3, u64 cr4) -{ - int bad; - u64 pcid; - - /* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */ - pcid = 0; - if (cr4 & X86_CR4_PCIDE) { - pcid = cr3 & 0xfff; - cr3 &= ~0xfff; - } - - bad = ctxt->ops->set_cr(ctxt, 3, cr3); - if (bad) - return X86EMUL_UNHANDLEABLE; - - /* - * First enable PAE, long mode needs it before CR0.PG = 1 is set. - * Then enable protected mode. However, PCID cannot be enabled - * if EFER.LMA=0, so set it separately. - */ - bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE); - if (bad) - return X86EMUL_UNHANDLEABLE; - - bad = ctxt->ops->set_cr(ctxt, 0, cr0); - if (bad) - return X86EMUL_UNHANDLEABLE; - - if (cr4 & X86_CR4_PCIDE) { - bad = ctxt->ops->set_cr(ctxt, 4, cr4); - if (bad) - return X86EMUL_UNHANDLEABLE; - if (pcid) { - bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid); - if (bad) - return X86EMUL_UNHANDLEABLE; - } - - } - - return X86EMUL_CONTINUE; -} - -static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, - const char *smstate) -{ - struct desc_struct desc; - struct desc_ptr dt; - u16 selector; - u32 val, cr0, cr3, cr4; - int i; - - cr0 = GET_SMSTATE(u32, smstate, 0x7ffc); - cr3 = GET_SMSTATE(u32, smstate, 0x7ff8); - ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7ff4) | X86_EFLAGS_FIXED; - ctxt->_eip = GET_SMSTATE(u32, smstate, 0x7ff0); - - for (i = 0; i < 8; i++) - *reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4); - - val = GET_SMSTATE(u32, smstate, 0x7fcc); - - if (ctxt->ops->set_dr(ctxt, 6, val)) - return X86EMUL_UNHANDLEABLE; - - val = GET_SMSTATE(u32, smstate, 0x7fc8); - - if (ctxt->ops->set_dr(ctxt, 7, val)) - return X86EMUL_UNHANDLEABLE; - - selector = GET_SMSTATE(u32, smstate, 0x7fc4); - set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f64)); - set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7f60)); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7f5c)); - ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_TR); - - selector = GET_SMSTATE(u32, smstate, 0x7fc0); - set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f80)); - set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7f7c)); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7f78)); - ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_LDTR); - - dt.address = GET_SMSTATE(u32, smstate, 0x7f74); - dt.size = GET_SMSTATE(u32, smstate, 0x7f70); - ctxt->ops->set_gdt(ctxt, &dt); - - dt.address = GET_SMSTATE(u32, smstate, 0x7f58); - dt.size = GET_SMSTATE(u32, smstate, 0x7f54); - ctxt->ops->set_idt(ctxt, &dt); - - for (i = 0; i < 6; i++) { - int r = rsm_load_seg_32(ctxt, smstate, i); - if (r != X86EMUL_CONTINUE) - return r; - } - - cr4 = GET_SMSTATE(u32, smstate, 0x7f14); - - ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7ef8)); - - return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); -} - -#ifdef CONFIG_X86_64 -static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, - const char *smstate) -{ - struct desc_struct desc; - struct desc_ptr dt; - u64 val, cr0, cr3, cr4; - u32 base3; - u16 selector; - int i, r; - - for (i = 0; i < 16; i++) - *reg_write(ctxt, i) = GET_SMSTATE(u64, smstate, 0x7ff8 - i * 8); - - ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78); - ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7f70) | X86_EFLAGS_FIXED; - - val = GET_SMSTATE(u64, smstate, 0x7f68); - - if (ctxt->ops->set_dr(ctxt, 6, val)) - return X86EMUL_UNHANDLEABLE; - - val = GET_SMSTATE(u64, smstate, 0x7f60); - - if (ctxt->ops->set_dr(ctxt, 7, val)) - return X86EMUL_UNHANDLEABLE; - - cr0 = GET_SMSTATE(u64, smstate, 0x7f58); - cr3 = GET_SMSTATE(u64, smstate, 0x7f50); - cr4 = GET_SMSTATE(u64, smstate, 0x7f48); - ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7f00)); - val = GET_SMSTATE(u64, smstate, 0x7ed0); - - if (ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA)) - return X86EMUL_UNHANDLEABLE; - - selector = GET_SMSTATE(u32, smstate, 0x7e90); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e92) << 8); - set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7e94)); - set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7e98)); - base3 = GET_SMSTATE(u32, smstate, 0x7e9c); - ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_TR); - - dt.size = GET_SMSTATE(u32, smstate, 0x7e84); - dt.address = GET_SMSTATE(u64, smstate, 0x7e88); - ctxt->ops->set_idt(ctxt, &dt); - - selector = GET_SMSTATE(u32, smstate, 0x7e70); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e72) << 8); - set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7e74)); - set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7e78)); - base3 = GET_SMSTATE(u32, smstate, 0x7e7c); - ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_LDTR); - - dt.size = GET_SMSTATE(u32, smstate, 0x7e64); - dt.address = GET_SMSTATE(u64, smstate, 0x7e68); - ctxt->ops->set_gdt(ctxt, &dt); - - r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); - if (r != X86EMUL_CONTINUE) - return r; - - for (i = 0; i < 6; i++) { - r = rsm_load_seg_64(ctxt, smstate, i); - if (r != X86EMUL_CONTINUE) - return r; - } - - return X86EMUL_CONTINUE; -} -#endif - static int em_rsm(struct x86_emulate_ctxt *ctxt) { - unsigned long cr0, cr4, efer; - char buf[512]; - u64 smbase; - int ret; - if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_MASK) == 0) return emulate_ud(ctxt); - smbase = ctxt->ops->get_smbase(ctxt); - - ret = ctxt->ops->read_phys(ctxt, smbase + 0xfe00, buf, sizeof(buf)); - if (ret != X86EMUL_CONTINUE) - return X86EMUL_UNHANDLEABLE; - - if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0) - ctxt->ops->set_nmi_mask(ctxt, false); - - ctxt->ops->exiting_smm(ctxt); - - /* - * Get back to real mode, to prepare a safe state in which to load - * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU - * supports long mode. - */ - if (emulator_has_longmode(ctxt)) { - struct desc_struct cs_desc; - - /* Zero CR4.PCIDE before CR0.PG. */ - cr4 = ctxt->ops->get_cr(ctxt, 4); - if (cr4 & X86_CR4_PCIDE) - ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE); - - /* A 32-bit code segment is required to clear EFER.LMA. */ - memset(&cs_desc, 0, sizeof(cs_desc)); - cs_desc.type = 0xb; - cs_desc.s = cs_desc.g = cs_desc.p = 1; - ctxt->ops->set_segment(ctxt, 0, &cs_desc, 0, VCPU_SREG_CS); - } - - /* For the 64-bit case, this will clear EFER.LMA. */ - cr0 = ctxt->ops->get_cr(ctxt, 0); - if (cr0 & X86_CR0_PE) - ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE)); - - if (emulator_has_longmode(ctxt)) { - /* Clear CR4.PAE before clearing EFER.LME. */ - cr4 = ctxt->ops->get_cr(ctxt, 4); - if (cr4 & X86_CR4_PAE) - ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE); - - /* And finally go back to 32-bit mode. */ - efer = 0; - ctxt->ops->set_msr(ctxt, MSR_EFER, efer); - } - - /* - * Give leave_smm() a chance to make ISA-specific changes to the vCPU - * state (e.g. enter guest mode) before loading state from the SMM - * state-save area. - */ - if (ctxt->ops->leave_smm(ctxt, buf)) - goto emulate_shutdown; - -#ifdef CONFIG_X86_64 - if (emulator_has_longmode(ctxt)) - ret = rsm_load_state_64(ctxt, buf); - else -#endif - ret = rsm_load_state_32(ctxt, buf); - - if (ret != X86EMUL_CONTINUE) - goto emulate_shutdown; + if (ctxt->ops->leave_smm(ctxt)) + ctxt->ops->triple_fault(ctxt); - /* - * Note, the ctxt->ops callbacks are responsible for handling side - * effects when writing MSRs and CRs, e.g. MMU context resets, CPUID - * runtime updates, etc... If that changes, e.g. this flow is moved - * out of the emulator to make it look more like enter_smm(), then - * those side effects need to be explicitly handled for both success - * and shutdown. - */ return emulator_recalc_and_set_mode(ctxt); - -emulate_shutdown: - ctxt->ops->triple_fault(ctxt); - return X86EMUL_CONTINUE; } static void diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 89246446d6aa9..d7afbc448dd24 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -234,8 +234,7 @@ struct x86_emulate_ops { void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt); - void (*exiting_smm)(struct x86_emulate_ctxt *ctxt); - int (*leave_smm)(struct x86_emulate_ctxt *ctxt, const char *smstate); + int (*leave_smm)(struct x86_emulate_ctxt *ctxt); void (*triple_fault)(struct x86_emulate_ctxt *ctxt); int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr); }; @@ -526,4 +525,35 @@ void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt); void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt); bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt); +static inline ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) +{ + if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt)) + nr &= NR_EMULATOR_GPRS - 1; + + if (!(ctxt->regs_valid & (1 << nr))) { + ctxt->regs_valid |= 1 << nr; + ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr); + } + return ctxt->_regs[nr]; +} + +static inline ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr) +{ + if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt)) + nr &= NR_EMULATOR_GPRS - 1; + + BUILD_BUG_ON(sizeof(ctxt->regs_dirty) * BITS_PER_BYTE < NR_EMULATOR_GPRS); + BUILD_BUG_ON(sizeof(ctxt->regs_valid) * BITS_PER_BYTE < NR_EMULATOR_GPRS); + + ctxt->regs_valid |= 1 << nr; + ctxt->regs_dirty |= 1 << nr; + return &ctxt->_regs[nr]; +} + +static inline ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr) +{ + reg_read(ctxt, nr); + return reg_write(ctxt, nr); +} + #endif /* _ASM_X86_KVM_X86_EMULATE_H */ diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index 26a6859e421fe..073dad4f04b59 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -270,3 +270,319 @@ void enter_smm(struct kvm_vcpu *vcpu) kvm_update_cpuid_runtime(vcpu); kvm_mmu_reset_context(vcpu); } + +static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt) +{ +#ifdef CONFIG_X86_64 + return ctxt->ops->guest_has_long_mode(ctxt); +#else + return false; +#endif +} + +static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags) +{ + desc->g = (flags >> 23) & 1; + desc->d = (flags >> 22) & 1; + desc->l = (flags >> 21) & 1; + desc->avl = (flags >> 20) & 1; + desc->p = (flags >> 15) & 1; + desc->dpl = (flags >> 13) & 3; + desc->s = (flags >> 12) & 1; + desc->type = (flags >> 8) & 15; +} + +static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, const char *smstate, + int n) +{ + struct desc_struct desc; + int offset; + u16 selector; + + selector = GET_SMSTATE(u32, smstate, 0x7fa8 + n * 4); + + if (n < 3) + offset = 0x7f84 + n * 12; + else + offset = 0x7f2c + (n - 3) * 12; + + set_desc_base(&desc, GET_SMSTATE(u32, smstate, offset + 8)); + set_desc_limit(&desc, GET_SMSTATE(u32, smstate, offset + 4)); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, offset)); + ctxt->ops->set_segment(ctxt, selector, &desc, 0, n); + return X86EMUL_CONTINUE; +} + +#ifdef CONFIG_X86_64 +static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, const char *smstate, + int n) +{ + struct desc_struct desc; + int offset; + u16 selector; + u32 base3; + + offset = 0x7e00 + n * 16; + + selector = GET_SMSTATE(u16, smstate, offset); + rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smstate, offset + 2) << 8); + set_desc_limit(&desc, GET_SMSTATE(u32, smstate, offset + 4)); + set_desc_base(&desc, GET_SMSTATE(u32, smstate, offset + 8)); + base3 = GET_SMSTATE(u32, smstate, offset + 12); + + ctxt->ops->set_segment(ctxt, selector, &desc, base3, n); + return X86EMUL_CONTINUE; +} +#endif + +static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, + u64 cr0, u64 cr3, u64 cr4) +{ + int bad; + u64 pcid; + + /* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */ + pcid = 0; + if (cr4 & X86_CR4_PCIDE) { + pcid = cr3 & 0xfff; + cr3 &= ~0xfff; + } + + bad = ctxt->ops->set_cr(ctxt, 3, cr3); + if (bad) + return X86EMUL_UNHANDLEABLE; + + /* + * First enable PAE, long mode needs it before CR0.PG = 1 is set. + * Then enable protected mode. However, PCID cannot be enabled + * if EFER.LMA=0, so set it separately. + */ + bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE); + if (bad) + return X86EMUL_UNHANDLEABLE; + + bad = ctxt->ops->set_cr(ctxt, 0, cr0); + if (bad) + return X86EMUL_UNHANDLEABLE; + + if (cr4 & X86_CR4_PCIDE) { + bad = ctxt->ops->set_cr(ctxt, 4, cr4); + if (bad) + return X86EMUL_UNHANDLEABLE; + if (pcid) { + bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid); + if (bad) + return X86EMUL_UNHANDLEABLE; + } + + } + + return X86EMUL_CONTINUE; +} + +static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, + const char *smstate) +{ + struct desc_struct desc; + struct desc_ptr dt; + u16 selector; + u32 val, cr0, cr3, cr4; + int i; + + cr0 = GET_SMSTATE(u32, smstate, 0x7ffc); + cr3 = GET_SMSTATE(u32, smstate, 0x7ff8); + ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7ff4) | X86_EFLAGS_FIXED; + ctxt->_eip = GET_SMSTATE(u32, smstate, 0x7ff0); + + for (i = 0; i < 8; i++) + *reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4); + + val = GET_SMSTATE(u32, smstate, 0x7fcc); + + if (ctxt->ops->set_dr(ctxt, 6, val)) + return X86EMUL_UNHANDLEABLE; + + val = GET_SMSTATE(u32, smstate, 0x7fc8); + + if (ctxt->ops->set_dr(ctxt, 7, val)) + return X86EMUL_UNHANDLEABLE; + + selector = GET_SMSTATE(u32, smstate, 0x7fc4); + set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f64)); + set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7f60)); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7f5c)); + ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_TR); + + selector = GET_SMSTATE(u32, smstate, 0x7fc0); + set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f80)); + set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7f7c)); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7f78)); + ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_LDTR); + + dt.address = GET_SMSTATE(u32, smstate, 0x7f74); + dt.size = GET_SMSTATE(u32, smstate, 0x7f70); + ctxt->ops->set_gdt(ctxt, &dt); + + dt.address = GET_SMSTATE(u32, smstate, 0x7f58); + dt.size = GET_SMSTATE(u32, smstate, 0x7f54); + ctxt->ops->set_idt(ctxt, &dt); + + for (i = 0; i < 6; i++) { + int r = rsm_load_seg_32(ctxt, smstate, i); + if (r != X86EMUL_CONTINUE) + return r; + } + + cr4 = GET_SMSTATE(u32, smstate, 0x7f14); + + ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7ef8)); + + return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); +} + +#ifdef CONFIG_X86_64 +static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, + const char *smstate) +{ + struct desc_struct desc; + struct desc_ptr dt; + u64 val, cr0, cr3, cr4; + u32 base3; + u16 selector; + int i, r; + + for (i = 0; i < 16; i++) + *reg_write(ctxt, i) = GET_SMSTATE(u64, smstate, 0x7ff8 - i * 8); + + ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78); + ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7f70) | X86_EFLAGS_FIXED; + + val = GET_SMSTATE(u64, smstate, 0x7f68); + + if (ctxt->ops->set_dr(ctxt, 6, val)) + return X86EMUL_UNHANDLEABLE; + + val = GET_SMSTATE(u64, smstate, 0x7f60); + + if (ctxt->ops->set_dr(ctxt, 7, val)) + return X86EMUL_UNHANDLEABLE; + + cr0 = GET_SMSTATE(u64, smstate, 0x7f58); + cr3 = GET_SMSTATE(u64, smstate, 0x7f50); + cr4 = GET_SMSTATE(u64, smstate, 0x7f48); + ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7f00)); + val = GET_SMSTATE(u64, smstate, 0x7ed0); + + if (ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA)) + return X86EMUL_UNHANDLEABLE; + + selector = GET_SMSTATE(u32, smstate, 0x7e90); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e92) << 8); + set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7e94)); + set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7e98)); + base3 = GET_SMSTATE(u32, smstate, 0x7e9c); + ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_TR); + + dt.size = GET_SMSTATE(u32, smstate, 0x7e84); + dt.address = GET_SMSTATE(u64, smstate, 0x7e88); + ctxt->ops->set_idt(ctxt, &dt); + + selector = GET_SMSTATE(u32, smstate, 0x7e70); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e72) << 8); + set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7e74)); + set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7e78)); + base3 = GET_SMSTATE(u32, smstate, 0x7e7c); + ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_LDTR); + + dt.size = GET_SMSTATE(u32, smstate, 0x7e64); + dt.address = GET_SMSTATE(u64, smstate, 0x7e68); + ctxt->ops->set_gdt(ctxt, &dt); + + r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); + if (r != X86EMUL_CONTINUE) + return r; + + for (i = 0; i < 6; i++) { + r = rsm_load_seg_64(ctxt, smstate, i); + if (r != X86EMUL_CONTINUE) + return r; + } + + return X86EMUL_CONTINUE; +} +#endif + +int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) +{ + struct kvm_vcpu *vcpu = ctxt->vcpu; + unsigned long cr0, cr4, efer; + char buf[512]; + u64 smbase; + int ret; + + smbase = ctxt->ops->get_smbase(ctxt); + + ret = ctxt->ops->read_phys(ctxt, smbase + 0xfe00, buf, sizeof(buf)); + if (ret != X86EMUL_CONTINUE) + return X86EMUL_UNHANDLEABLE; + + if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0) + ctxt->ops->set_nmi_mask(ctxt, false); + + kvm_smm_changed(vcpu, false); + + /* + * Get back to real mode, to prepare a safe state in which to load + * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU + * supports long mode. + * + * The ctxt->ops callbacks will handle all side effects when writing + * writing MSRs and CRs, e.g. MMU context resets, CPUID + * runtime updates, etc. + */ + if (emulator_has_longmode(ctxt)) { + struct desc_struct cs_desc; + + /* Zero CR4.PCIDE before CR0.PG. */ + cr4 = ctxt->ops->get_cr(ctxt, 4); + if (cr4 & X86_CR4_PCIDE) + ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE); + + /* A 32-bit code segment is required to clear EFER.LMA. */ + memset(&cs_desc, 0, sizeof(cs_desc)); + cs_desc.type = 0xb; + cs_desc.s = cs_desc.g = cs_desc.p = 1; + ctxt->ops->set_segment(ctxt, 0, &cs_desc, 0, VCPU_SREG_CS); + } + + /* For the 64-bit case, this will clear EFER.LMA. */ + cr0 = ctxt->ops->get_cr(ctxt, 0); + if (cr0 & X86_CR0_PE) + ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE)); + + if (emulator_has_longmode(ctxt)) { + /* Clear CR4.PAE before clearing EFER.LME. */ + cr4 = ctxt->ops->get_cr(ctxt, 4); + if (cr4 & X86_CR4_PAE) + ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE); + + /* And finally go back to 32-bit mode. */ + efer = 0; + ctxt->ops->set_msr(ctxt, MSR_EFER, efer); + } + + /* + * Give leave_smm() a chance to make ISA-specific changes to the vCPU + * state (e.g. enter guest mode) before loading state from the SMM + * state-save area. + */ + if (static_call(kvm_x86_leave_smm)(vcpu, buf)) + return X86EMUL_UNHANDLEABLE; + +#ifdef CONFIG_X86_64 + if (emulator_has_longmode(ctxt)) + return rsm_load_state_64(ctxt, buf); + else +#endif + return rsm_load_state_32(ctxt, buf); +} diff --git a/arch/x86/kvm/smm.h b/arch/x86/kvm/smm.h index aacc6dac2c99a..b0602a92e511e 100644 --- a/arch/x86/kvm/smm.h +++ b/arch/x86/kvm/smm.h @@ -21,6 +21,7 @@ static inline bool is_smm(struct kvm_vcpu *vcpu) void kvm_smm_changed(struct kvm_vcpu *vcpu, bool in_smm); void enter_smm(struct kvm_vcpu *vcpu); +int emulator_leave_smm(struct x86_emulate_ctxt *ctxt); void process_smi(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0730b16564f99..b953f0184208f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8150,19 +8150,6 @@ static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) return emul_to_vcpu(ctxt)->arch.hflags; } -static void emulator_exiting_smm(struct x86_emulate_ctxt *ctxt) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - - kvm_smm_changed(vcpu, false); -} - -static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt, - const char *smstate) -{ - return static_call(kvm_x86_leave_smm)(emul_to_vcpu(ctxt), smstate); -} - static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt) { kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt)); @@ -8226,7 +8213,6 @@ static const struct x86_emulate_ops emulate_ops = { .guest_has_rdpid = emulator_guest_has_rdpid, .set_nmi_mask = emulator_set_nmi_mask, .get_hflags = emulator_get_hflags, - .exiting_smm = emulator_exiting_smm, .leave_smm = emulator_leave_smm, .triple_fault = emulator_triple_fault, .set_xcr = emulator_set_xcr, -- GitLab From 1d0da94cdafe38b2c501a8d55f981204e588e259 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 29 Sep 2022 13:20:12 -0400 Subject: [PATCH 0420/1423] KVM: x86: do not go through ctxt->ops when emulating rsm Now that RSM is implemented in a single emulator callback, there is no point in going through other callbacks for the sake of modifying processor state. Just invoke KVM's own internal functions directly, and remove the callbacks that were only used by em_rsm; the only substantial difference is in the handling of the segment registers and descriptor cache, which have to be parsed into a struct kvm_segment instead of a struct desc_struct. This also fixes a bug where emulator_set_segment was shifting the limit left by 12 if the G bit is set, but the limit had not been shifted right upon entry to SMM. The emulator context is still used to restore EIP and the general purpose registers. Signed-off-by: Paolo Bonzini Reviewed-by: Maxim Levitsky Message-Id: <20220929172016.319443-5-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/kvm_emulate.h | 13 --- arch/x86/kvm/smm.c | 182 +++++++++++++++++-------------------- arch/x86/kvm/x86.c | 33 ------- 3 files changed, 85 insertions(+), 143 deletions(-) diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index d7afbc448dd24..84b1f26614630 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -116,16 +116,6 @@ struct x86_emulate_ops { unsigned int bytes, struct x86_exception *fault, bool system); - /* - * read_phys: Read bytes of standard (non-emulated/special) memory. - * Used for descriptor reading. - * @addr: [IN ] Physical address from which to read. - * @val: [OUT] Value read from memory. - * @bytes: [IN ] Number of bytes to read from memory. - */ - int (*read_phys)(struct x86_emulate_ctxt *ctxt, unsigned long addr, - void *val, unsigned int bytes); - /* * write_std: Write bytes of standard (non-emulated/special) memory. * Used for descriptor writing. @@ -209,11 +199,8 @@ struct x86_emulate_ops { int (*cpl)(struct x86_emulate_ctxt *ctxt); void (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); - u64 (*get_smbase)(struct x86_emulate_ctxt *ctxt); - void (*set_smbase)(struct x86_emulate_ctxt *ctxt, u64 smbase); int (*set_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); int (*get_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); - int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc); int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata); diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index 073dad4f04b59..102ecb8525640 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -271,71 +271,59 @@ void enter_smm(struct kvm_vcpu *vcpu) kvm_mmu_reset_context(vcpu); } -static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt) -{ -#ifdef CONFIG_X86_64 - return ctxt->ops->guest_has_long_mode(ctxt); -#else - return false; -#endif -} - -static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags) +static void rsm_set_desc_flags(struct kvm_segment *desc, u32 flags) { desc->g = (flags >> 23) & 1; - desc->d = (flags >> 22) & 1; + desc->db = (flags >> 22) & 1; desc->l = (flags >> 21) & 1; desc->avl = (flags >> 20) & 1; - desc->p = (flags >> 15) & 1; + desc->present = (flags >> 15) & 1; desc->dpl = (flags >> 13) & 3; desc->s = (flags >> 12) & 1; desc->type = (flags >> 8) & 15; + + desc->unusable = !desc->present; + desc->padding = 0; } -static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, const char *smstate, +static int rsm_load_seg_32(struct kvm_vcpu *vcpu, const char *smstate, int n) { - struct desc_struct desc; + struct kvm_segment desc; int offset; - u16 selector; - - selector = GET_SMSTATE(u32, smstate, 0x7fa8 + n * 4); if (n < 3) offset = 0x7f84 + n * 12; else offset = 0x7f2c + (n - 3) * 12; - set_desc_base(&desc, GET_SMSTATE(u32, smstate, offset + 8)); - set_desc_limit(&desc, GET_SMSTATE(u32, smstate, offset + 4)); + desc.selector = GET_SMSTATE(u32, smstate, 0x7fa8 + n * 4); + desc.base = GET_SMSTATE(u32, smstate, offset + 8); + desc.limit = GET_SMSTATE(u32, smstate, offset + 4); rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, offset)); - ctxt->ops->set_segment(ctxt, selector, &desc, 0, n); + kvm_set_segment(vcpu, &desc, n); return X86EMUL_CONTINUE; } #ifdef CONFIG_X86_64 -static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, const char *smstate, +static int rsm_load_seg_64(struct kvm_vcpu *vcpu, const char *smstate, int n) { - struct desc_struct desc; + struct kvm_segment desc; int offset; - u16 selector; - u32 base3; offset = 0x7e00 + n * 16; - selector = GET_SMSTATE(u16, smstate, offset); + desc.selector = GET_SMSTATE(u16, smstate, offset); rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smstate, offset + 2) << 8); - set_desc_limit(&desc, GET_SMSTATE(u32, smstate, offset + 4)); - set_desc_base(&desc, GET_SMSTATE(u32, smstate, offset + 8)); - base3 = GET_SMSTATE(u32, smstate, offset + 12); - - ctxt->ops->set_segment(ctxt, selector, &desc, base3, n); + desc.limit = GET_SMSTATE(u32, smstate, offset + 4); + desc.base = GET_SMSTATE(u64, smstate, offset + 8); + kvm_set_segment(vcpu, &desc, n); return X86EMUL_CONTINUE; } #endif -static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, +static int rsm_enter_protected_mode(struct kvm_vcpu *vcpu, u64 cr0, u64 cr3, u64 cr4) { int bad; @@ -348,7 +336,7 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, cr3 &= ~0xfff; } - bad = ctxt->ops->set_cr(ctxt, 3, cr3); + bad = kvm_set_cr3(vcpu, cr3); if (bad) return X86EMUL_UNHANDLEABLE; @@ -357,20 +345,20 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, * Then enable protected mode. However, PCID cannot be enabled * if EFER.LMA=0, so set it separately. */ - bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE); + bad = kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PCIDE); if (bad) return X86EMUL_UNHANDLEABLE; - bad = ctxt->ops->set_cr(ctxt, 0, cr0); + bad = kvm_set_cr0(vcpu, cr0); if (bad) return X86EMUL_UNHANDLEABLE; if (cr4 & X86_CR4_PCIDE) { - bad = ctxt->ops->set_cr(ctxt, 4, cr4); + bad = kvm_set_cr4(vcpu, cr4); if (bad) return X86EMUL_UNHANDLEABLE; if (pcid) { - bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid); + bad = kvm_set_cr3(vcpu, cr3 | pcid); if (bad) return X86EMUL_UNHANDLEABLE; } @@ -383,9 +371,9 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, const char *smstate) { - struct desc_struct desc; + struct kvm_vcpu *vcpu = ctxt->vcpu; + struct kvm_segment desc; struct desc_ptr dt; - u16 selector; u32 val, cr0, cr3, cr4; int i; @@ -399,56 +387,55 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, val = GET_SMSTATE(u32, smstate, 0x7fcc); - if (ctxt->ops->set_dr(ctxt, 6, val)) + if (kvm_set_dr(vcpu, 6, val)) return X86EMUL_UNHANDLEABLE; val = GET_SMSTATE(u32, smstate, 0x7fc8); - if (ctxt->ops->set_dr(ctxt, 7, val)) + if (kvm_set_dr(vcpu, 7, val)) return X86EMUL_UNHANDLEABLE; - selector = GET_SMSTATE(u32, smstate, 0x7fc4); - set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f64)); - set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7f60)); + desc.selector = GET_SMSTATE(u32, smstate, 0x7fc4); + desc.base = GET_SMSTATE(u32, smstate, 0x7f64); + desc.limit = GET_SMSTATE(u32, smstate, 0x7f60); rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7f5c)); - ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_TR); + kvm_set_segment(vcpu, &desc, VCPU_SREG_TR); - selector = GET_SMSTATE(u32, smstate, 0x7fc0); - set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f80)); - set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7f7c)); + desc.selector = GET_SMSTATE(u32, smstate, 0x7fc0); + desc.base = GET_SMSTATE(u32, smstate, 0x7f80); + desc.limit = GET_SMSTATE(u32, smstate, 0x7f7c); rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7f78)); - ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_LDTR); + kvm_set_segment(vcpu, &desc, VCPU_SREG_LDTR); dt.address = GET_SMSTATE(u32, smstate, 0x7f74); dt.size = GET_SMSTATE(u32, smstate, 0x7f70); - ctxt->ops->set_gdt(ctxt, &dt); + static_call(kvm_x86_set_gdt)(vcpu, &dt); dt.address = GET_SMSTATE(u32, smstate, 0x7f58); dt.size = GET_SMSTATE(u32, smstate, 0x7f54); - ctxt->ops->set_idt(ctxt, &dt); + static_call(kvm_x86_set_idt)(vcpu, &dt); for (i = 0; i < 6; i++) { - int r = rsm_load_seg_32(ctxt, smstate, i); + int r = rsm_load_seg_32(vcpu, smstate, i); if (r != X86EMUL_CONTINUE) return r; } cr4 = GET_SMSTATE(u32, smstate, 0x7f14); - ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7ef8)); + vcpu->arch.smbase = GET_SMSTATE(u32, smstate, 0x7ef8); - return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); + return rsm_enter_protected_mode(vcpu, cr0, cr3, cr4); } #ifdef CONFIG_X86_64 static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, const char *smstate) { - struct desc_struct desc; + struct kvm_vcpu *vcpu = ctxt->vcpu; + struct kvm_segment desc; struct desc_ptr dt; u64 val, cr0, cr3, cr4; - u32 base3; - u16 selector; int i, r; for (i = 0; i < 16; i++) @@ -459,51 +446,49 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, val = GET_SMSTATE(u64, smstate, 0x7f68); - if (ctxt->ops->set_dr(ctxt, 6, val)) + if (kvm_set_dr(vcpu, 6, val)) return X86EMUL_UNHANDLEABLE; val = GET_SMSTATE(u64, smstate, 0x7f60); - if (ctxt->ops->set_dr(ctxt, 7, val)) + if (kvm_set_dr(vcpu, 7, val)) return X86EMUL_UNHANDLEABLE; cr0 = GET_SMSTATE(u64, smstate, 0x7f58); cr3 = GET_SMSTATE(u64, smstate, 0x7f50); cr4 = GET_SMSTATE(u64, smstate, 0x7f48); - ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7f00)); + vcpu->arch.smbase = GET_SMSTATE(u32, smstate, 0x7f00); val = GET_SMSTATE(u64, smstate, 0x7ed0); - if (ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA)) + if (kvm_set_msr(vcpu, MSR_EFER, val & ~EFER_LMA)) return X86EMUL_UNHANDLEABLE; - selector = GET_SMSTATE(u32, smstate, 0x7e90); + desc.selector = GET_SMSTATE(u32, smstate, 0x7e90); rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e92) << 8); - set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7e94)); - set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7e98)); - base3 = GET_SMSTATE(u32, smstate, 0x7e9c); - ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_TR); + desc.limit = GET_SMSTATE(u32, smstate, 0x7e94); + desc.base = GET_SMSTATE(u64, smstate, 0x7e98); + kvm_set_segment(vcpu, &desc, VCPU_SREG_TR); dt.size = GET_SMSTATE(u32, smstate, 0x7e84); dt.address = GET_SMSTATE(u64, smstate, 0x7e88); - ctxt->ops->set_idt(ctxt, &dt); + static_call(kvm_x86_set_idt)(vcpu, &dt); - selector = GET_SMSTATE(u32, smstate, 0x7e70); + desc.selector = GET_SMSTATE(u32, smstate, 0x7e70); rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e72) << 8); - set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7e74)); - set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7e78)); - base3 = GET_SMSTATE(u32, smstate, 0x7e7c); - ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_LDTR); + desc.limit = GET_SMSTATE(u32, smstate, 0x7e74); + desc.base = GET_SMSTATE(u64, smstate, 0x7e78); + kvm_set_segment(vcpu, &desc, VCPU_SREG_LDTR); dt.size = GET_SMSTATE(u32, smstate, 0x7e64); dt.address = GET_SMSTATE(u64, smstate, 0x7e68); - ctxt->ops->set_gdt(ctxt, &dt); + static_call(kvm_x86_set_gdt)(vcpu, &dt); - r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); + r = rsm_enter_protected_mode(vcpu, cr0, cr3, cr4); if (r != X86EMUL_CONTINUE) return r; for (i = 0; i < 6; i++) { - r = rsm_load_seg_64(ctxt, smstate, i); + r = rsm_load_seg_64(vcpu, smstate, i); if (r != X86EMUL_CONTINUE) return r; } @@ -515,19 +500,19 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) { struct kvm_vcpu *vcpu = ctxt->vcpu; - unsigned long cr0, cr4, efer; + unsigned long cr0; char buf[512]; u64 smbase; int ret; - smbase = ctxt->ops->get_smbase(ctxt); + smbase = vcpu->arch.smbase; - ret = ctxt->ops->read_phys(ctxt, smbase + 0xfe00, buf, sizeof(buf)); - if (ret != X86EMUL_CONTINUE) + ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfe00, buf, sizeof(buf)); + if (ret < 0) return X86EMUL_UNHANDLEABLE; - if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0) - ctxt->ops->set_nmi_mask(ctxt, false); + if ((vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK) == 0) + static_call(kvm_x86_set_nmi_mask)(vcpu, false); kvm_smm_changed(vcpu, false); @@ -535,41 +520,44 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) * Get back to real mode, to prepare a safe state in which to load * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU * supports long mode. - * - * The ctxt->ops callbacks will handle all side effects when writing - * writing MSRs and CRs, e.g. MMU context resets, CPUID - * runtime updates, etc. */ - if (emulator_has_longmode(ctxt)) { - struct desc_struct cs_desc; +#ifdef CONFIG_X86_64 + if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) { + struct kvm_segment cs_desc; + unsigned long cr4; /* Zero CR4.PCIDE before CR0.PG. */ - cr4 = ctxt->ops->get_cr(ctxt, 4); + cr4 = kvm_read_cr4(vcpu); if (cr4 & X86_CR4_PCIDE) - ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE); + kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PCIDE); /* A 32-bit code segment is required to clear EFER.LMA. */ memset(&cs_desc, 0, sizeof(cs_desc)); cs_desc.type = 0xb; - cs_desc.s = cs_desc.g = cs_desc.p = 1; - ctxt->ops->set_segment(ctxt, 0, &cs_desc, 0, VCPU_SREG_CS); + cs_desc.s = cs_desc.g = cs_desc.present = 1; + kvm_set_segment(vcpu, &cs_desc, VCPU_SREG_CS); } +#endif /* For the 64-bit case, this will clear EFER.LMA. */ - cr0 = ctxt->ops->get_cr(ctxt, 0); + cr0 = kvm_read_cr0(vcpu); if (cr0 & X86_CR0_PE) - ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE)); + kvm_set_cr0(vcpu, cr0 & ~(X86_CR0_PG | X86_CR0_PE)); + +#ifdef CONFIG_X86_64 + if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) { + unsigned long cr4, efer; - if (emulator_has_longmode(ctxt)) { /* Clear CR4.PAE before clearing EFER.LME. */ - cr4 = ctxt->ops->get_cr(ctxt, 4); + cr4 = kvm_read_cr4(vcpu); if (cr4 & X86_CR4_PAE) - ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE); + kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PAE); /* And finally go back to 32-bit mode. */ efer = 0; - ctxt->ops->set_msr(ctxt, MSR_EFER, efer); + kvm_set_msr(vcpu, MSR_EFER, efer); } +#endif /* * Give leave_smm() a chance to make ISA-specific changes to the vCPU @@ -580,7 +568,7 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) return X86EMUL_UNHANDLEABLE; #ifdef CONFIG_X86_64 - if (emulator_has_longmode(ctxt)) + if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) return rsm_load_state_64(ctxt, buf); else #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b953f0184208f..019ba8725412c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7256,15 +7256,6 @@ static int emulator_read_std(struct x86_emulate_ctxt *ctxt, return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); } -static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt, - unsigned long addr, void *val, unsigned int bytes) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes); - - return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE; -} - static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, struct kvm_vcpu *vcpu, u64 access, struct x86_exception *exception) @@ -8056,26 +8047,6 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); } -static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, - u32 msr_index, u64 data) -{ - return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); -} - -static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - - return vcpu->arch.smbase; -} - -static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - - vcpu->arch.smbase = smbase; -} - static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt, u32 pmc) { @@ -8174,7 +8145,6 @@ static const struct x86_emulate_ops emulate_ops = { .write_gpr = emulator_write_gpr, .read_std = emulator_read_std, .write_std = emulator_write_std, - .read_phys = kvm_read_guest_phys_system, .fetch = kvm_fetch_guest_virt, .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, @@ -8194,11 +8164,8 @@ static const struct x86_emulate_ops emulate_ops = { .cpl = emulator_get_cpl, .get_dr = emulator_get_dr, .set_dr = emulator_set_dr, - .get_smbase = emulator_get_smbase, - .set_smbase = emulator_set_smbase, .set_msr_with_filter = emulator_set_msr_with_filter, .get_msr_with_filter = emulator_get_msr_with_filter, - .set_msr = emulator_set_msr, .get_msr = emulator_get_msr, .check_pmc = emulator_check_pmc, .read_pmc = emulator_read_pmc, -- GitLab From 4b8e1b32013da2495244dbdee70f2456e6bc7aca Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 29 Sep 2022 13:20:13 -0400 Subject: [PATCH 0421/1423] KVM: allow compiling out SMM support Some users of KVM implement the UEFI variable store through a paravirtual device that does not require the "SMM lockbox" component of edk2; allow them to compile out system management mode, which is not a full implementation especially in how it interacts with nested virtualization. Suggested-by: Sean Christopherson Signed-off-by: Paolo Bonzini Reviewed-by: Maxim Levitsky Message-Id: <20220929172016.319443-6-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Kconfig | 11 ++++++++++ arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/smm.h | 12 ++++++++++ arch/x86/kvm/svm/svm.c | 2 ++ arch/x86/kvm/vmx/vmx.c | 2 ++ arch/x86/kvm/x86.c | 22 +++++++++++++++++-- tools/testing/selftests/kvm/x86_64/smm_test.c | 2 ++ 7 files changed, 50 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 67be7f217e37b..fbeaa9ddef598 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -118,6 +118,17 @@ config KVM_AMD_SEV Provides support for launching Encrypted VMs (SEV) and Encrypted VMs with Encrypted State (SEV-ES) on AMD processors. +config KVM_SMM + bool "System Management Mode emulation" + default y + depends on KVM + help + Provides support for KVM to emulate System Management Mode (SMM) + in virtual machines. This can be used by the virtual machine + firmware to implement UEFI secure boot. + + If unsure, say Y. + config KVM_XEN bool "Support for Xen hypercall interface" depends on KVM diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index b584cb0e06bd8..b8a494b6a5ec9 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -20,7 +20,7 @@ endif kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o kvm-$(CONFIG_KVM_XEN) += xen.o -kvm-y += smm.o +kvm-$(CONFIG_KVM_SMM) += smm.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ vmx/evmcs.o vmx/nested.o vmx/posted_intr.o diff --git a/arch/x86/kvm/smm.h b/arch/x86/kvm/smm.h index b0602a92e511e..0e1bd8bd6dc47 100644 --- a/arch/x86/kvm/smm.h +++ b/arch/x86/kvm/smm.h @@ -8,6 +8,7 @@ #define PUT_SMSTATE(type, buf, offset, val) \ *(type *)((buf) + (offset) - 0x7e00) = val +#ifdef CONFIG_KVM_SMM static inline int kvm_inject_smi(struct kvm_vcpu *vcpu) { kvm_make_request(KVM_REQ_SMI, vcpu); @@ -23,5 +24,16 @@ void kvm_smm_changed(struct kvm_vcpu *vcpu, bool in_smm); void enter_smm(struct kvm_vcpu *vcpu); int emulator_leave_smm(struct x86_emulate_ctxt *ctxt); void process_smi(struct kvm_vcpu *vcpu); +#else +static inline int kvm_inject_smi(struct kvm_vcpu *vcpu) { return -ENOTTY; } +static inline bool is_smm(struct kvm_vcpu *vcpu) { return false; } +static inline void enter_smm(struct kvm_vcpu *vcpu) { WARN_ON_ONCE(1); } +static inline void process_smi(struct kvm_vcpu *vcpu) { WARN_ON_ONCE(1); } + +/* + * emulator_leave_smm is used as a function pointer, so the + * stub is defined in x86.c. + */ +#endif #endif diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 3bb07ec789859..4cc014b464061 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4115,6 +4115,8 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index) case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: return false; case MSR_IA32_SMBASE: + if (!IS_ENABLED(CONFIG_KVM_SMM)) + return false; /* SEV-ES guests do not support SMM, so report false */ if (kvm && sev_es_guest(kvm)) return false; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 49065614a3dbb..6a0b65815206b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6842,6 +6842,8 @@ static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) { switch (index) { case MSR_IA32_SMBASE: + if (!IS_ENABLED(CONFIG_KVM_SMM)) + return false; /* * We cannot do SMM unless we can run the guest in big * real mode. diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 019ba8725412c..0a80cd1d91c8c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3642,7 +3642,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; } case MSR_IA32_SMBASE: - if (!msr_info->host_initiated) + if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated) return 1; vcpu->arch.smbase = data; break; @@ -4058,7 +4058,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.ia32_misc_enable_msr; break; case MSR_IA32_SMBASE: - if (!msr_info->host_initiated) + if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated) return 1; msr_info->data = vcpu->arch.smbase; break; @@ -4432,6 +4432,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r |= KVM_X86_DISABLE_EXITS_MWAIT; break; case KVM_CAP_X86_SMM: + if (!IS_ENABLED(CONFIG_KVM_SMM)) + break; + /* SMBASE is usually relocated above 1M on modern chipsets, * and SMM handlers might indeed rely on 4G segment limits, * so do not report SMM to be available if real mode is @@ -5182,6 +5185,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.apic->sipi_vector = events->sipi_vector; if (events->flags & KVM_VCPUEVENT_VALID_SMM) { +#ifdef CONFIG_KVM_SMM if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) { kvm_x86_ops.nested_ops->leave_nested(vcpu); kvm_smm_changed(vcpu, events->smi.smm); @@ -5196,6 +5200,12 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK; } +#else + if (events->smi.smm || events->smi.pending || + events->smi.smm_inside_nmi) + return -EINVAL; +#endif + if (lapic_in_kernel(vcpu)) { if (events->smi.latched_init) set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); @@ -8121,6 +8131,14 @@ static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) return emul_to_vcpu(ctxt)->arch.hflags; } +#ifndef CONFIG_KVM_SMM +static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) +{ + WARN_ON_ONCE(1); + return X86EMUL_UNHANDLEABLE; +} +#endif + static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt) { kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt)); diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c index 1f136a81858e5..cb38a478e1f62 100644 --- a/tools/testing/selftests/kvm/x86_64/smm_test.c +++ b/tools/testing/selftests/kvm/x86_64/smm_test.c @@ -137,6 +137,8 @@ int main(int argc, char *argv[]) struct kvm_x86_state *state; int stage, stage_reported; + TEST_REQUIRE(kvm_has_cap(KVM_CAP_X86_SMM)); + /* Create VM */ vm = vm_create_with_one_vcpu(&vcpu, guest_code); -- GitLab From 31e83e21cf00fe5b669eb352ff3ed70e74b40fad Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 29 Sep 2022 13:20:14 -0400 Subject: [PATCH 0422/1423] KVM: x86: compile out vendor-specific code if SMM is disabled Vendor-specific code that deals with SMI injection and saving/restoring SMM state is not needed if CONFIG_KVM_SMM is disabled, so remove the four callbacks smi_allowed, enter_smm, leave_smm and enable_smi_window. The users in svm/nested.c and x86.c also have to be compiled out; the amount of #ifdef'ed code is small and it's not worth moving it to smm.c. enter_smm is now used only within #ifdef CONFIG_KVM_SMM, and the stub can therefore be removed. Signed-off-by: Paolo Bonzini Reviewed-by: Maxim Levitsky Message-Id: <20220929172016.319443-7-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 2 ++ arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/smm.h | 1 - arch/x86/kvm/svm/nested.c | 2 ++ arch/x86/kvm/svm/svm.c | 4 ++++ arch/x86/kvm/vmx/vmx.c | 4 ++++ arch/x86/kvm/x86.c | 4 ++++ 7 files changed, 18 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 82ba4a564e587..ea58e67e9a670 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -110,10 +110,12 @@ KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt) KVM_X86_OP_OPTIONAL(set_hv_timer) KVM_X86_OP_OPTIONAL(cancel_hv_timer) KVM_X86_OP(setup_mce) +#ifdef CONFIG_KVM_SMM KVM_X86_OP(smi_allowed) KVM_X86_OP(enter_smm) KVM_X86_OP(leave_smm) KVM_X86_OP(enable_smi_window) +#endif KVM_X86_OP_OPTIONAL(mem_enc_ioctl) KVM_X86_OP_OPTIONAL(mem_enc_register_region) KVM_X86_OP_OPTIONAL(mem_enc_unregister_region) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 612ef60631c1f..3e5e54d7baa68 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1612,10 +1612,12 @@ struct kvm_x86_ops { void (*setup_mce)(struct kvm_vcpu *vcpu); +#ifdef CONFIG_KVM_SMM int (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); int (*enter_smm)(struct kvm_vcpu *vcpu, char *smstate); int (*leave_smm)(struct kvm_vcpu *vcpu, const char *smstate); void (*enable_smi_window)(struct kvm_vcpu *vcpu); +#endif int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp); int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp); diff --git a/arch/x86/kvm/smm.h b/arch/x86/kvm/smm.h index 0e1bd8bd6dc47..8debe81494c62 100644 --- a/arch/x86/kvm/smm.h +++ b/arch/x86/kvm/smm.h @@ -27,7 +27,6 @@ void process_smi(struct kvm_vcpu *vcpu); #else static inline int kvm_inject_smi(struct kvm_vcpu *vcpu) { return -ENOTTY; } static inline bool is_smm(struct kvm_vcpu *vcpu) { return false; } -static inline void enter_smm(struct kvm_vcpu *vcpu) { WARN_ON_ONCE(1); } static inline void process_smi(struct kvm_vcpu *vcpu) { WARN_ON_ONCE(1); } /* diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index cc0fd75f7cbab..b258d6988f5dd 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1378,6 +1378,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu) return 0; } +#ifdef CONFIG_KVM_SMM if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) { if (block_nested_events) return -EBUSY; @@ -1386,6 +1387,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu) nested_svm_simple_vmexit(svm, SVM_EXIT_SMI); return 0; } +#endif if (vcpu->arch.nmi_pending && !svm_nmi_blocked(vcpu)) { if (block_nested_events) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 4cc014b464061..d28de3e59f7f7 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4373,6 +4373,7 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu) vcpu->arch.mcg_cap &= 0x1ff; } +#ifdef CONFIG_KVM_SMM bool svm_smi_blocked(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4522,6 +4523,7 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu) /* We must be in SMM; RSM will cause a vmexit anyway. */ } } +#endif static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, void *insn, int insn_len) @@ -4797,10 +4799,12 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .pi_update_irte = avic_pi_update_irte, .setup_mce = svm_setup_mce, +#ifdef CONFIG_KVM_SMM .smi_allowed = svm_smi_allowed, .enter_smm = svm_enter_smm, .leave_smm = svm_leave_smm, .enable_smi_window = svm_enable_smi_window, +#endif .mem_enc_ioctl = sev_mem_enc_ioctl, .mem_enc_register_region = sev_mem_enc_register_region, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6a0b65815206b..6be991b29bb76 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7932,6 +7932,7 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu) ~FEAT_CTL_LMCE_ENABLED; } +#ifdef CONFIG_KVM_SMM static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { /* we need a nested vmexit to enter SMM, postpone if run is pending */ @@ -7986,6 +7987,7 @@ static void vmx_enable_smi_window(struct kvm_vcpu *vcpu) { /* RSM will cause a vmexit anyway. */ } +#endif static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) { @@ -8153,10 +8155,12 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .setup_mce = vmx_setup_mce, +#ifdef CONFIG_KVM_SMM .smi_allowed = vmx_smi_allowed, .enter_smm = vmx_enter_smm, .leave_smm = vmx_leave_smm, .enable_smi_window = vmx_enable_smi_window, +#endif .can_emulate_instruction = vmx_can_emulate_instruction, .apic_init_signal_blocked = vmx_apic_init_signal_blocked, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0a80cd1d91c8c..9ac51c848fc82 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9919,6 +9919,7 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu, * in order to make progress and get back here for another iteration. * The kvm_x86_ops hooks communicate this by returning -EBUSY. */ +#ifdef CONFIG_KVM_SMM if (vcpu->arch.smi_pending) { r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY; if (r < 0) @@ -9931,6 +9932,7 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu, } else static_call(kvm_x86_enable_smi_window)(vcpu); } +#endif if (vcpu->arch.nmi_pending) { r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY; @@ -12580,10 +12582,12 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) static_call(kvm_x86_nmi_allowed)(vcpu, false))) return true; +#ifdef CONFIG_KVM_SMM if (kvm_test_request(KVM_REQ_SMI, vcpu) || (vcpu->arch.smi_pending && static_call(kvm_x86_smi_allowed)(vcpu, false))) return true; +#endif if (kvm_arch_interrupt_allowed(vcpu) && (kvm_cpu_has_interrupt(vcpu) || -- GitLab From ba97bb07e0b28c962015aaf219005928774b886c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 29 Sep 2022 13:20:15 -0400 Subject: [PATCH 0423/1423] KVM: x86: remove SMRAM address space if SMM is not supported If CONFIG_KVM_SMM is not defined HF_SMM_MASK will always be zero, and we can spare userspace the hassle of setting up the SMRAM address space simply by reporting that only one address space is supported. Signed-off-by: Paolo Bonzini Reviewed-by: Maxim Levitsky Message-Id: <20220929172016.319443-8-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3e5e54d7baa68..b9f6f854dcef8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1998,11 +1998,14 @@ enum { #define HF_SMM_MASK (1 << 6) #define HF_SMM_INSIDE_NMI_MASK (1 << 7) -#define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE -#define KVM_ADDRESS_SPACE_NUM 2 - -#define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) -#define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) +#ifdef CONFIG_KVM_SMM +# define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE +# define KVM_ADDRESS_SPACE_NUM 2 +# define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) +# define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) +#else +# define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, 0) +#endif #define KVM_ARCH_WANT_MMU_NOTIFIER -- GitLab From cf7316d0361c5d3289611402b5ac0c64c918b20b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 29 Sep 2022 13:20:16 -0400 Subject: [PATCH 0424/1423] KVM: x86: do not define KVM_REQ_SMI if SMM disabled This ensures that all the relevant code is compiled out, in fact the process_smi stub can be removed too. Signed-off-by: Paolo Bonzini Reviewed-by: Maxim Levitsky Message-Id: <20220929172016.319443-9-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/smm.h | 1 - arch/x86/kvm/x86.c | 6 ++++++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b9f6f854dcef8..24a2152a77cd0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -81,7 +81,9 @@ #define KVM_REQ_NMI KVM_ARCH_REQ(9) #define KVM_REQ_PMU KVM_ARCH_REQ(10) #define KVM_REQ_PMI KVM_ARCH_REQ(11) +#ifdef CONFIG_KVM_SMM #define KVM_REQ_SMI KVM_ARCH_REQ(12) +#endif #define KVM_REQ_MASTERCLOCK_UPDATE KVM_ARCH_REQ(13) #define KVM_REQ_MCLOCK_INPROGRESS \ KVM_ARCH_REQ_FLAGS(14, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) diff --git a/arch/x86/kvm/smm.h b/arch/x86/kvm/smm.h index 8debe81494c62..53c81394ebdbc 100644 --- a/arch/x86/kvm/smm.h +++ b/arch/x86/kvm/smm.h @@ -27,7 +27,6 @@ void process_smi(struct kvm_vcpu *vcpu); #else static inline int kvm_inject_smi(struct kvm_vcpu *vcpu) { return -ENOTTY; } static inline bool is_smm(struct kvm_vcpu *vcpu) { return false; } -static inline void process_smi(struct kvm_vcpu *vcpu) { WARN_ON_ONCE(1); } /* * emulator_leave_smm is used as a function pointer, so the diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9ac51c848fc82..73c32030d5142 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5026,8 +5026,10 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, process_nmi(vcpu); +#ifdef CONFIG_KVM_SMM if (kvm_check_request(KVM_REQ_SMI, vcpu)) process_smi(vcpu); +#endif /* * KVM's ABI only allows for one exception to be migrated. Luckily, @@ -10266,8 +10268,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) record_steal_time(vcpu); +#ifdef CONFIG_KVM_SMM if (kvm_check_request(KVM_REQ_SMI, vcpu)) process_smi(vcpu); +#endif if (kvm_check_request(KVM_REQ_NMI, vcpu)) process_nmi(vcpu); if (kvm_check_request(KVM_REQ_PMU, vcpu)) @@ -12628,7 +12632,9 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) return true; if (kvm_test_request(KVM_REQ_NMI, vcpu) || +#ifdef CONFIG_KVM_SMM kvm_test_request(KVM_REQ_SMI, vcpu) || +#endif kvm_test_request(KVM_REQ_EVENT, vcpu)) return true; -- GitLab From 85672346a707d8e4d5657279dac6b356e1edf24a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 27 Oct 2022 12:44:28 -0400 Subject: [PATCH 0425/1423] KVM: zero output of KVM_GET_VCPU_EVENTS before filling in the struct This allows making some fields optional, as will be the case soon for SMM-related data. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 73c32030d5142..84aa51613e4f5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5057,16 +5057,15 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, ex->pending && ex->has_payload) kvm_deliver_exception_payload(vcpu, ex); + memset(events, 0, sizeof(*events)); + /* * The API doesn't provide the instruction length for software * exceptions, so don't report them. As long as the guest RIP * isn't advanced, we should expect to encounter the exception * again. */ - if (kvm_exception_is_soft(ex->vector)) { - events->exception.injected = 0; - events->exception.pending = 0; - } else { + if (!kvm_exception_is_soft(ex->vector)) { events->exception.injected = ex->injected; events->exception.pending = ex->pending; /* @@ -5086,15 +5085,13 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, events->interrupt.injected = vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft; events->interrupt.nr = vcpu->arch.interrupt.nr; - events->interrupt.soft = 0; events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); events->nmi.injected = vcpu->arch.nmi_injected; events->nmi.pending = vcpu->arch.nmi_pending != 0; events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu); - events->nmi.pad = 0; - events->sipi_vector = 0; /* never valid when reporting to user space */ + /* events->sipi_vector is never valid when reporting to user space */ events->smi.smm = is_smm(vcpu); events->smi.pending = vcpu->arch.smi_pending; @@ -5111,8 +5108,6 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu); events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT; } - - memset(&events->reserved, 0, sizeof(events->reserved)); } static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, -- GitLab From a7662aa5e56ffa9adf65699eda541f94e157cc83 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 7 Nov 2022 10:11:42 -0500 Subject: [PATCH 0426/1423] KVM: x86: do not define SMM-related constants if SMM disabled The hidden processor flags HF_SMM_MASK and HF_SMM_INSIDE_NMI_MASK are not needed if CONFIG_KVM_SMM is turned off. Remove the definitions altogether and the code that uses them. Suggested-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/kvm_emulate.h | 1 - arch/x86/kvm/smm.c | 2 ++ arch/x86/kvm/x86.c | 4 ++-- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 24a2152a77cd0..e9862eaac43b8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1997,10 +1997,11 @@ enum { #define HF_NMI_MASK (1 << 3) #define HF_IRET_MASK (1 << 4) #define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ + +#ifdef CONFIG_KVM_SMM #define HF_SMM_MASK (1 << 6) #define HF_SMM_INSIDE_NMI_MASK (1 << 7) -#ifdef CONFIG_KVM_SMM # define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE # define KVM_ADDRESS_SPACE_NUM 2 # define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 84b1f26614630..2d9662be83337 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -278,7 +278,6 @@ enum x86emul_mode { /* These match some of the HF_* flags defined in kvm_host.h */ #define X86EMUL_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ #define X86EMUL_SMM_MASK (1 << 6) -#define X86EMUL_SMM_INSIDE_NMI_MASK (1 << 7) /* * fastop functions are declared as taking a never-defined fastop parameter, diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index 102ecb8525640..d9cf104ad94f5 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -10,6 +10,8 @@ void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) { + BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); + trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm); if (entering_smm) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 84aa51613e4f5..cbec2e675c18f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5093,10 +5093,12 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, /* events->sipi_vector is never valid when reporting to user space */ +#ifdef CONFIG_KVM_SMM events->smi.smm = is_smm(vcpu); events->smi.pending = vcpu->arch.smi_pending; events->smi.smm_inside_nmi = !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK); +#endif events->smi.latched_init = kvm_lapic_latched_init(vcpu); events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING @@ -8267,8 +8269,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK); - BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); - BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK); ctxt->interruptibility = 0; ctxt->have_exception = false; -- GitLab From 89dccf82e99e95ee465e2b00428494fe64679256 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 25 Oct 2022 15:47:33 +0300 Subject: [PATCH 0427/1423] KVM: x86: smm: check for failures on smm entry In the rare case of the failure on SMM entry, the KVM should at least terminate the VM instead of going south. Suggested-by: Sean Christopherson Signed-off-by: Maxim Levitsky Message-Id: <20221025124741.228045-16-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/smm.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index d9cf104ad94f5..2d5bb2af70e40 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -213,11 +213,17 @@ void enter_smm(struct kvm_vcpu *vcpu) * Give enter_smm() a chance to make ISA-specific changes to the vCPU * state (e.g. leave guest mode) after we've saved the state into the * SMM state-save area. + * + * Kill the VM in the unlikely case of failure, because the VM + * can be in undefined state in this case. */ - static_call(kvm_x86_enter_smm)(vcpu, buf); + if (static_call(kvm_x86_enter_smm)(vcpu, buf)) + goto error; kvm_smm_changed(vcpu, true); - kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); + + if (kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf))) + goto error; if (static_call(kvm_x86_get_nmi_mask)(vcpu)) vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; @@ -237,7 +243,8 @@ void enter_smm(struct kvm_vcpu *vcpu) dt.address = dt.size = 0; static_call(kvm_x86_set_idt)(vcpu, &dt); - kvm_set_dr(vcpu, 7, DR7_FIXED_1); + if (WARN_ON_ONCE(kvm_set_dr(vcpu, 7, DR7_FIXED_1))) + goto error; cs.selector = (vcpu->arch.smbase >> 4) & 0xffff; cs.base = vcpu->arch.smbase; @@ -266,11 +273,15 @@ void enter_smm(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) - static_call(kvm_x86_set_efer)(vcpu, 0); + if (static_call(kvm_x86_set_efer)(vcpu, 0)) + goto error; #endif kvm_update_cpuid_runtime(vcpu); kvm_mmu_reset_context(vcpu); + return; +error: + kvm_vm_dead(vcpu->kvm); } static void rsm_set_desc_flags(struct kvm_segment *desc, u32 flags) -- GitLab From 09779c16e3eda95312ca14cd263dbb05da147b75 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 25 Oct 2022 15:47:34 +0300 Subject: [PATCH 0428/1423] KVM: x86: smm: add structs for KVM's smram layout Add structs that will be used to define and read/write the KVM's SMRAM layout, instead of reading/writing to raw offsets. Also document the differences between KVM's SMRAM layout and SMRAM layout that is used by real Intel/AMD cpus. Signed-off-by: Maxim Levitsky Message-Id: <20221025124741.228045-17-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/smm.c | 98 +++++++++++++++++++++++++++++++++ arch/x86/kvm/smm.h | 133 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 231 insertions(+) diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index 2d5bb2af70e40..2e6ec79a581ef 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -8,6 +8,102 @@ #include "cpuid.h" #include "trace.h" +#define CHECK_SMRAM32_OFFSET(field, offset) \ + ASSERT_STRUCT_OFFSET(struct kvm_smram_state_32, field, offset - 0xFE00) + +#define CHECK_SMRAM64_OFFSET(field, offset) \ + ASSERT_STRUCT_OFFSET(struct kvm_smram_state_64, field, offset - 0xFE00) + +static void check_smram_offsets(void) +{ + /* 32 bit SMRAM image */ + CHECK_SMRAM32_OFFSET(reserved1, 0xFE00); + CHECK_SMRAM32_OFFSET(smbase, 0xFEF8); + CHECK_SMRAM32_OFFSET(smm_revision, 0xFEFC); + CHECK_SMRAM32_OFFSET(io_inst_restart, 0xFF00); + CHECK_SMRAM32_OFFSET(auto_hlt_restart, 0xFF02); + CHECK_SMRAM32_OFFSET(io_restart_rdi, 0xFF04); + CHECK_SMRAM32_OFFSET(io_restart_rcx, 0xFF08); + CHECK_SMRAM32_OFFSET(io_restart_rsi, 0xFF0C); + CHECK_SMRAM32_OFFSET(io_restart_rip, 0xFF10); + CHECK_SMRAM32_OFFSET(cr4, 0xFF14); + CHECK_SMRAM32_OFFSET(reserved3, 0xFF18); + CHECK_SMRAM32_OFFSET(ds, 0xFF2C); + CHECK_SMRAM32_OFFSET(fs, 0xFF38); + CHECK_SMRAM32_OFFSET(gs, 0xFF44); + CHECK_SMRAM32_OFFSET(idtr, 0xFF50); + CHECK_SMRAM32_OFFSET(tr, 0xFF5C); + CHECK_SMRAM32_OFFSET(gdtr, 0xFF6C); + CHECK_SMRAM32_OFFSET(ldtr, 0xFF78); + CHECK_SMRAM32_OFFSET(es, 0xFF84); + CHECK_SMRAM32_OFFSET(cs, 0xFF90); + CHECK_SMRAM32_OFFSET(ss, 0xFF9C); + CHECK_SMRAM32_OFFSET(es_sel, 0xFFA8); + CHECK_SMRAM32_OFFSET(cs_sel, 0xFFAC); + CHECK_SMRAM32_OFFSET(ss_sel, 0xFFB0); + CHECK_SMRAM32_OFFSET(ds_sel, 0xFFB4); + CHECK_SMRAM32_OFFSET(fs_sel, 0xFFB8); + CHECK_SMRAM32_OFFSET(gs_sel, 0xFFBC); + CHECK_SMRAM32_OFFSET(ldtr_sel, 0xFFC0); + CHECK_SMRAM32_OFFSET(tr_sel, 0xFFC4); + CHECK_SMRAM32_OFFSET(dr7, 0xFFC8); + CHECK_SMRAM32_OFFSET(dr6, 0xFFCC); + CHECK_SMRAM32_OFFSET(gprs, 0xFFD0); + CHECK_SMRAM32_OFFSET(eip, 0xFFF0); + CHECK_SMRAM32_OFFSET(eflags, 0xFFF4); + CHECK_SMRAM32_OFFSET(cr3, 0xFFF8); + CHECK_SMRAM32_OFFSET(cr0, 0xFFFC); + + /* 64 bit SMRAM image */ + CHECK_SMRAM64_OFFSET(es, 0xFE00); + CHECK_SMRAM64_OFFSET(cs, 0xFE10); + CHECK_SMRAM64_OFFSET(ss, 0xFE20); + CHECK_SMRAM64_OFFSET(ds, 0xFE30); + CHECK_SMRAM64_OFFSET(fs, 0xFE40); + CHECK_SMRAM64_OFFSET(gs, 0xFE50); + CHECK_SMRAM64_OFFSET(gdtr, 0xFE60); + CHECK_SMRAM64_OFFSET(ldtr, 0xFE70); + CHECK_SMRAM64_OFFSET(idtr, 0xFE80); + CHECK_SMRAM64_OFFSET(tr, 0xFE90); + CHECK_SMRAM64_OFFSET(io_restart_rip, 0xFEA0); + CHECK_SMRAM64_OFFSET(io_restart_rcx, 0xFEA8); + CHECK_SMRAM64_OFFSET(io_restart_rsi, 0xFEB0); + CHECK_SMRAM64_OFFSET(io_restart_rdi, 0xFEB8); + CHECK_SMRAM64_OFFSET(io_restart_dword, 0xFEC0); + CHECK_SMRAM64_OFFSET(reserved1, 0xFEC4); + CHECK_SMRAM64_OFFSET(io_inst_restart, 0xFEC8); + CHECK_SMRAM64_OFFSET(auto_hlt_restart, 0xFEC9); + CHECK_SMRAM64_OFFSET(reserved2, 0xFECA); + CHECK_SMRAM64_OFFSET(efer, 0xFED0); + CHECK_SMRAM64_OFFSET(svm_guest_flag, 0xFED8); + CHECK_SMRAM64_OFFSET(svm_guest_vmcb_gpa, 0xFEE0); + CHECK_SMRAM64_OFFSET(svm_guest_virtual_int, 0xFEE8); + CHECK_SMRAM64_OFFSET(reserved3, 0xFEF0); + CHECK_SMRAM64_OFFSET(smm_revison, 0xFEFC); + CHECK_SMRAM64_OFFSET(smbase, 0xFF00); + CHECK_SMRAM64_OFFSET(reserved4, 0xFF04); + CHECK_SMRAM64_OFFSET(ssp, 0xFF18); + CHECK_SMRAM64_OFFSET(svm_guest_pat, 0xFF20); + CHECK_SMRAM64_OFFSET(svm_host_efer, 0xFF28); + CHECK_SMRAM64_OFFSET(svm_host_cr4, 0xFF30); + CHECK_SMRAM64_OFFSET(svm_host_cr3, 0xFF38); + CHECK_SMRAM64_OFFSET(svm_host_cr0, 0xFF40); + CHECK_SMRAM64_OFFSET(cr4, 0xFF48); + CHECK_SMRAM64_OFFSET(cr3, 0xFF50); + CHECK_SMRAM64_OFFSET(cr0, 0xFF58); + CHECK_SMRAM64_OFFSET(dr7, 0xFF60); + CHECK_SMRAM64_OFFSET(dr6, 0xFF68); + CHECK_SMRAM64_OFFSET(rflags, 0xFF70); + CHECK_SMRAM64_OFFSET(rip, 0xFF78); + CHECK_SMRAM64_OFFSET(gprs, 0xFF80); + + BUILD_BUG_ON(sizeof(union kvm_smram) != 512); +} + +#undef CHECK_SMRAM64_OFFSET +#undef CHECK_SMRAM32_OFFSET + + void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) { BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); @@ -201,6 +297,8 @@ void enter_smm(struct kvm_vcpu *vcpu) unsigned long cr0; char buf[512]; + check_smram_offsets(); + memset(buf, 0, 512); #ifdef CONFIG_X86_64 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) diff --git a/arch/x86/kvm/smm.h b/arch/x86/kvm/smm.h index 53c81394ebdbc..b66da263ec822 100644 --- a/arch/x86/kvm/smm.h +++ b/arch/x86/kvm/smm.h @@ -2,6 +2,8 @@ #ifndef ASM_KVM_SMM_H #define ASM_KVM_SMM_H +#include + #define GET_SMSTATE(type, buf, offset) \ (*(type *)((buf) + (offset) - 0x7e00)) @@ -9,6 +11,137 @@ *(type *)((buf) + (offset) - 0x7e00) = val #ifdef CONFIG_KVM_SMM + + +/* + * 32 bit KVM's emulated SMM layout. Based on Intel P6 layout + * (https://www.sandpile.org/x86/smm.htm). + */ + +struct kvm_smm_seg_state_32 { + u32 flags; + u32 limit; + u32 base; +} __packed; + +struct kvm_smram_state_32 { + u32 reserved1[62]; + u32 smbase; + u32 smm_revision; + u16 io_inst_restart; + u16 auto_hlt_restart; + u32 io_restart_rdi; + u32 io_restart_rcx; + u32 io_restart_rsi; + u32 io_restart_rip; + u32 cr4; + + /* A20M#, CPL, shutdown and other reserved/undocumented fields */ + u32 reserved3[5]; + + struct kvm_smm_seg_state_32 ds; + struct kvm_smm_seg_state_32 fs; + struct kvm_smm_seg_state_32 gs; + struct kvm_smm_seg_state_32 idtr; /* IDTR has only base and limit */ + struct kvm_smm_seg_state_32 tr; + u32 reserved; + struct kvm_smm_seg_state_32 gdtr; /* GDTR has only base and limit */ + struct kvm_smm_seg_state_32 ldtr; + struct kvm_smm_seg_state_32 es; + struct kvm_smm_seg_state_32 cs; + struct kvm_smm_seg_state_32 ss; + + u32 es_sel; + u32 cs_sel; + u32 ss_sel; + u32 ds_sel; + u32 fs_sel; + u32 gs_sel; + u32 ldtr_sel; + u32 tr_sel; + + u32 dr7; + u32 dr6; + u32 gprs[8]; /* GPRS in the "natural" X86 order (EAX/ECX/EDX.../EDI) */ + u32 eip; + u32 eflags; + u32 cr3; + u32 cr0; +} __packed; + + +/* 64 bit KVM's emulated SMM layout. Based on AMD64 layout */ + +struct kvm_smm_seg_state_64 { + u16 selector; + u16 attributes; + u32 limit; + u64 base; +}; + +struct kvm_smram_state_64 { + + struct kvm_smm_seg_state_64 es; + struct kvm_smm_seg_state_64 cs; + struct kvm_smm_seg_state_64 ss; + struct kvm_smm_seg_state_64 ds; + struct kvm_smm_seg_state_64 fs; + struct kvm_smm_seg_state_64 gs; + struct kvm_smm_seg_state_64 gdtr; /* GDTR has only base and limit*/ + struct kvm_smm_seg_state_64 ldtr; + struct kvm_smm_seg_state_64 idtr; /* IDTR has only base and limit*/ + struct kvm_smm_seg_state_64 tr; + + /* I/O restart and auto halt restart are not implemented by KVM */ + u64 io_restart_rip; + u64 io_restart_rcx; + u64 io_restart_rsi; + u64 io_restart_rdi; + u32 io_restart_dword; + u32 reserved1; + u8 io_inst_restart; + u8 auto_hlt_restart; + u8 reserved2[6]; + + u64 efer; + + /* + * Two fields below are implemented on AMD only, to store + * SVM guest vmcb address if the #SMI was received while in the guest mode. + */ + u64 svm_guest_flag; + u64 svm_guest_vmcb_gpa; + u64 svm_guest_virtual_int; /* unknown purpose, not implemented */ + + u32 reserved3[3]; + u32 smm_revison; + u32 smbase; + u32 reserved4[5]; + + /* ssp and svm_* fields below are not implemented by KVM */ + u64 ssp; + u64 svm_guest_pat; + u64 svm_host_efer; + u64 svm_host_cr4; + u64 svm_host_cr3; + u64 svm_host_cr0; + + u64 cr4; + u64 cr3; + u64 cr0; + u64 dr7; + u64 dr6; + u64 rflags; + u64 rip; + u64 gprs[16]; /* GPRS in a reversed "natural" X86 order (R15/R14/../RCX/RAX.) */ +}; + +union kvm_smram { + struct kvm_smram_state_64 smram64; + struct kvm_smram_state_32 smram32; + u8 bytes[512]; +}; + static inline int kvm_inject_smi(struct kvm_vcpu *vcpu) { kvm_make_request(KVM_REQ_SMI, vcpu); -- GitLab From 58c1d206d545464f9051ad080674b719d553215b Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 25 Oct 2022 15:47:35 +0300 Subject: [PATCH 0429/1423] KVM: x86: smm: use smram structs in the common code Use kvm_smram union instad of raw arrays in the common smm code. Signed-off-by: Maxim Levitsky Message-Id: <20221025124741.228045-18-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 5 +++-- arch/x86/kvm/smm.c | 27 ++++++++++++++------------- arch/x86/kvm/svm/svm.c | 8 ++++++-- arch/x86/kvm/vmx/vmx.c | 4 ++-- 4 files changed, 25 insertions(+), 19 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e9862eaac43b8..4443869056323 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -206,6 +206,7 @@ typedef enum exit_fastpath_completion fastpath_t; struct x86_emulate_ctxt; struct x86_exception; +union kvm_smram; enum x86_intercept; enum x86_intercept_stage; @@ -1616,8 +1617,8 @@ struct kvm_x86_ops { #ifdef CONFIG_KVM_SMM int (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); - int (*enter_smm)(struct kvm_vcpu *vcpu, char *smstate); - int (*leave_smm)(struct kvm_vcpu *vcpu, const char *smstate); + int (*enter_smm)(struct kvm_vcpu *vcpu, union kvm_smram *smram); + int (*leave_smm)(struct kvm_vcpu *vcpu, const union kvm_smram *smram); void (*enable_smi_window)(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index 2e6ec79a581ef..ba2733e535fd7 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -295,17 +295,18 @@ void enter_smm(struct kvm_vcpu *vcpu) struct kvm_segment cs, ds; struct desc_ptr dt; unsigned long cr0; - char buf[512]; + union kvm_smram smram; check_smram_offsets(); - memset(buf, 0, 512); + memset(smram.bytes, 0, sizeof(smram.bytes)); + #ifdef CONFIG_X86_64 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) - enter_smm_save_state_64(vcpu, buf); + enter_smm_save_state_64(vcpu, smram.bytes); else #endif - enter_smm_save_state_32(vcpu, buf); + enter_smm_save_state_32(vcpu, smram.bytes); /* * Give enter_smm() a chance to make ISA-specific changes to the vCPU @@ -315,12 +316,12 @@ void enter_smm(struct kvm_vcpu *vcpu) * Kill the VM in the unlikely case of failure, because the VM * can be in undefined state in this case. */ - if (static_call(kvm_x86_enter_smm)(vcpu, buf)) + if (static_call(kvm_x86_enter_smm)(vcpu, &smram)) goto error; kvm_smm_changed(vcpu, true); - if (kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf))) + if (kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, &smram, sizeof(smram))) goto error; if (static_call(kvm_x86_get_nmi_mask)(vcpu)) @@ -480,7 +481,7 @@ static int rsm_enter_protected_mode(struct kvm_vcpu *vcpu, } static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, - const char *smstate) + u8 *smstate) { struct kvm_vcpu *vcpu = ctxt->vcpu; struct kvm_segment desc; @@ -541,7 +542,7 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, #ifdef CONFIG_X86_64 static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, - const char *smstate) + u8 *smstate) { struct kvm_vcpu *vcpu = ctxt->vcpu; struct kvm_segment desc; @@ -612,13 +613,13 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) { struct kvm_vcpu *vcpu = ctxt->vcpu; unsigned long cr0; - char buf[512]; + union kvm_smram smram; u64 smbase; int ret; smbase = vcpu->arch.smbase; - ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfe00, buf, sizeof(buf)); + ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfe00, smram.bytes, sizeof(smram)); if (ret < 0) return X86EMUL_UNHANDLEABLE; @@ -675,13 +676,13 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) * state (e.g. enter guest mode) before loading state from the SMM * state-save area. */ - if (static_call(kvm_x86_leave_smm)(vcpu, buf)) + if (static_call(kvm_x86_leave_smm)(vcpu, &smram)) return X86EMUL_UNHANDLEABLE; #ifdef CONFIG_X86_64 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) - return rsm_load_state_64(ctxt, buf); + return rsm_load_state_64(ctxt, smram.bytes); else #endif - return rsm_load_state_32(ctxt, buf); + return rsm_load_state_32(ctxt, smram.bytes); } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d28de3e59f7f7..9d08214031d29 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4401,12 +4401,14 @@ static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) return 1; } -static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) +static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_host_map map_save; int ret; + char *smstate = (char *)smram; + if (!is_guest_mode(vcpu)) return 0; @@ -4448,7 +4450,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) return 0; } -static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) +static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_host_map map, map_save; @@ -4456,6 +4458,8 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) struct vmcb *vmcb12; int ret; + const char *smstate = (const char *)smram; + if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) return 0; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6be991b29bb76..aca88524fd1ec 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7941,7 +7941,7 @@ static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) return !is_smm(vcpu); } -static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate) +static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7962,7 +7962,7 @@ static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate) return 0; } -static int vmx_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) +static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) { struct vcpu_vmx *vmx = to_vmx(vcpu); int ret; -- GitLab From f34bdf4c1707cdc687db87965d08bb5a51300c58 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 25 Oct 2022 15:47:36 +0300 Subject: [PATCH 0430/1423] KVM: x86: smm: use smram struct for 32 bit smram load/restore Use kvm_smram_state_32 struct to save/restore 32 bit SMM state (used when X86_FEATURE_LM is not present in the guest CPUID). Signed-off-by: Maxim Levitsky Message-Id: <20221025124741.228045-19-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/smm.c | 155 ++++++++++++++++++--------------------------- 1 file changed, 61 insertions(+), 94 deletions(-) diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index ba2733e535fd7..2c808d0a8e921 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -149,22 +149,17 @@ static u32 enter_smm_get_segment_flags(struct kvm_segment *seg) return flags; } -static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n) +static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, + struct kvm_smm_seg_state_32 *state, + u32 *selector, int n) { struct kvm_segment seg; - int offset; kvm_get_segment(vcpu, &seg, n); - PUT_SMSTATE(u32, buf, 0x7fa8 + n * 4, seg.selector); - - if (n < 3) - offset = 0x7f84 + n * 12; - else - offset = 0x7f2c + (n - 3) * 12; - - PUT_SMSTATE(u32, buf, offset + 8, seg.base); - PUT_SMSTATE(u32, buf, offset + 4, seg.limit); - PUT_SMSTATE(u32, buf, offset, enter_smm_get_segment_flags(&seg)); + *selector = seg.selector; + state->base = seg.base; + state->limit = seg.limit; + state->flags = enter_smm_get_segment_flags(&seg); } #ifdef CONFIG_X86_64 @@ -185,54 +180,48 @@ static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n) } #endif -static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf) +static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, + struct kvm_smram_state_32 *smram) { struct desc_ptr dt; - struct kvm_segment seg; unsigned long val; int i; - PUT_SMSTATE(u32, buf, 0x7ffc, kvm_read_cr0(vcpu)); - PUT_SMSTATE(u32, buf, 0x7ff8, kvm_read_cr3(vcpu)); - PUT_SMSTATE(u32, buf, 0x7ff4, kvm_get_rflags(vcpu)); - PUT_SMSTATE(u32, buf, 0x7ff0, kvm_rip_read(vcpu)); + smram->cr0 = kvm_read_cr0(vcpu); + smram->cr3 = kvm_read_cr3(vcpu); + smram->eflags = kvm_get_rflags(vcpu); + smram->eip = kvm_rip_read(vcpu); for (i = 0; i < 8; i++) - PUT_SMSTATE(u32, buf, 0x7fd0 + i * 4, kvm_register_read_raw(vcpu, i)); + smram->gprs[i] = kvm_register_read_raw(vcpu, i); kvm_get_dr(vcpu, 6, &val); - PUT_SMSTATE(u32, buf, 0x7fcc, (u32)val); + smram->dr6 = (u32)val; kvm_get_dr(vcpu, 7, &val); - PUT_SMSTATE(u32, buf, 0x7fc8, (u32)val); + smram->dr7 = (u32)val; - kvm_get_segment(vcpu, &seg, VCPU_SREG_TR); - PUT_SMSTATE(u32, buf, 0x7fc4, seg.selector); - PUT_SMSTATE(u32, buf, 0x7f64, seg.base); - PUT_SMSTATE(u32, buf, 0x7f60, seg.limit); - PUT_SMSTATE(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg)); - - kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR); - PUT_SMSTATE(u32, buf, 0x7fc0, seg.selector); - PUT_SMSTATE(u32, buf, 0x7f80, seg.base); - PUT_SMSTATE(u32, buf, 0x7f7c, seg.limit); - PUT_SMSTATE(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg)); + enter_smm_save_seg_32(vcpu, &smram->tr, &smram->tr_sel, VCPU_SREG_TR); + enter_smm_save_seg_32(vcpu, &smram->ldtr, &smram->ldtr_sel, VCPU_SREG_LDTR); static_call(kvm_x86_get_gdt)(vcpu, &dt); - PUT_SMSTATE(u32, buf, 0x7f74, dt.address); - PUT_SMSTATE(u32, buf, 0x7f70, dt.size); + smram->gdtr.base = dt.address; + smram->gdtr.limit = dt.size; static_call(kvm_x86_get_idt)(vcpu, &dt); - PUT_SMSTATE(u32, buf, 0x7f58, dt.address); - PUT_SMSTATE(u32, buf, 0x7f54, dt.size); + smram->idtr.base = dt.address; + smram->idtr.limit = dt.size; - for (i = 0; i < 6; i++) - enter_smm_save_seg_32(vcpu, buf, i); + enter_smm_save_seg_32(vcpu, &smram->es, &smram->es_sel, VCPU_SREG_ES); + enter_smm_save_seg_32(vcpu, &smram->cs, &smram->cs_sel, VCPU_SREG_CS); + enter_smm_save_seg_32(vcpu, &smram->ss, &smram->ss_sel, VCPU_SREG_SS); - PUT_SMSTATE(u32, buf, 0x7f14, kvm_read_cr4(vcpu)); + enter_smm_save_seg_32(vcpu, &smram->ds, &smram->ds_sel, VCPU_SREG_DS); + enter_smm_save_seg_32(vcpu, &smram->fs, &smram->fs_sel, VCPU_SREG_FS); + enter_smm_save_seg_32(vcpu, &smram->gs, &smram->gs_sel, VCPU_SREG_GS); - /* revision id */ - PUT_SMSTATE(u32, buf, 0x7efc, 0x00020000); - PUT_SMSTATE(u32, buf, 0x7ef8, vcpu->arch.smbase); + smram->cr4 = kvm_read_cr4(vcpu); + smram->smm_revision = 0x00020000; + smram->smbase = vcpu->arch.smbase; } #ifdef CONFIG_X86_64 @@ -306,7 +295,7 @@ void enter_smm(struct kvm_vcpu *vcpu) enter_smm_save_state_64(vcpu, smram.bytes); else #endif - enter_smm_save_state_32(vcpu, smram.bytes); + enter_smm_save_state_32(vcpu, &smram.smram32); /* * Give enter_smm() a chance to make ISA-specific changes to the vCPU @@ -398,21 +387,16 @@ static void rsm_set_desc_flags(struct kvm_segment *desc, u32 flags) desc->padding = 0; } -static int rsm_load_seg_32(struct kvm_vcpu *vcpu, const char *smstate, - int n) +static int rsm_load_seg_32(struct kvm_vcpu *vcpu, + const struct kvm_smm_seg_state_32 *state, + u16 selector, int n) { struct kvm_segment desc; - int offset; - - if (n < 3) - offset = 0x7f84 + n * 12; - else - offset = 0x7f2c + (n - 3) * 12; - desc.selector = GET_SMSTATE(u32, smstate, 0x7fa8 + n * 4); - desc.base = GET_SMSTATE(u32, smstate, offset + 8); - desc.limit = GET_SMSTATE(u32, smstate, offset + 4); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, offset)); + desc.selector = selector; + desc.base = state->base; + desc.limit = state->limit; + rsm_set_desc_flags(&desc, state->flags); kvm_set_segment(vcpu, &desc, n); return X86EMUL_CONTINUE; } @@ -481,63 +465,46 @@ static int rsm_enter_protected_mode(struct kvm_vcpu *vcpu, } static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, - u8 *smstate) + const struct kvm_smram_state_32 *smstate) { struct kvm_vcpu *vcpu = ctxt->vcpu; - struct kvm_segment desc; struct desc_ptr dt; - u32 val, cr0, cr3, cr4; int i; - cr0 = GET_SMSTATE(u32, smstate, 0x7ffc); - cr3 = GET_SMSTATE(u32, smstate, 0x7ff8); - ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7ff4) | X86_EFLAGS_FIXED; - ctxt->_eip = GET_SMSTATE(u32, smstate, 0x7ff0); + ctxt->eflags = smstate->eflags | X86_EFLAGS_FIXED; + ctxt->_eip = smstate->eip; for (i = 0; i < 8; i++) - *reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4); - - val = GET_SMSTATE(u32, smstate, 0x7fcc); + *reg_write(ctxt, i) = smstate->gprs[i]; - if (kvm_set_dr(vcpu, 6, val)) + if (kvm_set_dr(vcpu, 6, smstate->dr6)) return X86EMUL_UNHANDLEABLE; - - val = GET_SMSTATE(u32, smstate, 0x7fc8); - - if (kvm_set_dr(vcpu, 7, val)) + if (kvm_set_dr(vcpu, 7, smstate->dr7)) return X86EMUL_UNHANDLEABLE; - desc.selector = GET_SMSTATE(u32, smstate, 0x7fc4); - desc.base = GET_SMSTATE(u32, smstate, 0x7f64); - desc.limit = GET_SMSTATE(u32, smstate, 0x7f60); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7f5c)); - kvm_set_segment(vcpu, &desc, VCPU_SREG_TR); - - desc.selector = GET_SMSTATE(u32, smstate, 0x7fc0); - desc.base = GET_SMSTATE(u32, smstate, 0x7f80); - desc.limit = GET_SMSTATE(u32, smstate, 0x7f7c); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7f78)); - kvm_set_segment(vcpu, &desc, VCPU_SREG_LDTR); + rsm_load_seg_32(vcpu, &smstate->tr, smstate->tr_sel, VCPU_SREG_TR); + rsm_load_seg_32(vcpu, &smstate->ldtr, smstate->ldtr_sel, VCPU_SREG_LDTR); - dt.address = GET_SMSTATE(u32, smstate, 0x7f74); - dt.size = GET_SMSTATE(u32, smstate, 0x7f70); + dt.address = smstate->gdtr.base; + dt.size = smstate->gdtr.limit; static_call(kvm_x86_set_gdt)(vcpu, &dt); - dt.address = GET_SMSTATE(u32, smstate, 0x7f58); - dt.size = GET_SMSTATE(u32, smstate, 0x7f54); + dt.address = smstate->idtr.base; + dt.size = smstate->idtr.limit; static_call(kvm_x86_set_idt)(vcpu, &dt); - for (i = 0; i < 6; i++) { - int r = rsm_load_seg_32(vcpu, smstate, i); - if (r != X86EMUL_CONTINUE) - return r; - } + rsm_load_seg_32(vcpu, &smstate->es, smstate->es_sel, VCPU_SREG_ES); + rsm_load_seg_32(vcpu, &smstate->cs, smstate->cs_sel, VCPU_SREG_CS); + rsm_load_seg_32(vcpu, &smstate->ss, smstate->ss_sel, VCPU_SREG_SS); - cr4 = GET_SMSTATE(u32, smstate, 0x7f14); + rsm_load_seg_32(vcpu, &smstate->ds, smstate->ds_sel, VCPU_SREG_DS); + rsm_load_seg_32(vcpu, &smstate->fs, smstate->fs_sel, VCPU_SREG_FS); + rsm_load_seg_32(vcpu, &smstate->gs, smstate->gs_sel, VCPU_SREG_GS); - vcpu->arch.smbase = GET_SMSTATE(u32, smstate, 0x7ef8); + vcpu->arch.smbase = smstate->smbase; - return rsm_enter_protected_mode(vcpu, cr0, cr3, cr4); + return rsm_enter_protected_mode(vcpu, smstate->cr0, + smstate->cr3, smstate->cr4); } #ifdef CONFIG_X86_64 @@ -684,5 +651,5 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) return rsm_load_state_64(ctxt, smram.bytes); else #endif - return rsm_load_state_32(ctxt, smram.bytes); + return rsm_load_state_32(ctxt, &smram.smram32); } -- GitLab From 8bcda1dee95ae88cade0ad671e0f4d371c005c4d Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 25 Oct 2022 15:47:37 +0300 Subject: [PATCH 0431/1423] KVM: x86: smm: use smram struct for 64 bit smram load/restore Use kvm_smram_state_64 struct to save/restore the 64 bit SMM state (used when X86_FEATURE_LM is present in the guest CPUID, regardless of 32-bitness of the guest). Signed-off-by: Maxim Levitsky Message-Id: <20221025124741.228045-20-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/smm.c | 153 +++++++++++++++++++-------------------------- 1 file changed, 63 insertions(+), 90 deletions(-) diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index 2c808d0a8e921..e3ecb1f841681 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -163,20 +163,17 @@ static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, } #ifdef CONFIG_X86_64 -static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n) +static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, + struct kvm_smm_seg_state_64 *state, + int n) { struct kvm_segment seg; - int offset; - u16 flags; kvm_get_segment(vcpu, &seg, n); - offset = 0x7e00 + n * 16; - - flags = enter_smm_get_segment_flags(&seg) >> 8; - PUT_SMSTATE(u16, buf, offset, seg.selector); - PUT_SMSTATE(u16, buf, offset + 2, flags); - PUT_SMSTATE(u32, buf, offset + 4, seg.limit); - PUT_SMSTATE(u64, buf, offset + 8, seg.base); + state->selector = seg.selector; + state->attributes = enter_smm_get_segment_flags(&seg) >> 8; + state->limit = seg.limit; + state->base = seg.base; } #endif @@ -225,57 +222,52 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, } #ifdef CONFIG_X86_64 -static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) +static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, + struct kvm_smram_state_64 *smram) { struct desc_ptr dt; - struct kvm_segment seg; unsigned long val; int i; for (i = 0; i < 16; i++) - PUT_SMSTATE(u64, buf, 0x7ff8 - i * 8, kvm_register_read_raw(vcpu, i)); + smram->gprs[15 - i] = kvm_register_read_raw(vcpu, i); + + smram->rip = kvm_rip_read(vcpu); + smram->rflags = kvm_get_rflags(vcpu); - PUT_SMSTATE(u64, buf, 0x7f78, kvm_rip_read(vcpu)); - PUT_SMSTATE(u32, buf, 0x7f70, kvm_get_rflags(vcpu)); kvm_get_dr(vcpu, 6, &val); - PUT_SMSTATE(u64, buf, 0x7f68, val); + smram->dr6 = val; kvm_get_dr(vcpu, 7, &val); - PUT_SMSTATE(u64, buf, 0x7f60, val); - - PUT_SMSTATE(u64, buf, 0x7f58, kvm_read_cr0(vcpu)); - PUT_SMSTATE(u64, buf, 0x7f50, kvm_read_cr3(vcpu)); - PUT_SMSTATE(u64, buf, 0x7f48, kvm_read_cr4(vcpu)); + smram->dr7 = val; - PUT_SMSTATE(u32, buf, 0x7f00, vcpu->arch.smbase); + smram->cr0 = kvm_read_cr0(vcpu); + smram->cr3 = kvm_read_cr3(vcpu); + smram->cr4 = kvm_read_cr4(vcpu); - /* revision id */ - PUT_SMSTATE(u32, buf, 0x7efc, 0x00020064); + smram->smbase = vcpu->arch.smbase; + smram->smm_revison = 0x00020064; - PUT_SMSTATE(u64, buf, 0x7ed0, vcpu->arch.efer); + smram->efer = vcpu->arch.efer; - kvm_get_segment(vcpu, &seg, VCPU_SREG_TR); - PUT_SMSTATE(u16, buf, 0x7e90, seg.selector); - PUT_SMSTATE(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8); - PUT_SMSTATE(u32, buf, 0x7e94, seg.limit); - PUT_SMSTATE(u64, buf, 0x7e98, seg.base); + enter_smm_save_seg_64(vcpu, &smram->tr, VCPU_SREG_TR); static_call(kvm_x86_get_idt)(vcpu, &dt); - PUT_SMSTATE(u32, buf, 0x7e84, dt.size); - PUT_SMSTATE(u64, buf, 0x7e88, dt.address); + smram->idtr.limit = dt.size; + smram->idtr.base = dt.address; - kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR); - PUT_SMSTATE(u16, buf, 0x7e70, seg.selector); - PUT_SMSTATE(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8); - PUT_SMSTATE(u32, buf, 0x7e74, seg.limit); - PUT_SMSTATE(u64, buf, 0x7e78, seg.base); + enter_smm_save_seg_64(vcpu, &smram->ldtr, VCPU_SREG_LDTR); static_call(kvm_x86_get_gdt)(vcpu, &dt); - PUT_SMSTATE(u32, buf, 0x7e64, dt.size); - PUT_SMSTATE(u64, buf, 0x7e68, dt.address); + smram->gdtr.limit = dt.size; + smram->gdtr.base = dt.address; - for (i = 0; i < 6; i++) - enter_smm_save_seg_64(vcpu, buf, i); + enter_smm_save_seg_64(vcpu, &smram->es, VCPU_SREG_ES); + enter_smm_save_seg_64(vcpu, &smram->cs, VCPU_SREG_CS); + enter_smm_save_seg_64(vcpu, &smram->ss, VCPU_SREG_SS); + enter_smm_save_seg_64(vcpu, &smram->ds, VCPU_SREG_DS); + enter_smm_save_seg_64(vcpu, &smram->fs, VCPU_SREG_FS); + enter_smm_save_seg_64(vcpu, &smram->gs, VCPU_SREG_GS); } #endif @@ -292,7 +284,7 @@ void enter_smm(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) - enter_smm_save_state_64(vcpu, smram.bytes); + enter_smm_save_state_64(vcpu, &smram.smram64); else #endif enter_smm_save_state_32(vcpu, &smram.smram32); @@ -402,18 +394,17 @@ static int rsm_load_seg_32(struct kvm_vcpu *vcpu, } #ifdef CONFIG_X86_64 -static int rsm_load_seg_64(struct kvm_vcpu *vcpu, const char *smstate, + +static int rsm_load_seg_64(struct kvm_vcpu *vcpu, + const struct kvm_smm_seg_state_64 *state, int n) { struct kvm_segment desc; - int offset; - - offset = 0x7e00 + n * 16; - desc.selector = GET_SMSTATE(u16, smstate, offset); - rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smstate, offset + 2) << 8); - desc.limit = GET_SMSTATE(u32, smstate, offset + 4); - desc.base = GET_SMSTATE(u64, smstate, offset + 8); + desc.selector = state->selector; + rsm_set_desc_flags(&desc, state->attributes << 8); + desc.limit = state->limit; + desc.base = state->base; kvm_set_segment(vcpu, &desc, n); return X86EMUL_CONTINUE; } @@ -509,68 +500,50 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, #ifdef CONFIG_X86_64 static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, - u8 *smstate) + const struct kvm_smram_state_64 *smstate) { struct kvm_vcpu *vcpu = ctxt->vcpu; - struct kvm_segment desc; struct desc_ptr dt; - u64 val, cr0, cr3, cr4; int i, r; for (i = 0; i < 16; i++) - *reg_write(ctxt, i) = GET_SMSTATE(u64, smstate, 0x7ff8 - i * 8); - - ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78); - ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7f70) | X86_EFLAGS_FIXED; + *reg_write(ctxt, i) = smstate->gprs[15 - i]; - val = GET_SMSTATE(u64, smstate, 0x7f68); + ctxt->_eip = smstate->rip; + ctxt->eflags = smstate->rflags | X86_EFLAGS_FIXED; - if (kvm_set_dr(vcpu, 6, val)) + if (kvm_set_dr(vcpu, 6, smstate->dr6)) return X86EMUL_UNHANDLEABLE; - - val = GET_SMSTATE(u64, smstate, 0x7f60); - - if (kvm_set_dr(vcpu, 7, val)) + if (kvm_set_dr(vcpu, 7, smstate->dr7)) return X86EMUL_UNHANDLEABLE; - cr0 = GET_SMSTATE(u64, smstate, 0x7f58); - cr3 = GET_SMSTATE(u64, smstate, 0x7f50); - cr4 = GET_SMSTATE(u64, smstate, 0x7f48); - vcpu->arch.smbase = GET_SMSTATE(u32, smstate, 0x7f00); - val = GET_SMSTATE(u64, smstate, 0x7ed0); + vcpu->arch.smbase = smstate->smbase; - if (kvm_set_msr(vcpu, MSR_EFER, val & ~EFER_LMA)) + if (kvm_set_msr(vcpu, MSR_EFER, smstate->efer & ~EFER_LMA)) return X86EMUL_UNHANDLEABLE; - desc.selector = GET_SMSTATE(u32, smstate, 0x7e90); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e92) << 8); - desc.limit = GET_SMSTATE(u32, smstate, 0x7e94); - desc.base = GET_SMSTATE(u64, smstate, 0x7e98); - kvm_set_segment(vcpu, &desc, VCPU_SREG_TR); + rsm_load_seg_64(vcpu, &smstate->tr, VCPU_SREG_TR); - dt.size = GET_SMSTATE(u32, smstate, 0x7e84); - dt.address = GET_SMSTATE(u64, smstate, 0x7e88); + dt.size = smstate->idtr.limit; + dt.address = smstate->idtr.base; static_call(kvm_x86_set_idt)(vcpu, &dt); - desc.selector = GET_SMSTATE(u32, smstate, 0x7e70); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e72) << 8); - desc.limit = GET_SMSTATE(u32, smstate, 0x7e74); - desc.base = GET_SMSTATE(u64, smstate, 0x7e78); - kvm_set_segment(vcpu, &desc, VCPU_SREG_LDTR); + rsm_load_seg_64(vcpu, &smstate->ldtr, VCPU_SREG_LDTR); - dt.size = GET_SMSTATE(u32, smstate, 0x7e64); - dt.address = GET_SMSTATE(u64, smstate, 0x7e68); + dt.size = smstate->gdtr.limit; + dt.address = smstate->gdtr.base; static_call(kvm_x86_set_gdt)(vcpu, &dt); - r = rsm_enter_protected_mode(vcpu, cr0, cr3, cr4); + r = rsm_enter_protected_mode(vcpu, smstate->cr0, smstate->cr3, smstate->cr4); if (r != X86EMUL_CONTINUE) return r; - for (i = 0; i < 6; i++) { - r = rsm_load_seg_64(vcpu, smstate, i); - if (r != X86EMUL_CONTINUE) - return r; - } + rsm_load_seg_64(vcpu, &smstate->es, VCPU_SREG_ES); + rsm_load_seg_64(vcpu, &smstate->cs, VCPU_SREG_CS); + rsm_load_seg_64(vcpu, &smstate->ss, VCPU_SREG_SS); + rsm_load_seg_64(vcpu, &smstate->ds, VCPU_SREG_DS); + rsm_load_seg_64(vcpu, &smstate->fs, VCPU_SREG_FS); + rsm_load_seg_64(vcpu, &smstate->gs, VCPU_SREG_GS); return X86EMUL_CONTINUE; } @@ -648,7 +621,7 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) #ifdef CONFIG_X86_64 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) - return rsm_load_state_64(ctxt, smram.bytes); + return rsm_load_state_64(ctxt, &smram.smram64); else #endif return rsm_load_state_32(ctxt, &smram.smram32); -- GitLab From e6a82199b610d843d810bc90d1f5df667906c402 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 25 Oct 2022 15:47:38 +0300 Subject: [PATCH 0432/1423] KVM: svm: drop explicit return value of kvm_vcpu_map if kvm_vcpu_map returns non zero value, error path should be triggered regardless of the exact returned error value. Suggested-by: Sean Christopherson Signed-off-by: Maxim Levitsky Message-Id: <20221025124741.228045-21-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9d08214031d29..dfcdca3f538b5 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4437,8 +4437,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) * that, see svm_prepare_switch_to_guest()) which must be * preserved. */ - if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), - &map_save) == -EINVAL) + if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save)) return 1; BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400); @@ -4475,11 +4474,11 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) return 1; vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0); - if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL) + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map)) return 1; ret = 1; - if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save) == -EINVAL) + if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save)) goto unmap_map; if (svm_allocate_nested(svm)) -- GitLab From dd5045fed588b3e7ac0a4546138b2fe16d5d0fff Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 25 Oct 2022 15:47:39 +0300 Subject: [PATCH 0433/1423] KVM: x86: SVM: use smram structs Use SMM structs in the SVM code as well, which removes the last user of put_smstate/GET_SMSTATE so remove these macros as well. Signed-off-by: Maxim Levitsky Message-Id: <20221025124741.228045-22-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/smm.h | 6 ------ arch/x86/kvm/svm/svm.c | 21 +++++++-------------- 2 files changed, 7 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/smm.h b/arch/x86/kvm/smm.h index b66da263ec822..520217467ac20 100644 --- a/arch/x86/kvm/smm.h +++ b/arch/x86/kvm/smm.h @@ -4,12 +4,6 @@ #include -#define GET_SMSTATE(type, buf, offset) \ - (*(type *)((buf) + (offset) - 0x7e00)) - -#define PUT_SMSTATE(type, buf, offset, val) \ - *(type *)((buf) + (offset) - 0x7e00) = val - #ifdef CONFIG_KVM_SMM diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index dfcdca3f538b5..2be8050bf981d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4407,15 +4407,11 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) struct kvm_host_map map_save; int ret; - char *smstate = (char *)smram; - if (!is_guest_mode(vcpu)) return 0; - /* FED8h - SVM Guest */ - PUT_SMSTATE(u64, smstate, 0x7ed8, 1); - /* FEE0h - SVM Guest VMCB Physical Address */ - PUT_SMSTATE(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa); + smram->smram64.svm_guest_flag = 1; + smram->smram64.svm_guest_vmcb_gpa = svm->nested.vmcb12_gpa; svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; @@ -4453,28 +4449,25 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_host_map map, map_save; - u64 saved_efer, vmcb12_gpa; struct vmcb *vmcb12; int ret; - const char *smstate = (const char *)smram; + const struct kvm_smram_state_64 *smram64 = &smram->smram64; if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) return 0; /* Non-zero if SMI arrived while vCPU was in guest mode. */ - if (!GET_SMSTATE(u64, smstate, 0x7ed8)) + if (!smram64->svm_guest_flag) return 0; if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM)) return 1; - saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0); - if (!(saved_efer & EFER_SVME)) + if (!(smram64->efer & EFER_SVME)) return 1; - vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0); - if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map)) + if (kvm_vcpu_map(vcpu, gpa_to_gfn(smram64->svm_guest_vmcb_gpa), &map)) return 1; ret = 1; @@ -4500,7 +4493,7 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) vmcb12 = map.hva; nested_copy_vmcb_control_to_cache(svm, &vmcb12->control); nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); - ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false); + ret = enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa, vmcb12, false); if (ret) goto unmap_save; -- GitLab From 95504c7c981b3260b3b238ace03f3519bd9a0b6d Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 25 Oct 2022 15:47:40 +0300 Subject: [PATCH 0434/1423] KVM: x86: SVM: don't save SVM state to SMRAM when VM is not long mode capable When the guest CPUID doesn't have support for long mode, 32 bit SMRAM layout is used and it has no support for preserving EFER and/or SVM state. Note that this isn't relevant to running 32 bit guests on VM which is long mode capable - such VM can still run 32 bit guests in compatibility mode. Signed-off-by: Maxim Levitsky Message-Id: <20221025124741.228045-23-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 2be8050bf981d..527f18d8cc448 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4410,6 +4410,14 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) if (!is_guest_mode(vcpu)) return 0; + /* + * 32-bit SMRAM format doesn't preserve EFER and SVM state. Userspace is + * responsible for ensuring nested SVM and SMIs are mutually exclusive. + */ + + if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) + return 1; + smram->smram64.svm_guest_flag = 1; smram->smram64.svm_guest_vmcb_gpa = svm->nested.vmcb12_gpa; -- GitLab From fb28875fd7da184079150295da7ee8d80a70917e Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 25 Oct 2022 15:47:41 +0300 Subject: [PATCH 0435/1423] KVM: x86: smm: preserve interrupt shadow in SMRAM When #SMI is asserted, the CPU can be in interrupt shadow due to sti or mov ss. It is not mandatory in Intel/AMD prm to have the #SMI blocked during the shadow, and on top of that, since neither SVM nor VMX has true support for SMI window, waiting for one instruction would mean single stepping the guest. Instead, allow #SMI in this case, but both reset the interrupt window and stash its value in SMRAM to restore it on exit from SMM. This fixes rare failures seen mostly on windows guests on VMX, when #SMI falls on the sti instruction which mainfest in VM entry failure due to EFLAGS.IF not being set, but STI interrupt window still being set in the VMCS. Signed-off-by: Maxim Levitsky Message-Id: <20221025124741.228045-24-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/smm.c | 29 +++++++++++++++++++++++++---- arch/x86/kvm/smm.h | 8 ++++++-- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index e3ecb1f841681..a9c1c2af8d94c 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -27,7 +27,9 @@ static void check_smram_offsets(void) CHECK_SMRAM32_OFFSET(io_restart_rsi, 0xFF0C); CHECK_SMRAM32_OFFSET(io_restart_rip, 0xFF10); CHECK_SMRAM32_OFFSET(cr4, 0xFF14); - CHECK_SMRAM32_OFFSET(reserved3, 0xFF18); + CHECK_SMRAM32_OFFSET(reserved2, 0xFF18); + CHECK_SMRAM32_OFFSET(int_shadow, 0xFF1A); + CHECK_SMRAM32_OFFSET(reserved3, 0xFF1B); CHECK_SMRAM32_OFFSET(ds, 0xFF2C); CHECK_SMRAM32_OFFSET(fs, 0xFF38); CHECK_SMRAM32_OFFSET(gs, 0xFF44); @@ -73,7 +75,9 @@ static void check_smram_offsets(void) CHECK_SMRAM64_OFFSET(reserved1, 0xFEC4); CHECK_SMRAM64_OFFSET(io_inst_restart, 0xFEC8); CHECK_SMRAM64_OFFSET(auto_hlt_restart, 0xFEC9); - CHECK_SMRAM64_OFFSET(reserved2, 0xFECA); + CHECK_SMRAM64_OFFSET(amd_nmi_mask, 0xFECA); + CHECK_SMRAM64_OFFSET(int_shadow, 0xFECB); + CHECK_SMRAM64_OFFSET(reserved2, 0xFECC); CHECK_SMRAM64_OFFSET(efer, 0xFED0); CHECK_SMRAM64_OFFSET(svm_guest_flag, 0xFED8); CHECK_SMRAM64_OFFSET(svm_guest_vmcb_gpa, 0xFEE0); @@ -219,6 +223,8 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, smram->cr4 = kvm_read_cr4(vcpu); smram->smm_revision = 0x00020000; smram->smbase = vcpu->arch.smbase; + + smram->int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); } #ifdef CONFIG_X86_64 @@ -268,6 +274,8 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, enter_smm_save_seg_64(vcpu, &smram->ds, VCPU_SREG_DS); enter_smm_save_seg_64(vcpu, &smram->fs, VCPU_SREG_FS); enter_smm_save_seg_64(vcpu, &smram->gs, VCPU_SREG_GS); + + smram->int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); } #endif @@ -313,6 +321,8 @@ void enter_smm(struct kvm_vcpu *vcpu) kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); kvm_rip_write(vcpu, 0x8000); + static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0); + cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG); static_call(kvm_x86_set_cr0)(vcpu, cr0); vcpu->arch.cr0 = cr0; @@ -460,7 +470,7 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, { struct kvm_vcpu *vcpu = ctxt->vcpu; struct desc_ptr dt; - int i; + int i, r; ctxt->eflags = smstate->eflags | X86_EFLAGS_FIXED; ctxt->_eip = smstate->eip; @@ -494,8 +504,16 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, vcpu->arch.smbase = smstate->smbase; - return rsm_enter_protected_mode(vcpu, smstate->cr0, + r = rsm_enter_protected_mode(vcpu, smstate->cr0, smstate->cr3, smstate->cr4); + + if (r != X86EMUL_CONTINUE) + return r; + + static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0); + ctxt->interruptibility = (u8)smstate->int_shadow; + + return r; } #ifdef CONFIG_X86_64 @@ -545,6 +563,9 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, rsm_load_seg_64(vcpu, &smstate->fs, VCPU_SREG_FS); rsm_load_seg_64(vcpu, &smstate->gs, VCPU_SREG_GS); + static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0); + ctxt->interruptibility = (u8)smstate->int_shadow; + return X86EMUL_CONTINUE; } #endif diff --git a/arch/x86/kvm/smm.h b/arch/x86/kvm/smm.h index 520217467ac20..a1cf2ac5bd78d 100644 --- a/arch/x86/kvm/smm.h +++ b/arch/x86/kvm/smm.h @@ -31,7 +31,9 @@ struct kvm_smram_state_32 { u32 cr4; /* A20M#, CPL, shutdown and other reserved/undocumented fields */ - u32 reserved3[5]; + u16 reserved2; + u8 int_shadow; /* KVM extension */ + u8 reserved3[17]; struct kvm_smm_seg_state_32 ds; struct kvm_smm_seg_state_32 fs; @@ -95,7 +97,9 @@ struct kvm_smram_state_64 { u32 reserved1; u8 io_inst_restart; u8 auto_hlt_restart; - u8 reserved2[6]; + u8 amd_nmi_mask; /* Documented in AMD BKDG as NMI mask, not used by KVM */ + u8 int_shadow; + u32 reserved2; u64 efer; -- GitLab From 93c5c61d9e58a9ea423439d358c198be5b674a58 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Oct 2022 15:58:06 -0400 Subject: [PATCH 0436/1423] mm/gup: Add FOLL_INTERRUPTIBLE We have had FAULT_FLAG_INTERRUPTIBLE but it was never applied to GUPs. One issue with it is that not all GUP paths are able to handle signal delivers besides SIGKILL. That's not ideal for the GUP users who are actually able to handle these cases, like KVM. KVM uses GUP extensively on faulting guest pages, during which we've got existing infrastructures to retry a page fault at a later time. Allowing the GUP to be interrupted by generic signals can make KVM related threads to be more responsive. For examples: (1) SIGUSR1: which QEMU/KVM uses to deliver an inter-process IPI, e.g. when the admin issues a vm_stop QMP command, SIGUSR1 can be generated to kick the vcpus out of kernel context immediately, (2) SIGINT: which can be used with interactive hypervisor users to stop a virtual machine with Ctrl-C without any delays/hangs, (3) SIGTRAP: which grants GDB capability even during page faults that are stuck for a long time. Normally hypervisor will be able to receive these signals properly, but not if we're stuck in a GUP for a long time for whatever reason. It happens easily with a stucked postcopy migration when e.g. a network temp failure happens, then some vcpu threads can hang death waiting for the pages. With the new FOLL_INTERRUPTIBLE, we can allow GUP users like KVM to selectively enable the ability to trap these signals. Reviewed-by: John Hubbard Reviewed-by: David Hildenbrand Signed-off-by: Peter Xu Message-Id: <20221011195809.557016-2-peterx@redhat.com> Signed-off-by: Paolo Bonzini --- include/linux/mm.h | 1 + mm/gup.c | 33 +++++++++++++++++++++++++++++---- mm/hugetlb.c | 5 ++++- 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 8bbcccbc55654..3c84f4e48cd77 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2958,6 +2958,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ #define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */ #define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */ +#define FOLL_INTERRUPTIBLE 0x100000 /* allow interrupts from generic signals */ /* * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each diff --git a/mm/gup.c b/mm/gup.c index fe195d47de74a..90e372352e824 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -989,8 +989,17 @@ static int faultin_page(struct vm_area_struct *vma, fault_flags |= FAULT_FLAG_WRITE; if (*flags & FOLL_REMOTE) fault_flags |= FAULT_FLAG_REMOTE; - if (locked) + if (locked) { fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + /* + * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set + * FOLL_INTERRUPTIBLE to enable FAULT_FLAG_INTERRUPTIBLE. + * That's because some callers may not be prepared to + * handle early exits caused by non-fatal signals. + */ + if (*flags & FOLL_INTERRUPTIBLE) + fault_flags |= FAULT_FLAG_INTERRUPTIBLE; + } if (*flags & FOLL_NOWAIT) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; if (*flags & FOLL_TRIED) { @@ -1391,6 +1400,22 @@ retry: } EXPORT_SYMBOL_GPL(fixup_user_fault); +/* + * GUP always responds to fatal signals. When FOLL_INTERRUPTIBLE is + * specified, it'll also respond to generic signals. The caller of GUP + * that has FOLL_INTERRUPTIBLE should take care of the GUP interruption. + */ +static bool gup_signal_pending(unsigned int flags) +{ + if (fatal_signal_pending(current)) + return true; + + if (!(flags & FOLL_INTERRUPTIBLE)) + return false; + + return signal_pending(current); +} + /* * Please note that this function, unlike __get_user_pages will not * return 0 for nr_pages > 0 without FOLL_NOWAIT @@ -1472,11 +1497,11 @@ retry: * Repeat on the address that fired VM_FAULT_RETRY * with both FAULT_FLAG_ALLOW_RETRY and * FAULT_FLAG_TRIED. Note that GUP can be interrupted - * by fatal signals, so we need to check it before we + * by fatal signals of even common signals, depending on + * the caller's request. So we need to check it before we * start trying again otherwise it can loop forever. */ - - if (fatal_signal_pending(current)) { + if (gup_signal_pending(flags)) { if (!pages_done) pages_done = -EINTR; break; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 546df97c31e4c..b5ed54f760bb2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6285,9 +6285,12 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, fault_flags |= FAULT_FLAG_WRITE; else if (unshare) fault_flags |= FAULT_FLAG_UNSHARE; - if (locked) + if (locked) { fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + if (flags & FOLL_INTERRUPTIBLE) + fault_flags |= FAULT_FLAG_INTERRUPTIBLE; + } if (flags & FOLL_NOWAIT) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; -- GitLab From fe5ed56c79733b7808f968567c581118ab79552e Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Oct 2022 15:58:07 -0400 Subject: [PATCH 0437/1423] kvm: Add KVM_PFN_ERR_SIGPENDING Add a new pfn error to show that we've got a pending signal to handle during hva_to_pfn_slow() procedure (of -EINTR retval). Signed-off-by: Peter Xu Reviewed-by: Sean Christopherson Message-Id: <20221011195809.557016-3-peterx@redhat.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 10 ++++++++++ virt/kvm/kvm_main.c | 2 ++ 2 files changed, 12 insertions(+) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 18592bdf4c1bf..911b064878dfa 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -96,6 +96,7 @@ #define KVM_PFN_ERR_FAULT (KVM_PFN_ERR_MASK) #define KVM_PFN_ERR_HWPOISON (KVM_PFN_ERR_MASK + 1) #define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 2) +#define KVM_PFN_ERR_SIGPENDING (KVM_PFN_ERR_MASK + 3) /* * error pfns indicate that the gfn is in slot but faild to @@ -106,6 +107,15 @@ static inline bool is_error_pfn(kvm_pfn_t pfn) return !!(pfn & KVM_PFN_ERR_MASK); } +/* + * KVM_PFN_ERR_SIGPENDING indicates that fetching the PFN was interrupted + * by a pending signal. Note, the signal may or may not be fatal. + */ +static inline bool is_sigpending_pfn(kvm_pfn_t pfn) +{ + return pfn == KVM_PFN_ERR_SIGPENDING; +} + /* * error_noslot pfns indicate that the gfn can not be * translated to pfn - it is not in slot or failed to diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 25d7872b29c17..558f52dbebbde 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2667,6 +2667,8 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn); if (npages == 1) return pfn; + if (npages == -EINTR) + return KVM_PFN_ERR_SIGPENDING; mmap_read_lock(current->mm); if (npages == -EHWPOISON || -- GitLab From c8b88b332bedf47a9aa008dfb69998c90623375c Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Oct 2022 15:58:08 -0400 Subject: [PATCH 0438/1423] kvm: Add interruptible flag to __gfn_to_pfn_memslot() Add a new "interruptible" flag showing that the caller is willing to be interrupted by signals during the __gfn_to_pfn_memslot() request. Wire it up with a FOLL_INTERRUPTIBLE flag that we've just introduced. This prepares KVM to be able to respond to SIGUSR1 (for QEMU that's the SIGIPI) even during e.g. handling an userfaultfd page fault. No functional change intended. Signed-off-by: Peter Xu Reviewed-by: Sean Christopherson Message-Id: <20221011195809.557016-4-peterx@redhat.com> Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/mmu.c | 2 +- arch/powerpc/kvm/book3s_64_mmu_hv.c | 2 +- arch/powerpc/kvm/book3s_64_mmu_radix.c | 2 +- arch/x86/kvm/mmu/mmu.c | 4 ++-- include/linux/kvm_host.h | 4 ++-- virt/kvm/kvm_main.c | 28 ++++++++++++++++---------- virt/kvm/kvm_mm.h | 4 ++-- virt/kvm/pfncache.c | 2 +- 8 files changed, 27 insertions(+), 21 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 60ee3d9f01f8c..f154d4a7fae0b 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1239,7 +1239,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, */ smp_rmb(); - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, + pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL, write_fault, &writable, NULL); if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(hva, vma_shift); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index e9744b41a226c..4939f57b6f6a2 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -598,7 +598,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu, write_ok = true; } else { /* Call KVM generic code to do the slow-path check */ - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, + pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL, writing, &write_ok, NULL); if (is_error_noslot_pfn(pfn)) return -EFAULT; diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 5d5e12f3bf864..9d3743ca16d53 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -846,7 +846,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, unsigned long pfn; /* Call KVM generic code to do the slow-path check */ - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, + pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL, writing, upgrade_p, NULL); if (is_error_noslot_pfn(pfn)) return -EFAULT; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index f8c92a4a35faa..0bbfb33fa7355 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4170,7 +4170,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) } async = false; - fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async, + fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async, fault->write, &fault->map_writable, &fault->hva); if (!async) @@ -4187,7 +4187,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) } } - fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL, + fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, NULL, fault->write, &fault->map_writable, &fault->hva); return RET_PF_CONTINUE; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 911b064878dfa..8fe4665bd020b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1150,8 +1150,8 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn); kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn); kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, - bool atomic, bool *async, bool write_fault, - bool *writable, hva_t *hva); + bool atomic, bool interruptible, bool *async, + bool write_fault, bool *writable, hva_t *hva); void kvm_release_pfn_clean(kvm_pfn_t pfn); void kvm_release_pfn_dirty(kvm_pfn_t pfn); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 558f52dbebbde..43bbe4fde078f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2514,7 +2514,7 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, * 1 indicates success, -errno is returned if error is detected. */ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, - bool *writable, kvm_pfn_t *pfn) + bool interruptible, bool *writable, kvm_pfn_t *pfn) { unsigned int flags = FOLL_HWPOISON; struct page *page; @@ -2529,6 +2529,8 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, flags |= FOLL_WRITE; if (async) flags |= FOLL_NOWAIT; + if (interruptible) + flags |= FOLL_INTERRUPTIBLE; npages = get_user_pages_unlocked(addr, 1, &page, flags); if (npages != 1) @@ -2638,6 +2640,7 @@ out: * Pin guest page in memory and return its pfn. * @addr: host virtual address which maps memory to the guest * @atomic: whether this function can sleep + * @interruptible: whether the process can be interrupted by non-fatal signals * @async: whether this function need to wait IO complete if the * host page is not in the memory * @write_fault: whether we should get a writable host page @@ -2648,8 +2651,8 @@ out: * 2): @write_fault = false && @writable, @writable will tell the caller * whether the mapping is writable. */ -kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, - bool write_fault, bool *writable) +kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible, + bool *async, bool write_fault, bool *writable) { struct vm_area_struct *vma; kvm_pfn_t pfn; @@ -2664,7 +2667,8 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, if (atomic) return KVM_PFN_ERR_FAULT; - npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn); + npages = hva_to_pfn_slow(addr, async, write_fault, interruptible, + writable, &pfn); if (npages == 1) return pfn; if (npages == -EINTR) @@ -2699,8 +2703,8 @@ exit: } kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, - bool atomic, bool *async, bool write_fault, - bool *writable, hva_t *hva) + bool atomic, bool interruptible, bool *async, + bool write_fault, bool *writable, hva_t *hva) { unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); @@ -2725,7 +2729,7 @@ kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, writable = NULL; } - return hva_to_pfn(addr, atomic, async, write_fault, + return hva_to_pfn(addr, atomic, interruptible, async, write_fault, writable); } EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); @@ -2733,20 +2737,22 @@ EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable) { - return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, - write_fault, writable, NULL); + return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false, + NULL, write_fault, writable, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) { - return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL); + return __gfn_to_pfn_memslot(slot, gfn, false, false, NULL, true, + NULL, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn) { - return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL); + return __gfn_to_pfn_memslot(slot, gfn, true, false, NULL, true, + NULL, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h index 41da467d99c95..a1ab15006af34 100644 --- a/virt/kvm/kvm_mm.h +++ b/virt/kvm/kvm_mm.h @@ -24,8 +24,8 @@ #define KVM_MMU_READ_UNLOCK(kvm) spin_unlock(&(kvm)->mmu_lock) #endif /* KVM_HAVE_MMU_RWLOCK */ -kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, - bool write_fault, bool *writable); +kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible, + bool *async, bool write_fault, bool *writable); #ifdef CONFIG_HAVE_KVM_PFNCACHE void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index 346e47f155724..bd4a46aee384b 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -185,7 +185,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) } /* We always request a writeable mapping */ - new_pfn = hva_to_pfn(gpc->uhva, false, NULL, true, NULL); + new_pfn = hva_to_pfn(gpc->uhva, false, false, NULL, true, NULL); if (is_error_noslot_pfn(new_pfn)) goto out_error; -- GitLab From 766576874b9732ad6a65595296de351c064b4c0b Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Oct 2022 15:59:47 -0400 Subject: [PATCH 0439/1423] kvm: x86: Allow to respond to generic signals during slow PF Enable x86 slow page faults to be able to respond to non-fatal signals, returning -EINTR properly when it happens. Signed-off-by: Peter Xu Reviewed-by: Sean Christopherson Message-Id: <20221011195947.557281-1-peterx@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0bbfb33fa7355..5d662b43a63ef 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3149,8 +3149,13 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct * send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk); } -static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) +static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) { + if (is_sigpending_pfn(pfn)) { + kvm_handle_signal_exit(vcpu); + return -EINTR; + } + /* * Do not cache the mmio info caused by writing the readonly gfn * into the spte otherwise read access on readonly gfn also can @@ -3172,7 +3177,7 @@ static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fau { /* The pfn is invalid, report the error! */ if (unlikely(is_error_pfn(fault->pfn))) - return kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn); + return kvm_handle_error_pfn(vcpu, fault->gfn, fault->pfn); if (unlikely(!fault->slot)) { gva_t gva = fault->is_tdp ? 0 : fault->addr; @@ -4187,7 +4192,12 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) } } - fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, NULL, + /* + * Allow gup to bail on pending non-fatal signals when it's also allowed + * to wait for IO. Note, gup always bails if it is unable to quickly + * get a page and a fatal signal, i.e. SIGKILL, is pending. + */ + fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, true, NULL, fault->write, &fault->map_writable, &fault->hva); return RET_PF_CONTINUE; -- GitLab From be83794210e7020fef98596f4513aafbed659cd1 Mon Sep 17 00:00:00 2001 From: Aaron Lewis Date: Wed, 21 Sep 2022 15:15:21 +0000 Subject: [PATCH 0440/1423] KVM: x86: Disallow the use of KVM_MSR_FILTER_DEFAULT_ALLOW in the kernel Protect the kernel from using the flag KVM_MSR_FILTER_DEFAULT_ALLOW. Its value is 0, and using it incorrectly could have unintended consequences. E.g. prevent someone in the kernel from writing something like this. if (filter.flags & KVM_MSR_FILTER_DEFAULT_ALLOW) and getting confused when it doesn't work. It would be more ideal to remove this flag altogether, but userspace may already be using it, so protecting the kernel is all that can reasonably be done at this point. Suggested-by: Sean Christopherson Signed-off-by: Aaron Lewis Reviewed-by: Sean Christopherson Message-Id: <20220921151525.904162-2-aaronlewis@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/kvm.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 46de10a809ecb..73ad693aa653d 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -222,7 +222,9 @@ struct kvm_msr_filter_range { #define KVM_MSR_FILTER_MAX_RANGES 16 struct kvm_msr_filter { +#ifndef __KERNEL__ #define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0) +#endif #define KVM_MSR_FILTER_DEFAULT_DENY (1 << 0) __u32 flags; struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES]; -- GitLab From db205f7e1edc8d8f0880f0218d3a03b13fe94af3 Mon Sep 17 00:00:00 2001 From: Aaron Lewis Date: Wed, 21 Sep 2022 15:15:22 +0000 Subject: [PATCH 0441/1423] KVM: x86: Add a VALID_MASK for the MSR exit reason flags Add the mask KVM_MSR_EXIT_REASON_VALID_MASK for the MSR exit reason flags. This simplifies checks that validate these flags, and makes it easier to introduce new flags in the future. No functional change intended. Signed-off-by: Aaron Lewis Message-Id: <20220921151525.904162-3-aaronlewis@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 +--- include/uapi/linux/kvm.h | 3 +++ 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cbec2e675c18f..9ba8c1b73db49 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6230,9 +6230,7 @@ split_irqchip_unlock: break; case KVM_CAP_X86_USER_SPACE_MSR: r = -EINVAL; - if (cap->args[0] & ~(KVM_MSR_EXIT_REASON_INVAL | - KVM_MSR_EXIT_REASON_UNKNOWN | - KVM_MSR_EXIT_REASON_FILTER)) + if (cap->args[0] & ~KVM_MSR_EXIT_REASON_VALID_MASK) break; kvm->arch.user_space_msr_mask = cap->args[0]; r = 0; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 0d5d4419139ae..7fea12369245a 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -485,6 +485,9 @@ struct kvm_run { #define KVM_MSR_EXIT_REASON_INVAL (1 << 0) #define KVM_MSR_EXIT_REASON_UNKNOWN (1 << 1) #define KVM_MSR_EXIT_REASON_FILTER (1 << 2) +#define KVM_MSR_EXIT_REASON_VALID_MASK (KVM_MSR_EXIT_REASON_INVAL | \ + KVM_MSR_EXIT_REASON_UNKNOWN | \ + KVM_MSR_EXIT_REASON_FILTER) __u32 reason; /* kernel -> user */ __u32 index; /* kernel -> user */ __u64 data; /* kernel <-> user */ -- GitLab From c1340fe3590ebbe729294c728234434ef31a7c7a Mon Sep 17 00:00:00 2001 From: Aaron Lewis Date: Wed, 21 Sep 2022 15:15:23 +0000 Subject: [PATCH 0442/1423] KVM: x86: Add a VALID_MASK for the flag in kvm_msr_filter Add the mask KVM_MSR_FILTER_VALID_MASK for the flag in the struct kvm_msr_filter. This makes it easier to introduce new flags in the future. No functional change intended. Signed-off-by: Aaron Lewis Message-Id: <20220921151525.904162-4-aaronlewis@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/kvm.h | 1 + arch/x86/kvm/x86.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 73ad693aa653d..ae4324674c490 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -226,6 +226,7 @@ struct kvm_msr_filter { #define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0) #endif #define KVM_MSR_FILTER_DEFAULT_DENY (1 << 0) +#define KVM_MSR_FILTER_VALID_MASK (KVM_MSR_FILTER_DEFAULT_DENY) __u32 flags; struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES]; }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9ba8c1b73db49..5208b9501c880 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6441,7 +6441,7 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, int r = 0; u32 i; - if (filter->flags & ~KVM_MSR_FILTER_DEFAULT_DENY) + if (filter->flags & ~KVM_MSR_FILTER_VALID_MASK) return -EINVAL; for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) -- GitLab From 8aff460f216753d86ab90ddbcab0125ab2400335 Mon Sep 17 00:00:00 2001 From: Aaron Lewis Date: Wed, 21 Sep 2022 15:15:24 +0000 Subject: [PATCH 0443/1423] KVM: x86: Add a VALID_MASK for the flags in kvm_msr_filter_range Add the mask KVM_MSR_FILTER_RANGE_VALID_MASK for the flags in the struct kvm_msr_filter_range. This simplifies checks that validate these flags, and makes it easier to introduce new flags in the future. No functional change intended. Signed-off-by: Aaron Lewis Message-Id: <20220921151525.904162-5-aaronlewis@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/kvm.h | 2 ++ arch/x86/kvm/x86.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index ae4324674c490..c6df6b16a0885 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -214,6 +214,8 @@ struct kvm_msr_list { struct kvm_msr_filter_range { #define KVM_MSR_FILTER_READ (1 << 0) #define KVM_MSR_FILTER_WRITE (1 << 1) +#define KVM_MSR_FILTER_RANGE_VALID_MASK (KVM_MSR_FILTER_READ | \ + KVM_MSR_FILTER_WRITE) __u32 flags; __u32 nmsrs; /* number of msrs in bitmap */ __u32 base; /* MSR index the bitmap starts at */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5208b9501c880..e46e458c5b08a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6407,7 +6407,7 @@ static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter, if (!user_range->nmsrs) return 0; - if (user_range->flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) + if (user_range->flags & ~KVM_MSR_FILTER_RANGE_VALID_MASK) return -EINVAL; if (!user_range->flags) -- GitLab From f7d64772712350fd35f1d76d16dec030a81029eb Mon Sep 17 00:00:00 2001 From: Aaron Lewis Date: Wed, 21 Sep 2022 15:15:25 +0000 Subject: [PATCH 0444/1423] selftests: kvm/x86: Test the flags in MSR filtering and MSR exiting When using the flags in KVM_X86_SET_MSR_FILTER and KVM_CAP_X86_USER_SPACE_MSR it is expected that an attempt to write to any of the unused bits will fail. Add testing to walk over every bit in each of the flag fields in MSR filtering and MSR exiting to verify that unused bits return and error and used bits, i.e. valid bits, succeed. Signed-off-by: Aaron Lewis Message-Id: <20220921151525.904162-6-aaronlewis@google.com> Signed-off-by: Paolo Bonzini --- .../kvm/x86_64/userspace_msr_exit_test.c | 85 +++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c index a4f06370a2454..fae95089e6551 100644 --- a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c +++ b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c @@ -733,6 +733,89 @@ static void test_msr_permission_bitmap(void) kvm_vm_free(vm); } +#define test_user_exit_msr_ioctl(vm, cmd, arg, flag, valid_mask) \ +({ \ + int r = __vm_ioctl(vm, cmd, arg); \ + \ + if (flag & valid_mask) \ + TEST_ASSERT(!r, __KVM_IOCTL_ERROR(#cmd, r)); \ + else \ + TEST_ASSERT(r == -1 && errno == EINVAL, \ + "Wanted EINVAL for %s with flag = 0x%llx, got rc: %i errno: %i (%s)", \ + #cmd, flag, r, errno, strerror(errno)); \ +}) + +static void run_user_space_msr_flag_test(struct kvm_vm *vm) +{ + struct kvm_enable_cap cap = { .cap = KVM_CAP_X86_USER_SPACE_MSR }; + int nflags = sizeof(cap.args[0]) * BITS_PER_BYTE; + int rc; + int i; + + rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR); + TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available"); + + for (i = 0; i < nflags; i++) { + cap.args[0] = BIT_ULL(i); + test_user_exit_msr_ioctl(vm, KVM_ENABLE_CAP, &cap, + BIT_ULL(i), KVM_MSR_EXIT_REASON_VALID_MASK); + } +} + +static void run_msr_filter_flag_test(struct kvm_vm *vm) +{ + u64 deny_bits = 0; + struct kvm_msr_filter filter = { + .flags = KVM_MSR_FILTER_DEFAULT_ALLOW, + .ranges = { + { + .flags = KVM_MSR_FILTER_READ, + .nmsrs = 1, + .base = 0, + .bitmap = (uint8_t *)&deny_bits, + }, + }, + }; + int nflags; + int rc; + int i; + + rc = kvm_check_cap(KVM_CAP_X86_MSR_FILTER); + TEST_ASSERT(rc, "KVM_CAP_X86_MSR_FILTER is available"); + + nflags = sizeof(filter.flags) * BITS_PER_BYTE; + for (i = 0; i < nflags; i++) { + filter.flags = BIT_ULL(i); + test_user_exit_msr_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter, + BIT_ULL(i), KVM_MSR_FILTER_VALID_MASK); + } + + filter.flags = KVM_MSR_FILTER_DEFAULT_ALLOW; + nflags = sizeof(filter.ranges[0].flags) * BITS_PER_BYTE; + for (i = 0; i < nflags; i++) { + filter.ranges[0].flags = BIT_ULL(i); + test_user_exit_msr_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter, + BIT_ULL(i), KVM_MSR_FILTER_RANGE_VALID_MASK); + } +} + +/* Test that attempts to write to the unused bits in a flag fails. */ +static void test_user_exit_msr_flags(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + vm = vm_create_with_one_vcpu(&vcpu, NULL); + + /* Test flags for KVM_CAP_X86_USER_SPACE_MSR. */ + run_user_space_msr_flag_test(vm); + + /* Test flags and range flags for KVM_X86_SET_MSR_FILTER. */ + run_msr_filter_flag_test(vm); + + kvm_vm_free(vm); +} + int main(int argc, char *argv[]) { /* Tell stdout not to buffer its content */ @@ -744,5 +827,7 @@ int main(int argc, char *argv[]) test_msr_permission_bitmap(); + test_user_exit_msr_flags(); + return 0; } -- GitLab From 428e921611bcad9ab95078baf9abe14688de43f0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 19 Oct 2022 16:56:11 +0000 Subject: [PATCH 0445/1423] KVM: x86/mmu: Tag disallowed NX huge pages even if they're not tracked Tag shadow pages that cannot be replaced with an NX huge page regardless of whether or not zapping the page would allow KVM to immediately create a huge page, e.g. because something else prevents creating a huge page. I.e. track pages that are disallowed from being NX huge pages regardless of whether or not the page could have been huge at the time of fault. KVM currently tracks pages that were disallowed from being huge due to the NX workaround if and only if the page could otherwise be huge. But that fails to handled the scenario where whatever restriction prevented KVM from installing a huge page goes away, e.g. if dirty logging is disabled, the host mapping level changes, etc... Failure to tag shadow pages appropriately could theoretically lead to false negatives, e.g. if a fetch fault requests a small page and thus isn't tracked, and a read/write fault later requests a huge page, KVM will not reject the huge page as it should. To avoid yet another flag, initialize the list_head and use list_empty() to determine whether or not a page is on the list of NX huge pages that should be recovered. Note, the TDP MMU accounting is still flawed as fixing the TDP MMU is more involved due to mmu_lock being held for read. This will be addressed in a future commit. Fixes: 5bcaf3e1715f ("KVM: x86/mmu: Account NX huge page disallowed iff huge page was requested") Signed-off-by: Sean Christopherson Message-Id: <20221019165618.927057-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 32 ++++++++++++++++++++++++-------- arch/x86/kvm/mmu/mmu_internal.h | 10 +++++++++- arch/x86/kvm/mmu/paging_tmpl.h | 6 +++--- arch/x86/kvm/mmu/tdp_mmu.c | 4 +++- 4 files changed, 39 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 5d662b43a63ef..989586e7dd86e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -803,15 +803,25 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); } -void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp, + bool nx_huge_page_possible) { - if (sp->lpage_disallowed) + sp->lpage_disallowed = true; + + /* + * If it's possible to replace the shadow page with an NX huge page, + * i.e. if the shadow page is the only thing currently preventing KVM + * from using a huge page, add the shadow page to the list of "to be + * zapped for NX recovery" pages. Note, the shadow page can already be + * on the list if KVM is reusing an existing shadow page, i.e. if KVM + * links a shadow page at multiple points. + */ + if (!nx_huge_page_possible || !list_empty(&sp->lpage_disallowed_link)) return; ++kvm->stat.nx_lpage_splits; list_add_tail(&sp->lpage_disallowed_link, &kvm->arch.lpage_disallowed_mmu_pages); - sp->lpage_disallowed = true; } static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -833,9 +843,13 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - --kvm->stat.nx_lpage_splits; sp->lpage_disallowed = false; - list_del(&sp->lpage_disallowed_link); + + if (list_empty(&sp->lpage_disallowed_link)) + return; + + --kvm->stat.nx_lpage_splits; + list_del_init(&sp->lpage_disallowed_link); } static struct kvm_memory_slot * @@ -2130,6 +2144,8 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm, set_page_private(virt_to_page(sp->spt), (unsigned long)sp); + INIT_LIST_HEAD(&sp->lpage_disallowed_link); + /* * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() * depends on valid pages being added to the head of the list. See @@ -3127,9 +3143,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) continue; link_shadow_page(vcpu, it.sptep, sp); - if (fault->is_tdp && fault->huge_page_disallowed && - fault->req_level >= it.level) - account_huge_nx_page(vcpu->kvm, sp); + if (fault->is_tdp && fault->huge_page_disallowed) + account_huge_nx_page(vcpu->kvm, sp, + fault->req_level >= it.level); } if (WARN_ON_ONCE(it.level != fault->goal_level)) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 582def531d4d9..cca1ad75d0961 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -100,6 +100,13 @@ struct kvm_mmu_page { }; }; + /* + * Tracks shadow pages that, if zapped, would allow KVM to create an NX + * huge page. A shadow page will have lpage_disallowed set but not be + * on the list if a huge page is disallowed for other reasons, e.g. + * because KVM is shadowing a PTE at the same gfn, the memslot isn't + * properly aligned, etc... + */ struct list_head lpage_disallowed_link; #ifdef CONFIG_X86_32 /* @@ -315,7 +322,8 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); -void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp); +void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp, + bool nx_huge_page_possible); void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp); #endif /* __KVM_X86_MMU_INTERNAL_H */ diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 5ab5f94dcb6fd..8fd0c4e1e5750 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -713,9 +713,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, continue; link_shadow_page(vcpu, it.sptep, sp); - if (fault->huge_page_disallowed && - fault->req_level >= it.level) - account_huge_nx_page(vcpu->kvm, sp); + if (fault->huge_page_disallowed) + account_huge_nx_page(vcpu->kvm, sp, + fault->req_level >= it.level); } if (WARN_ON_ONCE(it.level != fault->goal_level)) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 672f0432d7777..80a4a1a091310 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -284,6 +284,8 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep, gfn_t gfn, union kvm_mmu_page_role role) { + INIT_LIST_HEAD(&sp->lpage_disallowed_link); + set_page_private(virt_to_page(sp->spt), (unsigned long)sp); sp->role = role; @@ -1141,7 +1143,7 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, spin_lock(&kvm->arch.tdp_mmu_pages_lock); list_add(&sp->link, &kvm->arch.tdp_mmu_pages); if (account_nx) - account_huge_nx_page(kvm, sp); + account_huge_nx_page(kvm, sp, true); spin_unlock(&kvm->arch.tdp_mmu_pages_lock); tdp_account_mmu_page(kvm, sp); -- GitLab From 55c510e26ab6181c132327a8b90c864e6193ce27 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 19 Oct 2022 16:56:12 +0000 Subject: [PATCH 0446/1423] KVM: x86/mmu: Rename NX huge pages fields/functions for consistency Rename most of the variables/functions involved in the NX huge page mitigation to provide consistency, e.g. lpage vs huge page, and NX huge vs huge NX, and also to provide clarity, e.g. to make it obvious the flag applies only to the NX huge page mitigation, not to any condition that prevents creating a huge page. Add a comment explaining what the newly named "possible_nx_huge_pages" tracks. Leave the nx_lpage_splits stat alone as the name is ABI and thus set in stone. Signed-off-by: Sean Christopherson Reviewed-by: Mingwei Zhang Message-Id: <20221019165618.927057-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 19 +++++++-- arch/x86/kvm/mmu/mmu.c | 71 +++++++++++++++++---------------- arch/x86/kvm/mmu/mmu_internal.h | 22 ++++++---- arch/x86/kvm/mmu/paging_tmpl.h | 2 +- arch/x86/kvm/mmu/tdp_mmu.c | 8 ++-- 5 files changed, 71 insertions(+), 51 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4443869056323..9030e6263f954 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1159,7 +1159,18 @@ struct kvm_arch { struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; struct list_head active_mmu_pages; struct list_head zapped_obsolete_pages; - struct list_head lpage_disallowed_mmu_pages; + /* + * A list of kvm_mmu_page structs that, if zapped, could possibly be + * replaced by an NX huge page. A shadow page is on this list if its + * existence disallows an NX huge page (nx_huge_page_disallowed is set) + * and there are no other conditions that prevent a huge page, e.g. + * the backing host page is huge, dirtly logging is not enabled for its + * memslot, etc... Note, zapping shadow pages on this list doesn't + * guarantee an NX huge page will be created in its stead, e.g. if the + * guest attempts to execute from the region then KVM obviously can't + * create an NX huge page (without hanging the guest). + */ + struct list_head possible_nx_huge_pages; struct kvm_page_track_notifier_node mmu_sp_tracker; struct kvm_page_track_notifier_head track_notifier_head; /* @@ -1275,7 +1286,7 @@ struct kvm_arch { bool sgx_provisioning_allowed; struct kvm_pmu_event_filter __rcu *pmu_event_filter; - struct task_struct *nx_lpage_recovery_thread; + struct task_struct *nx_huge_page_recovery_thread; #ifdef CONFIG_X86_64 /* @@ -1320,8 +1331,8 @@ struct kvm_arch { * - tdp_mmu_roots (above) * - tdp_mmu_pages (above) * - the link field of kvm_mmu_page structs used by the TDP MMU - * - lpage_disallowed_mmu_pages - * - the lpage_disallowed_link field of kvm_mmu_page structs used + * - possible_nx_huge_pages; + * - the possible_nx_huge_page_link field of kvm_mmu_page structs used * by the TDP MMU * It is acceptable, but not necessary, to acquire this lock when * the thread holds the MMU lock in write mode. diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 989586e7dd86e..09482ef4d8320 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -803,10 +803,10 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); } -void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp, +void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp, bool nx_huge_page_possible) { - sp->lpage_disallowed = true; + sp->nx_huge_page_disallowed = true; /* * If it's possible to replace the shadow page with an NX huge page, @@ -816,12 +816,13 @@ void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp, * on the list if KVM is reusing an existing shadow page, i.e. if KVM * links a shadow page at multiple points. */ - if (!nx_huge_page_possible || !list_empty(&sp->lpage_disallowed_link)) + if (!nx_huge_page_possible || + !list_empty(&sp->possible_nx_huge_page_link)) return; ++kvm->stat.nx_lpage_splits; - list_add_tail(&sp->lpage_disallowed_link, - &kvm->arch.lpage_disallowed_mmu_pages); + list_add_tail(&sp->possible_nx_huge_page_link, + &kvm->arch.possible_nx_huge_pages); } static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -841,15 +842,15 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_allow_lpage(slot, gfn); } -void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - sp->lpage_disallowed = false; + sp->nx_huge_page_disallowed = false; - if (list_empty(&sp->lpage_disallowed_link)) + if (list_empty(&sp->possible_nx_huge_page_link)) return; --kvm->stat.nx_lpage_splits; - list_del_init(&sp->lpage_disallowed_link); + list_del_init(&sp->possible_nx_huge_page_link); } static struct kvm_memory_slot * @@ -2144,7 +2145,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm, set_page_private(virt_to_page(sp->spt), (unsigned long)sp); - INIT_LIST_HEAD(&sp->lpage_disallowed_link); + INIT_LIST_HEAD(&sp->possible_nx_huge_page_link); /* * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() @@ -2503,8 +2504,8 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, zapped_root = !is_obsolete_sp(kvm, sp); } - if (sp->lpage_disallowed) - unaccount_huge_nx_page(kvm, sp); + if (sp->nx_huge_page_disallowed) + unaccount_nx_huge_page(kvm, sp); sp->role.invalid = 1; @@ -3144,7 +3145,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) link_shadow_page(vcpu, it.sptep, sp); if (fault->is_tdp && fault->huge_page_disallowed) - account_huge_nx_page(vcpu->kvm, sp, + account_nx_huge_page(vcpu->kvm, sp, fault->req_level >= it.level); } @@ -5998,7 +5999,7 @@ int kvm_mmu_init_vm(struct kvm *kvm) INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); - INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages); spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); r = kvm_mmu_init_tdp_mmu(kvm); @@ -6683,7 +6684,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) kvm_mmu_zap_all_fast(kvm); mutex_unlock(&kvm->slots_lock); - wake_up_process(kvm->arch.nx_lpage_recovery_thread); + wake_up_process(kvm->arch.nx_huge_page_recovery_thread); } mutex_unlock(&kvm_lock); } @@ -6815,7 +6816,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) - wake_up_process(kvm->arch.nx_lpage_recovery_thread); + wake_up_process(kvm->arch.nx_huge_page_recovery_thread); mutex_unlock(&kvm_lock); } @@ -6823,7 +6824,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel return err; } -static void kvm_recover_nx_lpages(struct kvm *kvm) +static void kvm_recover_nx_huge_pages(struct kvm *kvm) { unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits; int rcu_idx; @@ -6846,23 +6847,25 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) ratio = READ_ONCE(nx_huge_pages_recovery_ratio); to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0; for ( ; to_zap; --to_zap) { - if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) + if (list_empty(&kvm->arch.possible_nx_huge_pages)) break; /* * We use a separate list instead of just using active_mmu_pages - * because the number of lpage_disallowed pages is expected to - * be relatively small compared to the total. + * because the number of shadow pages that be replaced with an + * NX huge page is expected to be relatively small compared to + * the total number of shadow pages. And because the TDP MMU + * doesn't use active_mmu_pages. */ - sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages, + sp = list_first_entry(&kvm->arch.possible_nx_huge_pages, struct kvm_mmu_page, - lpage_disallowed_link); - WARN_ON_ONCE(!sp->lpage_disallowed); + possible_nx_huge_page_link); + WARN_ON_ONCE(!sp->nx_huge_page_disallowed); if (is_tdp_mmu_page(sp)) { flush |= kvm_tdp_mmu_zap_sp(kvm, sp); } else { kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - WARN_ON_ONCE(sp->lpage_disallowed); + WARN_ON_ONCE(sp->nx_huge_page_disallowed); } if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { @@ -6883,7 +6886,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) srcu_read_unlock(&kvm->srcu, rcu_idx); } -static long get_nx_lpage_recovery_timeout(u64 start_time) +static long get_nx_huge_page_recovery_timeout(u64 start_time) { bool enabled; uint period; @@ -6894,19 +6897,19 @@ static long get_nx_lpage_recovery_timeout(u64 start_time) : MAX_SCHEDULE_TIMEOUT; } -static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) +static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data) { u64 start_time; long remaining_time; while (true) { start_time = get_jiffies_64(); - remaining_time = get_nx_lpage_recovery_timeout(start_time); + remaining_time = get_nx_huge_page_recovery_timeout(start_time); set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop() && remaining_time > 0) { schedule_timeout(remaining_time); - remaining_time = get_nx_lpage_recovery_timeout(start_time); + remaining_time = get_nx_huge_page_recovery_timeout(start_time); set_current_state(TASK_INTERRUPTIBLE); } @@ -6915,7 +6918,7 @@ static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) if (kthread_should_stop()) return 0; - kvm_recover_nx_lpages(kvm); + kvm_recover_nx_huge_pages(kvm); } } @@ -6923,17 +6926,17 @@ int kvm_mmu_post_init_vm(struct kvm *kvm) { int err; - err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0, + err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0, "kvm-nx-lpage-recovery", - &kvm->arch.nx_lpage_recovery_thread); + &kvm->arch.nx_huge_page_recovery_thread); if (!err) - kthread_unpark(kvm->arch.nx_lpage_recovery_thread); + kthread_unpark(kvm->arch.nx_huge_page_recovery_thread); return err; } void kvm_mmu_pre_destroy_vm(struct kvm *kvm) { - if (kvm->arch.nx_lpage_recovery_thread) - kthread_stop(kvm->arch.nx_lpage_recovery_thread); + if (kvm->arch.nx_huge_page_recovery_thread) + kthread_stop(kvm->arch.nx_huge_page_recovery_thread); } diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index cca1ad75d0961..67879459a25c8 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -57,7 +57,13 @@ struct kvm_mmu_page { bool tdp_mmu_page; bool unsync; u8 mmu_valid_gen; - bool lpage_disallowed; /* Can't be replaced by an equiv large page */ + + /* + * The shadow page can't be replaced by an equivalent huge page + * because it is being used to map an executable page in the guest + * and the NX huge page mitigation is enabled. + */ + bool nx_huge_page_disallowed; /* * The following two entries are used to key the shadow page in the @@ -102,12 +108,12 @@ struct kvm_mmu_page { /* * Tracks shadow pages that, if zapped, would allow KVM to create an NX - * huge page. A shadow page will have lpage_disallowed set but not be - * on the list if a huge page is disallowed for other reasons, e.g. - * because KVM is shadowing a PTE at the same gfn, the memslot isn't - * properly aligned, etc... + * huge page. A shadow page will have nx_huge_page_disallowed set but + * not be on the list if a huge page is disallowed for other reasons, + * e.g. because KVM is shadowing a PTE at the same gfn, the memslot + * isn't properly aligned, etc... */ - struct list_head lpage_disallowed_link; + struct list_head possible_nx_huge_page_link; #ifdef CONFIG_X86_32 /* * Used out of the mmu-lock to avoid reading spte values while an @@ -322,8 +328,8 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); -void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp, +void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp, bool nx_huge_page_possible); -void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp); +void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); #endif /* __KVM_X86_MMU_INTERNAL_H */ diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 8fd0c4e1e5750..0f64550720557 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -714,7 +714,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, link_shadow_page(vcpu, it.sptep, sp); if (fault->huge_page_disallowed) - account_huge_nx_page(vcpu->kvm, sp, + account_nx_huge_page(vcpu->kvm, sp, fault->req_level >= it.level); } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 80a4a1a091310..73eb28ed1f03f 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -284,7 +284,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep, gfn_t gfn, union kvm_mmu_page_role role) { - INIT_LIST_HEAD(&sp->lpage_disallowed_link); + INIT_LIST_HEAD(&sp->possible_nx_huge_page_link); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); @@ -403,8 +403,8 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, lockdep_assert_held_write(&kvm->mmu_lock); list_del(&sp->link); - if (sp->lpage_disallowed) - unaccount_huge_nx_page(kvm, sp); + if (sp->nx_huge_page_disallowed) + unaccount_nx_huge_page(kvm, sp); if (shared) spin_unlock(&kvm->arch.tdp_mmu_pages_lock); @@ -1143,7 +1143,7 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, spin_lock(&kvm->arch.tdp_mmu_pages_lock); list_add(&sp->link, &kvm->arch.tdp_mmu_pages); if (account_nx) - account_huge_nx_page(kvm, sp, true); + account_nx_huge_page(kvm, sp, true); spin_unlock(&kvm->arch.tdp_mmu_pages_lock); tdp_account_mmu_page(kvm, sp); -- GitLab From b5b0977f4aa28ef2166894b05f37d8f8028a76ce Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 19 Oct 2022 16:56:13 +0000 Subject: [PATCH 0447/1423] KVM: x86/mmu: Properly account NX huge page workaround for nonpaging MMUs Account and track NX huge pages for nonpaging MMUs so that a future enhancement to precisely check if a shadow page can't be replaced by a NX huge page doesn't get false positives. Without correct tracking, KVM can get stuck in a loop if an instruction is fetching and writing data on the same huge page, e.g. KVM installs a small executable page on the fetch fault, replaces it with an NX huge page on the write fault, and faults again on the fetch. Alternatively, and perhaps ideally, KVM would simply not enforce the workaround for nonpaging MMUs. The guest has no page tables to abuse and KVM is guaranteed to switch to a different MMU on CR0.PG being toggled so there's no security or performance concerns. However, getting make_spte() to play nice now and in the future is unnecessarily complex. In the current code base, make_spte() can enforce the mitigation if TDP is enabled or the MMU is indirect, but make_spte() may not always have a vCPU/MMU to work with, e.g. if KVM were to support in-line huge page promotion when disabling dirty logging. Without a vCPU/MMU, KVM could either pass in the correct information and/or derive it from the shadow page, but the former is ugly and the latter subtly non-trivial due to the possibility of direct shadow pages in indirect MMUs. Given that using shadow paging with an unpaged guest is far from top priority _and_ has been subjected to the workaround since its inception, keep it simple and just fix the accounting glitch. Signed-off-by: Sean Christopherson Reviewed-by: David Matlack Reviewed-by: Mingwei Zhang Message-Id: <20221019165618.927057-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/mmu/spte.c | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 09482ef4d8320..f11e4bbfc0bcc 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3144,7 +3144,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) continue; link_shadow_page(vcpu, it.sptep, sp); - if (fault->is_tdp && fault->huge_page_disallowed) + if (fault->huge_page_disallowed) account_nx_huge_page(vcpu->kvm, sp, fault->req_level >= it.level); } diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 2e08b2a453612..c0fd7e049b4e4 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -161,6 +161,18 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (!prefetch) spte |= spte_shadow_accessed_mask(spte); + /* + * For simplicity, enforce the NX huge page mitigation even if not + * strictly necessary. KVM could ignore the mitigation if paging is + * disabled in the guest, as the guest doesn't have an page tables to + * abuse. But to safely ignore the mitigation, KVM would have to + * ensure a new MMU is loaded (or all shadow pages zapped) when CR0.PG + * is toggled on, and that's a net negative for performance when TDP is + * enabled. When TDP is disabled, KVM will always switch to a new MMU + * when CR0.PG is toggled, but leveraging that to ignore the mitigation + * would tie make_spte() further to vCPU/MMU state, and add complexity + * just to optimize a mode that is anything but performance critical. + */ if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && is_nx_huge_page_enabled(vcpu->kvm)) { pte_access &= ~ACC_EXEC_MASK; -- GitLab From 61f94478547bb4fdcd4c4f37a0aa723d610a7422 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 19 Oct 2022 16:56:14 +0000 Subject: [PATCH 0448/1423] KVM: x86/mmu: Set disallowed_nx_huge_page in TDP MMU before setting SPTE Set nx_huge_page_disallowed in TDP MMU shadow pages before making the SP visible to other readers, i.e. before setting its SPTE. This will allow KVM to query the flag when determining if a shadow page can be replaced by a NX huge page without violating the rules of the mitigation. Note, the shadow/legacy MMU holds mmu_lock for write, so it's impossible for another CPU to see a shadow page without an up-to-date nx_huge_page_disallowed, i.e. only the TDP MMU needs the complicated dance. Signed-off-by: Sean Christopherson Reviewed-by: David Matlack Reviewed-by: Yan Zhao Message-Id: <20221019165618.927057-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 28 +++++++++++++++++++--------- arch/x86/kvm/mmu/mmu_internal.h | 5 ++--- arch/x86/kvm/mmu/tdp_mmu.c | 31 ++++++++++++++++++------------- 3 files changed, 39 insertions(+), 25 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index f11e4bbfc0bcc..e384b78e099c1 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -803,11 +803,8 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); } -void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp, - bool nx_huge_page_possible) +void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - sp->nx_huge_page_disallowed = true; - /* * If it's possible to replace the shadow page with an NX huge page, * i.e. if the shadow page is the only thing currently preventing KVM @@ -816,8 +813,7 @@ void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp, * on the list if KVM is reusing an existing shadow page, i.e. if KVM * links a shadow page at multiple points. */ - if (!nx_huge_page_possible || - !list_empty(&sp->possible_nx_huge_page_link)) + if (!list_empty(&sp->possible_nx_huge_page_link)) return; ++kvm->stat.nx_lpage_splits; @@ -825,6 +821,15 @@ void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp, &kvm->arch.possible_nx_huge_pages); } +static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp, + bool nx_huge_page_possible) +{ + sp->nx_huge_page_disallowed = true; + + if (nx_huge_page_possible) + track_possible_nx_huge_page(kvm, sp); +} + static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memslots *slots; @@ -842,10 +847,8 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_allow_lpage(slot, gfn); } -void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) +void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - sp->nx_huge_page_disallowed = false; - if (list_empty(&sp->possible_nx_huge_page_link)) return; @@ -853,6 +856,13 @@ void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) list_del_init(&sp->possible_nx_huge_page_link); } +static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + sp->nx_huge_page_disallowed = false; + + untrack_possible_nx_huge_page(kvm, sp); +} + static struct kvm_memory_slot * gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 67879459a25c8..22152241bd290 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -328,8 +328,7 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); -void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp, - bool nx_huge_page_possible); -void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); +void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); +void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); #endif /* __KVM_X86_MMU_INTERNAL_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 73eb28ed1f03f..059231c823452 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -403,8 +403,11 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, lockdep_assert_held_write(&kvm->mmu_lock); list_del(&sp->link); - if (sp->nx_huge_page_disallowed) - unaccount_nx_huge_page(kvm, sp); + + if (sp->nx_huge_page_disallowed) { + sp->nx_huge_page_disallowed = false; + untrack_possible_nx_huge_page(kvm, sp); + } if (shared) spin_unlock(&kvm->arch.tdp_mmu_pages_lock); @@ -1118,16 +1121,13 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, * @kvm: kvm instance * @iter: a tdp_iter instance currently on the SPTE that should be set * @sp: The new TDP page table to install. - * @account_nx: True if this page table is being installed to split a - * non-executable huge page. * @shared: This operation is running under the MMU lock in read mode. * * Returns: 0 if the new page table was installed. Non-0 if the page table * could not be installed (e.g. the atomic compare-exchange failed). */ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, - struct kvm_mmu_page *sp, bool account_nx, - bool shared) + struct kvm_mmu_page *sp, bool shared) { u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled()); int ret = 0; @@ -1142,8 +1142,6 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, spin_lock(&kvm->arch.tdp_mmu_pages_lock); list_add(&sp->link, &kvm->arch.tdp_mmu_pages); - if (account_nx) - account_nx_huge_page(kvm, sp, true); spin_unlock(&kvm->arch.tdp_mmu_pages_lock); tdp_account_mmu_page(kvm, sp); @@ -1157,6 +1155,7 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_mmu *mmu = vcpu->arch.mmu; + struct kvm *kvm = vcpu->kvm; struct tdp_iter iter; struct kvm_mmu_page *sp; int ret; @@ -1193,9 +1192,6 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) } if (!is_shadow_present_pte(iter.old_spte)) { - bool account_nx = fault->huge_page_disallowed && - fault->req_level >= iter.level; - /* * If SPTE has been frozen by another thread, just * give up and retry, avoiding unnecessary page table @@ -1207,10 +1203,19 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) sp = tdp_mmu_alloc_sp(vcpu); tdp_mmu_init_child_sp(sp, &iter); - if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) { + sp->nx_huge_page_disallowed = fault->huge_page_disallowed; + + if (tdp_mmu_link_sp(kvm, &iter, sp, true)) { tdp_mmu_free_sp(sp); break; } + + if (fault->huge_page_disallowed && + fault->req_level >= iter.level) { + spin_lock(&kvm->arch.tdp_mmu_pages_lock); + track_possible_nx_huge_page(kvm, sp); + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); + } } } @@ -1498,7 +1503,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, * correctness standpoint since the translation will be the same either * way. */ - ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared); + ret = tdp_mmu_link_sp(kvm, iter, sp, shared); if (ret) goto out; -- GitLab From d25ceb9264364dca0683748b301340097fdab6c7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 19 Oct 2022 16:56:15 +0000 Subject: [PATCH 0449/1423] KVM: x86/mmu: Track the number of TDP MMU pages, but not the actual pages Track the number of TDP MMU "shadow" pages instead of tracking the pages themselves. With the NX huge page list manipulation moved out of the common linking flow, elminating the list-based tracking means the happy path of adding a shadow page doesn't need to acquire a spinlock and can instead inc/dec an atomic. Keep the tracking as the WARN during TDP MMU teardown on leaked shadow pages is very, very useful for detecting KVM bugs. Tracking the number of pages will also make it trivial to expose the counter to userspace as a stat in the future, which may or may not be desirable. Note, the TDP MMU needs to use a separate counter (and stat if that ever comes to be) from the existing n_used_mmu_pages. The TDP MMU doesn't bother supporting the shrinker nor does it honor KVM_SET_NR_MMU_PAGES (because the TDP MMU consumes so few pages relative to shadow paging), and including TDP MMU pages in that counter would break both the shrinker and shadow MMUs, e.g. if a VM is using nested TDP. Cc: Yan Zhao Reviewed-by: Mingwei Zhang Reviewed-by: David Matlack Signed-off-by: Sean Christopherson Reviewed-by: Yan Zhao Message-Id: <20221019165618.927057-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 11 +++-------- arch/x86/kvm/mmu/tdp_mmu.c | 20 +++++++++----------- 2 files changed, 12 insertions(+), 19 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9030e6263f954..9362b9736d877 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1298,6 +1298,9 @@ struct kvm_arch { */ bool tdp_mmu_enabled; + /* The number of TDP MMU pages across all roots. */ + atomic64_t tdp_mmu_pages; + /* * List of kvm_mmu_page structs being used as roots. * All kvm_mmu_page structs in the list should have @@ -1318,18 +1321,10 @@ struct kvm_arch { */ struct list_head tdp_mmu_roots; - /* - * List of kvm_mmu_page structs not being used as roots. - * All kvm_mmu_page structs in the list should have - * tdp_mmu_page set and a tdp_mmu_root_count of 0. - */ - struct list_head tdp_mmu_pages; - /* * Protects accesses to the following fields when the MMU lock * is held in read mode: * - tdp_mmu_roots (above) - * - tdp_mmu_pages (above) * - the link field of kvm_mmu_page structs used by the TDP MMU * - possible_nx_huge_pages; * - the possible_nx_huge_page_link field of kvm_mmu_page structs used diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 059231c823452..4e5b3ae824c16 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -29,7 +29,6 @@ int kvm_mmu_init_tdp_mmu(struct kvm *kvm) kvm->arch.tdp_mmu_enabled = true; INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); - INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); kvm->arch.tdp_mmu_zap_wq = wq; return 1; } @@ -54,7 +53,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) /* Also waits for any queued work items. */ destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); - WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages)); + WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); /* @@ -377,11 +376,13 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_account_pgtable_pages((void *)sp->spt, +1); + atomic64_inc(&kvm->arch.tdp_mmu_pages); } static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_account_pgtable_pages((void *)sp->spt, -1); + atomic64_dec(&kvm->arch.tdp_mmu_pages); } /** @@ -397,17 +398,17 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, bool shared) { tdp_unaccount_mmu_page(kvm, sp); + + if (!sp->nx_huge_page_disallowed) + return; + if (shared) spin_lock(&kvm->arch.tdp_mmu_pages_lock); else lockdep_assert_held_write(&kvm->mmu_lock); - list_del(&sp->link); - - if (sp->nx_huge_page_disallowed) { - sp->nx_huge_page_disallowed = false; - untrack_possible_nx_huge_page(kvm, sp); - } + sp->nx_huge_page_disallowed = false; + untrack_possible_nx_huge_page(kvm, sp); if (shared) spin_unlock(&kvm->arch.tdp_mmu_pages_lock); @@ -1140,9 +1141,6 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, tdp_mmu_set_spte(kvm, iter, spte); } - spin_lock(&kvm->arch.tdp_mmu_pages_lock); - list_add(&sp->link, &kvm->arch.tdp_mmu_pages); - spin_unlock(&kvm->arch.tdp_mmu_pages_lock); tdp_account_mmu_page(kvm, sp); return 0; -- GitLab From 5e3edd7e8b7e8004c9bb8310fd669a9ca81de207 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 19 Oct 2022 16:56:16 +0000 Subject: [PATCH 0450/1423] KVM: x86/mmu: Add helper to convert SPTE value to its shadow page Add a helper to convert a SPTE to its shadow page to deduplicate a variety of flows and hopefully avoid future bugs, e.g. if KVM attempts to get the shadow page for a SPTE without dropping high bits. Opportunistically add a comment in mmu_free_root_page() documenting why it treats the root HPA as a SPTE. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20221019165618.927057-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 17 ++++++++++------- arch/x86/kvm/mmu/mmu_internal.h | 12 ------------ arch/x86/kvm/mmu/spte.h | 17 +++++++++++++++++ arch/x86/kvm/mmu/tdp_mmu.h | 2 ++ 4 files changed, 29 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e384b78e099c1..5f9b57c6e5068 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1819,7 +1819,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, continue; } - child = to_shadow_page(ent & SPTE_BASE_ADDR_MASK); + child = spte_to_child_sp(ent); if (child->unsync_children) { if (mmu_pages_add(pvec, child, i)) @@ -2378,7 +2378,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, * so we should update the spte at this point to get * a new sp with the correct access. */ - child = to_shadow_page(*sptep & SPTE_BASE_ADDR_MASK); + child = spte_to_child_sp(*sptep); if (child->role.access == direct_access) return; @@ -2399,7 +2399,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, if (is_last_spte(pte, sp->role.level)) { drop_spte(kvm, spte); } else { - child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK); + child = spte_to_child_sp(pte); drop_parent_pte(child, spte); /* @@ -2838,7 +2838,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, struct kvm_mmu_page *child; u64 pte = *sptep; - child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK); + child = spte_to_child_sp(pte); drop_parent_pte(child, sptep); flush = true; } else if (pfn != spte_to_pfn(*sptep)) { @@ -3455,7 +3455,11 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, if (!VALID_PAGE(*root_hpa)) return; - sp = to_shadow_page(*root_hpa & SPTE_BASE_ADDR_MASK); + /* + * The "root" may be a special root, e.g. a PAE entry, treat it as a + * SPTE to ensure any non-PA bits are dropped. + */ + sp = spte_to_child_sp(*root_hpa); if (WARN_ON(!sp)) return; @@ -3940,8 +3944,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) hpa_t root = vcpu->arch.mmu->pae_root[i]; if (IS_VALID_PAE_ROOT(root)) { - root &= SPTE_BASE_ADDR_MASK; - sp = to_shadow_page(root); + sp = spte_to_child_sp(root); mmu_sync_children(vcpu, sp, true); } } diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 22152241bd290..dbaf6755c5a71 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -133,18 +133,6 @@ struct kvm_mmu_page { extern struct kmem_cache *mmu_page_header_cache; -static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page) -{ - struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); - - return (struct kvm_mmu_page *)page_private(page); -} - -static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) -{ - return to_shadow_page(__pa(sptep)); -} - static inline int kvm_mmu_role_as_id(union kvm_mmu_page_role role) { return role.smm ? 1 : 0; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 79560d77aa4c7..1f03701b943a1 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -219,6 +219,23 @@ static inline int spte_index(u64 *sptep) */ extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; +static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page) +{ + struct page *page = pfn_to_page((shadow_page) >> PAGE_SHIFT); + + return (struct kvm_mmu_page *)page_private(page); +} + +static inline struct kvm_mmu_page *spte_to_child_sp(u64 spte) +{ + return to_shadow_page(spte & SPTE_BASE_ADDR_MASK); +} + +static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) +{ + return to_shadow_page(__pa(sptep)); +} + static inline bool is_mmio_spte(u64 spte) { return (spte & shadow_mmio_mask) == shadow_mmio_value && diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index c163f7cc23ca5..d3714200b932a 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -5,6 +5,8 @@ #include +#include "spte.h" + hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root) -- GitLab From 76901e56fb517db61939efdf54e9581b117d615d Mon Sep 17 00:00:00 2001 From: Mingwei Zhang Date: Wed, 19 Oct 2022 16:56:17 +0000 Subject: [PATCH 0451/1423] KVM: x86/mmu: explicitly check nx_hugepage in disallowed_hugepage_adjust() Explicitly check if a NX huge page is disallowed when determining if a page fault needs to be forced to use a smaller sized page. KVM currently assumes that the NX huge page mitigation is the only scenario where KVM will force a shadow page instead of a huge page, and so unnecessarily keeps an existing shadow page instead of replacing it with a huge page. Any scenario that causes KVM to zap leaf SPTEs may result in having a SP that can be made huge without violating the NX huge page mitigation. E.g. prior to commit 5ba7c4c6d1c7 ("KVM: x86/MMU: Zap non-leaf SPTEs when disabling dirty logging"), KVM would keep shadow pages after disabling dirty logging due to a live migration being canceled, resulting in degraded performance due to running with 4kb pages instead of huge pages. Although the dirty logging case is "fixed", that fix is coincidental, i.e. is an implementation detail, and there are other scenarios where KVM will zap leaf SPTEs. E.g. zapping leaf SPTEs in response to a host page migration (mmu_notifier invalidation) to create a huge page would yield a similar result; KVM would see the shadow-present non-leaf SPTE and assume a huge page is disallowed. Fixes: b8e8c8303ff2 ("kvm: mmu: ITLB_MULTIHIT mitigation") Reviewed-by: Ben Gardon Reviewed-by: David Matlack Signed-off-by: Mingwei Zhang [sean: use spte_to_child_sp(), massage changelog, fold into if-statement] Signed-off-by: Sean Christopherson Reviewed-by: Yan Zhao Message-Id: <20221019165618.927057-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 5f9b57c6e5068..efce5e4e24c30 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3112,7 +3112,8 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_ if (cur_level > PG_LEVEL_4K && cur_level == fault->goal_level && is_shadow_present_pte(spte) && - !is_large_pte(spte)) { + !is_large_pte(spte) && + spte_to_child_sp(spte)->nx_huge_page_disallowed) { /* * A small SPTE exists for this pfn, but FNAME(fetch) * and __direct_map would like to create a large PTE -- GitLab From 3a05675722250a522c148f6de0cc190f407c4bb5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 19 Oct 2022 16:56:18 +0000 Subject: [PATCH 0452/1423] KVM: x86/mmu: WARN if TDP MMU SP disallows hugepage after being zapped Extend the accounting sanity check in kvm_recover_nx_huge_pages() to the TDP MMU, i.e. verify that zapping a shadow page unaccounts the disallowed NX huge page regardless of the MMU type. Recovery runs while holding mmu_lock for write and so it should be impossible to get false positives on the WARN. Suggested-by: Yan Zhao Signed-off-by: Sean Christopherson Message-Id: <20221019165618.927057-9-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index efce5e4e24c30..93c389eaf4711 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6875,12 +6875,11 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm) struct kvm_mmu_page, possible_nx_huge_page_link); WARN_ON_ONCE(!sp->nx_huge_page_disallowed); - if (is_tdp_mmu_page(sp)) { + if (is_tdp_mmu_page(sp)) flush |= kvm_tdp_mmu_zap_sp(kvm, sp); - } else { + else kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - WARN_ON_ONCE(sp->nx_huge_page_disallowed); - } + WARN_ON_ONCE(sp->nx_huge_page_disallowed); if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); -- GitLab From f1c5651fda43e0c62a32456cdc6f254f40457409 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 23 Sep 2022 00:13:52 +0000 Subject: [PATCH 0453/1423] KVM: x86/pmu: Force reprogramming of all counters on PMU filter change Force vCPUs to reprogram all counters on a PMU filter change to provide a sane ABI for userspace. Use the existing KVM_REQ_PMU to do the programming, and take advantage of the fact that the reprogram_pmi bitmap fits in a u64 to set all bits in a single atomic update. Note, setting the bitmap and making the request needs to be done _after_ the SRCU synchronization to ensure that vCPUs will reprogram using the new filter. KVM's current "lazy" approach is confusing and non-deterministic. It's confusing because, from a developer perspective, the code is buggy as it makes zero sense to let userspace modify the filter but then not actually enforce the new filter. The lazy approach is non-deterministic because KVM enforces the filter whenever a counter is reprogrammed, not just on guest WRMSRs, i.e. a guest might gain/lose access to an event at random times depending on what is going on in the host. Note, the resulting behavior is still non-determinstic while the filter is in flux. If userspace wants to guarantee deterministic behavior, all vCPUs should be paused during the filter update. Jim Mattson Fixes: 66bb8a065f5a ("KVM: x86: PMU Event Filter") Cc: Aaron Lewis Signed-off-by: Sean Christopherson Message-Id: <20220923001355.3741194-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 11 ++++++++++- arch/x86/kvm/pmu.c | 13 ++++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9362b9736d877..58a562bb197c8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -527,7 +527,16 @@ struct kvm_pmu { struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC]; struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED]; struct irq_work irq_work; - DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX); + + /* + * Overlay the bitmap with a 64-bit atomic so that all bits can be + * set in a single access, e.g. to reprogram all counters when the PMU + * filter changes. + */ + union { + DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX); + atomic64_t __reprogram_pmi; + }; DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX); DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX); diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index de1fd73697365..bf2c32ea3255e 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -577,6 +577,8 @@ EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event); int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) { struct kvm_pmu_event_filter tmp, *filter; + struct kvm_vcpu *vcpu; + unsigned long i; size_t size; int r; @@ -613,9 +615,18 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) mutex_lock(&kvm->lock); filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter, mutex_is_locked(&kvm->lock)); + synchronize_srcu_expedited(&kvm->srcu); + + BUILD_BUG_ON(sizeof(((struct kvm_pmu *)0)->reprogram_pmi) > + sizeof(((struct kvm_pmu *)0)->__reprogram_pmi)); + + kvm_for_each_vcpu(i, vcpu, kvm) + atomic64_set(&vcpu_to_pmu(vcpu)->__reprogram_pmi, -1ull); + + kvm_make_all_cpus_request(kvm, KVM_REQ_PMU); + mutex_unlock(&kvm->lock); - synchronize_srcu_expedited(&kvm->srcu); r = 0; cleanup: kfree(filter); -- GitLab From dcbb816a2842e41d3ec22605c6760837d800b20a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 23 Sep 2022 00:13:53 +0000 Subject: [PATCH 0454/1423] KVM: x86/pmu: Clear "reprogram" bit if counter is disabled or disallowed When reprogramming a counter, clear the counter's "reprogram pending" bit if the counter is disabled (by the guest) or is disallowed (by the userspace filter). In both cases, there's no need to re-attempt programming on the next coincident KVM_REQ_PMU as enabling the counter by either method will trigger reprogramming. Signed-off-by: Sean Christopherson Message-Id: <20220923001355.3741194-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index bf2c32ea3255e..be8ce5aeb454f 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -150,9 +150,9 @@ static void kvm_perf_overflow(struct perf_event *perf_event, __kvm_perf_overflow(pmc, true); } -static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, - u64 config, bool exclude_user, - bool exclude_kernel, bool intr) +static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, + bool exclude_user, bool exclude_kernel, + bool intr) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); struct perf_event *event; @@ -204,14 +204,14 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, if (IS_ERR(event)) { pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n", PTR_ERR(event), pmc->idx); - return; + return PTR_ERR(event); } pmc->perf_event = event; pmc_to_pmu(pmc)->event_count++; - clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); pmc->is_paused = false; pmc->intr = intr || pebs; + return 0; } static void pmc_pause_counter(struct kvm_pmc *pmc) @@ -245,7 +245,6 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) perf_event_enable(pmc->perf_event); pmc->is_paused = false; - clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); return true; } @@ -303,10 +302,10 @@ void reprogram_counter(struct kvm_pmc *pmc) pmc_pause_counter(pmc); if (!pmc_speculative_in_use(pmc) || !pmc_is_enabled(pmc)) - return; + goto reprogram_complete; if (!check_pmu_event_filter(pmc)) - return; + goto reprogram_complete; if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) printk_once("kvm pmu: pin control bit is ignored\n"); @@ -324,16 +323,27 @@ void reprogram_counter(struct kvm_pmc *pmc) } if (pmc->current_config == new_config && pmc_resume_counter(pmc)) - return; + goto reprogram_complete; pmc_release_perf_event(pmc); pmc->current_config = new_config; - pmc_reprogram_counter(pmc, PERF_TYPE_RAW, - (eventsel & pmu->raw_event_mask), - !(eventsel & ARCH_PERFMON_EVENTSEL_USR), - !(eventsel & ARCH_PERFMON_EVENTSEL_OS), - eventsel & ARCH_PERFMON_EVENTSEL_INT); + + /* + * If reprogramming fails, e.g. due to contention, leave the counter's + * regprogram bit set, i.e. opportunistically try again on the next PMU + * refresh. Don't make a new request as doing so can stall the guest + * if reprogramming repeatedly fails. + */ + if (pmc_reprogram_counter(pmc, PERF_TYPE_RAW, + (eventsel & pmu->raw_event_mask), + !(eventsel & ARCH_PERFMON_EVENTSEL_USR), + !(eventsel & ARCH_PERFMON_EVENTSEL_OS), + eventsel & ARCH_PERFMON_EVENTSEL_INT)) + return; + +reprogram_complete: + clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); } EXPORT_SYMBOL_GPL(reprogram_counter); -- GitLab From 68fb4757e8678894530ee0b15c29a3567207b970 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 23 Sep 2022 00:13:54 +0000 Subject: [PATCH 0455/1423] KVM: x86/pmu: Defer reprogram_counter() to kvm_pmu_handle_event() Batch reprogramming PMU counters by setting KVM_REQ_PMU and thus deferring reprogramming kvm_pmu_handle_event() to avoid reprogramming a counter multiple times during a single VM-Exit. Deferring programming will also allow KVM to fix a bug where immediately reprogramming a counter can result in sleeping (taking a mutex) while interrupts are disabled in the VM-Exit fastpath. Introduce kvm_pmu_request_counter_reprogam() to make it obvious that KVM is _requesting_ a reprogram and not actually doing the reprogram. Opportunistically refine related comments to avoid misunderstandings. Signed-off-by: Like Xu Link: https://lore.kernel.org/r/20220831085328.45489-5-likexu@tencent.com Signed-off-by: Sean Christopherson Message-Id: <20220923001355.3741194-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/pmu.c | 17 ++++++++++++----- arch/x86/kvm/pmu.h | 6 +++++- arch/x86/kvm/svm/pmu.c | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 6 +++--- 5 files changed, 22 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 58a562bb197c8..afadbc4b72c45 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -496,6 +496,7 @@ struct kvm_pmc { struct perf_event *perf_event; struct kvm_vcpu *vcpu; /* + * only for creating or reusing perf_event, * eventsel value for general purpose counters, * ctrl value for fixed counters. */ diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index be8ce5aeb454f..3054b35b41436 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -101,7 +101,11 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) struct kvm_pmu *pmu = pmc_to_pmu(pmc); bool skip_pmi = false; - /* Ignore counters that have been reprogrammed already. */ + /* + * Ignore overflow events for counters that are scheduled to be + * reprogrammed, e.g. if a PMI for the previous event races with KVM's + * handling of a related guest WRMSR. + */ if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) return; @@ -292,7 +296,7 @@ out: return allow_event; } -void reprogram_counter(struct kvm_pmc *pmc) +static void reprogram_counter(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); u64 eventsel = pmc->eventsel; @@ -345,7 +349,6 @@ void reprogram_counter(struct kvm_pmc *pmc) reprogram_complete: clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); } -EXPORT_SYMBOL_GPL(reprogram_counter); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) { @@ -355,10 +358,11 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) { struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, bit); - if (unlikely(!pmc || !pmc->perf_event)) { + if (unlikely(!pmc)) { clear_bit(bit, pmu->reprogram_pmi); continue; } + reprogram_counter(pmc); } @@ -552,12 +556,15 @@ static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, static inline bool cpl_is_matched(struct kvm_pmc *pmc) { bool select_os, select_user; - u64 config = pmc->current_config; + u64 config; if (pmc_is_gp(pmc)) { + config = pmc->eventsel; select_os = config & ARCH_PERFMON_EVENTSEL_OS; select_user = config & ARCH_PERFMON_EVENTSEL_USR; } else { + config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl, + pmc->idx - INTEL_PMC_IDX_FIXED); select_os = config & 0x1; select_user = config & 0x2; } diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 5cc5721f260bf..85ff3c0588bac 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -183,7 +183,11 @@ static inline void kvm_init_pmu_capability(void) KVM_PMC_MAX_FIXED); } -void reprogram_counter(struct kvm_pmc *pmc); +static inline void kvm_pmu_request_counter_reprogam(struct kvm_pmc *pmc) +{ + set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); + kvm_make_request(KVM_REQ_PMU, pmc->vcpu); +} void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 9d65cd095691b..c4b322ffdac48 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -159,7 +159,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) data &= ~pmu->reserved_bits; if (data != pmc->eventsel) { pmc->eventsel = data; - reprogram_counter(pmc); + kvm_pmu_request_counter_reprogam(pmc); } return 0; } diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index b7c3a9874a93c..6c80dff37b772 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -52,7 +52,7 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use); - reprogram_counter(pmc); + kvm_pmu_request_counter_reprogam(pmc); } } @@ -76,7 +76,7 @@ static void reprogram_counters(struct kvm_pmu *pmu, u64 diff) for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) { pmc = intel_pmc_idx_to_pmc(pmu, bit); if (pmc) - reprogram_counter(pmc); + kvm_pmu_request_counter_reprogam(pmc); } } @@ -477,7 +477,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) reserved_bits ^= HSW_IN_TX_CHECKPOINTED; if (!(data & reserved_bits)) { pmc->eventsel = data; - reprogram_counter(pmc); + kvm_pmu_request_counter_reprogam(pmc); return 0; } } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) -- GitLab From de0f619564f4713bd548b82d535a954ffa1ee7d8 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 23 Sep 2022 00:13:55 +0000 Subject: [PATCH 0456/1423] KVM: x86/pmu: Defer counter emulated overflow via pmc->prev_counter Defer reprogramming counters and handling overflow via KVM_REQ_PMU when incrementing counters. KVM skips emulated WRMSR in the VM-Exit fastpath, the fastpath runs with IRQs disabled, skipping instructions can increment and reprogram counters, reprogramming counters can sleep, and sleeping is disallowed while IRQs are disabled. [*] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:580 [*] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 2981888, name: CPU 15/KVM [*] preempt_count: 1, expected: 0 [*] RCU nest depth: 0, expected: 0 [*] INFO: lockdep is turned off. [*] irq event stamp: 0 [*] hardirqs last enabled at (0): [<0000000000000000>] 0x0 [*] hardirqs last disabled at (0): [] copy_process+0x146a/0x62d0 [*] softirqs last enabled at (0): [] copy_process+0x14a9/0x62d0 [*] softirqs last disabled at (0): [<0000000000000000>] 0x0 [*] Preemption disabled at: [*] [] vcpu_enter_guest+0x1001/0x3dc0 [kvm] [*] CPU: 17 PID: 2981888 Comm: CPU 15/KVM Kdump: 5.19.0-rc1-g239111db364c-dirty #2 [*] Call Trace: [*] [*] dump_stack_lvl+0x6c/0x9b [*] __might_resched.cold+0x22e/0x297 [*] __mutex_lock+0xc0/0x23b0 [*] perf_event_ctx_lock_nested+0x18f/0x340 [*] perf_event_pause+0x1a/0x110 [*] reprogram_counter+0x2af/0x1490 [kvm] [*] kvm_pmu_trigger_event+0x429/0x950 [kvm] [*] kvm_skip_emulated_instruction+0x48/0x90 [kvm] [*] handle_fastpath_set_msr_irqoff+0x349/0x3b0 [kvm] [*] vmx_vcpu_run+0x268e/0x3b80 [kvm_intel] [*] vcpu_enter_guest+0x1d22/0x3dc0 [kvm] Add a field to kvm_pmc to track the previous counter value in order to defer overflow detection to kvm_pmu_handle_event() (the counter must be paused before handling overflow, and that may increment the counter). Opportunistically shrink sizeof(struct kvm_pmc) a bit. Suggested-by: Wanpeng Li Fixes: 9cd803d496e7 ("KVM: x86: Update vPMCs when retiring instructions") Signed-off-by: Like Xu Link: https://lore.kernel.org/r/20220831085328.45489-6-likexu@tencent.com [sean: avoid re-triggering KVM_REQ_PMU on overflow, tweak changelog] Signed-off-by: Sean Christopherson Message-Id: <20220923001355.3741194-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 5 +++-- arch/x86/kvm/pmu.c | 32 ++++++++++++++++---------------- arch/x86/kvm/svm/pmu.c | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 4 ++-- 4 files changed, 22 insertions(+), 21 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index afadbc4b72c45..81114a376c4ee 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -491,7 +491,10 @@ enum pmc_type { struct kvm_pmc { enum pmc_type type; u8 idx; + bool is_paused; + bool intr; u64 counter; + u64 prev_counter; u64 eventsel; struct perf_event *perf_event; struct kvm_vcpu *vcpu; @@ -501,8 +504,6 @@ struct kvm_pmc { * ctrl value for fixed counters. */ u64 current_config; - bool is_paused; - bool intr; }; /* More counters may conflict with other existing Architectural MSRs */ diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 3054b35b41436..684393c221052 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -101,14 +101,6 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) struct kvm_pmu *pmu = pmc_to_pmu(pmc); bool skip_pmi = false; - /* - * Ignore overflow events for counters that are scheduled to be - * reprogrammed, e.g. if a PMI for the previous event races with KVM's - * handling of a related guest WRMSR. - */ - if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) - return; - if (pmc->perf_event && pmc->perf_event->attr.precise_ip) { if (!in_pmi) { /* @@ -126,7 +118,6 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) } else { __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); } - kvm_make_request(KVM_REQ_PMU, pmc->vcpu); if (!pmc->intr || skip_pmi) return; @@ -151,7 +142,17 @@ static void kvm_perf_overflow(struct perf_event *perf_event, { struct kvm_pmc *pmc = perf_event->overflow_handler_context; + /* + * Ignore overflow events for counters that are scheduled to be + * reprogrammed, e.g. if a PMI for the previous event races with KVM's + * handling of a related guest WRMSR. + */ + if (test_and_set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi)) + return; + __kvm_perf_overflow(pmc, true); + + kvm_make_request(KVM_REQ_PMU, pmc->vcpu); } static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, @@ -311,6 +312,9 @@ static void reprogram_counter(struct kvm_pmc *pmc) if (!check_pmu_event_filter(pmc)) goto reprogram_complete; + if (pmc->counter < pmc->prev_counter) + __kvm_perf_overflow(pmc, false); + if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) printk_once("kvm pmu: pin control bit is ignored\n"); @@ -348,6 +352,7 @@ static void reprogram_counter(struct kvm_pmc *pmc) reprogram_complete: clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); + pmc->prev_counter = 0; } void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) @@ -536,14 +541,9 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu) static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) { - u64 prev_count; - - prev_count = pmc->counter; + pmc->prev_counter = pmc->counter; pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); - - reprogram_counter(pmc); - if (pmc->counter < prev_count) - __kvm_perf_overflow(pmc, false); + kvm_pmu_request_counter_reprogam(pmc); } static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index c4b322ffdac48..0e313fbae0556 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -212,7 +212,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) struct kvm_pmc *pmc = &pmu->gp_counters[i]; pmc_stop_counter(pmc); - pmc->counter = pmc->eventsel = 0; + pmc->counter = pmc->prev_counter = pmc->eventsel = 0; } } diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 6c80dff37b772..e5cec07ca8d95 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -646,14 +646,14 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu) pmc = &pmu->gp_counters[i]; pmc_stop_counter(pmc); - pmc->counter = pmc->eventsel = 0; + pmc->counter = pmc->prev_counter = pmc->eventsel = 0; } for (i = 0; i < KVM_PMC_MAX_FIXED; i++) { pmc = &pmu->fixed_counters[i]; pmc_stop_counter(pmc); - pmc->counter = 0; + pmc->counter = pmc->prev_counter = 0; } pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0; -- GitLab From d663b8a285986072428a6a145e5994bc275df994 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 3 Nov 2022 10:44:10 -0400 Subject: [PATCH 0457/1423] KVM: replace direct irq.h inclusion virt/kvm/irqchip.c is including "irq.h" from the arch-specific KVM source directory (i.e. not from arch/*/include) for the sole purpose of retrieving irqchip_in_kernel. Making the function inline in a header that is already included, such as asm/kvm_host.h, is not possible because it needs to look at struct kvm which is defined after asm/kvm_host.h is included. So add a kvm_arch_irqchip_in_kernel non-inline function; irqchip_in_kernel() is only performance critical on arm64 and x86, and the non-inline function is enough on all other architectures. irq.h can then be deleted from all architectures except x86. Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/arm.c | 5 +++++ arch/arm64/kvm/irq.h | 16 ---------------- arch/powerpc/kvm/irq.h | 22 ---------------------- arch/powerpc/kvm/powerpc.c | 18 ++++++++++++++++-- arch/s390/kvm/irq.h | 19 ------------------- arch/s390/kvm/kvm-s390.c | 5 +++++ arch/x86/kvm/irq.c | 5 +++++ include/linux/kvm_host.h | 2 ++ virt/kvm/irqchip.c | 3 +-- 9 files changed, 34 insertions(+), 61 deletions(-) delete mode 100644 arch/arm64/kvm/irq.h delete mode 100644 arch/powerpc/kvm/irq.h delete mode 100644 arch/s390/kvm/irq.h diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 94d33e296e10c..7b107fa540fac 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2130,6 +2130,11 @@ struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr) return NULL; } +bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) +{ + return irqchip_in_kernel(kvm); +} + bool kvm_arch_has_irq_bypass(void) { return true; diff --git a/arch/arm64/kvm/irq.h b/arch/arm64/kvm/irq.h deleted file mode 100644 index 0d257de42c109..0000000000000 --- a/arch/arm64/kvm/irq.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * irq.h: in kernel interrupt controller related definitions - * Copyright (c) 2016 Red Hat, Inc. - * - * This header is included by irqchip.c. However, on ARM, interrupt - * controller declarations are located in include/kvm/arm_vgic.h since - * they are mostly shared between arm and arm64. - */ - -#ifndef __IRQ_H -#define __IRQ_H - -#include - -#endif diff --git a/arch/powerpc/kvm/irq.h b/arch/powerpc/kvm/irq.h deleted file mode 100644 index e6463f866abc2..0000000000000 --- a/arch/powerpc/kvm/irq.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __IRQ_H -#define __IRQ_H - -#include - -static inline int irqchip_in_kernel(struct kvm *kvm) -{ - int ret = 0; - -#ifdef CONFIG_KVM_MPIC - ret = ret || (kvm->arch.mpic != NULL); -#endif -#ifdef CONFIG_KVM_XICS - ret = ret || (kvm->arch.xics != NULL); - ret = ret || (kvm->arch.xive != NULL); -#endif - smp_rmb(); - return ret; -} - -#endif diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index b850b0efa201a..04494a4fb37a0 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -36,7 +36,6 @@ #include #include "timing.h" -#include "irq.h" #include "../mm/mmu_decl.h" #define CREATE_TRACE_POINTS @@ -2165,10 +2164,25 @@ static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo) return 0; } +bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) +{ + int ret = 0; + +#ifdef CONFIG_KVM_MPIC + ret = ret || (kvm->arch.mpic != NULL); +#endif +#ifdef CONFIG_KVM_XICS + ret = ret || (kvm->arch.xics != NULL); + ret = ret || (kvm->arch.xive != NULL); +#endif + smp_rmb(); + return ret; +} + int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, bool line_status) { - if (!irqchip_in_kernel(kvm)) + if (!kvm_arch_irqchip_in_kernel(kvm)) return -ENXIO; irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, diff --git a/arch/s390/kvm/irq.h b/arch/s390/kvm/irq.h deleted file mode 100644 index 484608c71dd01..0000000000000 --- a/arch/s390/kvm/irq.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * s390 irqchip routines - * - * Copyright IBM Corp. 2014 - * - * Author(s): Cornelia Huck - */ -#ifndef __KVM_IRQ_H -#define __KVM_IRQ_H - -#include - -static inline int irqchip_in_kernel(struct kvm *kvm) -{ - return 1; -} - -#endif diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index bc491a73815c3..5c7532dbc96b1 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -5567,6 +5567,11 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } +bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) +{ + return true; +} + /* Section: memory related */ int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index f371f1292ca3e..d8d50558f1657 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -165,3 +165,8 @@ bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args) return resample ? irqchip_kernel(kvm) : irqchip_in_kernel(kvm); } + +bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) +{ + return irqchip_in_kernel(kvm); +} diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8fe4665bd020b..e6e66c5e56f24 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -663,6 +663,8 @@ struct kvm_irq_routing_table { */ struct hlist_head map[]; }; + +bool kvm_arch_irqchip_in_kernel(struct kvm *kvm); #endif #ifndef KVM_INTERNAL_MEM_SLOTS diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c index 58e4f88b2b9fb..1e567d1f6d3df 100644 --- a/virt/kvm/irqchip.c +++ b/virt/kvm/irqchip.c @@ -17,7 +17,6 @@ #include #include #include -#include "irq.h" int kvm_irq_map_gsi(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *entries, int gsi) @@ -50,7 +49,7 @@ int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi) { struct kvm_kernel_irq_routing_entry route; - if (!irqchip_in_kernel(kvm) || (msi->flags & ~KVM_MSI_VALID_DEVID)) + if (!kvm_arch_irqchip_in_kernel(kvm) || (msi->flags & ~KVM_MSI_VALID_DEVID)) return -EINVAL; route.msi.address_lo = msi->address_lo; -- GitLab From 837a55847ead27362aac80aa1cf402459a9757f7 Mon Sep 17 00:00:00 2001 From: Daisuke Matsuda Date: Mon, 7 Nov 2022 14:53:38 +0900 Subject: [PATCH 0458/1423] RDMA/rxe: Implement packet length validation on responder The function check_length() is supposed to check the length of inbound packets on responder, but it actually has been a stub since the driver was born. Let it check the payload length and the DMA length. Signed-off-by: Daisuke Matsuda Link: https://lore.kernel.org/r/20221107055338.357184-1-matsuda-daisuke@fujitsu.com Reviewed-by: Li Zhijian Acked-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_resp.c | 34 ++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index c32bc12cc82f7..382d2053db43e 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -393,16 +393,36 @@ static enum resp_states check_resource(struct rxe_qp *qp, static enum resp_states check_length(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { - switch (qp_type(qp)) { - case IB_QPT_RC: - return RESPST_CHK_RKEY; + int mtu = qp->mtu; + u32 payload = payload_size(pkt); + u32 dmalen = reth_len(pkt); - case IB_QPT_UC: - return RESPST_CHK_RKEY; + /* RoCEv2 packets do not have LRH. + * Let's skip checking it. + */ - default: - return RESPST_CHK_RKEY; + if ((pkt->opcode & RXE_START_MASK) && + (pkt->opcode & RXE_END_MASK)) { + /* "only" packets */ + if (payload > mtu) + return RESPST_ERR_LENGTH; + } else if ((pkt->opcode & RXE_START_MASK) || + (pkt->opcode & RXE_MIDDLE_MASK)) { + /* "first" or "middle" packets */ + if (payload != mtu) + return RESPST_ERR_LENGTH; + } else if (pkt->opcode & RXE_END_MASK) { + /* "last" packets */ + if ((payload == 0) || (payload > mtu)) + return RESPST_ERR_LENGTH; + } + + if (pkt->opcode & (RXE_WRITE_MASK | RXE_READ_MASK)) { + if (dmalen > (1 << 31)) + return RESPST_ERR_LENGTH; } + + return RESPST_CHK_RKEY; } static enum resp_states check_rkey(struct rxe_qp *qp, -- GitLab From aa382ffa705bea9931ec92b6f3c70e1fdb372195 Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Tue, 8 Nov 2022 17:05:59 -0600 Subject: [PATCH 0459/1423] PCI/sysfs: Fix double free in error path When pci_create_attr() fails, pci_remove_resource_files() is called which will iterate over the res_attr[_wc] arrays and frees every non NULL entry. To avoid a double free here set the array entry only after it's clear we successfully initialized it. Fixes: b562ec8f74e4 ("PCI: Don't leak memory if sysfs_create_bin_file() fails") Link: https://lore.kernel.org/r/20221007070735.GX986@pengutronix.de/ Signed-off-by: Sascha Hauer Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org --- drivers/pci/pci-sysfs.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 0a2eeb82cebde..ba38fc47d35e9 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -1175,11 +1175,9 @@ static int pci_create_attr(struct pci_dev *pdev, int num, int write_combine) sysfs_bin_attr_init(res_attr); if (write_combine) { - pdev->res_attr_wc[num] = res_attr; sprintf(res_attr_name, "resource%d_wc", num); res_attr->mmap = pci_mmap_resource_wc; } else { - pdev->res_attr[num] = res_attr; sprintf(res_attr_name, "resource%d", num); if (pci_resource_flags(pdev, num) & IORESOURCE_IO) { res_attr->read = pci_read_resource_io; @@ -1197,10 +1195,17 @@ static int pci_create_attr(struct pci_dev *pdev, int num, int write_combine) res_attr->size = pci_resource_len(pdev, num); res_attr->private = (void *)(unsigned long)num; retval = sysfs_create_bin_file(&pdev->dev.kobj, res_attr); - if (retval) + if (retval) { kfree(res_attr); + return retval; + } + + if (write_combine) + pdev->res_attr_wc[num] = res_attr; + else + pdev->res_attr[num] = res_attr; - return retval; + return 0; } /** -- GitLab From a39a1466dae5e3ed0dd7c03334a60894e4d1f334 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Thu, 13 Oct 2022 14:46:36 -0700 Subject: [PATCH 0460/1423] MAINTAINERS: git://github -> https://github.com for awilliam MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Github deprecated the git:// links about a year ago, so let's move to the https:// URLs instead. Reported-by: Conor Dooley Link: https://github.blog/2021-09-01-improving-git-protocol-security-github/ Signed-off-by: Palmer Dabbelt Reviewed-by: Philippe Mathieu-Daudé Tested-by: Philippe Mathieu-Daudé Link: https://lore.kernel.org/r/20221013214636.30721-1-palmer@rivosinc.com Signed-off-by: Alex Williamson --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 046ff06ff97fa..daa6a7a755ece 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21539,7 +21539,7 @@ M: Alex Williamson R: Cornelia Huck L: kvm@vger.kernel.org S: Maintained -T: git git://github.com/awilliam/linux-vfio.git +T: git https://github.com/awilliam/linux-vfio.git F: Documentation/ABI/testing/sysfs-devices-vfio-dev F: Documentation/driver-api/vfio.rst F: drivers/vfio/ -- GitLab From cd48ebc5c4f2e94830b238f035ebf04f1c3a4433 Mon Sep 17 00:00:00 2001 From: Shang XiaoJing Date: Thu, 22 Sep 2022 20:35:07 +0800 Subject: [PATCH 0461/1423] vfio/mlx5: Switch to use module_pci_driver() macro Since pci provides the helper macro module_pci_driver(), we may replace the module_init/exit with it. Signed-off-by: Shang XiaoJing Reviewed-by: Yishai Hadas Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220922123507.11222-1-shangxiaojing@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/main.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index fd6ccb8454a24..457138b92f134 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -676,18 +676,7 @@ static struct pci_driver mlx5vf_pci_driver = { .driver_managed_dma = true, }; -static void __exit mlx5vf_pci_cleanup(void) -{ - pci_unregister_driver(&mlx5vf_pci_driver); -} - -static int __init mlx5vf_pci_init(void) -{ - return pci_register_driver(&mlx5vf_pci_driver); -} - -module_init(mlx5vf_pci_init); -module_exit(mlx5vf_pci_cleanup); +module_pci_driver(mlx5vf_pci_driver); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Max Gurtovoy "); -- GitLab From e67e070632a665c932d534b8b800477bb3111449 Mon Sep 17 00:00:00 2001 From: Rafael Mendonca Date: Tue, 18 Oct 2022 12:28:25 -0300 Subject: [PATCH 0462/1423] vfio: platform: Do not pass return buffer to ACPI _RST method The ACPI _RST method has no return value, there's no need to pass a return buffer to acpi_evaluate_object(). Fixes: d30daa33ec1d ("vfio: platform: call _RST method when using ACPI") Signed-off-by: Rafael Mendonca Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20221018152825.891032-1-rafaelmendsr@gmail.com Signed-off-by: Alex Williamson --- drivers/vfio/platform/vfio_platform_common.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index 55dc4f43c31e3..1a0a238ffa354 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -72,12 +72,11 @@ static int vfio_platform_acpi_call_reset(struct vfio_platform_device *vdev, const char **extra_dbg) { #ifdef CONFIG_ACPI - struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; struct device *dev = vdev->device; acpi_handle handle = ACPI_HANDLE(dev); acpi_status acpi_ret; - acpi_ret = acpi_evaluate_object(handle, "_RST", NULL, &buffer); + acpi_ret = acpi_evaluate_object(handle, "_RST", NULL, NULL); if (ACPI_FAILURE(acpi_ret)) { if (extra_dbg) *extra_dbg = acpi_format_exception(acpi_ret); -- GitLab From ea00d4ededcd8639f9e814513426cfeccdd3aaf0 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 25 Oct 2022 20:31:13 +0100 Subject: [PATCH 0463/1423] vfio/iova_bitmap: Explicitly include linux/slab.h kzalloc/kzfree are used so include `slab.h`. While it happens to work without it, due to commit 8b9f3ac5b01d ("fs: introduce alloc_inode_sb() to allocate filesystems specific inode") which indirectly includes via: . ./include/linux/mm.h .. ./include/linux/huge_mm.h ... ./include/linux/fs.h .... ./include/linux/slab.h Make it explicit should any of its indirect dependencies be dropped/changed for entirely different reasons as it was the cause prior to commit above recently (i.e. <= v5.18). Signed-off-by: Joao Martins Link: https://lore.kernel.org/r/20221025193114.58695-2-joao.m.martins@oracle.com Signed-off-by: Alex Williamson --- drivers/vfio/iova_bitmap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/vfio/iova_bitmap.c b/drivers/vfio/iova_bitmap.c index 6631e8befe1b2..56816ba1be9b1 100644 --- a/drivers/vfio/iova_bitmap.c +++ b/drivers/vfio/iova_bitmap.c @@ -5,6 +5,7 @@ */ #include #include +#include #include #define BITS_PER_PAGE (PAGE_SIZE * BITS_PER_BYTE) -- GitLab From f38044e5ef58ad0346fdabd7027ea5c1e1a3b624 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 25 Oct 2022 20:31:14 +0100 Subject: [PATCH 0464/1423] vfio/iova_bitmap: Fix PAGE_SIZE unaligned bitmaps iova_bitmap_set() doesn't consider the end of the page boundary when the first bitmap page offset isn't zero, and wrongly changes the consecutive page right after. Consequently this leads to missing dirty pages from reported by the device as seen from the VMM. The current logic iterates over a given number of base pages and clamps it to the remaining indexes to iterate in the last page. Instead of having to consider extra pages to pin (e.g. first and extra pages), just handle the first page as its own range and let the rest of the bitmap be handled as if it was base page aligned. This is done by changing iova_bitmap_mapped_remaining() to return PAGE_SIZE - pgoff (on the first bitmap page), and leads to pgoff being set to 0 on following iterations. Fixes: 58ccf0190d19 ("vfio: Add an IOVA bitmap support") Reported-by: Avihai Horon Tested-by: Avihai Horon Signed-off-by: Joao Martins Link: https://lore.kernel.org/r/20221025193114.58695-3-joao.m.martins@oracle.com Signed-off-by: Alex Williamson --- drivers/vfio/iova_bitmap.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/iova_bitmap.c b/drivers/vfio/iova_bitmap.c index 56816ba1be9b1..de6d6ea5c4967 100644 --- a/drivers/vfio/iova_bitmap.c +++ b/drivers/vfio/iova_bitmap.c @@ -296,11 +296,15 @@ void iova_bitmap_free(struct iova_bitmap *bitmap) */ static unsigned long iova_bitmap_mapped_remaining(struct iova_bitmap *bitmap) { - unsigned long remaining; + unsigned long remaining, bytes; + + /* Cap to one page in the first iteration, if PAGE_SIZE unaligned. */ + bytes = !bitmap->mapped.pgoff ? bitmap->mapped.npages << PAGE_SHIFT : + PAGE_SIZE - bitmap->mapped.pgoff; remaining = bitmap->mapped_total_index - bitmap->mapped_base_index; remaining = min_t(unsigned long, remaining, - (bitmap->mapped.npages << PAGE_SHIFT) / sizeof(*bitmap->bitmap)); + bytes / sizeof(*bitmap->bitmap)); return remaining; } -- GitLab From 5c20311d76cbaeb7ed2ecf9c8b8322f8fc4a7ae3 Mon Sep 17 00:00:00 2001 From: Leonid Ravich Date: Wed, 9 Nov 2022 11:57:17 +0200 Subject: [PATCH 0465/1423] IB/mad: Don't call to function that might sleep while in atomic context Tracepoints are not allowed to sleep, as such the following splat is generated due to call to ib_query_pkey() in atomic context. WARNING: CPU: 0 PID: 1888000 at kernel/trace/ring_buffer.c:2492 rb_commit+0xc1/0x220 CPU: 0 PID: 1888000 Comm: kworker/u9:0 Kdump: loaded Tainted: G OE --------- - - 4.18.0-305.3.1.el8.x86_64 #1 Hardware name: Red Hat KVM, BIOS 1.13.0-2.module_el8.3.0+555+a55c8938 04/01/2014 Workqueue: ib-comp-unb-wq ib_cq_poll_work [ib_core] RIP: 0010:rb_commit+0xc1/0x220 RSP: 0000:ffffa8ac80f9bca0 EFLAGS: 00010202 RAX: ffff8951c7c01300 RBX: ffff8951c7c14a00 RCX: 0000000000000246 RDX: ffff8951c707c000 RSI: ffff8951c707c57c RDI: ffff8951c7c14a00 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: ffff8951c7c01300 R11: 0000000000000001 R12: 0000000000000246 R13: 0000000000000000 R14: ffffffff964c70c0 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff8951fbc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f20e8f39010 CR3: 000000002ca10005 CR4: 0000000000170ef0 Call Trace: ring_buffer_unlock_commit+0x1d/0xa0 trace_buffer_unlock_commit_regs+0x3b/0x1b0 trace_event_buffer_commit+0x67/0x1d0 trace_event_raw_event_ib_mad_recv_done_handler+0x11c/0x160 [ib_core] ib_mad_recv_done+0x48b/0xc10 [ib_core] ? trace_event_raw_event_cq_poll+0x6f/0xb0 [ib_core] __ib_process_cq+0x91/0x1c0 [ib_core] ib_cq_poll_work+0x26/0x80 [ib_core] process_one_work+0x1a7/0x360 ? create_worker+0x1a0/0x1a0 worker_thread+0x30/0x390 ? create_worker+0x1a0/0x1a0 kthread+0x116/0x130 ? kthread_flush_work_fn+0x10/0x10 ret_from_fork+0x35/0x40 ---[ end trace 78ba8509d3830a16 ]--- Fixes: 821bf1de45a1 ("IB/MAD: Add recv path trace point") Signed-off-by: Leonid Ravich Link: https://lore.kernel.org/r/Y2t5feomyznrVj7V@leonid-Inspiron-3421 Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/mad.c | 5 ----- include/trace/events/ib_mad.h | 13 ++++--------- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 1893aa613ad73..674344eb8e2f4 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -59,9 +59,6 @@ static void create_mad_addr_info(struct ib_mad_send_wr_private *mad_send_wr, struct ib_mad_qp_info *qp_info, struct trace_event_raw_ib_mad_send_template *entry) { - u16 pkey; - struct ib_device *dev = qp_info->port_priv->device; - u32 pnum = qp_info->port_priv->port_num; struct ib_ud_wr *wr = &mad_send_wr->send_wr; struct rdma_ah_attr attr = {}; @@ -69,8 +66,6 @@ static void create_mad_addr_info(struct ib_mad_send_wr_private *mad_send_wr, /* These are common */ entry->sl = attr.sl; - ib_query_pkey(dev, pnum, wr->pkey_index, &pkey); - entry->pkey = pkey; entry->rqpn = wr->remote_qpn; entry->rqkey = wr->remote_qkey; entry->dlid = rdma_ah_get_dlid(&attr); diff --git a/include/trace/events/ib_mad.h b/include/trace/events/ib_mad.h index 59363a083ecb9..d92691c78cff6 100644 --- a/include/trace/events/ib_mad.h +++ b/include/trace/events/ib_mad.h @@ -49,7 +49,6 @@ DECLARE_EVENT_CLASS(ib_mad_send_template, __field(int, retries_left) __field(int, max_retries) __field(int, retry) - __field(u16, pkey) ), TP_fast_assign( @@ -89,7 +88,7 @@ DECLARE_EVENT_CLASS(ib_mad_send_template, "hdr : base_ver 0x%x class 0x%x class_ver 0x%x " \ "method 0x%x status 0x%x class_specific 0x%x tid 0x%llx " \ "attr_id 0x%x attr_mod 0x%x => dlid 0x%08x sl %d "\ - "pkey 0x%x rpqn 0x%x rqpkey 0x%x", + "rpqn 0x%x rqpkey 0x%x", __entry->dev_index, __entry->port_num, __entry->qp_num, __entry->agent_priv, be64_to_cpu(__entry->wrtid), __entry->retries_left, __entry->max_retries, @@ -100,7 +99,7 @@ DECLARE_EVENT_CLASS(ib_mad_send_template, be16_to_cpu(__entry->class_specific), be64_to_cpu(__entry->tid), be16_to_cpu(__entry->attr_id), be32_to_cpu(__entry->attr_mod), - be32_to_cpu(__entry->dlid), __entry->sl, __entry->pkey, + be32_to_cpu(__entry->dlid), __entry->sl, __entry->rqpn, __entry->rqkey ) ); @@ -204,7 +203,6 @@ TRACE_EVENT(ib_mad_recv_done_handler, __field(u16, wc_status) __field(u32, slid) __field(u32, dev_index) - __field(u16, pkey) ), TP_fast_assign( @@ -224,9 +222,6 @@ TRACE_EVENT(ib_mad_recv_done_handler, __entry->slid = wc->slid; __entry->src_qp = wc->src_qp; __entry->sl = wc->sl; - ib_query_pkey(qp_info->port_priv->device, - qp_info->port_priv->port_num, - wc->pkey_index, &__entry->pkey); __entry->wc_status = wc->status; ), @@ -234,7 +229,7 @@ TRACE_EVENT(ib_mad_recv_done_handler, "base_ver 0x%02x class 0x%02x class_ver 0x%02x " \ "method 0x%02x status 0x%04x class_specific 0x%04x " \ "tid 0x%016llx attr_id 0x%04x attr_mod 0x%08x " \ - "slid 0x%08x src QP%d, sl %d pkey 0x%04x", + "slid 0x%08x src QP%d, sl %d", __entry->dev_index, __entry->port_num, __entry->qp_num, __entry->wc_status, __entry->length, @@ -244,7 +239,7 @@ TRACE_EVENT(ib_mad_recv_done_handler, be16_to_cpu(__entry->class_specific), be64_to_cpu(__entry->tid), be16_to_cpu(__entry->attr_id), be32_to_cpu(__entry->attr_mod), - __entry->slid, __entry->src_qp, __entry->sl, __entry->pkey + __entry->slid, __entry->src_qp, __entry->sl ) ); -- GitLab From cf87ac739e488055a6046a410caa8f4da108948f Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 10 Nov 2022 18:49:08 +0800 Subject: [PATCH 0466/1423] KVM: x86: Introduce KVM_REQ_DIRTY_RING_SOFT_FULL The VCPU isn't expected to be runnable when the dirty ring becomes soft full, until the dirty pages are harvested and the dirty ring is reset from userspace. So there is a check in each guest's entrace to see if the dirty ring is soft full or not. The VCPU is stopped from running if its dirty ring has been soft full. The similar check will be needed when the feature is going to be supported on ARM64. As Marc Zyngier suggested, a new event will avoid pointless overhead to check the size of the dirty ring ('vcpu->kvm->dirty_ring_size') in each guest's entrance. Add KVM_REQ_DIRTY_RING_SOFT_FULL. The event is raised when the dirty ring becomes soft full in kvm_dirty_ring_push(). The event is only cleared in the check, done in the newly added helper kvm_dirty_ring_check_request(). Since the VCPU is not runnable when the dirty ring becomes soft full, the KVM_REQ_DIRTY_RING_SOFT_FULL event is always set to prevent the VCPU from running until the dirty pages are harvested and the dirty ring is reset by userspace. kvm_dirty_ring_soft_full() becomes a private function with the newly added helper kvm_dirty_ring_check_request(). The alignment for the various event definitions in kvm_host.h is changed to tab character by the way. In order to avoid using 'container_of()', the argument @ring is replaced by @vcpu in kvm_dirty_ring_push(). Link: https://lore.kernel.org/kvmarm/87lerkwtm5.wl-maz@kernel.org Suggested-by: Marc Zyngier Signed-off-by: Gavin Shan Reviewed-by: Peter Xu Reviewed-by: Sean Christopherson Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110104914.31280-2-gshan@redhat.com --- arch/x86/kvm/x86.c | 15 ++++++--------- include/linux/kvm_dirty_ring.h | 12 ++++-------- include/linux/kvm_host.h | 9 +++++---- virt/kvm/dirty_ring.c | 32 ++++++++++++++++++++++++++++++-- virt/kvm/kvm_main.c | 3 +-- 5 files changed, 46 insertions(+), 25 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9cf1ba865562e..d0d32e67ebf34 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10499,20 +10499,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_immediate_exit = false; - /* Forbid vmenter if vcpu dirty ring is soft-full */ - if (unlikely(vcpu->kvm->dirty_ring_size && - kvm_dirty_ring_soft_full(&vcpu->dirty_ring))) { - vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL; - trace_kvm_dirty_ring_exit(vcpu); - r = 0; - goto out; - } - if (kvm_request_pending(vcpu)) { if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) { r = -EIO; goto out; } + + if (kvm_dirty_ring_check_request(vcpu)) { + r = 0; + goto out; + } + if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) { r = 0; diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h index 906f899813dc9..9c13c4c3d30c1 100644 --- a/include/linux/kvm_dirty_ring.h +++ b/include/linux/kvm_dirty_ring.h @@ -49,7 +49,7 @@ static inline int kvm_dirty_ring_reset(struct kvm *kvm, return 0; } -static inline void kvm_dirty_ring_push(struct kvm_dirty_ring *ring, +static inline void kvm_dirty_ring_push(struct kvm_vcpu *vcpu, u32 slot, u64 offset) { } @@ -64,11 +64,6 @@ static inline void kvm_dirty_ring_free(struct kvm_dirty_ring *ring) { } -static inline bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring) -{ - return true; -} - #else /* CONFIG_HAVE_KVM_DIRTY_RING */ u32 kvm_dirty_ring_get_rsvd_entries(void); @@ -84,13 +79,14 @@ int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring); * returns =0: successfully pushed * <0: unable to push, need to wait */ -void kvm_dirty_ring_push(struct kvm_dirty_ring *ring, u32 slot, u64 offset); +void kvm_dirty_ring_push(struct kvm_vcpu *vcpu, u32 slot, u64 offset); + +bool kvm_dirty_ring_check_request(struct kvm_vcpu *vcpu); /* for use in vm_operations_struct */ struct page *kvm_dirty_ring_get_page(struct kvm_dirty_ring *ring, u32 offset); void kvm_dirty_ring_free(struct kvm_dirty_ring *ring); -bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring); #endif /* CONFIG_HAVE_KVM_DIRTY_RING */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 00c3448ba7f8b..648d663f32c46 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -153,10 +153,11 @@ static inline bool is_error_page(struct page *page) * Architecture-independent vcpu->requests bit members * Bits 3-7 are reserved for more arch-independent bits. */ -#define KVM_REQ_TLB_FLUSH (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) -#define KVM_REQ_VM_DEAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) -#define KVM_REQ_UNBLOCK 2 -#define KVM_REQUEST_ARCH_BASE 8 +#define KVM_REQ_TLB_FLUSH (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_VM_DEAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_UNBLOCK 2 +#define KVM_REQ_DIRTY_RING_SOFT_FULL 3 +#define KVM_REQUEST_ARCH_BASE 8 /* * KVM_REQ_OUTSIDE_GUEST_MODE exists is purely as way to force the vCPU to diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index d6fabf238032a..fecbb7d75ad22 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -26,7 +26,7 @@ static u32 kvm_dirty_ring_used(struct kvm_dirty_ring *ring) return READ_ONCE(ring->dirty_index) - READ_ONCE(ring->reset_index); } -bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring) +static bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring) { return kvm_dirty_ring_used(ring) >= ring->soft_limit; } @@ -142,13 +142,19 @@ int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring) kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask); + /* + * The request KVM_REQ_DIRTY_RING_SOFT_FULL will be cleared + * by the VCPU thread next time when it enters the guest. + */ + trace_kvm_dirty_ring_reset(ring); return count; } -void kvm_dirty_ring_push(struct kvm_dirty_ring *ring, u32 slot, u64 offset) +void kvm_dirty_ring_push(struct kvm_vcpu *vcpu, u32 slot, u64 offset) { + struct kvm_dirty_ring *ring = &vcpu->dirty_ring; struct kvm_dirty_gfn *entry; /* It should never get full */ @@ -166,6 +172,28 @@ void kvm_dirty_ring_push(struct kvm_dirty_ring *ring, u32 slot, u64 offset) kvm_dirty_gfn_set_dirtied(entry); ring->dirty_index++; trace_kvm_dirty_ring_push(ring, slot, offset); + + if (kvm_dirty_ring_soft_full(ring)) + kvm_make_request(KVM_REQ_DIRTY_RING_SOFT_FULL, vcpu); +} + +bool kvm_dirty_ring_check_request(struct kvm_vcpu *vcpu) +{ + /* + * The VCPU isn't runnable when the dirty ring becomes soft full. + * The KVM_REQ_DIRTY_RING_SOFT_FULL event is always set to prevent + * the VCPU from running until the dirty pages are harvested and + * the dirty ring is reset by userspace. + */ + if (kvm_check_request(KVM_REQ_DIRTY_RING_SOFT_FULL, vcpu) && + kvm_dirty_ring_soft_full(&vcpu->dirty_ring)) { + kvm_make_request(KVM_REQ_DIRTY_RING_SOFT_FULL, vcpu); + vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL; + trace_kvm_dirty_ring_exit(vcpu); + return true; + } + + return false; } struct page *kvm_dirty_ring_get_page(struct kvm_dirty_ring *ring, u32 offset) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 46e8ed1ae6470..04b22d2f99d8c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3314,8 +3314,7 @@ void mark_page_dirty_in_slot(struct kvm *kvm, u32 slot = (memslot->as_id << 16) | memslot->id; if (kvm->dirty_ring_size) - kvm_dirty_ring_push(&vcpu->dirty_ring, - slot, rel_gfn); + kvm_dirty_ring_push(vcpu, slot, rel_gfn); else set_bit_le(rel_gfn, memslot->dirty_bitmap); } -- GitLab From e8a18565e59303ac12c626a161d72bd890bd2062 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 10 Nov 2022 18:49:09 +0800 Subject: [PATCH 0467/1423] KVM: Move declaration of kvm_cpu_dirty_log_size() to kvm_dirty_ring.h Not all architectures like ARM64 need to override the function. Move its declaration to kvm_dirty_ring.h to avoid the following compiling warning on ARM64 when the feature is enabled. arch/arm64/kvm/../../../virt/kvm/dirty_ring.c:14:12: \ warning: no previous prototype for 'kvm_cpu_dirty_log_size' \ [-Wmissing-prototypes] \ int __weak kvm_cpu_dirty_log_size(void) Reported-by: kernel test robot Signed-off-by: Gavin Shan Reviewed-by: Peter Xu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110104914.31280-3-gshan@redhat.com --- arch/x86/include/asm/kvm_host.h | 2 -- include/linux/kvm_dirty_ring.h | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7551b6f9c31c5..b4dbde7d9eb1d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2090,8 +2090,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu) #define GET_SMSTATE(type, buf, offset) \ (*(type *)((buf) + (offset) - 0x7e00)) -int kvm_cpu_dirty_log_size(void); - int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); #define KVM_CLOCK_VALID_FLAGS \ diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h index 9c13c4c3d30c1..199ead37b1049 100644 --- a/include/linux/kvm_dirty_ring.h +++ b/include/linux/kvm_dirty_ring.h @@ -66,6 +66,7 @@ static inline void kvm_dirty_ring_free(struct kvm_dirty_ring *ring) #else /* CONFIG_HAVE_KVM_DIRTY_RING */ +int kvm_cpu_dirty_log_size(void); u32 kvm_dirty_ring_get_rsvd_entries(void); int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size); -- GitLab From 86bdf3ebcfe1ded055282536fecce13001874740 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 10 Nov 2022 18:49:10 +0800 Subject: [PATCH 0468/1423] KVM: Support dirty ring in conjunction with bitmap ARM64 needs to dirty memory outside of a VCPU context when VGIC/ITS is enabled. It's conflicting with that ring-based dirty page tracking always requires a running VCPU context. Introduce a new flavor of dirty ring that requires the use of both VCPU dirty rings and a dirty bitmap. The expectation is that for non-VCPU sources of dirty memory (such as the VGIC/ITS on arm64), KVM writes to the dirty bitmap. Userspace should scan the dirty bitmap before migrating the VM to the target. Use an additional capability to advertise this behavior. The newly added capability (KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP) can't be enabled before KVM_CAP_DIRTY_LOG_RING_ACQ_REL on ARM64. In this way, the newly added capability is treated as an extension of KVM_CAP_DIRTY_LOG_RING_ACQ_REL. Suggested-by: Marc Zyngier Suggested-by: Peter Xu Co-developed-by: Oliver Upton Signed-off-by: Oliver Upton Signed-off-by: Gavin Shan Acked-by: Peter Xu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110104914.31280-4-gshan@redhat.com --- Documentation/virt/kvm/api.rst | 34 ++++++++--- .../virt/kvm/devices/arm-vgic-its.rst | 5 +- include/linux/kvm_dirty_ring.h | 7 +++ include/linux/kvm_host.h | 1 + include/uapi/linux/kvm.h | 1 + virt/kvm/Kconfig | 6 ++ virt/kvm/dirty_ring.c | 14 +++++ virt/kvm/kvm_main.c | 61 ++++++++++++++++--- 8 files changed, 112 insertions(+), 17 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index eee9f857a986f..1f1b09aa6db47 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8003,13 +8003,6 @@ flushing is done by the KVM_GET_DIRTY_LOG ioctl). To achieve that, one needs to kick the vcpu out of KVM_RUN using a signal. The resulting vmexit ensures that all dirty GFNs are flushed to the dirty rings. -NOTE: the capability KVM_CAP_DIRTY_LOG_RING and the corresponding -ioctl KVM_RESET_DIRTY_RINGS are mutual exclusive to the existing ioctls -KVM_GET_DIRTY_LOG and KVM_CLEAR_DIRTY_LOG. After enabling -KVM_CAP_DIRTY_LOG_RING with an acceptable dirty ring size, the virtual -machine will switch to ring-buffer dirty page tracking and further -KVM_GET_DIRTY_LOG or KVM_CLEAR_DIRTY_LOG ioctls will fail. - NOTE: KVM_CAP_DIRTY_LOG_RING_ACQ_REL is the only capability that should be exposed by weakly ordered architecture, in order to indicate the additional memory ordering requirements imposed on userspace when @@ -8018,6 +8011,33 @@ Architecture with TSO-like ordering (such as x86) are allowed to expose both KVM_CAP_DIRTY_LOG_RING and KVM_CAP_DIRTY_LOG_RING_ACQ_REL to userspace. +After enabling the dirty rings, the userspace needs to detect the +capability of KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP to see whether the +ring structures can be backed by per-slot bitmaps. With this capability +advertised, it means the architecture can dirty guest pages without +vcpu/ring context, so that some of the dirty information will still be +maintained in the bitmap structure. KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP +can't be enabled if the capability of KVM_CAP_DIRTY_LOG_RING_ACQ_REL +hasn't been enabled, or any memslot has been existing. + +Note that the bitmap here is only a backup of the ring structure. The +use of the ring and bitmap combination is only beneficial if there is +only a very small amount of memory that is dirtied out of vcpu/ring +context. Otherwise, the stand-alone per-slot bitmap mechanism needs to +be considered. + +To collect dirty bits in the backup bitmap, userspace can use the same +KVM_GET_DIRTY_LOG ioctl. KVM_CLEAR_DIRTY_LOG isn't needed as long as all +the generation of the dirty bits is done in a single pass. Collecting +the dirty bitmap should be the very last thing that the VMM does before +considering the state as complete. VMM needs to ensure that the dirty +state is final and avoid missing dirty pages from another ioctl ordered +after the bitmap collection. + +NOTE: One example of using the backup bitmap is saving arm64 vgic/its +tables through KVM_DEV_ARM_{VGIC_GRP_CTRL, ITS_SAVE_TABLES} command on +KVM device "kvm-arm-vgic-its" when dirty ring is enabled. + 8.30 KVM_CAP_XEN_HVM -------------------- diff --git a/Documentation/virt/kvm/devices/arm-vgic-its.rst b/Documentation/virt/kvm/devices/arm-vgic-its.rst index d257eddbae296..e053124f77c48 100644 --- a/Documentation/virt/kvm/devices/arm-vgic-its.rst +++ b/Documentation/virt/kvm/devices/arm-vgic-its.rst @@ -52,7 +52,10 @@ KVM_DEV_ARM_VGIC_GRP_CTRL KVM_DEV_ARM_ITS_SAVE_TABLES save the ITS table data into guest RAM, at the location provisioned - by the guest in corresponding registers/table entries. + by the guest in corresponding registers/table entries. Should userspace + require a form of dirty tracking to identify which pages are modified + by the saving process, it should use a bitmap even if using another + mechanism to track the memory dirtied by the vCPUs. The layout of the tables in guest memory defines an ABI. The entries are laid out in little endian format as described in the last paragraph. diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h index 199ead37b1049..4862c98d80d36 100644 --- a/include/linux/kvm_dirty_ring.h +++ b/include/linux/kvm_dirty_ring.h @@ -37,6 +37,11 @@ static inline u32 kvm_dirty_ring_get_rsvd_entries(void) return 0; } +static inline bool kvm_use_dirty_bitmap(struct kvm *kvm) +{ + return true; +} + static inline int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size) { @@ -67,6 +72,8 @@ static inline void kvm_dirty_ring_free(struct kvm_dirty_ring *ring) #else /* CONFIG_HAVE_KVM_DIRTY_RING */ int kvm_cpu_dirty_log_size(void); +bool kvm_use_dirty_bitmap(struct kvm *kvm); +bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm); u32 kvm_dirty_ring_get_rsvd_entries(void); int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 648d663f32c46..db83f63f4e61e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -779,6 +779,7 @@ struct kvm { pid_t userspace_pid; unsigned int max_halt_poll_ns; u32 dirty_ring_size; + bool dirty_ring_with_bitmap; bool vm_bugged; bool vm_dead; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 0d5d4419139ae..c87b5882d7aef 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1178,6 +1178,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_S390_ZPCI_OP 221 #define KVM_CAP_S390_CPU_TOPOLOGY 222 #define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223 +#define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 224 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 800f9470e36b1..9fb1ff6f19e5f 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -33,6 +33,12 @@ config HAVE_KVM_DIRTY_RING_ACQ_REL bool select HAVE_KVM_DIRTY_RING +# Allow enabling both the dirty bitmap and dirty ring. Only architectures +# that need to dirty memory outside of a vCPU context should select this. +config NEED_KVM_DIRTY_RING_WITH_BITMAP + bool + depends on HAVE_KVM_DIRTY_RING + config HAVE_KVM_EVENTFD bool select EVENTFD diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index fecbb7d75ad22..c1cd7dfe4a908 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -21,6 +21,20 @@ u32 kvm_dirty_ring_get_rsvd_entries(void) return KVM_DIRTY_RING_RSVD_ENTRIES + kvm_cpu_dirty_log_size(); } +bool kvm_use_dirty_bitmap(struct kvm *kvm) +{ + lockdep_assert_held(&kvm->slots_lock); + + return !kvm->dirty_ring_size || kvm->dirty_ring_with_bitmap; +} + +#ifndef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP +bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm) +{ + return false; +} +#endif + static u32 kvm_dirty_ring_used(struct kvm_dirty_ring *ring) { return READ_ONCE(ring->dirty_index) - READ_ONCE(ring->reset_index); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 04b22d2f99d8c..be40d1ce6e913 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1617,7 +1617,7 @@ static int kvm_prepare_memory_region(struct kvm *kvm, new->dirty_bitmap = NULL; else if (old && old->dirty_bitmap) new->dirty_bitmap = old->dirty_bitmap; - else if (!kvm->dirty_ring_size) { + else if (kvm_use_dirty_bitmap(kvm)) { r = kvm_alloc_dirty_bitmap(new); if (r) return r; @@ -2060,8 +2060,8 @@ int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, unsigned long n; unsigned long any = 0; - /* Dirty ring tracking is exclusive to dirty log tracking */ - if (kvm->dirty_ring_size) + /* Dirty ring tracking may be exclusive to dirty log tracking */ + if (!kvm_use_dirty_bitmap(kvm)) return -ENXIO; *memslot = NULL; @@ -2125,8 +2125,8 @@ static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log) unsigned long *dirty_bitmap_buffer; bool flush; - /* Dirty ring tracking is exclusive to dirty log tracking */ - if (kvm->dirty_ring_size) + /* Dirty ring tracking may be exclusive to dirty log tracking */ + if (!kvm_use_dirty_bitmap(kvm)) return -ENXIO; as_id = log->slot >> 16; @@ -2237,8 +2237,8 @@ static int kvm_clear_dirty_log_protect(struct kvm *kvm, unsigned long *dirty_bitmap_buffer; bool flush; - /* Dirty ring tracking is exclusive to dirty log tracking */ - if (kvm->dirty_ring_size) + /* Dirty ring tracking may be exclusive to dirty log tracking */ + if (!kvm_use_dirty_bitmap(kvm)) return -ENXIO; as_id = log->slot >> 16; @@ -3305,7 +3305,10 @@ void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); #ifdef CONFIG_HAVE_KVM_DIRTY_RING - if (WARN_ON_ONCE(!vcpu) || WARN_ON_ONCE(vcpu->kvm != kvm)) + if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm)) + return; + + if (WARN_ON_ONCE(!kvm_arch_allow_write_without_running_vcpu(kvm) && !vcpu)) return; #endif @@ -3313,7 +3316,7 @@ void mark_page_dirty_in_slot(struct kvm *kvm, unsigned long rel_gfn = gfn - memslot->base_gfn; u32 slot = (memslot->as_id << 16) | memslot->id; - if (kvm->dirty_ring_size) + if (kvm->dirty_ring_size && vcpu) kvm_dirty_ring_push(vcpu, slot, rel_gfn); else set_bit_le(rel_gfn, memslot->dirty_bitmap); @@ -4482,6 +4485,9 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn); #else return 0; +#endif +#ifdef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP + case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: #endif case KVM_CAP_BINARY_STATS_FD: case KVM_CAP_SYSTEM_EVENT_DATA: @@ -4558,6 +4564,20 @@ int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm, return -EINVAL; } +static bool kvm_are_all_memslots_empty(struct kvm *kvm) +{ + int i; + + lockdep_assert_held(&kvm->slots_lock); + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + if (!kvm_memslots_empty(__kvm_memslots(kvm, i))) + return false; + } + + return true; +} + static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, struct kvm_enable_cap *cap) { @@ -4588,6 +4608,29 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, return -EINVAL; return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]); + case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: { + int r = -EINVAL; + + if (!IS_ENABLED(CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP) || + !kvm->dirty_ring_size || cap->flags) + return r; + + mutex_lock(&kvm->slots_lock); + + /* + * For simplicity, allow enabling ring+bitmap if and only if + * there are no memslots, e.g. to ensure all memslots allocate + * a bitmap after the capability is enabled. + */ + if (kvm_are_all_memslots_empty(kvm)) { + kvm->dirty_ring_with_bitmap = true; + r = 0; + } + + mutex_unlock(&kvm->slots_lock); + + return r; + } default: return kvm_vm_ioctl_enable_cap(kvm, cap); } -- GitLab From 9cb1096f8590bc590326087bea65db932b53c3b5 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 10 Nov 2022 18:49:11 +0800 Subject: [PATCH 0469/1423] KVM: arm64: Enable ring-based dirty memory tracking Enable ring-based dirty memory tracking on ARM64: - Enable CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL. - Enable CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP. - Set KVM_DIRTY_LOG_PAGE_OFFSET for the ring buffer's physical page offset. - Add ARM64 specific kvm_arch_allow_write_without_running_vcpu() to keep the site of saving vgic/its tables out of the no-running-vcpu radar. Signed-off-by: Gavin Shan Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110104914.31280-5-gshan@redhat.com --- Documentation/virt/kvm/api.rst | 2 +- arch/arm64/include/uapi/asm/kvm.h | 1 + arch/arm64/kvm/Kconfig | 2 ++ arch/arm64/kvm/arm.c | 3 +++ arch/arm64/kvm/vgic/vgic-its.c | 20 ++++++++++++++++++++ include/kvm/arm_vgic.h | 1 + 6 files changed, 28 insertions(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 1f1b09aa6db47..773e4b202f479 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7921,7 +7921,7 @@ regardless of what has actually been exposed through the CPUID leaf. 8.29 KVM_CAP_DIRTY_LOG_RING/KVM_CAP_DIRTY_LOG_RING_ACQ_REL ---------------------------------------------------------- -:Architectures: x86 +:Architectures: x86, arm64 :Parameters: args[0] - size of the dirty log ring KVM is capable of tracking dirty memory using ring buffers that are diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 316917b987070..a7a857f1784d8 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -43,6 +43,7 @@ #define __KVM_HAVE_VCPU_EVENTS #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 +#define KVM_DIRTY_LOG_PAGE_OFFSET 64 #define KVM_REG_SIZE(id) \ (1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT)) diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 815cc118c675f..05da3c8f7e88f 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -32,6 +32,8 @@ menuconfig KVM select KVM_VFIO select HAVE_KVM_EVENTFD select HAVE_KVM_IRQFD + select HAVE_KVM_DIRTY_RING_ACQ_REL + select NEED_KVM_DIRTY_RING_WITH_BITMAP select HAVE_KVM_MSI select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQ_ROUTING diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 94d33e296e10c..6b097605e38ce 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -746,6 +746,9 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_SUSPEND, vcpu)) return kvm_vcpu_suspend(vcpu); + + if (kvm_dirty_ring_check_request(vcpu)) + return 0; } return 1; diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 733b53055f976..94a666dd1443d 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -2743,6 +2743,7 @@ static int vgic_its_has_attr(struct kvm_device *dev, static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr) { const struct vgic_its_abi *abi = vgic_its_get_abi(its); + struct vgic_dist *dist = &kvm->arch.vgic; int ret = 0; if (attr == KVM_DEV_ARM_VGIC_CTRL_INIT) /* Nothing to do */ @@ -2762,7 +2763,9 @@ static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr) vgic_its_reset(kvm, its); break; case KVM_DEV_ARM_ITS_SAVE_TABLES: + dist->save_its_tables_in_progress = true; ret = abi->save_tables(its); + dist->save_its_tables_in_progress = false; break; case KVM_DEV_ARM_ITS_RESTORE_TABLES: ret = abi->restore_tables(its); @@ -2775,6 +2778,23 @@ static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr) return ret; } +/* + * kvm_arch_allow_write_without_running_vcpu - allow writing guest memory + * without the running VCPU when dirty ring is enabled. + * + * The running VCPU is required to track dirty guest pages when dirty ring + * is enabled. Otherwise, the backup bitmap should be used to track the + * dirty guest pages. When vgic/its tables are being saved, the backup + * bitmap is used to track the dirty guest pages due to the missed running + * VCPU in the period. + */ +bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + + return dist->save_its_tables_in_progress; +} + static int vgic_its_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 4df9e73a8bb5f..9270cd87da3fc 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -263,6 +263,7 @@ struct vgic_dist { struct vgic_io_device dist_iodev; bool has_its; + bool save_its_tables_in_progress; /* * Contains the attributes and gpa of the LPI configuration table. -- GitLab From a737f5ffb1e883e580730122be11c9eb832a7749 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 10 Nov 2022 18:49:12 +0800 Subject: [PATCH 0470/1423] KVM: selftests: Use host page size to map ring buffer in dirty_log_test In vcpu_map_dirty_ring(), the guest's page size is used to figure out the offset in the virtual area. It works fine when we have same page sizes on host and guest. However, it fails when the page sizes on host and guest are different on arm64, like below error messages indicates. # ./dirty_log_test -M dirty-ring -m 7 Setting log mode to: 'dirty-ring' Test iterations: 32, interval: 10 (ms) Testing guest mode: PA-bits:40, VA-bits:48, 64K pages guest physical test memory offset: 0xffbffc0000 vcpu stops because vcpu is kicked out... Notifying vcpu to continue vcpu continues now. ==== Test Assertion Failure ==== lib/kvm_util.c:1477: addr == MAP_FAILED pid=9000 tid=9000 errno=0 - Success 1 0x0000000000405f5b: vcpu_map_dirty_ring at kvm_util.c:1477 2 0x0000000000402ebb: dirty_ring_collect_dirty_pages at dirty_log_test.c:349 3 0x00000000004029b3: log_mode_collect_dirty_pages at dirty_log_test.c:478 4 (inlined by) run_test at dirty_log_test.c:778 5 (inlined by) run_test at dirty_log_test.c:691 6 0x0000000000403a57: for_each_guest_mode at guest_modes.c:105 7 0x0000000000401ccf: main at dirty_log_test.c:921 8 0x0000ffffb06ec79b: ?? ??:0 9 0x0000ffffb06ec86b: ?? ??:0 10 0x0000000000401def: _start at ??:? Dirty ring mapped private Fix the issue by using host's page size to map the ring buffer. Signed-off-by: Gavin Shan Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110104914.31280-6-gshan@redhat.com --- tools/testing/selftests/kvm/lib/kvm_util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index f1cb1627161fd..89a1a420ebd5e 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1506,7 +1506,7 @@ struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vcpu *vcpu) void *vcpu_map_dirty_ring(struct kvm_vcpu *vcpu) { - uint32_t page_size = vcpu->vm->page_size; + uint32_t page_size = getpagesize(); uint32_t size = vcpu->vm->dirty_ring_size; TEST_ASSERT(size > 0, "Should enable dirty ring first"); -- GitLab From 7167190ddb863bd061c0c6b61f4cec94184b40da Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 10 Nov 2022 18:49:13 +0800 Subject: [PATCH 0471/1423] KVM: selftests: Clear dirty ring states between two modes in dirty_log_test There are two states, which need to be cleared before next mode is executed. Otherwise, we will hit failure as the following messages indicate. - The variable 'dirty_ring_vcpu_ring_full' shared by main and vcpu thread. It's indicating if the vcpu exit due to full ring buffer. The value can be carried from previous mode (VM_MODE_P40V48_4K) to current one (VM_MODE_P40V48_64K) when VM_MODE_P40V48_16K isn't supported. - The current ring buffer index needs to be reset before next mode (VM_MODE_P40V48_64K) is executed. Otherwise, the stale value is carried from previous mode (VM_MODE_P40V48_4K). # ./dirty_log_test -M dirty-ring Setting log mode to: 'dirty-ring' Test iterations: 32, interval: 10 (ms) Testing guest mode: PA-bits:40, VA-bits:48, 4K pages guest physical test memory offset: 0xffbfffc000 : Dirtied 995328 pages Total bits checked: dirty (1012434), clear (7114123), track_next (966700) Testing guest mode: PA-bits:40, VA-bits:48, 64K pages guest physical test memory offset: 0xffbffc0000 vcpu stops because vcpu is kicked out... vcpu continues now. Notifying vcpu to continue Iteration 1 collected 0 pages vcpu stops because dirty ring is full... vcpu continues now. vcpu stops because dirty ring is full... vcpu continues now. vcpu stops because dirty ring is full... ==== Test Assertion Failure ==== dirty_log_test.c:369: cleared == count pid=10541 tid=10541 errno=22 - Invalid argument 1 0x0000000000403087: dirty_ring_collect_dirty_pages at dirty_log_test.c:369 2 0x0000000000402a0b: log_mode_collect_dirty_pages at dirty_log_test.c:492 3 (inlined by) run_test at dirty_log_test.c:795 4 (inlined by) run_test at dirty_log_test.c:705 5 0x0000000000403a37: for_each_guest_mode at guest_modes.c:100 6 0x0000000000401ccf: main at dirty_log_test.c:938 7 0x0000ffff9ecd279b: ?? ??:0 8 0x0000ffff9ecd286b: ?? ??:0 9 0x0000000000401def: _start at ??:? Reset dirty pages (0) mismatch with collected (35566) Fix the issues by clearing 'dirty_ring_vcpu_ring_full' and the ring buffer index before next new mode is to be executed. Signed-off-by: Gavin Shan Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110104914.31280-7-gshan@redhat.com --- tools/testing/selftests/kvm/dirty_log_test.c | 27 ++++++++++++-------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index b5234d6efbe15..8758c10ec8503 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -226,13 +226,15 @@ static void clear_log_create_vm_done(struct kvm_vm *vm) } static void dirty_log_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot, - void *bitmap, uint32_t num_pages) + void *bitmap, uint32_t num_pages, + uint32_t *unused) { kvm_vm_get_dirty_log(vcpu->vm, slot, bitmap); } static void clear_log_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot, - void *bitmap, uint32_t num_pages) + void *bitmap, uint32_t num_pages, + uint32_t *unused) { kvm_vm_get_dirty_log(vcpu->vm, slot, bitmap); kvm_vm_clear_dirty_log(vcpu->vm, slot, bitmap, 0, num_pages); @@ -329,10 +331,9 @@ static void dirty_ring_continue_vcpu(void) } static void dirty_ring_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot, - void *bitmap, uint32_t num_pages) + void *bitmap, uint32_t num_pages, + uint32_t *ring_buf_idx) { - /* We only have one vcpu */ - static uint32_t fetch_index = 0; uint32_t count = 0, cleared; bool continued_vcpu = false; @@ -349,7 +350,8 @@ static void dirty_ring_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot, /* Only have one vcpu */ count = dirty_ring_collect_one(vcpu_map_dirty_ring(vcpu), - slot, bitmap, num_pages, &fetch_index); + slot, bitmap, num_pages, + ring_buf_idx); cleared = kvm_vm_reset_dirty_ring(vcpu->vm); @@ -406,7 +408,8 @@ struct log_mode { void (*create_vm_done)(struct kvm_vm *vm); /* Hook to collect the dirty pages into the bitmap provided */ void (*collect_dirty_pages) (struct kvm_vcpu *vcpu, int slot, - void *bitmap, uint32_t num_pages); + void *bitmap, uint32_t num_pages, + uint32_t *ring_buf_idx); /* Hook to call when after each vcpu run */ void (*after_vcpu_run)(struct kvm_vcpu *vcpu, int ret, int err); void (*before_vcpu_join) (void); @@ -471,13 +474,14 @@ static void log_mode_create_vm_done(struct kvm_vm *vm) } static void log_mode_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot, - void *bitmap, uint32_t num_pages) + void *bitmap, uint32_t num_pages, + uint32_t *ring_buf_idx) { struct log_mode *mode = &log_modes[host_log_mode]; TEST_ASSERT(mode->collect_dirty_pages != NULL, "collect_dirty_pages() is required for any log mode!"); - mode->collect_dirty_pages(vcpu, slot, bitmap, num_pages); + mode->collect_dirty_pages(vcpu, slot, bitmap, num_pages, ring_buf_idx); } static void log_mode_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err) @@ -696,6 +700,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) struct kvm_vcpu *vcpu; struct kvm_vm *vm; unsigned long *bmap; + uint32_t ring_buf_idx = 0; if (!log_mode_supported()) { print_skip("Log mode '%s' not supported", @@ -771,6 +776,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) host_dirty_count = 0; host_clear_count = 0; host_track_next_count = 0; + WRITE_ONCE(dirty_ring_vcpu_ring_full, false); pthread_create(&vcpu_thread, NULL, vcpu_worker, vcpu); @@ -778,7 +784,8 @@ static void run_test(enum vm_guest_mode mode, void *arg) /* Give the vcpu thread some time to dirty some pages */ usleep(p->interval * 1000); log_mode_collect_dirty_pages(vcpu, TEST_MEM_SLOT_INDEX, - bmap, host_num_pages); + bmap, host_num_pages, + &ring_buf_idx); /* * See vcpu_sync_stop_requested definition for details on why -- GitLab From dc6df7d4d0633e65850d5372ae9f1234bcc6e26e Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 10 Nov 2022 18:49:14 +0800 Subject: [PATCH 0472/1423] KVM: selftests: Automate choosing dirty ring size in dirty_log_test In the dirty ring case, we rely on vcpu exit due to full dirty ring state. On ARM64 system, there are 4096 host pages when the host page size is 64KB. In this case, the vcpu never exits due to the full dirty ring state. The similar case is 4KB page size on host and 64KB page size on guest. The vcpu corrupts same set of host pages, but the dirty page information isn't collected in the main thread. This leads to infinite loop as the following log shows. # ./dirty_log_test -M dirty-ring -c 65536 -m 5 Setting log mode to: 'dirty-ring' Test iterations: 32, interval: 10 (ms) Testing guest mode: PA-bits:40, VA-bits:48, 4K pages guest physical test memory offset: 0xffbffe0000 vcpu stops because vcpu is kicked out... Notifying vcpu to continue vcpu continues now. Iteration 1 collected 576 pages Fix the issue by automatically choosing the best dirty ring size, to ensure vcpu exit due to full dirty ring state. The option '-c' becomes a hint to the dirty ring count, instead of the value of it. Signed-off-by: Gavin Shan Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110104914.31280-8-gshan@redhat.com --- tools/testing/selftests/kvm/dirty_log_test.c | 26 +++++++++++++++++--- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 8758c10ec8503..a87e5f78ebf10 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -24,6 +24,9 @@ #include "guest_modes.h" #include "processor.h" +#define DIRTY_MEM_BITS 30 /* 1G */ +#define PAGE_SHIFT_4K 12 + /* The memory slot index to track dirty pages */ #define TEST_MEM_SLOT_INDEX 1 @@ -273,6 +276,24 @@ static bool dirty_ring_supported(void) static void dirty_ring_create_vm_done(struct kvm_vm *vm) { + uint64_t pages; + uint32_t limit; + + /* + * We rely on vcpu exit due to full dirty ring state. Adjust + * the ring buffer size to ensure we're able to reach the + * full dirty ring state. + */ + pages = (1ul << (DIRTY_MEM_BITS - vm->page_shift)) + 3; + pages = vm_adjust_num_guest_pages(vm->mode, pages); + if (vm->page_size < getpagesize()) + pages = vm_num_host_pages(vm->mode, pages); + + limit = 1 << (31 - __builtin_clz(pages)); + test_dirty_ring_count = 1 << (31 - __builtin_clz(test_dirty_ring_count)); + test_dirty_ring_count = min(limit, test_dirty_ring_count); + pr_info("dirty ring count: 0x%x\n", test_dirty_ring_count); + /* * Switch to dirty ring mode after VM creation but before any * of the vcpu creation. @@ -685,9 +706,6 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, struct kvm_vcpu **vcpu, return vm; } -#define DIRTY_MEM_BITS 30 /* 1G */ -#define PAGE_SHIFT_4K 12 - struct test_params { unsigned long iterations; unsigned long interval; @@ -830,7 +848,7 @@ static void help(char *name) printf("usage: %s [-h] [-i iterations] [-I interval] " "[-p offset] [-m mode]\n", name); puts(""); - printf(" -c: specify dirty ring size, in number of entries\n"); + printf(" -c: hint to dirty ring size, in number of entries\n"); printf(" (only useful for dirty-ring test; default: %"PRIu32")\n", TEST_DIRTY_RING_COUNT); printf(" -i: specify iteration counts (default: %"PRIu64")\n", -- GitLab From 8dab99c9eab3162bfb4326c35579a3388dbf68f2 Mon Sep 17 00:00:00 2001 From: Guillaume La Roque Date: Mon, 7 Nov 2022 18:29:21 +0100 Subject: [PATCH 0473/1423] gpio: davinci: add support of module build Added module build support for the davinci gpio driver Signed-off-by: Guillaume La Roque Signed-off-by: Nicolas Frayer Signed-off-by: Bartosz Golaszewski --- drivers/gpio/Kconfig | 2 +- drivers/gpio/gpio-davinci.c | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 4bfedb0109a76..ec7cfd4f52b1e 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -228,7 +228,7 @@ config GPIO_CLPS711X Say yes here to support GPIO on CLPS711X SoCs. config GPIO_DAVINCI - bool "TI Davinci/Keystone GPIO support" + tristate "TI Davinci/Keystone GPIO support" default y if ARCH_DAVINCI depends on (ARM || ARM64) && (ARCH_DAVINCI || ARCH_KEYSTONE || ARCH_K3) help diff --git a/drivers/gpio/gpio-davinci.c b/drivers/gpio/gpio-davinci.c index 1018860c83c27..fa51a91afa54f 100644 --- a/drivers/gpio/gpio-davinci.c +++ b/drivers/gpio/gpio-davinci.c @@ -727,3 +727,14 @@ static int __init davinci_gpio_drv_reg(void) return platform_driver_register(&davinci_gpio_driver); } postcore_initcall(davinci_gpio_drv_reg); + +static void __exit davinci_gpio_exit(void) +{ + platform_driver_unregister(&davinci_gpio_driver); +} +module_exit(davinci_gpio_exit); + +MODULE_AUTHOR("Jan Kotas "); +MODULE_DESCRIPTION("DAVINCI GPIO driver"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:gpio-davinci"); -- GitLab From dfc7a7769ab7f2a2f629c673717ef1fa7b63aa42 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 7 Nov 2022 21:56:31 +0000 Subject: [PATCH 0474/1423] KVM: arm64: Combine visitor arguments into a context structure Passing new arguments by value to the visitor callbacks is extremely inflexible for stuffing new parameters used by only some of the visitors. Use a context structure instead and pass the pointer through to the visitor callback. While at it, redefine the 'flags' parameter to the visitor to contain the bit indicating the phase of the walk. Pass the entire set of flags through the context structure such that the walker can communicate additional state to the visitor callback. No functional change intended. Signed-off-by: Oliver Upton Reviewed-by: Ben Gardon Reviewed-by: Gavin Shan Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221107215644.1895162-2-oliver.upton@linux.dev --- arch/arm64/include/asm/kvm_pgtable.h | 15 +- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 10 +- arch/arm64/kvm/hyp/nvhe/setup.c | 16 +- arch/arm64/kvm/hyp/pgtable.c | 269 +++++++++++++------------- 4 files changed, 154 insertions(+), 156 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 3252eb50ecfe5..607f9bb8aab4e 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -199,10 +199,17 @@ enum kvm_pgtable_walk_flags { KVM_PGTABLE_WALK_TABLE_POST = BIT(2), }; -typedef int (*kvm_pgtable_visitor_fn_t)(u64 addr, u64 end, u32 level, - kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, - void * const arg); +struct kvm_pgtable_visit_ctx { + kvm_pte_t *ptep; + void *arg; + u64 addr; + u64 end; + u32 level; + enum kvm_pgtable_walk_flags flags; +}; + +typedef int (*kvm_pgtable_visitor_fn_t)(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit); /** * struct kvm_pgtable_walker - Hook into a page-table walk. diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 1e78acf9662eb..8f5b6a36a039e 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -417,13 +417,11 @@ struct check_walk_data { enum pkvm_page_state (*get_page_state)(kvm_pte_t pte); }; -static int __check_page_state_visitor(u64 addr, u64 end, u32 level, - kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, - void * const arg) +static int __check_page_state_visitor(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) { - struct check_walk_data *d = arg; - kvm_pte_t pte = *ptep; + struct check_walk_data *d = ctx->arg; + kvm_pte_t pte = *ctx->ptep; if (kvm_pte_valid(pte) && !addr_is_memory(kvm_pte_to_phys(pte))) return -EINVAL; diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index e8d4ea2fcfa09..a293cf5eba1b1 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -186,15 +186,13 @@ static void hpool_put_page(void *addr) hyp_put_page(&hpool, addr); } -static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level, - kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, - void * const arg) +static int finalize_host_mappings_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) { - struct kvm_pgtable_mm_ops *mm_ops = arg; + struct kvm_pgtable_mm_ops *mm_ops = ctx->arg; enum kvm_pgtable_prot prot; enum pkvm_page_state state; - kvm_pte_t pte = *ptep; + kvm_pte_t pte = *ctx->ptep; phys_addr_t phys; if (!kvm_pte_valid(pte)) @@ -205,11 +203,11 @@ static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level, * was unable to access the hyp_vmemmap and so the buddy allocator has * initialised the refcount to '1'. */ - mm_ops->get_page(ptep); - if (flag != KVM_PGTABLE_WALK_LEAF) + mm_ops->get_page(ctx->ptep); + if (visit != KVM_PGTABLE_WALK_LEAF) return 0; - if (level != (KVM_PGTABLE_MAX_LEVELS - 1)) + if (ctx->level != (KVM_PGTABLE_MAX_LEVELS - 1)) return -EINVAL; phys = kvm_pte_to_phys(pte); diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index cdf8e76b0be14..900c8b9c0cfcd 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -64,20 +64,20 @@ static bool kvm_phys_is_valid(u64 phys) return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_EL1_PARANGE_MAX)); } -static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level) +static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, u64 phys) { - u64 granule = kvm_granule_size(level); + u64 granule = kvm_granule_size(ctx->level); - if (!kvm_level_supports_block_mapping(level)) + if (!kvm_level_supports_block_mapping(ctx->level)) return false; - if (granule > (end - addr)) + if (granule > (ctx->end - ctx->addr)) return false; if (kvm_phys_is_valid(phys) && !IS_ALIGNED(phys, granule)) return false; - return IS_ALIGNED(addr, granule); + return IS_ALIGNED(ctx->addr, granule); } static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level) @@ -172,12 +172,12 @@ static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id) return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id); } -static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr, - u32 level, kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag) +static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, + const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) { struct kvm_pgtable_walker *walker = data->walker; - return walker->cb(addr, data->end, level, ptep, flag, walker->arg); + return walker->cb(ctx, visit); } static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, @@ -186,20 +186,24 @@ static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, kvm_pte_t *ptep, u32 level) { + enum kvm_pgtable_walk_flags flags = data->walker->flags; + struct kvm_pgtable_visit_ctx ctx = { + .ptep = ptep, + .arg = data->walker->arg, + .addr = data->addr, + .end = data->end, + .level = level, + .flags = flags, + }; int ret = 0; - u64 addr = data->addr; kvm_pte_t *childp, pte = *ptep; bool table = kvm_pte_table(pte, level); - enum kvm_pgtable_walk_flags flags = data->walker->flags; - if (table && (flags & KVM_PGTABLE_WALK_TABLE_PRE)) { - ret = kvm_pgtable_visitor_cb(data, addr, level, ptep, - KVM_PGTABLE_WALK_TABLE_PRE); - } + if (table && (ctx.flags & KVM_PGTABLE_WALK_TABLE_PRE)) + ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_PRE); - if (!table && (flags & KVM_PGTABLE_WALK_LEAF)) { - ret = kvm_pgtable_visitor_cb(data, addr, level, ptep, - KVM_PGTABLE_WALK_LEAF); + if (!table && (ctx.flags & KVM_PGTABLE_WALK_LEAF)) { + ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_LEAF); pte = *ptep; table = kvm_pte_table(pte, level); } @@ -218,10 +222,8 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, if (ret) goto out; - if (flags & KVM_PGTABLE_WALK_TABLE_POST) { - ret = kvm_pgtable_visitor_cb(data, addr, level, ptep, - KVM_PGTABLE_WALK_TABLE_POST); - } + if (ctx.flags & KVM_PGTABLE_WALK_TABLE_POST) + ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_POST); out: return ret; @@ -292,13 +294,13 @@ struct leaf_walk_data { u32 level; }; -static int leaf_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, void * const arg) +static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) { - struct leaf_walk_data *data = arg; + struct leaf_walk_data *data = ctx->arg; - data->pte = *ptep; - data->level = level; + data->pte = *ctx->ptep; + data->level = ctx->level; return 0; } @@ -383,47 +385,47 @@ enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte) return prot; } -static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level, - kvm_pte_t *ptep, struct hyp_map_data *data) +static bool hyp_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, + struct hyp_map_data *data) { - kvm_pte_t new, old = *ptep; - u64 granule = kvm_granule_size(level), phys = data->phys; + kvm_pte_t new, old = *ctx->ptep; + u64 granule = kvm_granule_size(ctx->level), phys = data->phys; - if (!kvm_block_mapping_supported(addr, end, phys, level)) + if (!kvm_block_mapping_supported(ctx, phys)) return false; data->phys += granule; - new = kvm_init_valid_leaf_pte(phys, data->attr, level); + new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); if (old == new) return true; if (!kvm_pte_valid(old)) - data->mm_ops->get_page(ptep); + data->mm_ops->get_page(ctx->ptep); else if (WARN_ON((old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) return false; - smp_store_release(ptep, new); + smp_store_release(ctx->ptep, new); return true; } -static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, void * const arg) +static int hyp_map_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) { kvm_pte_t *childp; - struct hyp_map_data *data = arg; + struct hyp_map_data *data = ctx->arg; struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; - if (hyp_map_walker_try_leaf(addr, end, level, ptep, arg)) + if (hyp_map_walker_try_leaf(ctx, data)) return 0; - if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1)) + if (WARN_ON(ctx->level == KVM_PGTABLE_MAX_LEVELS - 1)) return -EINVAL; childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL); if (!childp) return -ENOMEM; - kvm_set_table_pte(ptep, childp, mm_ops); - mm_ops->get_page(ptep); + kvm_set_table_pte(ctx->ptep, childp, mm_ops); + mm_ops->get_page(ctx->ptep); return 0; } @@ -456,39 +458,39 @@ struct hyp_unmap_data { struct kvm_pgtable_mm_ops *mm_ops; }; -static int hyp_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, void * const arg) +static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) { - kvm_pte_t pte = *ptep, *childp = NULL; - u64 granule = kvm_granule_size(level); - struct hyp_unmap_data *data = arg; + kvm_pte_t pte = *ctx->ptep, *childp = NULL; + u64 granule = kvm_granule_size(ctx->level); + struct hyp_unmap_data *data = ctx->arg; struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; if (!kvm_pte_valid(pte)) return -EINVAL; - if (kvm_pte_table(pte, level)) { + if (kvm_pte_table(pte, ctx->level)) { childp = kvm_pte_follow(pte, mm_ops); if (mm_ops->page_count(childp) != 1) return 0; - kvm_clear_pte(ptep); + kvm_clear_pte(ctx->ptep); dsb(ishst); - __tlbi_level(vae2is, __TLBI_VADDR(addr, 0), level); + __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), ctx->level); } else { - if (end - addr < granule) + if (ctx->end - ctx->addr < granule) return -EINVAL; - kvm_clear_pte(ptep); + kvm_clear_pte(ctx->ptep); dsb(ishst); - __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), level); + __tlbi_level(vale2is, __TLBI_VADDR(ctx->addr, 0), ctx->level); data->unmapped += granule; } dsb(ish); isb(); - mm_ops->put_page(ptep); + mm_ops->put_page(ctx->ptep); if (childp) mm_ops->put_page(childp); @@ -532,18 +534,18 @@ int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, return 0; } -static int hyp_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, void * const arg) +static int hyp_free_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) { - struct kvm_pgtable_mm_ops *mm_ops = arg; - kvm_pte_t pte = *ptep; + struct kvm_pgtable_mm_ops *mm_ops = ctx->arg; + kvm_pte_t pte = *ctx->ptep; if (!kvm_pte_valid(pte)) return 0; - mm_ops->put_page(ptep); + mm_ops->put_page(ctx->ptep); - if (kvm_pte_table(pte, level)) + if (kvm_pte_table(pte, ctx->level)) mm_ops->put_page(kvm_pte_follow(pte, mm_ops)); return 0; @@ -682,19 +684,19 @@ static bool stage2_pte_is_counted(kvm_pte_t pte) return !!pte; } -static void stage2_put_pte(kvm_pte_t *ptep, struct kvm_s2_mmu *mmu, u64 addr, - u32 level, struct kvm_pgtable_mm_ops *mm_ops) +static void stage2_put_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu, + struct kvm_pgtable_mm_ops *mm_ops) { /* * Clear the existing PTE, and perform break-before-make with * TLB maintenance if it was valid. */ - if (kvm_pte_valid(*ptep)) { - kvm_clear_pte(ptep); - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level); + if (kvm_pte_valid(*ctx->ptep)) { + kvm_clear_pte(ctx->ptep); + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level); } - mm_ops->put_page(ptep); + mm_ops->put_page(ctx->ptep); } static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte) @@ -708,29 +710,28 @@ static bool stage2_pte_executable(kvm_pte_t pte) return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN); } -static bool stage2_leaf_mapping_allowed(u64 addr, u64 end, u32 level, +static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx, struct stage2_map_data *data) { - if (data->force_pte && (level < (KVM_PGTABLE_MAX_LEVELS - 1))) + if (data->force_pte && (ctx->level < (KVM_PGTABLE_MAX_LEVELS - 1))) return false; - return kvm_block_mapping_supported(addr, end, data->phys, level); + return kvm_block_mapping_supported(ctx, data->phys); } -static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, - kvm_pte_t *ptep, +static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, struct stage2_map_data *data) { - kvm_pte_t new, old = *ptep; - u64 granule = kvm_granule_size(level), phys = data->phys; + kvm_pte_t new, old = *ctx->ptep; + u64 granule = kvm_granule_size(ctx->level), phys = data->phys; struct kvm_pgtable *pgt = data->mmu->pgt; struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; - if (!stage2_leaf_mapping_allowed(addr, end, level, data)) + if (!stage2_leaf_mapping_allowed(ctx, data)) return -E2BIG; if (kvm_phys_is_valid(phys)) - new = kvm_init_valid_leaf_pte(phys, data->attr, level); + new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); else new = kvm_init_invalid_leaf_owner(data->owner_id); @@ -744,7 +745,7 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, if (!stage2_pte_needs_update(old, new)) return -EAGAIN; - stage2_put_pte(ptep, data->mmu, addr, level, mm_ops); + stage2_put_pte(ctx, data->mmu, mm_ops); } /* Perform CMOs before installation of the guest stage-2 PTE */ @@ -755,26 +756,25 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, if (mm_ops->icache_inval_pou && stage2_pte_executable(new)) mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule); - smp_store_release(ptep, new); + smp_store_release(ctx->ptep, new); if (stage2_pte_is_counted(new)) - mm_ops->get_page(ptep); + mm_ops->get_page(ctx->ptep); if (kvm_phys_is_valid(phys)) data->phys += granule; return 0; } -static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level, - kvm_pte_t *ptep, +static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx, struct stage2_map_data *data) { if (data->anchor) return 0; - if (!stage2_leaf_mapping_allowed(addr, end, level, data)) + if (!stage2_leaf_mapping_allowed(ctx, data)) return 0; - data->childp = kvm_pte_follow(*ptep, data->mm_ops); - kvm_clear_pte(ptep); + data->childp = kvm_pte_follow(*ctx->ptep, data->mm_ops); + kvm_clear_pte(ctx->ptep); /* * Invalidate the whole stage-2, as we may have numerous leaf @@ -782,29 +782,29 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level, * individually. */ kvm_call_hyp(__kvm_tlb_flush_vmid, data->mmu); - data->anchor = ptep; + data->anchor = ctx->ptep; return 0; } -static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, +static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx, struct stage2_map_data *data) { struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; - kvm_pte_t *childp, pte = *ptep; + kvm_pte_t *childp, pte = *ctx->ptep; int ret; if (data->anchor) { if (stage2_pte_is_counted(pte)) - mm_ops->put_page(ptep); + mm_ops->put_page(ctx->ptep); return 0; } - ret = stage2_map_walker_try_leaf(addr, end, level, ptep, data); + ret = stage2_map_walker_try_leaf(ctx, data); if (ret != -E2BIG) return ret; - if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1)) + if (WARN_ON(ctx->level == KVM_PGTABLE_MAX_LEVELS - 1)) return -EINVAL; if (!data->memcache) @@ -820,16 +820,15 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, * will be mapped lazily. */ if (stage2_pte_is_counted(pte)) - stage2_put_pte(ptep, data->mmu, addr, level, mm_ops); + stage2_put_pte(ctx, data->mmu, mm_ops); - kvm_set_table_pte(ptep, childp, mm_ops); - mm_ops->get_page(ptep); + kvm_set_table_pte(ctx->ptep, childp, mm_ops); + mm_ops->get_page(ctx->ptep); return 0; } -static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level, - kvm_pte_t *ptep, +static int stage2_map_walk_table_post(const struct kvm_pgtable_visit_ctx *ctx, struct stage2_map_data *data) { struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; @@ -839,17 +838,17 @@ static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level, if (!data->anchor) return 0; - if (data->anchor == ptep) { + if (data->anchor == ctx->ptep) { childp = data->childp; data->anchor = NULL; data->childp = NULL; - ret = stage2_map_walk_leaf(addr, end, level, ptep, data); + ret = stage2_map_walk_leaf(ctx, data); } else { - childp = kvm_pte_follow(*ptep, mm_ops); + childp = kvm_pte_follow(*ctx->ptep, mm_ops); } mm_ops->put_page(childp); - mm_ops->put_page(ptep); + mm_ops->put_page(ctx->ptep); return ret; } @@ -873,18 +872,18 @@ static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level, * the page-table, installing the block entry when it revisits the anchor * pointer and clearing the anchor to NULL. */ -static int stage2_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, void * const arg) +static int stage2_map_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) { - struct stage2_map_data *data = arg; + struct stage2_map_data *data = ctx->arg; - switch (flag) { + switch (visit) { case KVM_PGTABLE_WALK_TABLE_PRE: - return stage2_map_walk_table_pre(addr, end, level, ptep, data); + return stage2_map_walk_table_pre(ctx, data); case KVM_PGTABLE_WALK_LEAF: - return stage2_map_walk_leaf(addr, end, level, ptep, data); + return stage2_map_walk_leaf(ctx, data); case KVM_PGTABLE_WALK_TABLE_POST: - return stage2_map_walk_table_post(addr, end, level, ptep, data); + return stage2_map_walk_table_post(ctx, data); } return -EINVAL; @@ -949,25 +948,24 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, return ret; } -static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, - void * const arg) +static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) { - struct kvm_pgtable *pgt = arg; + struct kvm_pgtable *pgt = ctx->arg; struct kvm_s2_mmu *mmu = pgt->mmu; struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; - kvm_pte_t pte = *ptep, *childp = NULL; + kvm_pte_t pte = *ctx->ptep, *childp = NULL; bool need_flush = false; if (!kvm_pte_valid(pte)) { if (stage2_pte_is_counted(pte)) { - kvm_clear_pte(ptep); - mm_ops->put_page(ptep); + kvm_clear_pte(ctx->ptep); + mm_ops->put_page(ctx->ptep); } return 0; } - if (kvm_pte_table(pte, level)) { + if (kvm_pte_table(pte, ctx->level)) { childp = kvm_pte_follow(pte, mm_ops); if (mm_ops->page_count(childp) != 1) @@ -981,11 +979,11 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, * block entry and rely on the remaining portions being faulted * back lazily. */ - stage2_put_pte(ptep, mmu, addr, level, mm_ops); + stage2_put_pte(ctx, mmu, mm_ops); if (need_flush && mm_ops->dcache_clean_inval_poc) mm_ops->dcache_clean_inval_poc(kvm_pte_follow(pte, mm_ops), - kvm_granule_size(level)); + kvm_granule_size(ctx->level)); if (childp) mm_ops->put_page(childp); @@ -1012,18 +1010,17 @@ struct stage2_attr_data { struct kvm_pgtable_mm_ops *mm_ops; }; -static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, - void * const arg) +static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) { - kvm_pte_t pte = *ptep; - struct stage2_attr_data *data = arg; + kvm_pte_t pte = *ctx->ptep; + struct stage2_attr_data *data = ctx->arg; struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; if (!kvm_pte_valid(pte)) return 0; - data->level = level; + data->level = ctx->level; data->pte = pte; pte &= ~data->attr_clr; pte |= data->attr_set; @@ -1039,10 +1036,10 @@ static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, * stage-2 PTE if we are going to add executable permission. */ if (mm_ops->icache_inval_pou && - stage2_pte_executable(pte) && !stage2_pte_executable(*ptep)) + stage2_pte_executable(pte) && !stage2_pte_executable(*ctx->ptep)) mm_ops->icache_inval_pou(kvm_pte_follow(pte, mm_ops), - kvm_granule_size(level)); - WRITE_ONCE(*ptep, pte); + kvm_granule_size(ctx->level)); + WRITE_ONCE(*ctx->ptep, pte); } return 0; @@ -1140,20 +1137,19 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, return ret; } -static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, - void * const arg) +static int stage2_flush_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) { - struct kvm_pgtable *pgt = arg; + struct kvm_pgtable *pgt = ctx->arg; struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; - kvm_pte_t pte = *ptep; + kvm_pte_t pte = *ctx->ptep; if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pgt, pte)) return 0; if (mm_ops->dcache_clean_inval_poc) mm_ops->dcache_clean_inval_poc(kvm_pte_follow(pte, mm_ops), - kvm_granule_size(level)); + kvm_granule_size(ctx->level)); return 0; } @@ -1200,19 +1196,18 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, return 0; } -static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, - void * const arg) +static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) { - struct kvm_pgtable_mm_ops *mm_ops = arg; - kvm_pte_t pte = *ptep; + struct kvm_pgtable_mm_ops *mm_ops = ctx->arg; + kvm_pte_t pte = *ctx->ptep; if (!stage2_pte_is_counted(pte)) return 0; - mm_ops->put_page(ptep); + mm_ops->put_page(ctx->ptep); - if (kvm_pte_table(pte, level)) + if (kvm_pte_table(pte, ctx->level)) mm_ops->put_page(kvm_pte_follow(pte, mm_ops)); return 0; -- GitLab From 83844a2317ecad935f6735abd854e4bf3f757040 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 7 Nov 2022 21:56:32 +0000 Subject: [PATCH 0475/1423] KVM: arm64: Stash observed pte value in visitor context Rather than reading the ptep all over the shop, read the ptep once from __kvm_pgtable_visit() and stick it in the visitor context. Reread the ptep after visiting a leaf in case the callback installed a new table underneath. No functional change intended. Signed-off-by: Oliver Upton Reviewed-by: Ben Gardon Reviewed-by: Gavin Shan Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221107215644.1895162-3-oliver.upton@linux.dev --- arch/arm64/include/asm/kvm_pgtable.h | 1 + arch/arm64/kvm/hyp/nvhe/mem_protect.c | 5 +- arch/arm64/kvm/hyp/nvhe/setup.c | 7 +-- arch/arm64/kvm/hyp/pgtable.c | 86 +++++++++++++-------------- 4 files changed, 48 insertions(+), 51 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 607f9bb8aab4e..14d4b68a1e92c 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -201,6 +201,7 @@ enum kvm_pgtable_walk_flags { struct kvm_pgtable_visit_ctx { kvm_pte_t *ptep; + kvm_pte_t old; void *arg; u64 addr; u64 end; diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 8f5b6a36a039e..d21d1b08a055b 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -421,12 +421,11 @@ static int __check_page_state_visitor(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { struct check_walk_data *d = ctx->arg; - kvm_pte_t pte = *ctx->ptep; - if (kvm_pte_valid(pte) && !addr_is_memory(kvm_pte_to_phys(pte))) + if (kvm_pte_valid(ctx->old) && !addr_is_memory(kvm_pte_to_phys(ctx->old))) return -EINVAL; - return d->get_page_state(pte) == d->desired ? 0 : -EPERM; + return d->get_page_state(ctx->old) == d->desired ? 0 : -EPERM; } static int check_page_state_range(struct kvm_pgtable *pgt, u64 addr, u64 size, diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index a293cf5eba1b1..6af443c9d78e5 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -192,10 +192,9 @@ static int finalize_host_mappings_walker(const struct kvm_pgtable_visit_ctx *ctx struct kvm_pgtable_mm_ops *mm_ops = ctx->arg; enum kvm_pgtable_prot prot; enum pkvm_page_state state; - kvm_pte_t pte = *ctx->ptep; phys_addr_t phys; - if (!kvm_pte_valid(pte)) + if (!kvm_pte_valid(ctx->old)) return 0; /* @@ -210,7 +209,7 @@ static int finalize_host_mappings_walker(const struct kvm_pgtable_visit_ctx *ctx if (ctx->level != (KVM_PGTABLE_MAX_LEVELS - 1)) return -EINVAL; - phys = kvm_pte_to_phys(pte); + phys = kvm_pte_to_phys(ctx->old); if (!addr_is_memory(phys)) return -EINVAL; @@ -218,7 +217,7 @@ static int finalize_host_mappings_walker(const struct kvm_pgtable_visit_ctx *ctx * Adjust the host stage-2 mappings to match the ownership attributes * configured in the hypervisor stage-1. */ - state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(pte)); + state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(ctx->old)); switch (state) { case PKVM_PAGE_OWNED: return host_stage2_set_owner_locked(phys, PAGE_SIZE, pkvm_hyp_id); diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 900c8b9c0cfcd..fb3696b3a9976 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -189,6 +189,7 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, enum kvm_pgtable_walk_flags flags = data->walker->flags; struct kvm_pgtable_visit_ctx ctx = { .ptep = ptep, + .old = READ_ONCE(*ptep), .arg = data->walker->arg, .addr = data->addr, .end = data->end, @@ -196,16 +197,16 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, .flags = flags, }; int ret = 0; - kvm_pte_t *childp, pte = *ptep; - bool table = kvm_pte_table(pte, level); + kvm_pte_t *childp; + bool table = kvm_pte_table(ctx.old, level); if (table && (ctx.flags & KVM_PGTABLE_WALK_TABLE_PRE)) ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_PRE); if (!table && (ctx.flags & KVM_PGTABLE_WALK_LEAF)) { ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_LEAF); - pte = *ptep; - table = kvm_pte_table(pte, level); + ctx.old = READ_ONCE(*ptep); + table = kvm_pte_table(ctx.old, level); } if (ret) @@ -217,7 +218,7 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, goto out; } - childp = kvm_pte_follow(pte, data->pgt->mm_ops); + childp = kvm_pte_follow(ctx.old, data->pgt->mm_ops); ret = __kvm_pgtable_walk(data, childp, level + 1); if (ret) goto out; @@ -299,7 +300,7 @@ static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx, { struct leaf_walk_data *data = ctx->arg; - data->pte = *ctx->ptep; + data->pte = ctx->old; data->level = ctx->level; return 0; @@ -388,7 +389,7 @@ enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte) static bool hyp_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, struct hyp_map_data *data) { - kvm_pte_t new, old = *ctx->ptep; + kvm_pte_t new; u64 granule = kvm_granule_size(ctx->level), phys = data->phys; if (!kvm_block_mapping_supported(ctx, phys)) @@ -396,11 +397,11 @@ static bool hyp_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, data->phys += granule; new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); - if (old == new) + if (ctx->old == new) return true; - if (!kvm_pte_valid(old)) + if (!kvm_pte_valid(ctx->old)) data->mm_ops->get_page(ctx->ptep); - else if (WARN_ON((old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) + else if (WARN_ON((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) return false; smp_store_release(ctx->ptep, new); @@ -461,16 +462,16 @@ struct hyp_unmap_data { static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { - kvm_pte_t pte = *ctx->ptep, *childp = NULL; + kvm_pte_t *childp = NULL; u64 granule = kvm_granule_size(ctx->level); struct hyp_unmap_data *data = ctx->arg; struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; - if (!kvm_pte_valid(pte)) + if (!kvm_pte_valid(ctx->old)) return -EINVAL; - if (kvm_pte_table(pte, ctx->level)) { - childp = kvm_pte_follow(pte, mm_ops); + if (kvm_pte_table(ctx->old, ctx->level)) { + childp = kvm_pte_follow(ctx->old, mm_ops); if (mm_ops->page_count(childp) != 1) return 0; @@ -538,15 +539,14 @@ static int hyp_free_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { struct kvm_pgtable_mm_ops *mm_ops = ctx->arg; - kvm_pte_t pte = *ctx->ptep; - if (!kvm_pte_valid(pte)) + if (!kvm_pte_valid(ctx->old)) return 0; mm_ops->put_page(ctx->ptep); - if (kvm_pte_table(pte, ctx->level)) - mm_ops->put_page(kvm_pte_follow(pte, mm_ops)); + if (kvm_pte_table(ctx->old, ctx->level)) + mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops)); return 0; } @@ -691,7 +691,7 @@ static void stage2_put_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s * Clear the existing PTE, and perform break-before-make with * TLB maintenance if it was valid. */ - if (kvm_pte_valid(*ctx->ptep)) { + if (kvm_pte_valid(ctx->old)) { kvm_clear_pte(ctx->ptep); kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level); } @@ -722,7 +722,7 @@ static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx, static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, struct stage2_map_data *data) { - kvm_pte_t new, old = *ctx->ptep; + kvm_pte_t new; u64 granule = kvm_granule_size(ctx->level), phys = data->phys; struct kvm_pgtable *pgt = data->mmu->pgt; struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; @@ -735,14 +735,14 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, else new = kvm_init_invalid_leaf_owner(data->owner_id); - if (stage2_pte_is_counted(old)) { + if (stage2_pte_is_counted(ctx->old)) { /* * Skip updating the PTE if we are trying to recreate the exact * same mapping or only change the access permissions. Instead, * the vCPU will exit one more time from guest if still needed * and then go through the path of relaxing permissions. */ - if (!stage2_pte_needs_update(old, new)) + if (!stage2_pte_needs_update(ctx->old, new)) return -EAGAIN; stage2_put_pte(ctx, data->mmu, mm_ops); @@ -773,7 +773,7 @@ static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx, if (!stage2_leaf_mapping_allowed(ctx, data)) return 0; - data->childp = kvm_pte_follow(*ctx->ptep, data->mm_ops); + data->childp = kvm_pte_follow(ctx->old, data->mm_ops); kvm_clear_pte(ctx->ptep); /* @@ -790,11 +790,11 @@ static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx, struct stage2_map_data *data) { struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; - kvm_pte_t *childp, pte = *ctx->ptep; + kvm_pte_t *childp; int ret; if (data->anchor) { - if (stage2_pte_is_counted(pte)) + if (stage2_pte_is_counted(ctx->old)) mm_ops->put_page(ctx->ptep); return 0; @@ -819,7 +819,7 @@ static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx, * a table. Accesses beyond 'end' that fall within the new table * will be mapped lazily. */ - if (stage2_pte_is_counted(pte)) + if (stage2_pte_is_counted(ctx->old)) stage2_put_pte(ctx, data->mmu, mm_ops); kvm_set_table_pte(ctx->ptep, childp, mm_ops); @@ -844,7 +844,7 @@ static int stage2_map_walk_table_post(const struct kvm_pgtable_visit_ctx *ctx, data->childp = NULL; ret = stage2_map_walk_leaf(ctx, data); } else { - childp = kvm_pte_follow(*ctx->ptep, mm_ops); + childp = kvm_pte_follow(ctx->old, mm_ops); } mm_ops->put_page(childp); @@ -954,23 +954,23 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_pgtable *pgt = ctx->arg; struct kvm_s2_mmu *mmu = pgt->mmu; struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; - kvm_pte_t pte = *ctx->ptep, *childp = NULL; + kvm_pte_t *childp = NULL; bool need_flush = false; - if (!kvm_pte_valid(pte)) { - if (stage2_pte_is_counted(pte)) { + if (!kvm_pte_valid(ctx->old)) { + if (stage2_pte_is_counted(ctx->old)) { kvm_clear_pte(ctx->ptep); mm_ops->put_page(ctx->ptep); } return 0; } - if (kvm_pte_table(pte, ctx->level)) { - childp = kvm_pte_follow(pte, mm_ops); + if (kvm_pte_table(ctx->old, ctx->level)) { + childp = kvm_pte_follow(ctx->old, mm_ops); if (mm_ops->page_count(childp) != 1) return 0; - } else if (stage2_pte_cacheable(pgt, pte)) { + } else if (stage2_pte_cacheable(pgt, ctx->old)) { need_flush = !stage2_has_fwb(pgt); } @@ -982,7 +982,7 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, stage2_put_pte(ctx, mmu, mm_ops); if (need_flush && mm_ops->dcache_clean_inval_poc) - mm_ops->dcache_clean_inval_poc(kvm_pte_follow(pte, mm_ops), + mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops), kvm_granule_size(ctx->level)); if (childp) @@ -1013,11 +1013,11 @@ struct stage2_attr_data { static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { - kvm_pte_t pte = *ctx->ptep; + kvm_pte_t pte = ctx->old; struct stage2_attr_data *data = ctx->arg; struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; - if (!kvm_pte_valid(pte)) + if (!kvm_pte_valid(ctx->old)) return 0; data->level = ctx->level; @@ -1036,7 +1036,7 @@ static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx, * stage-2 PTE if we are going to add executable permission. */ if (mm_ops->icache_inval_pou && - stage2_pte_executable(pte) && !stage2_pte_executable(*ctx->ptep)) + stage2_pte_executable(pte) && !stage2_pte_executable(ctx->old)) mm_ops->icache_inval_pou(kvm_pte_follow(pte, mm_ops), kvm_granule_size(ctx->level)); WRITE_ONCE(*ctx->ptep, pte); @@ -1142,13 +1142,12 @@ static int stage2_flush_walker(const struct kvm_pgtable_visit_ctx *ctx, { struct kvm_pgtable *pgt = ctx->arg; struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; - kvm_pte_t pte = *ctx->ptep; - if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pgt, pte)) + if (!kvm_pte_valid(ctx->old) || !stage2_pte_cacheable(pgt, ctx->old)) return 0; if (mm_ops->dcache_clean_inval_poc) - mm_ops->dcache_clean_inval_poc(kvm_pte_follow(pte, mm_ops), + mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops), kvm_granule_size(ctx->level)); return 0; } @@ -1200,15 +1199,14 @@ static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { struct kvm_pgtable_mm_ops *mm_ops = ctx->arg; - kvm_pte_t pte = *ctx->ptep; - if (!stage2_pte_is_counted(pte)) + if (!stage2_pte_is_counted(ctx->old)) return 0; mm_ops->put_page(ctx->ptep); - if (kvm_pte_table(pte, ctx->level)) - mm_ops->put_page(kvm_pte_follow(pte, mm_ops)); + if (kvm_pte_table(ctx->old, ctx->level)) + mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops)); return 0; } -- GitLab From 2a611c7f87f26cca405da63a57f06d0e4dc14240 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 7 Nov 2022 21:56:33 +0000 Subject: [PATCH 0476/1423] KVM: arm64: Pass mm_ops through the visitor context As a prerequisite for getting visitors off of struct kvm_pgtable, pass mm_ops through the visitor context. No functional change intended. Signed-off-by: Oliver Upton Reviewed-by: Ben Gardon Reviewed-by: Gavin Shan Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221107215644.1895162-4-oliver.upton@linux.dev --- arch/arm64/include/asm/kvm_pgtable.h | 1 + arch/arm64/kvm/hyp/nvhe/setup.c | 3 +- arch/arm64/kvm/hyp/pgtable.c | 63 +++++++++++----------------- 3 files changed, 26 insertions(+), 41 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 14d4b68a1e92c..a752793482cb0 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -203,6 +203,7 @@ struct kvm_pgtable_visit_ctx { kvm_pte_t *ptep; kvm_pte_t old; void *arg; + struct kvm_pgtable_mm_ops *mm_ops; u64 addr; u64 end; u32 level; diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 6af443c9d78e5..1068338d77f3c 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -189,7 +189,7 @@ static void hpool_put_page(void *addr) static int finalize_host_mappings_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { - struct kvm_pgtable_mm_ops *mm_ops = ctx->arg; + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; enum kvm_pgtable_prot prot; enum pkvm_page_state state; phys_addr_t phys; @@ -239,7 +239,6 @@ static int finalize_host_mappings(void) struct kvm_pgtable_walker walker = { .cb = finalize_host_mappings_walker, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, - .arg = pkvm_pgtable.mm_ops, }; int i, ret; diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index fb3696b3a9976..db25e81a9890e 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -181,9 +181,10 @@ static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, } static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, - kvm_pte_t *pgtable, u32 level); + struct kvm_pgtable_mm_ops *mm_ops, kvm_pte_t *pgtable, u32 level); static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, + struct kvm_pgtable_mm_ops *mm_ops, kvm_pte_t *ptep, u32 level) { enum kvm_pgtable_walk_flags flags = data->walker->flags; @@ -191,6 +192,7 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, .ptep = ptep, .old = READ_ONCE(*ptep), .arg = data->walker->arg, + .mm_ops = mm_ops, .addr = data->addr, .end = data->end, .level = level, @@ -218,8 +220,8 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, goto out; } - childp = kvm_pte_follow(ctx.old, data->pgt->mm_ops); - ret = __kvm_pgtable_walk(data, childp, level + 1); + childp = kvm_pte_follow(ctx.old, mm_ops); + ret = __kvm_pgtable_walk(data, mm_ops, childp, level + 1); if (ret) goto out; @@ -231,7 +233,7 @@ out: } static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, - kvm_pte_t *pgtable, u32 level) + struct kvm_pgtable_mm_ops *mm_ops, kvm_pte_t *pgtable, u32 level) { u32 idx; int ret = 0; @@ -245,7 +247,7 @@ static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, if (data->addr >= data->end) break; - ret = __kvm_pgtable_visit(data, ptep, level); + ret = __kvm_pgtable_visit(data, mm_ops, ptep, level); if (ret) break; } @@ -269,7 +271,7 @@ static int _kvm_pgtable_walk(struct kvm_pgtable_walk_data *data) for (idx = kvm_pgd_page_idx(data); data->addr < data->end; ++idx) { kvm_pte_t *ptep = &pgt->pgd[idx * PTRS_PER_PTE]; - ret = __kvm_pgtable_walk(data, ptep, pgt->start_level); + ret = __kvm_pgtable_walk(data, pgt->mm_ops, ptep, pgt->start_level); if (ret) break; } @@ -332,7 +334,6 @@ int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, struct hyp_map_data { u64 phys; kvm_pte_t attr; - struct kvm_pgtable_mm_ops *mm_ops; }; static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) @@ -400,7 +401,7 @@ static bool hyp_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, if (ctx->old == new) return true; if (!kvm_pte_valid(ctx->old)) - data->mm_ops->get_page(ctx->ptep); + ctx->mm_ops->get_page(ctx->ptep); else if (WARN_ON((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) return false; @@ -413,7 +414,7 @@ static int hyp_map_walker(const struct kvm_pgtable_visit_ctx *ctx, { kvm_pte_t *childp; struct hyp_map_data *data = ctx->arg; - struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; if (hyp_map_walker_try_leaf(ctx, data)) return 0; @@ -436,7 +437,6 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, int ret; struct hyp_map_data map_data = { .phys = ALIGN_DOWN(phys, PAGE_SIZE), - .mm_ops = pgt->mm_ops, }; struct kvm_pgtable_walker walker = { .cb = hyp_map_walker, @@ -454,18 +454,13 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, return ret; } -struct hyp_unmap_data { - u64 unmapped; - struct kvm_pgtable_mm_ops *mm_ops; -}; - static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { kvm_pte_t *childp = NULL; u64 granule = kvm_granule_size(ctx->level); - struct hyp_unmap_data *data = ctx->arg; - struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; + u64 *unmapped = ctx->arg; + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; if (!kvm_pte_valid(ctx->old)) return -EINVAL; @@ -486,7 +481,7 @@ static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, kvm_clear_pte(ctx->ptep); dsb(ishst); __tlbi_level(vale2is, __TLBI_VADDR(ctx->addr, 0), ctx->level); - data->unmapped += granule; + *unmapped += granule; } dsb(ish); @@ -501,12 +496,10 @@ static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) { - struct hyp_unmap_data unmap_data = { - .mm_ops = pgt->mm_ops, - }; + u64 unmapped = 0; struct kvm_pgtable_walker walker = { .cb = hyp_unmap_walker, - .arg = &unmap_data, + .arg = &unmapped, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, }; @@ -514,7 +507,7 @@ u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) return 0; kvm_pgtable_walk(pgt, addr, size, &walker); - return unmap_data.unmapped; + return unmapped; } int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, @@ -538,7 +531,7 @@ int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, static int hyp_free_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { - struct kvm_pgtable_mm_ops *mm_ops = ctx->arg; + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; if (!kvm_pte_valid(ctx->old)) return 0; @@ -556,7 +549,6 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt) struct kvm_pgtable_walker walker = { .cb = hyp_free_walker, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, - .arg = pgt->mm_ops, }; WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); @@ -575,8 +567,6 @@ struct stage2_map_data { struct kvm_s2_mmu *mmu; void *memcache; - struct kvm_pgtable_mm_ops *mm_ops; - /* Force mappings to page granularity */ bool force_pte; }; @@ -725,7 +715,7 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new; u64 granule = kvm_granule_size(ctx->level), phys = data->phys; struct kvm_pgtable *pgt = data->mmu->pgt; - struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; if (!stage2_leaf_mapping_allowed(ctx, data)) return -E2BIG; @@ -773,7 +763,7 @@ static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx, if (!stage2_leaf_mapping_allowed(ctx, data)) return 0; - data->childp = kvm_pte_follow(ctx->old, data->mm_ops); + data->childp = kvm_pte_follow(ctx->old, ctx->mm_ops); kvm_clear_pte(ctx->ptep); /* @@ -789,7 +779,7 @@ static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx, static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx, struct stage2_map_data *data) { - struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; kvm_pte_t *childp; int ret; @@ -831,7 +821,7 @@ static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx, static int stage2_map_walk_table_post(const struct kvm_pgtable_visit_ctx *ctx, struct stage2_map_data *data) { - struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; kvm_pte_t *childp; int ret = 0; @@ -898,7 +888,6 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, .phys = ALIGN_DOWN(phys, PAGE_SIZE), .mmu = pgt->mmu, .memcache = mc, - .mm_ops = pgt->mm_ops, .force_pte = pgt->force_pte_cb && pgt->force_pte_cb(addr, addr + size, prot), }; struct kvm_pgtable_walker walker = { @@ -929,7 +918,6 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, .phys = KVM_PHYS_INVALID, .mmu = pgt->mmu, .memcache = mc, - .mm_ops = pgt->mm_ops, .owner_id = owner_id, .force_pte = true, }; @@ -953,7 +941,7 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, { struct kvm_pgtable *pgt = ctx->arg; struct kvm_s2_mmu *mmu = pgt->mmu; - struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; kvm_pte_t *childp = NULL; bool need_flush = false; @@ -1007,7 +995,6 @@ struct stage2_attr_data { kvm_pte_t attr_clr; kvm_pte_t pte; u32 level; - struct kvm_pgtable_mm_ops *mm_ops; }; static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx, @@ -1015,7 +1002,7 @@ static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx, { kvm_pte_t pte = ctx->old; struct stage2_attr_data *data = ctx->arg; - struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; if (!kvm_pte_valid(ctx->old)) return 0; @@ -1055,7 +1042,6 @@ static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr, struct stage2_attr_data data = { .attr_set = attr_set & attr_mask, .attr_clr = attr_clr & attr_mask, - .mm_ops = pgt->mm_ops, }; struct kvm_pgtable_walker walker = { .cb = stage2_attr_walker, @@ -1198,7 +1184,7 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { - struct kvm_pgtable_mm_ops *mm_ops = ctx->arg; + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; if (!stage2_pte_is_counted(ctx->old)) return 0; @@ -1218,7 +1204,6 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) .cb = stage2_free_walker, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, - .arg = pgt->mm_ops, }; WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); -- GitLab From fa002e8e79b3f980455ba585c1f47b26680de5b9 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 7 Nov 2022 21:56:34 +0000 Subject: [PATCH 0477/1423] KVM: arm64: Don't pass kvm_pgtable through kvm_pgtable_walk_data In order to tear down page tables from outside the context of kvm_pgtable (such as an RCU callback), stop passing a pointer through kvm_pgtable_walk_data. No functional change intended. Signed-off-by: Oliver Upton Reviewed-by: Ben Gardon Reviewed-by: Gavin Shan Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221107215644.1895162-5-oliver.upton@linux.dev --- arch/arm64/kvm/hyp/pgtable.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index db25e81a9890e..93989b750a263 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -50,7 +50,6 @@ #define KVM_MAX_OWNER_ID 1 struct kvm_pgtable_walk_data { - struct kvm_pgtable *pgt; struct kvm_pgtable_walker *walker; u64 addr; @@ -88,7 +87,7 @@ static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level) return (data->addr >> shift) & mask; } -static u32 __kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr) +static u32 kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr) { u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */ u64 mask = BIT(pgt->ia_bits) - 1; @@ -96,11 +95,6 @@ static u32 __kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr) return (addr & mask) >> shift; } -static u32 kvm_pgd_page_idx(struct kvm_pgtable_walk_data *data) -{ - return __kvm_pgd_page_idx(data->pgt, data->addr); -} - static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level) { struct kvm_pgtable pgt = { @@ -108,7 +102,7 @@ static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level) .start_level = start_level, }; - return __kvm_pgd_page_idx(&pgt, -1ULL) + 1; + return kvm_pgd_page_idx(&pgt, -1ULL) + 1; } static bool kvm_pte_table(kvm_pte_t pte, u32 level) @@ -255,11 +249,10 @@ static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, return ret; } -static int _kvm_pgtable_walk(struct kvm_pgtable_walk_data *data) +static int _kvm_pgtable_walk(struct kvm_pgtable *pgt, struct kvm_pgtable_walk_data *data) { u32 idx; int ret = 0; - struct kvm_pgtable *pgt = data->pgt; u64 limit = BIT(pgt->ia_bits); if (data->addr > limit || data->end > limit) @@ -268,7 +261,7 @@ static int _kvm_pgtable_walk(struct kvm_pgtable_walk_data *data) if (!pgt->pgd) return -EINVAL; - for (idx = kvm_pgd_page_idx(data); data->addr < data->end; ++idx) { + for (idx = kvm_pgd_page_idx(pgt, data->addr); data->addr < data->end; ++idx) { kvm_pte_t *ptep = &pgt->pgd[idx * PTRS_PER_PTE]; ret = __kvm_pgtable_walk(data, pgt->mm_ops, ptep, pgt->start_level); @@ -283,13 +276,12 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, struct kvm_pgtable_walker *walker) { struct kvm_pgtable_walk_data walk_data = { - .pgt = pgt, .addr = ALIGN_DOWN(addr, PAGE_SIZE), .end = PAGE_ALIGN(walk_data.addr + size), .walker = walker, }; - return _kvm_pgtable_walk(&walk_data); + return _kvm_pgtable_walk(pgt, &walk_data); } struct leaf_walk_data { -- GitLab From 8e94e1252cc054bb31fd3e9a15235cd831970ec1 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 7 Nov 2022 21:56:35 +0000 Subject: [PATCH 0478/1423] KVM: arm64: Add a helper to tear down unlinked stage-2 subtrees A subsequent change to KVM will move the tear down of an unlinked stage-2 subtree out of the critical path of the break-before-make sequence. Introduce a new helper for tearing down unlinked stage-2 subtrees. Leverage the existing stage-2 free walkers to do so, with a deep call into __kvm_pgtable_walk() as the subtree is no longer reachable from the root. Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221107215644.1895162-6-oliver.upton@linux.dev --- arch/arm64/include/asm/kvm_pgtable.h | 11 +++++++++++ arch/arm64/kvm/hyp/pgtable.c | 23 +++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index a752793482cb0..93b1feeaebabb 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -333,6 +333,17 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, */ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); +/** + * kvm_pgtable_stage2_free_removed() - Free a removed stage-2 paging structure. + * @mm_ops: Memory management callbacks. + * @pgtable: Unlinked stage-2 paging structure to be freed. + * @level: Level of the stage-2 paging structure to be freed. + * + * The page-table is assumed to be unreachable by any hardware walkers prior to + * freeing and therefore no TLB invalidation is performed. + */ +void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level); + /** * kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table. * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 93989b750a263..363a5cce7e1a1 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -1203,3 +1203,26 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) pgt->mm_ops->free_pages_exact(pgt->pgd, pgd_sz); pgt->pgd = NULL; } + +void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level) +{ + kvm_pte_t *ptep = (kvm_pte_t *)pgtable; + struct kvm_pgtable_walker walker = { + .cb = stage2_free_walker, + .flags = KVM_PGTABLE_WALK_LEAF | + KVM_PGTABLE_WALK_TABLE_POST, + }; + struct kvm_pgtable_walk_data data = { + .walker = &walker, + + /* + * At this point the IPA really doesn't matter, as the page + * table being traversed has already been removed from the stage + * 2. Set an appropriate range to cover the entire page table. + */ + .addr = 0, + .end = kvm_granule_size(level), + }; + + WARN_ON(__kvm_pgtable_walk(&data, mm_ops, ptep, level)); +} -- GitLab From 6b91b8f95cadd3441c056182daf9024475ac4a91 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 7 Nov 2022 21:56:36 +0000 Subject: [PATCH 0479/1423] KVM: arm64: Use an opaque type for pteps Use an opaque type for pteps and require visitors explicitly dereference the pointer before using. Protecting page table memory with RCU requires that KVM dereferences RCU-annotated pointers before using. However, RCU is not available for use in the nVHE hypervisor and the opaque type can be conditionally annotated with RCU for the stage-2 MMU. Call the type a 'pteref' to avoid a naming collision with raw pteps. No functional change intended. Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221107215644.1895162-7-oliver.upton@linux.dev --- arch/arm64/include/asm/kvm_pgtable.h | 9 ++++++++- arch/arm64/kvm/hyp/pgtable.c | 27 ++++++++++++++------------- arch/arm64/kvm/mmu.c | 2 +- 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 93b1feeaebabb..cbd2851eefc1c 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -37,6 +37,13 @@ static inline u64 kvm_get_parange(u64 mmfr0) typedef u64 kvm_pte_t; +typedef kvm_pte_t *kvm_pteref_t; + +static inline kvm_pte_t *kvm_dereference_pteref(kvm_pteref_t pteref, bool shared) +{ + return pteref; +} + #define KVM_PTE_VALID BIT(0) #define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT) @@ -175,7 +182,7 @@ typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end, struct kvm_pgtable { u32 ia_bits; u32 start_level; - kvm_pte_t *pgd; + kvm_pteref_t pgd; struct kvm_pgtable_mm_ops *mm_ops; /* Stage-2 only */ diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 363a5cce7e1a1..7511494537e5b 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -175,13 +175,14 @@ static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, } static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, - struct kvm_pgtable_mm_ops *mm_ops, kvm_pte_t *pgtable, u32 level); + struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level); static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, struct kvm_pgtable_mm_ops *mm_ops, - kvm_pte_t *ptep, u32 level) + kvm_pteref_t pteref, u32 level) { enum kvm_pgtable_walk_flags flags = data->walker->flags; + kvm_pte_t *ptep = kvm_dereference_pteref(pteref, false); struct kvm_pgtable_visit_ctx ctx = { .ptep = ptep, .old = READ_ONCE(*ptep), @@ -193,7 +194,7 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, .flags = flags, }; int ret = 0; - kvm_pte_t *childp; + kvm_pteref_t childp; bool table = kvm_pte_table(ctx.old, level); if (table && (ctx.flags & KVM_PGTABLE_WALK_TABLE_PRE)) @@ -214,7 +215,7 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, goto out; } - childp = kvm_pte_follow(ctx.old, mm_ops); + childp = (kvm_pteref_t)kvm_pte_follow(ctx.old, mm_ops); ret = __kvm_pgtable_walk(data, mm_ops, childp, level + 1); if (ret) goto out; @@ -227,7 +228,7 @@ out: } static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, - struct kvm_pgtable_mm_ops *mm_ops, kvm_pte_t *pgtable, u32 level) + struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level) { u32 idx; int ret = 0; @@ -236,12 +237,12 @@ static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, return -EINVAL; for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) { - kvm_pte_t *ptep = &pgtable[idx]; + kvm_pteref_t pteref = &pgtable[idx]; if (data->addr >= data->end) break; - ret = __kvm_pgtable_visit(data, mm_ops, ptep, level); + ret = __kvm_pgtable_visit(data, mm_ops, pteref, level); if (ret) break; } @@ -262,9 +263,9 @@ static int _kvm_pgtable_walk(struct kvm_pgtable *pgt, struct kvm_pgtable_walk_da return -EINVAL; for (idx = kvm_pgd_page_idx(pgt, data->addr); data->addr < data->end; ++idx) { - kvm_pte_t *ptep = &pgt->pgd[idx * PTRS_PER_PTE]; + kvm_pteref_t pteref = &pgt->pgd[idx * PTRS_PER_PTE]; - ret = __kvm_pgtable_walk(data, pgt->mm_ops, ptep, pgt->start_level); + ret = __kvm_pgtable_walk(data, pgt->mm_ops, pteref, pgt->start_level); if (ret) break; } @@ -507,7 +508,7 @@ int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, { u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits); - pgt->pgd = (kvm_pte_t *)mm_ops->zalloc_page(NULL); + pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_page(NULL); if (!pgt->pgd) return -ENOMEM; @@ -544,7 +545,7 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt) }; WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); - pgt->mm_ops->put_page(pgt->pgd); + pgt->mm_ops->put_page(kvm_dereference_pteref(pgt->pgd, false)); pgt->pgd = NULL; } @@ -1157,7 +1158,7 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; - pgt->pgd = mm_ops->zalloc_pages_exact(pgd_sz); + pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_pages_exact(pgd_sz); if (!pgt->pgd) return -ENOMEM; @@ -1200,7 +1201,7 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE; - pgt->mm_ops->free_pages_exact(pgt->pgd, pgd_sz); + pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(pgt->pgd, false), pgd_sz); pgt->pgd = NULL; } diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 60ee3d9f01f8c..5e197ae190ef9 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -640,7 +640,7 @@ static struct kvm_pgtable_mm_ops kvm_user_mm_ops = { static int get_user_mapping_size(struct kvm *kvm, u64 addr) { struct kvm_pgtable pgt = { - .pgd = (kvm_pte_t *)kvm->mm->pgd, + .pgd = (kvm_pteref_t)kvm->mm->pgd, .ia_bits = VA_BITS, .start_level = (KVM_PGTABLE_MAX_LEVELS - CONFIG_PGTABLE_LEVELS), -- GitLab From 5c359cca1faf6d7671537fe1c240e8668467864d Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 7 Nov 2022 21:56:37 +0000 Subject: [PATCH 0480/1423] KVM: arm64: Tear down unlinked stage-2 subtree after break-before-make The break-before-make sequence is a bit annoying as it opens a window wherein memory is unmapped from the guest. KVM should replace the PTE as quickly as possible and avoid unnecessary work in between. Presently, the stage-2 map walker tears down a removed table before installing a block mapping when coalescing a table into a block. As the removed table is no longer visible to hardware walkers after the DSB+TLBI, it is possible to move the remaining cleanup to happen after installing the new PTE. Reshuffle the stage-2 map walker to install the new block entry in the pre-order callback. Unwire all of the teardown logic and replace it with a call to kvm_pgtable_stage2_free_removed() after fixing the PTE. The post-order visitor is now completely unnecessary, so drop it. Finally, touch up the comments to better represent the now simplified map walker. Note that the call to tear down the unlinked stage-2 is indirected as a subsequent change will use an RCU callback to trigger tear down. RCU is not available to pKVM, so there is a need to use different implementations on pKVM and non-pKVM VMs. Signed-off-by: Oliver Upton Reviewed-by: Ben Gardon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221107215644.1895162-8-oliver.upton@linux.dev --- arch/arm64/include/asm/kvm_pgtable.h | 3 + arch/arm64/kvm/hyp/nvhe/mem_protect.c | 6 ++ arch/arm64/kvm/hyp/pgtable.c | 85 +++++++-------------------- arch/arm64/kvm/mmu.c | 8 +++ 4 files changed, 39 insertions(+), 63 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index cbd2851eefc1c..e70cf57b719ec 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -92,6 +92,8 @@ static inline bool kvm_level_supports_block_mapping(u32 level) * allocation is physically contiguous. * @free_pages_exact: Free an exact number of memory pages previously * allocated by zalloc_pages_exact. + * @free_removed_table: Free a removed paging structure by unlinking and + * dropping references. * @get_page: Increment the refcount on a page. * @put_page: Decrement the refcount on a page. When the * refcount reaches 0 the page is automatically @@ -110,6 +112,7 @@ struct kvm_pgtable_mm_ops { void* (*zalloc_page)(void *arg); void* (*zalloc_pages_exact)(size_t size); void (*free_pages_exact)(void *addr, size_t size); + void (*free_removed_table)(void *addr, u32 level); void (*get_page)(void *addr); void (*put_page)(void *addr); int (*page_count)(void *addr); diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index d21d1b08a055b..735769886b55c 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -79,6 +79,11 @@ static void host_s2_put_page(void *addr) hyp_put_page(&host_s2_pool, addr); } +static void host_s2_free_removed_table(void *addr, u32 level) +{ + kvm_pgtable_stage2_free_removed(&host_kvm.mm_ops, addr, level); +} + static int prepare_s2_pool(void *pgt_pool_base) { unsigned long nr_pages, pfn; @@ -93,6 +98,7 @@ static int prepare_s2_pool(void *pgt_pool_base) host_kvm.mm_ops = (struct kvm_pgtable_mm_ops) { .zalloc_pages_exact = host_s2_zalloc_pages_exact, .zalloc_page = host_s2_zalloc_page, + .free_removed_table = host_s2_free_removed_table, .phys_to_virt = hyp_phys_to_virt, .virt_to_phys = hyp_virt_to_phys, .page_count = hyp_page_count, diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 7511494537e5b..7c9782347570b 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -750,13 +750,13 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx, struct stage2_map_data *data) { - if (data->anchor) - return 0; + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + kvm_pte_t *childp = kvm_pte_follow(ctx->old, mm_ops); + int ret; if (!stage2_leaf_mapping_allowed(ctx, data)) return 0; - data->childp = kvm_pte_follow(ctx->old, ctx->mm_ops); kvm_clear_pte(ctx->ptep); /* @@ -765,8 +765,13 @@ static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx, * individually. */ kvm_call_hyp(__kvm_tlb_flush_vmid, data->mmu); - data->anchor = ctx->ptep; - return 0; + + ret = stage2_map_walker_try_leaf(ctx, data); + + mm_ops->put_page(ctx->ptep); + mm_ops->free_removed_table(childp, ctx->level); + + return ret; } static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx, @@ -776,13 +781,6 @@ static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t *childp; int ret; - if (data->anchor) { - if (stage2_pte_is_counted(ctx->old)) - mm_ops->put_page(ctx->ptep); - - return 0; - } - ret = stage2_map_walker_try_leaf(ctx, data); if (ret != -E2BIG) return ret; @@ -811,49 +809,14 @@ static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx, return 0; } -static int stage2_map_walk_table_post(const struct kvm_pgtable_visit_ctx *ctx, - struct stage2_map_data *data) -{ - struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; - kvm_pte_t *childp; - int ret = 0; - - if (!data->anchor) - return 0; - - if (data->anchor == ctx->ptep) { - childp = data->childp; - data->anchor = NULL; - data->childp = NULL; - ret = stage2_map_walk_leaf(ctx, data); - } else { - childp = kvm_pte_follow(ctx->old, mm_ops); - } - - mm_ops->put_page(childp); - mm_ops->put_page(ctx->ptep); - - return ret; -} - /* - * This is a little fiddly, as we use all three of the walk flags. The idea - * is that the TABLE_PRE callback runs for table entries on the way down, - * looking for table entries which we could conceivably replace with a - * block entry for this mapping. If it finds one, then it sets the 'anchor' - * field in 'struct stage2_map_data' to point at the table entry, before - * clearing the entry to zero and descending into the now detached table. - * - * The behaviour of the LEAF callback then depends on whether or not the - * anchor has been set. If not, then we're not using a block mapping higher - * up the table and we perform the mapping at the existing leaves instead. - * If, on the other hand, the anchor _is_ set, then we drop references to - * all valid leaves so that the pages beneath the anchor can be freed. + * The TABLE_PRE callback runs for table entries on the way down, looking + * for table entries which we could conceivably replace with a block entry + * for this mapping. If it finds one it replaces the entry and calls + * kvm_pgtable_mm_ops::free_removed_table() to tear down the detached table. * - * Finally, the TABLE_POST callback does nothing if the anchor has not - * been set, but otherwise frees the page-table pages while walking back up - * the page-table, installing the block entry when it revisits the anchor - * pointer and clearing the anchor to NULL. + * Otherwise, the LEAF callback performs the mapping at the existing leaves + * instead. */ static int stage2_map_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) @@ -865,11 +828,9 @@ static int stage2_map_walker(const struct kvm_pgtable_visit_ctx *ctx, return stage2_map_walk_table_pre(ctx, data); case KVM_PGTABLE_WALK_LEAF: return stage2_map_walk_leaf(ctx, data); - case KVM_PGTABLE_WALK_TABLE_POST: - return stage2_map_walk_table_post(ctx, data); + default: + return -EINVAL; } - - return -EINVAL; } int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, @@ -886,8 +847,7 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, struct kvm_pgtable_walker walker = { .cb = stage2_map_walker, .flags = KVM_PGTABLE_WALK_TABLE_PRE | - KVM_PGTABLE_WALK_LEAF | - KVM_PGTABLE_WALK_TABLE_POST, + KVM_PGTABLE_WALK_LEAF, .arg = &map_data, }; @@ -917,8 +877,7 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, struct kvm_pgtable_walker walker = { .cb = stage2_map_walker, .flags = KVM_PGTABLE_WALK_TABLE_PRE | - KVM_PGTABLE_WALK_LEAF | - KVM_PGTABLE_WALK_TABLE_POST, + KVM_PGTABLE_WALK_LEAF, .arg = &map_data, }; @@ -1207,7 +1166,7 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level) { - kvm_pte_t *ptep = (kvm_pte_t *)pgtable; + kvm_pteref_t ptep = (kvm_pteref_t)pgtable; struct kvm_pgtable_walker walker = { .cb = stage2_free_walker, .flags = KVM_PGTABLE_WALK_LEAF | @@ -1225,5 +1184,5 @@ void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pg .end = kvm_granule_size(level), }; - WARN_ON(__kvm_pgtable_walk(&data, mm_ops, ptep, level)); + WARN_ON(__kvm_pgtable_walk(&data, mm_ops, ptep, level + 1)); } diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 5e197ae190ef9..73ae908eb5d93 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -128,6 +128,13 @@ static void kvm_s2_free_pages_exact(void *virt, size_t size) free_pages_exact(virt, size); } +static struct kvm_pgtable_mm_ops kvm_s2_mm_ops; + +static void stage2_free_removed_table(void *addr, u32 level) +{ + kvm_pgtable_stage2_free_removed(&kvm_s2_mm_ops, addr, level); +} + static void kvm_host_get_page(void *addr) { get_page(virt_to_page(addr)); @@ -662,6 +669,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { .zalloc_page = stage2_memcache_zalloc_page, .zalloc_pages_exact = kvm_s2_zalloc_pages_exact, .free_pages_exact = kvm_s2_free_pages_exact, + .free_removed_table = stage2_free_removed_table, .get_page = kvm_host_get_page, .put_page = kvm_s2_put_page, .page_count = kvm_host_page_count, -- GitLab From c3119ae45dfb6038ca458ab5ba7a9fba2810845b Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 7 Nov 2022 21:56:38 +0000 Subject: [PATCH 0481/1423] KVM: arm64: Protect stage-2 traversal with RCU Use RCU to safely walk the stage-2 page tables in parallel. Acquire and release the RCU read lock when traversing the page tables. Defer the freeing of table memory to an RCU callback. Indirect the calls into RCU and provide stubs for hypervisor code, as RCU is not available in such a context. The RCU protection doesn't amount to much at the moment, as readers are already protected by the read-write lock (all walkers that free table memory take the write lock). Nonetheless, a subsequent change will futher relax the locking requirements around the stage-2 MMU, thereby depending on RCU. Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221107215644.1895162-9-oliver.upton@linux.dev --- arch/arm64/include/asm/kvm_pgtable.h | 49 ++++++++++++++++++++++++++++ arch/arm64/kvm/hyp/pgtable.c | 10 +++++- arch/arm64/kvm/mmu.c | 14 +++++++- 3 files changed, 71 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index e70cf57b719ec..7634b6964779a 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -37,6 +37,13 @@ static inline u64 kvm_get_parange(u64 mmfr0) typedef u64 kvm_pte_t; +/* + * RCU cannot be used in a non-kernel context such as the hyp. As such, page + * table walkers used in hyp do not call into RCU and instead use other + * synchronization mechanisms (such as a spinlock). + */ +#if defined(__KVM_NVHE_HYPERVISOR__) || defined(__KVM_VHE_HYPERVISOR__) + typedef kvm_pte_t *kvm_pteref_t; static inline kvm_pte_t *kvm_dereference_pteref(kvm_pteref_t pteref, bool shared) @@ -44,6 +51,40 @@ static inline kvm_pte_t *kvm_dereference_pteref(kvm_pteref_t pteref, bool shared return pteref; } +static inline void kvm_pgtable_walk_begin(void) {} +static inline void kvm_pgtable_walk_end(void) {} + +static inline bool kvm_pgtable_walk_lock_held(void) +{ + return true; +} + +#else + +typedef kvm_pte_t __rcu *kvm_pteref_t; + +static inline kvm_pte_t *kvm_dereference_pteref(kvm_pteref_t pteref, bool shared) +{ + return rcu_dereference_check(pteref, !shared); +} + +static inline void kvm_pgtable_walk_begin(void) +{ + rcu_read_lock(); +} + +static inline void kvm_pgtable_walk_end(void) +{ + rcu_read_unlock(); +} + +static inline bool kvm_pgtable_walk_lock_held(void) +{ + return rcu_read_lock_held(); +} + +#endif + #define KVM_PTE_VALID BIT(0) #define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT) @@ -202,11 +243,14 @@ struct kvm_pgtable { * children. * @KVM_PGTABLE_WALK_TABLE_POST: Visit table entries after their * children. + * @KVM_PGTABLE_WALK_SHARED: Indicates the page-tables may be shared + * with other software walkers. */ enum kvm_pgtable_walk_flags { KVM_PGTABLE_WALK_LEAF = BIT(0), KVM_PGTABLE_WALK_TABLE_PRE = BIT(1), KVM_PGTABLE_WALK_TABLE_POST = BIT(2), + KVM_PGTABLE_WALK_SHARED = BIT(3), }; struct kvm_pgtable_visit_ctx { @@ -223,6 +267,11 @@ struct kvm_pgtable_visit_ctx { typedef int (*kvm_pgtable_visitor_fn_t)(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit); +static inline bool kvm_pgtable_walk_shared(const struct kvm_pgtable_visit_ctx *ctx) +{ + return ctx->flags & KVM_PGTABLE_WALK_SHARED; +} + /** * struct kvm_pgtable_walker - Hook into a page-table walk. * @cb: Callback function to invoke during the walk. diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 7c9782347570b..d8d963521d4eb 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -171,6 +171,9 @@ static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, enum kvm_pgtable_walk_flags visit) { struct kvm_pgtable_walker *walker = data->walker; + + /* Ensure the appropriate lock is held (e.g. RCU lock for stage-2 MMU) */ + WARN_ON_ONCE(kvm_pgtable_walk_shared(ctx) && !kvm_pgtable_walk_lock_held()); return walker->cb(ctx, visit); } @@ -281,8 +284,13 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, .end = PAGE_ALIGN(walk_data.addr + size), .walker = walker, }; + int r; + + kvm_pgtable_walk_begin(); + r = _kvm_pgtable_walk(pgt, &walk_data); + kvm_pgtable_walk_end(); - return _kvm_pgtable_walk(pgt, &walk_data); + return r; } struct leaf_walk_data { diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 73ae908eb5d93..52e042399ba5d 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -130,9 +130,21 @@ static void kvm_s2_free_pages_exact(void *virt, size_t size) static struct kvm_pgtable_mm_ops kvm_s2_mm_ops; +static void stage2_free_removed_table_rcu_cb(struct rcu_head *head) +{ + struct page *page = container_of(head, struct page, rcu_head); + void *pgtable = page_to_virt(page); + u32 level = page_private(page); + + kvm_pgtable_stage2_free_removed(&kvm_s2_mm_ops, pgtable, level); +} + static void stage2_free_removed_table(void *addr, u32 level) { - kvm_pgtable_stage2_free_removed(&kvm_s2_mm_ops, addr, level); + struct page *page = virt_to_page(addr); + + set_page_private(page, (unsigned long)level); + call_rcu(&page->rcu_head, stage2_free_removed_table_rcu_cb); } static void kvm_host_get_page(void *addr) -- GitLab From ca5de2448c3b4c018fe3d6223df8b59068be1cd7 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 7 Nov 2022 21:56:39 +0000 Subject: [PATCH 0482/1423] KVM: arm64: Atomically update stage 2 leaf attributes in parallel walks The stage2 attr walker is already used for parallel walks. Since commit f783ef1c0e82 ("KVM: arm64: Add fast path to handle permission relaxation during dirty logging"), KVM acquires the read lock when write-unprotecting a PTE. However, the walker only uses a simple store to update the PTE. This is safe as the only possible race is with hardware updates to the access flag, which is benign. However, a subsequent change to KVM will allow more changes to the stage 2 page tables to be done in parallel. Prepare the stage 2 attribute walker by performing atomic updates to the PTE when walking in parallel. Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221107215644.1895162-10-oliver.upton@linux.dev --- arch/arm64/kvm/hyp/pgtable.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index d8d963521d4eb..a34e2050f9314 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -185,7 +185,7 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, kvm_pteref_t pteref, u32 level) { enum kvm_pgtable_walk_flags flags = data->walker->flags; - kvm_pte_t *ptep = kvm_dereference_pteref(pteref, false); + kvm_pte_t *ptep = kvm_dereference_pteref(pteref, flags & KVM_PGTABLE_WALK_SHARED); struct kvm_pgtable_visit_ctx ctx = { .ptep = ptep, .old = READ_ONCE(*ptep), @@ -675,6 +675,16 @@ static bool stage2_pte_is_counted(kvm_pte_t pte) return !!pte; } +static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new) +{ + if (!kvm_pgtable_walk_shared(ctx)) { + WRITE_ONCE(*ctx->ptep, new); + return true; + } + + return cmpxchg(ctx->ptep, ctx->old, new) == ctx->old; +} + static void stage2_put_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops) { @@ -986,7 +996,9 @@ static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx, stage2_pte_executable(pte) && !stage2_pte_executable(ctx->old)) mm_ops->icache_inval_pou(kvm_pte_follow(pte, mm_ops), kvm_granule_size(ctx->level)); - WRITE_ONCE(*ctx->ptep, pte); + + if (!stage2_try_set_pte(ctx, pte)) + return -EAGAIN; } return 0; @@ -995,7 +1007,7 @@ static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx, static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr, u64 size, kvm_pte_t attr_set, kvm_pte_t attr_clr, kvm_pte_t *orig_pte, - u32 *level) + u32 *level, enum kvm_pgtable_walk_flags flags) { int ret; kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI; @@ -1006,7 +1018,7 @@ static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr, struct kvm_pgtable_walker walker = { .cb = stage2_attr_walker, .arg = &data, - .flags = KVM_PGTABLE_WALK_LEAF, + .flags = flags | KVM_PGTABLE_WALK_LEAF, }; ret = kvm_pgtable_walk(pgt, addr, size, &walker); @@ -1025,14 +1037,14 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) { return stage2_update_leaf_attrs(pgt, addr, size, 0, KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, - NULL, NULL); + NULL, NULL, 0); } kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr) { kvm_pte_t pte = 0; stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0, - &pte, NULL); + &pte, NULL, 0); dsb(ishst); return pte; } @@ -1041,7 +1053,7 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr) { kvm_pte_t pte = 0; stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF, - &pte, NULL); + &pte, NULL, 0); /* * "But where's the TLBI?!", you scream. * "Over in the core code", I sigh. @@ -1054,7 +1066,7 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr) bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr) { kvm_pte_t pte = 0; - stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL); + stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL, 0); return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF; } @@ -1077,7 +1089,8 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, if (prot & KVM_PGTABLE_PROT_X) clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; - ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level); + ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, + KVM_PGTABLE_WALK_SHARED); if (!ret) kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level); return ret; -- GitLab From 331aa3a0547d1c794587e0df374d13b16645e832 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 7 Nov 2022 21:56:40 +0000 Subject: [PATCH 0483/1423] KVM: arm64: Split init and set for table PTE Create a helper to initialize a table and directly call smp_store_release() to install it (for now). Prepare for a subsequent change that generalizes PTE writes with a helper. Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221107215644.1895162-11-oliver.upton@linux.dev --- arch/arm64/kvm/hyp/pgtable.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index a34e2050f9314..f4dd77c6c97d2 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -136,16 +136,13 @@ static void kvm_clear_pte(kvm_pte_t *ptep) WRITE_ONCE(*ptep, 0); } -static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp, - struct kvm_pgtable_mm_ops *mm_ops) +static kvm_pte_t kvm_init_table_pte(kvm_pte_t *childp, struct kvm_pgtable_mm_ops *mm_ops) { - kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(mm_ops->virt_to_phys(childp)); + kvm_pte_t pte = kvm_phys_to_pte(mm_ops->virt_to_phys(childp)); pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE); pte |= KVM_PTE_VALID; - - WARN_ON(kvm_pte_valid(old)); - smp_store_release(ptep, pte); + return pte; } static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level) @@ -413,7 +410,7 @@ static bool hyp_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, static int hyp_map_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { - kvm_pte_t *childp; + kvm_pte_t *childp, new; struct hyp_map_data *data = ctx->arg; struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; @@ -427,8 +424,10 @@ static int hyp_map_walker(const struct kvm_pgtable_visit_ctx *ctx, if (!childp) return -ENOMEM; - kvm_set_table_pte(ctx->ptep, childp, mm_ops); + new = kvm_init_table_pte(childp, mm_ops); mm_ops->get_page(ctx->ptep); + smp_store_release(ctx->ptep, new); + return 0; } @@ -796,7 +795,7 @@ static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx, struct stage2_map_data *data) { struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; - kvm_pte_t *childp; + kvm_pte_t *childp, new; int ret; ret = stage2_map_walker_try_leaf(ctx, data); @@ -821,8 +820,9 @@ static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx, if (stage2_pte_is_counted(ctx->old)) stage2_put_pte(ctx, data->mmu, mm_ops); - kvm_set_table_pte(ctx->ptep, childp, mm_ops); + new = kvm_init_table_pte(childp, mm_ops); mm_ops->get_page(ctx->ptep); + smp_store_release(ctx->ptep, new); return 0; } -- GitLab From 0ab12f3574db6cb432917a667f9392a88e8f0dfc Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 7 Nov 2022 21:58:55 +0000 Subject: [PATCH 0484/1423] KVM: arm64: Make block->table PTE changes parallel-aware In order to service stage-2 faults in parallel, stage-2 table walkers must take exclusive ownership of the PTE being worked on. An additional requirement of the architecture is that software must perform a 'break-before-make' operation when changing the block size used for mapping memory. Roll these two concepts together into helpers for performing a 'break-before-make' sequence. Use a special PTE value to indicate a PTE has been locked by a software walker. Additionally, use an atomic compare-exchange to 'break' the PTE when the stage-2 page tables are possibly shared with another software walker. Elide the DSB + TLBI if the evicted PTE was invalid (and thus not subject to break-before-make). All of the atomics do nothing for now, as the stage-2 walker isn't fully ready to perform parallel walks. Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221107215855.1895367-1-oliver.upton@linux.dev --- arch/arm64/kvm/hyp/pgtable.c | 80 +++++++++++++++++++++++++++++++++--- 1 file changed, 75 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index f4dd77c6c97d2..b9f0d792b8d94 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -49,6 +49,12 @@ #define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) #define KVM_MAX_OWNER_ID 1 +/* + * Used to indicate a pte for which a 'break-before-make' sequence is in + * progress. + */ +#define KVM_INVALID_PTE_LOCKED BIT(10) + struct kvm_pgtable_walk_data { struct kvm_pgtable_walker *walker; @@ -674,6 +680,11 @@ static bool stage2_pte_is_counted(kvm_pte_t pte) return !!pte; } +static bool stage2_pte_is_locked(kvm_pte_t pte) +{ + return !kvm_pte_valid(pte) && (pte & KVM_INVALID_PTE_LOCKED); +} + static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new) { if (!kvm_pgtable_walk_shared(ctx)) { @@ -684,6 +695,64 @@ static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_ return cmpxchg(ctx->ptep, ctx->old, new) == ctx->old; } +/** + * stage2_try_break_pte() - Invalidates a pte according to the + * 'break-before-make' requirements of the + * architecture. + * + * @ctx: context of the visited pte. + * @mmu: stage-2 mmu + * + * Returns: true if the pte was successfully broken. + * + * If the removed pte was valid, performs the necessary serialization and TLB + * invalidation for the old value. For counted ptes, drops the reference count + * on the containing table page. + */ +static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx, + struct kvm_s2_mmu *mmu) +{ + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + + if (stage2_pte_is_locked(ctx->old)) { + /* + * Should never occur if this walker has exclusive access to the + * page tables. + */ + WARN_ON(!kvm_pgtable_walk_shared(ctx)); + return false; + } + + if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED)) + return false; + + /* + * Perform the appropriate TLB invalidation based on the evicted pte + * value (if any). + */ + if (kvm_pte_table(ctx->old, ctx->level)) + kvm_call_hyp(__kvm_tlb_flush_vmid, mmu); + else if (kvm_pte_valid(ctx->old)) + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level); + + if (stage2_pte_is_counted(ctx->old)) + mm_ops->put_page(ctx->ptep); + + return true; +} + +static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new) +{ + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + + WARN_ON(!stage2_pte_is_locked(*ctx->ptep)); + + if (stage2_pte_is_counted(new)) + mm_ops->get_page(ctx->ptep); + + smp_store_release(ctx->ptep, new); +} + static void stage2_put_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops) { @@ -812,17 +881,18 @@ static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx, if (!childp) return -ENOMEM; + if (!stage2_try_break_pte(ctx, data->mmu)) { + mm_ops->put_page(childp); + return -EAGAIN; + } + /* * If we've run into an existing block mapping then replace it with * a table. Accesses beyond 'end' that fall within the new table * will be mapped lazily. */ - if (stage2_pte_is_counted(ctx->old)) - stage2_put_pte(ctx, data->mmu, mm_ops); - new = kvm_init_table_pte(childp, mm_ops); - mm_ops->get_page(ctx->ptep); - smp_store_release(ctx->ptep, new); + stage2_make_pte(ctx, new); return 0; } -- GitLab From 946fbfdf336b811479e024136c7cabc00157b6b9 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 7 Nov 2022 21:59:34 +0000 Subject: [PATCH 0485/1423] KVM: arm64: Make leaf->leaf PTE changes parallel-aware Convert stage2_map_walker_try_leaf() to use the new break-before-make helpers, thereby making the handler parallel-aware. As before, avoid the break-before-make if recreating the existing mapping. Additionally, retry execution if another vCPU thread is modifying the same PTE. Signed-off-by: Oliver Upton Reviewed-by: Ben Gardon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221107215934.1895478-1-oliver.upton@linux.dev --- arch/arm64/kvm/hyp/pgtable.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index b9f0d792b8d94..238f29389617b 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -804,18 +804,17 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, else new = kvm_init_invalid_leaf_owner(data->owner_id); - if (stage2_pte_is_counted(ctx->old)) { - /* - * Skip updating the PTE if we are trying to recreate the exact - * same mapping or only change the access permissions. Instead, - * the vCPU will exit one more time from guest if still needed - * and then go through the path of relaxing permissions. - */ - if (!stage2_pte_needs_update(ctx->old, new)) - return -EAGAIN; + /* + * Skip updating the PTE if we are trying to recreate the exact + * same mapping or only change the access permissions. Instead, + * the vCPU will exit one more time from guest if still needed + * and then go through the path of relaxing permissions. + */ + if (!stage2_pte_needs_update(ctx->old, new)) + return -EAGAIN; - stage2_put_pte(ctx, data->mmu, mm_ops); - } + if (!stage2_try_break_pte(ctx, data->mmu)) + return -EAGAIN; /* Perform CMOs before installation of the guest stage-2 PTE */ if (mm_ops->dcache_clean_inval_poc && stage2_pte_cacheable(pgt, new)) @@ -825,9 +824,8 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, if (mm_ops->icache_inval_pou && stage2_pte_executable(new)) mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule); - smp_store_release(ctx->ptep, new); - if (stage2_pte_is_counted(new)) - mm_ops->get_page(ctx->ptep); + stage2_make_pte(ctx, new); + if (kvm_phys_is_valid(phys)) data->phys += granule; return 0; -- GitLab From af87fc03cfdf6893011df419588d27acdfb9c197 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 7 Nov 2022 22:00:06 +0000 Subject: [PATCH 0486/1423] KVM: arm64: Make table->block changes parallel-aware stage2_map_walker_try_leaf() and friends now handle stage-2 PTEs generically, and perform the correct flush when a table PTE is removed. Additionally, they've been made parallel-aware, using an atomic break to take ownership of the PTE. Stop clearing the PTE in the pre-order callback and instead let stage2_map_walker_try_leaf() deal with it. Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221107220006.1895572-1-oliver.upton@linux.dev --- arch/arm64/kvm/hyp/pgtable.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 238f29389617b..f814422ef7952 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -841,21 +841,12 @@ static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx, if (!stage2_leaf_mapping_allowed(ctx, data)) return 0; - kvm_clear_pte(ctx->ptep); - - /* - * Invalidate the whole stage-2, as we may have numerous leaf - * entries below us which would otherwise need invalidating - * individually. - */ - kvm_call_hyp(__kvm_tlb_flush_vmid, data->mmu); - ret = stage2_map_walker_try_leaf(ctx, data); + if (ret) + return ret; - mm_ops->put_page(ctx->ptep); mm_ops->free_removed_table(childp, ctx->level); - - return ret; + return 0; } static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx, -- GitLab From 1577cb5823cefdff4416f272a88143ee933d97f5 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 7 Nov 2022 22:00:33 +0000 Subject: [PATCH 0487/1423] KVM: arm64: Handle stage-2 faults in parallel The stage-2 map walker has been made parallel-aware, and as such can be called while only holding the read side of the MMU lock. Rip out the conditional locking in user_mem_abort() and instead grab the read lock. Continue to take the write lock from other callsites to kvm_pgtable_stage2_map(). Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221107220033.1895655-1-oliver.upton@linux.dev --- arch/arm64/include/asm/kvm_pgtable.h | 3 ++- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 2 +- arch/arm64/kvm/hyp/pgtable.c | 5 +++-- arch/arm64/kvm/mmu.c | 31 ++++++--------------------- 4 files changed, 13 insertions(+), 28 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 7634b6964779a..a874ce0ce7b51 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -412,6 +412,7 @@ void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pg * @prot: Permissions and attributes for the mapping. * @mc: Cache of pre-allocated and zeroed memory from which to allocate * page-table pages. + * @flags: Flags to control the page-table walk (ex. a shared walk) * * The offset of @addr within a page is ignored, @size is rounded-up to * the next page boundary and @phys is rounded-down to the previous page @@ -433,7 +434,7 @@ void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pg */ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, enum kvm_pgtable_prot prot, - void *mc); + void *mc, enum kvm_pgtable_walk_flags flags); /** * kvm_pgtable_stage2_set_owner() - Unmap and annotate pages in the IPA space to diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 735769886b55c..f6d82bf33ce1d 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -257,7 +257,7 @@ static inline int __host_stage2_idmap(u64 start, u64 end, enum kvm_pgtable_prot prot) { return kvm_pgtable_stage2_map(&host_kvm.pgt, start, end - start, start, - prot, &host_s2_pool); + prot, &host_s2_pool, 0); } /* diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index f814422ef7952..5bca9610d040d 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -912,7 +912,7 @@ static int stage2_map_walker(const struct kvm_pgtable_visit_ctx *ctx, int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, enum kvm_pgtable_prot prot, - void *mc) + void *mc, enum kvm_pgtable_walk_flags flags) { int ret; struct stage2_map_data map_data = { @@ -923,7 +923,8 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, }; struct kvm_pgtable_walker walker = { .cb = stage2_map_walker, - .flags = KVM_PGTABLE_WALK_TABLE_PRE | + .flags = flags | + KVM_PGTABLE_WALK_TABLE_PRE | KVM_PGTABLE_WALK_LEAF, .arg = &map_data, }; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 52e042399ba5d..410c2a37fe32f 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -861,7 +861,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, write_lock(&kvm->mmu_lock); ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot, - &cache); + &cache, 0); write_unlock(&kvm->mmu_lock); if (ret) break; @@ -1156,7 +1156,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, gfn_t gfn; kvm_pfn_t pfn; bool logging_active = memslot_is_logging(memslot); - bool use_read_lock = false; unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu); unsigned long vma_pagesize, fault_granule; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; @@ -1191,8 +1190,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (logging_active) { force_pte = true; vma_shift = PAGE_SHIFT; - use_read_lock = (fault_status == FSC_PERM && write_fault && - fault_granule == PAGE_SIZE); } else { vma_shift = get_vma_page_shift(vma, hva); } @@ -1291,15 +1288,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (exec_fault && device) return -ENOEXEC; - /* - * To reduce MMU contentions and enhance concurrency during dirty - * logging dirty logging, only acquire read lock for permission - * relaxation. - */ - if (use_read_lock) - read_lock(&kvm->mmu_lock); - else - write_lock(&kvm->mmu_lock); + read_lock(&kvm->mmu_lock); pgt = vcpu->arch.hw_mmu->pgt; if (mmu_invalidate_retry(kvm, mmu_seq)) goto out_unlock; @@ -1343,15 +1332,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * permissions only if vma_pagesize equals fault_granule. Otherwise, * kvm_pgtable_stage2_map() should be called to change block size. */ - if (fault_status == FSC_PERM && vma_pagesize == fault_granule) { + if (fault_status == FSC_PERM && vma_pagesize == fault_granule) ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot); - } else { - WARN_ONCE(use_read_lock, "Attempted stage-2 map outside of write lock\n"); - + else ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize, __pfn_to_phys(pfn), prot, - memcache); - } + memcache, KVM_PGTABLE_WALK_SHARED); /* Mark the page dirty only if the fault is handled successfully */ if (writable && !ret) { @@ -1360,10 +1346,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } out_unlock: - if (use_read_lock) - read_unlock(&kvm->mmu_lock); - else - write_unlock(&kvm->mmu_lock); + read_unlock(&kvm->mmu_lock); kvm_set_pfn_accessed(pfn); kvm_release_pfn_clean(pfn); return ret != -EAGAIN ? ret : 0; @@ -1569,7 +1552,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) */ kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT, PAGE_SIZE, __pfn_to_phys(pfn), - KVM_PGTABLE_PROT_R, NULL); + KVM_PGTABLE_PROT_R, NULL, 0); return false; } -- GitLab From fba31beab3578b793060f549188fe682df7d3ed9 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Wed, 9 Nov 2022 15:10:39 +0530 Subject: [PATCH 0488/1423] PCI: qcom: Fix error message for reset_control_assert() Fix the error message to mention "assert" instead of "deassert". Link: https://lore.kernel.org/r/20221109094039.25753-1-manivannan.sadhasivam@linaro.org Signed-off-by: Manivannan Sadhasivam Signed-off-by: Lorenzo Pieralisi Reviewed-by: Vinod Koul --- drivers/pci/controller/dwc/pcie-qcom.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c index f711acacaeaf8..cf27345f65759 100644 --- a/drivers/pci/controller/dwc/pcie-qcom.c +++ b/drivers/pci/controller/dwc/pcie-qcom.c @@ -1236,7 +1236,7 @@ static int qcom_pcie_init_2_7_0(struct qcom_pcie *pcie) ret = reset_control_assert(res->pci_reset); if (ret < 0) { - dev_err(dev, "cannot deassert pci reset\n"); + dev_err(dev, "cannot assert pci reset\n"); goto err_disable_clocks; } -- GitLab From c9bfd858402c86b6559aa05227eb5dbae3ce862e Mon Sep 17 00:00:00 2001 From: Jianjun Wang Date: Thu, 3 Nov 2022 10:56:54 +0800 Subject: [PATCH 0489/1423] dt-bindings: PCI: mediatek-gen3: Support mt8195 In order to support mt8195 pcie node, update the yaml to support new properties of iommu and power-domain, and update the reset-names property to allow only one 'mac' name. Link: https://lore.kernel.org/r/20221103025656.8714-2-tinghan.shen@mediatek.com Signed-off-by: Jianjun Wang Signed-off-by: TingHan Shen Signed-off-by: Lorenzo Pieralisi Reviewed-by: AngeloGioacchino Del Regno Reviewed-by: Rob Herring --- .../devicetree/bindings/pci/mediatek-pcie-gen3.yaml | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/pci/mediatek-pcie-gen3.yaml b/Documentation/devicetree/bindings/pci/mediatek-pcie-gen3.yaml index c00be39af64e5..bc90f0ec7bd99 100644 --- a/Documentation/devicetree/bindings/pci/mediatek-pcie-gen3.yaml +++ b/Documentation/devicetree/bindings/pci/mediatek-pcie-gen3.yaml @@ -70,15 +70,21 @@ properties: minItems: 1 maxItems: 8 + iommu-map: + maxItems: 1 + + iommu-map-mask: + const: 0 + resets: minItems: 1 maxItems: 2 reset-names: minItems: 1 + maxItems: 2 items: - - const: phy - - const: mac + enum: [ phy, mac ] clocks: maxItems: 6 @@ -107,6 +113,9 @@ properties: items: - const: pcie-phy + power-domains: + maxItems: 1 + '#interrupt-cells': const: 1 -- GitLab From 8405d8f0956d227c3355d9bdbabc23f79f721ce4 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Tue, 13 Sep 2022 15:42:37 +0530 Subject: [PATCH 0490/1423] PCI: dwc: Use dev_info for PCIe link down event logging Some of the platforms (like Tegra194 and Tegra234) have open slots and not having an endpoint connected to the slot is not an error. So, changing the macro from dev_err to dev_info to log the event. Link: https://lore.kernel.org/r/20220913101237.4337-1-vidyas@nvidia.com Tested-by: Jon Hunter Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Acked-by: Jon Hunter Acked-by: Manivannan Sadhasivam --- drivers/pci/controller/dwc/pcie-designware.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c index 9e4d96e5a3f5a..432aead68d1fd 100644 --- a/drivers/pci/controller/dwc/pcie-designware.c +++ b/drivers/pci/controller/dwc/pcie-designware.c @@ -448,7 +448,7 @@ int dw_pcie_wait_for_link(struct dw_pcie *pci) } if (retries >= LINK_WAIT_MAX_RETRIES) { - dev_err(pci->dev, "Phy link never came up\n"); + dev_info(pci->dev, "Phy link never came up\n"); return -ETIMEDOUT; } -- GitLab From 72f542ac4f39fb42b8a6380ac8d9b3c39019d2d6 Mon Sep 17 00:00:00 2001 From: Matt Ranostay Date: Fri, 28 Oct 2022 02:17:16 -0700 Subject: [PATCH 0491/1423] dt-bindings: PCI: Add host mode device-id for j721s2 platform Add unique device-id of 0xb013 for j721s2 platform to oneOf field. Link: https://lore.kernel.org/r/20221028091716.21414-1-mranostay@ti.com Signed-off-by: Matt Ranostay Signed-off-by: Lorenzo Pieralisi Acked-by: Rob Herring --- Documentation/devicetree/bindings/pci/ti,j721e-pci-host.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/pci/ti,j721e-pci-host.yaml b/Documentation/devicetree/bindings/pci/ti,j721e-pci-host.yaml index d9df7cd922f1f..b0513b197d087 100644 --- a/Documentation/devicetree/bindings/pci/ti,j721e-pci-host.yaml +++ b/Documentation/devicetree/bindings/pci/ti,j721e-pci-host.yaml @@ -73,6 +73,8 @@ properties: - const: 0xb00f - items: - const: 0xb010 + - items: + - const: 0xb013 msi-map: true -- GitLab From 9e6f07cd1eaa72c41719050c5ca9d22a8c0b7c02 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Fri, 4 Nov 2022 15:20:01 +0100 Subject: [PATCH 0492/1423] vfio/ccw: create a parent struct Move the stuff associated with the mdev parent (and thus the subchannel struct) into its own struct, and leave the rest in the existing private structure. The subchannel will point to the parent, and the parent will point to the private, for the areas where one or both are needed. Further separation of these structs will follow. Signed-off-by: Eric Farman Reviewed-by: Matthew Rosato Link: https://lore.kernel.org/r/20221104142007.1314999-2-farman@linux.ibm.com Signed-off-by: Alex Williamson --- drivers/s390/cio/vfio_ccw_drv.c | 98 +++++++++++++++++++++++------ drivers/s390/cio/vfio_ccw_ops.c | 8 ++- drivers/s390/cio/vfio_ccw_private.h | 20 ++++-- 3 files changed, 101 insertions(+), 25 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index 7f5402fe857a2..444b32047397b 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -36,10 +36,19 @@ debug_info_t *vfio_ccw_debug_trace_id; */ int vfio_ccw_sch_quiesce(struct subchannel *sch) { - struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev); + struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev); + struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev); DECLARE_COMPLETION_ONSTACK(completion); int iretry, ret = 0; + /* + * Probably an impossible situation, after being called through + * FSM callbacks. But in the event it did, register a warning + * and return as if things were fine. + */ + if (WARN_ON(!private)) + return 0; + iretry = 255; do { @@ -121,7 +130,23 @@ static void vfio_ccw_crw_todo(struct work_struct *work) */ static void vfio_ccw_sch_irq(struct subchannel *sch) { - struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev); + struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev); + struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev); + + /* + * The subchannel should still be disabled at this point, + * so an interrupt would be quite surprising. As with an + * interrupt while the FSM is closed, let's attempt to + * disable the subchannel again. + */ + if (!private) { + VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: unexpected interrupt\n", + sch->schid.cssid, sch->schid.ssid, + sch->schid.sch_no); + + cio_disable_subchannel(sch); + return; + } inc_irq_stat(IRQIO_CIO); vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_INTERRUPT); @@ -201,10 +226,19 @@ static void vfio_ccw_free_private(struct vfio_ccw_private *private) mutex_destroy(&private->io_mutex); kfree(private); } + +static void vfio_ccw_free_parent(struct device *dev) +{ + struct vfio_ccw_parent *parent = container_of(dev, struct vfio_ccw_parent, dev); + + kfree(parent); +} + static int vfio_ccw_sch_probe(struct subchannel *sch) { struct pmcw *pmcw = &sch->schib.pmcw; struct vfio_ccw_private *private; + struct vfio_ccw_parent *parent; int ret = -ENOMEM; if (pmcw->qf) { @@ -213,38 +247,58 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) return -ENODEV; } + parent = kzalloc(sizeof(*parent), GFP_KERNEL); + if (!parent) + return -ENOMEM; + + dev_set_name(&parent->dev, "parent"); + parent->dev.parent = &sch->dev; + parent->dev.release = &vfio_ccw_free_parent; + ret = device_register(&parent->dev); + if (ret) + goto out_free; + private = vfio_ccw_alloc_private(sch); - if (IS_ERR(private)) + if (IS_ERR(private)) { + device_unregister(&parent->dev); return PTR_ERR(private); + } - dev_set_drvdata(&sch->dev, private); + dev_set_drvdata(&sch->dev, parent); + dev_set_drvdata(&parent->dev, private); - private->mdev_type.sysfs_name = "io"; - private->mdev_type.pretty_name = "I/O subchannel (Non-QDIO)"; - private->mdev_types[0] = &private->mdev_type; - ret = mdev_register_parent(&private->parent, &sch->dev, + parent->mdev_type.sysfs_name = "io"; + parent->mdev_type.pretty_name = "I/O subchannel (Non-QDIO)"; + parent->mdev_types[0] = &parent->mdev_type; + ret = mdev_register_parent(&parent->parent, &sch->dev, &vfio_ccw_mdev_driver, - private->mdev_types, 1); + parent->mdev_types, 1); if (ret) - goto out_free; + goto out_unreg; VFIO_CCW_MSG_EVENT(4, "bound to subchannel %x.%x.%04x\n", sch->schid.cssid, sch->schid.ssid, sch->schid.sch_no); return 0; +out_unreg: + device_unregister(&parent->dev); out_free: + dev_set_drvdata(&parent->dev, NULL); dev_set_drvdata(&sch->dev, NULL); - vfio_ccw_free_private(private); + if (private) + vfio_ccw_free_private(private); return ret; } static void vfio_ccw_sch_remove(struct subchannel *sch) { - struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev); + struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev); + struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev); - mdev_unregister_parent(&private->parent); + mdev_unregister_parent(&parent->parent); + device_unregister(&parent->dev); dev_set_drvdata(&sch->dev, NULL); vfio_ccw_free_private(private); @@ -256,7 +310,11 @@ static void vfio_ccw_sch_remove(struct subchannel *sch) static void vfio_ccw_sch_shutdown(struct subchannel *sch) { - struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev); + struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev); + struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev); + + if (WARN_ON(!private)) + return; vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_CLOSE); vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_NOT_OPER); @@ -274,7 +332,8 @@ static void vfio_ccw_sch_shutdown(struct subchannel *sch) */ static int vfio_ccw_sch_event(struct subchannel *sch, int process) { - struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev); + struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev); + struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev); unsigned long flags; int rc = -EAGAIN; @@ -287,8 +346,10 @@ static int vfio_ccw_sch_event(struct subchannel *sch, int process) rc = 0; - if (cio_update_schib(sch)) - vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_NOT_OPER); + if (cio_update_schib(sch)) { + if (private) + vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_NOT_OPER); + } out_unlock: spin_unlock_irqrestore(sch->lock, flags); @@ -326,7 +387,8 @@ static void vfio_ccw_queue_crw(struct vfio_ccw_private *private, static int vfio_ccw_chp_event(struct subchannel *sch, struct chp_link *link, int event) { - struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev); + struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev); + struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev); int mask = chp_ssd_get_mask(&sch->ssd_info, link); int retry = 255; diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 6ae4d012d8008..dc084883d872d 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -55,7 +55,9 @@ static int vfio_ccw_mdev_init_dev(struct vfio_device *vdev) static int vfio_ccw_mdev_probe(struct mdev_device *mdev) { - struct vfio_ccw_private *private = dev_get_drvdata(mdev->dev.parent); + struct subchannel *sch = to_subchannel(mdev->dev.parent); + struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev); + struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev); int ret; if (private->state == VFIO_CCW_STATE_NOT_OPER) @@ -100,7 +102,9 @@ static void vfio_ccw_mdev_release_dev(struct vfio_device *vdev) static void vfio_ccw_mdev_remove(struct mdev_device *mdev) { - struct vfio_ccw_private *private = dev_get_drvdata(mdev->dev.parent); + struct subchannel *sch = to_subchannel(mdev->dev.parent); + struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev); + struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev); VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: remove\n", private->sch->schid.cssid, diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index bd5fb81456af8..1f598d58d9694 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -67,6 +67,21 @@ struct vfio_ccw_crw { struct crw crw; }; +/** + * struct vfio_ccw_parent + * + * @dev: embedded device struct + * @parent: parent data structures for mdevs created + * @mdev_type(s): identifying information for mdevs created + */ +struct vfio_ccw_parent { + struct device dev; + + struct mdev_parent parent; + struct mdev_type mdev_type; + struct mdev_type *mdev_types[1]; +}; + /** * struct vfio_ccw_private * @vdev: Embedded VFIO device @@ -89,7 +104,6 @@ struct vfio_ccw_crw { * @io_work: work for deferral process of I/O handling * @crw_work: work for deferral process of CRW handling * @release_comp: synchronization helper for vfio device release - * @parent: parent data structures for mdevs created */ struct vfio_ccw_private { struct vfio_device vdev; @@ -116,10 +130,6 @@ struct vfio_ccw_private { struct work_struct crw_work; struct completion release_comp; - - struct mdev_parent parent; - struct mdev_type mdev_type; - struct mdev_type *mdev_types[1]; } __aligned(8); int vfio_ccw_sch_quiesce(struct subchannel *sch); -- GitLab From 008a011d68036ebfa4dede07cb9b93dedaa958b1 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Fri, 4 Nov 2022 15:20:02 +0100 Subject: [PATCH 0493/1423] vfio/ccw: remove private->sch These places all rely on the ability to jump from a private struct back to the subchannel struct. Rather than keeping a copy in our back pocket, let's use the relationship provided by the vfio_device embedded within the private. Signed-off-by: Eric Farman Reviewed-by: Matthew Rosato Link: https://lore.kernel.org/r/20221104142007.1314999-3-farman@linux.ibm.com Signed-off-by: Alex Williamson --- drivers/s390/cio/vfio_ccw_chp.c | 5 +++-- drivers/s390/cio/vfio_ccw_drv.c | 3 +-- drivers/s390/cio/vfio_ccw_fsm.c | 27 ++++++++++++--------------- drivers/s390/cio/vfio_ccw_ops.c | 12 ++++++------ drivers/s390/cio/vfio_ccw_private.h | 7 ++++--- 5 files changed, 26 insertions(+), 28 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_chp.c b/drivers/s390/cio/vfio_ccw_chp.c index 13b26a1c79886..d3f3a611f95b4 100644 --- a/drivers/s390/cio/vfio_ccw_chp.c +++ b/drivers/s390/cio/vfio_ccw_chp.c @@ -16,6 +16,7 @@ static ssize_t vfio_ccw_schib_region_read(struct vfio_ccw_private *private, char __user *buf, size_t count, loff_t *ppos) { + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); unsigned int i = VFIO_CCW_OFFSET_TO_INDEX(*ppos) - VFIO_CCW_NUM_REGIONS; loff_t pos = *ppos & VFIO_CCW_OFFSET_MASK; struct ccw_schib_region *region; @@ -27,12 +28,12 @@ static ssize_t vfio_ccw_schib_region_read(struct vfio_ccw_private *private, mutex_lock(&private->io_mutex); region = private->region[i].data; - if (cio_update_schib(private->sch)) { + if (cio_update_schib(sch)) { ret = -ENODEV; goto out; } - memcpy(region, &private->sch->schib, sizeof(*region)); + memcpy(region, &sch->schib, sizeof(*region)); if (copy_to_user(buf, (void *)region + pos, count)) { ret = -EFAULT; diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index 444b32047397b..2c680a5563832 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -160,7 +160,6 @@ static struct vfio_ccw_private *vfio_ccw_alloc_private(struct subchannel *sch) if (!private) return ERR_PTR(-ENOMEM); - private->sch = sch; mutex_init(&private->io_mutex); private->state = VFIO_CCW_STATE_STANDBY; INIT_LIST_HEAD(&private->crw); @@ -395,7 +394,7 @@ static int vfio_ccw_chp_event(struct subchannel *sch, if (!private || !mask) return 0; - trace_vfio_ccw_chp_event(private->sch->schid, mask, event); + trace_vfio_ccw_chp_event(sch->schid, mask, event); VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: mask=0x%x event=%d\n", sch->schid.cssid, sch->schid.ssid, sch->schid.sch_no, diff --git a/drivers/s390/cio/vfio_ccw_fsm.c b/drivers/s390/cio/vfio_ccw_fsm.c index a59c758869f8f..e67fad897af3a 100644 --- a/drivers/s390/cio/vfio_ccw_fsm.c +++ b/drivers/s390/cio/vfio_ccw_fsm.c @@ -18,15 +18,13 @@ static int fsm_io_helper(struct vfio_ccw_private *private) { - struct subchannel *sch; + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); union orb *orb; int ccode; __u8 lpm; unsigned long flags; int ret; - sch = private->sch; - spin_lock_irqsave(sch->lock, flags); orb = cp_get_orb(&private->cp, (u32)(addr_t)sch, sch->lpm); @@ -80,13 +78,11 @@ out: static int fsm_do_halt(struct vfio_ccw_private *private) { - struct subchannel *sch; + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); unsigned long flags; int ccode; int ret; - sch = private->sch; - spin_lock_irqsave(sch->lock, flags); VFIO_CCW_TRACE_EVENT(2, "haltIO"); @@ -121,13 +117,11 @@ static int fsm_do_halt(struct vfio_ccw_private *private) static int fsm_do_clear(struct vfio_ccw_private *private) { - struct subchannel *sch; + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); unsigned long flags; int ccode; int ret; - sch = private->sch; - spin_lock_irqsave(sch->lock, flags); VFIO_CCW_TRACE_EVENT(2, "clearIO"); @@ -160,7 +154,7 @@ static int fsm_do_clear(struct vfio_ccw_private *private) static void fsm_notoper(struct vfio_ccw_private *private, enum vfio_ccw_event event) { - struct subchannel *sch = private->sch; + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: notoper event %x state %x\n", sch->schid.cssid, @@ -228,7 +222,7 @@ static void fsm_async_retry(struct vfio_ccw_private *private, static void fsm_disabled_irq(struct vfio_ccw_private *private, enum vfio_ccw_event event) { - struct subchannel *sch = private->sch; + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); /* * An interrupt in a disabled state means a previous disable was not @@ -238,7 +232,9 @@ static void fsm_disabled_irq(struct vfio_ccw_private *private, } inline struct subchannel_id get_schid(struct vfio_ccw_private *p) { - return p->sch->schid; + struct subchannel *sch = to_subchannel(p->vdev.dev->parent); + + return sch->schid; } /* @@ -360,10 +356,11 @@ static void fsm_async_request(struct vfio_ccw_private *private, static void fsm_irq(struct vfio_ccw_private *private, enum vfio_ccw_event event) { + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); struct irb *irb = this_cpu_ptr(&cio_irb); VFIO_CCW_TRACE_EVENT(6, "IRQ"); - VFIO_CCW_TRACE_EVENT(6, dev_name(&private->sch->dev)); + VFIO_CCW_TRACE_EVENT(6, dev_name(&sch->dev)); memcpy(&private->irb, irb, sizeof(*irb)); @@ -376,7 +373,7 @@ static void fsm_irq(struct vfio_ccw_private *private, static void fsm_open(struct vfio_ccw_private *private, enum vfio_ccw_event event) { - struct subchannel *sch = private->sch; + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); int ret; spin_lock_irq(sch->lock); @@ -397,7 +394,7 @@ err_unlock: static void fsm_close(struct vfio_ccw_private *private, enum vfio_ccw_event event) { - struct subchannel *sch = private->sch; + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); int ret; spin_lock_irq(sch->lock); diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index dc084883d872d..79c50cb7dcb81 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -68,9 +68,9 @@ static int vfio_ccw_mdev_probe(struct mdev_device *mdev) return ret; VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: create\n", - private->sch->schid.cssid, - private->sch->schid.ssid, - private->sch->schid.sch_no); + sch->schid.cssid, + sch->schid.ssid, + sch->schid.sch_no); ret = vfio_register_emulated_iommu_dev(&private->vdev); if (ret) @@ -107,9 +107,9 @@ static void vfio_ccw_mdev_remove(struct mdev_device *mdev) struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev); VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: remove\n", - private->sch->schid.cssid, - private->sch->schid.ssid, - private->sch->schid.sch_no); + sch->schid.cssid, + sch->schid.ssid, + sch->schid.sch_no); vfio_unregister_group_dev(&private->vdev); diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index 1f598d58d9694..b28af2f639630 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -85,7 +85,6 @@ struct vfio_ccw_parent { /** * struct vfio_ccw_private * @vdev: Embedded VFIO device - * @sch: pointer to the subchannel * @state: internal state of the device * @completion: synchronization helper of the I/O completion * @io_region: MMIO region to input/output I/O arguments/results @@ -107,7 +106,6 @@ struct vfio_ccw_parent { */ struct vfio_ccw_private { struct vfio_device vdev; - struct subchannel *sch; int state; struct completion *completion; struct ccw_io_region *io_region; @@ -172,7 +170,10 @@ extern fsm_func_t *vfio_ccw_jumptable[NR_VFIO_CCW_STATES][NR_VFIO_CCW_EVENTS]; static inline void vfio_ccw_fsm_event(struct vfio_ccw_private *private, enum vfio_ccw_event event) { - trace_vfio_ccw_fsm_event(private->sch->schid, private->state, event); + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); + + if (sch) + trace_vfio_ccw_fsm_event(sch->schid, private->state, event); vfio_ccw_jumptable[private->state][event](private, event); } -- GitLab From 06caaa27df8f1a8a6be78212c5ef5cac04500176 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Fri, 4 Nov 2022 15:20:03 +0100 Subject: [PATCH 0494/1423] vfio/ccw: move private initialization to callback There's already a device initialization callback that is used to initialize the release completion workaround that was introduced by commit ebb72b765fb49 ("vfio/ccw: Use the new device life cycle helpers"). Move the other elements of the vfio_ccw_private struct that require distinct initialization over to that routine. With that done, the vfio_ccw_alloc_private routine only does a kzalloc, so fold it inline. Signed-off-by: Eric Farman Reviewed-by: Matthew Rosato Link: https://lore.kernel.org/r/20221104142007.1314999-4-farman@linux.ibm.com Signed-off-by: Alex Williamson --- drivers/s390/cio/vfio_ccw_drv.c | 74 ++++------------------------- drivers/s390/cio/vfio_ccw_ops.c | 43 +++++++++++++++++ drivers/s390/cio/vfio_ccw_private.h | 7 ++- 3 files changed, 58 insertions(+), 66 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index 2c680a5563832..fbc26338ceabe 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -23,10 +23,10 @@ #include "vfio_ccw_private.h" struct workqueue_struct *vfio_ccw_work_q; -static struct kmem_cache *vfio_ccw_io_region; -static struct kmem_cache *vfio_ccw_cmd_region; -static struct kmem_cache *vfio_ccw_schib_region; -static struct kmem_cache *vfio_ccw_crw_region; +struct kmem_cache *vfio_ccw_io_region; +struct kmem_cache *vfio_ccw_cmd_region; +struct kmem_cache *vfio_ccw_schib_region; +struct kmem_cache *vfio_ccw_crw_region; debug_info_t *vfio_ccw_debug_msg_id; debug_info_t *vfio_ccw_debug_trace_id; @@ -79,7 +79,7 @@ int vfio_ccw_sch_quiesce(struct subchannel *sch) return ret; } -static void vfio_ccw_sch_io_todo(struct work_struct *work) +void vfio_ccw_sch_io_todo(struct work_struct *work) { struct vfio_ccw_private *private; struct irb *irb; @@ -115,7 +115,7 @@ static void vfio_ccw_sch_io_todo(struct work_struct *work) eventfd_signal(private->io_trigger, 1); } -static void vfio_ccw_crw_todo(struct work_struct *work) +void vfio_ccw_crw_todo(struct work_struct *work) { struct vfio_ccw_private *private; @@ -152,62 +152,6 @@ static void vfio_ccw_sch_irq(struct subchannel *sch) vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_INTERRUPT); } -static struct vfio_ccw_private *vfio_ccw_alloc_private(struct subchannel *sch) -{ - struct vfio_ccw_private *private; - - private = kzalloc(sizeof(*private), GFP_KERNEL); - if (!private) - return ERR_PTR(-ENOMEM); - - mutex_init(&private->io_mutex); - private->state = VFIO_CCW_STATE_STANDBY; - INIT_LIST_HEAD(&private->crw); - INIT_WORK(&private->io_work, vfio_ccw_sch_io_todo); - INIT_WORK(&private->crw_work, vfio_ccw_crw_todo); - - private->cp.guest_cp = kcalloc(CCWCHAIN_LEN_MAX, sizeof(struct ccw1), - GFP_KERNEL); - if (!private->cp.guest_cp) - goto out_free_private; - - private->io_region = kmem_cache_zalloc(vfio_ccw_io_region, - GFP_KERNEL | GFP_DMA); - if (!private->io_region) - goto out_free_cp; - - private->cmd_region = kmem_cache_zalloc(vfio_ccw_cmd_region, - GFP_KERNEL | GFP_DMA); - if (!private->cmd_region) - goto out_free_io; - - private->schib_region = kmem_cache_zalloc(vfio_ccw_schib_region, - GFP_KERNEL | GFP_DMA); - - if (!private->schib_region) - goto out_free_cmd; - - private->crw_region = kmem_cache_zalloc(vfio_ccw_crw_region, - GFP_KERNEL | GFP_DMA); - - if (!private->crw_region) - goto out_free_schib; - return private; - -out_free_schib: - kmem_cache_free(vfio_ccw_schib_region, private->schib_region); -out_free_cmd: - kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region); -out_free_io: - kmem_cache_free(vfio_ccw_io_region, private->io_region); -out_free_cp: - kfree(private->cp.guest_cp); -out_free_private: - mutex_destroy(&private->io_mutex); - kfree(private); - return ERR_PTR(-ENOMEM); -} - static void vfio_ccw_free_private(struct vfio_ccw_private *private) { struct vfio_ccw_crw *crw, *temp; @@ -257,10 +201,10 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) if (ret) goto out_free; - private = vfio_ccw_alloc_private(sch); - if (IS_ERR(private)) { + private = kzalloc(sizeof(*private), GFP_KERNEL); + if (!private) { device_unregister(&parent->dev); - return PTR_ERR(private); + return -ENOMEM; } dev_set_drvdata(&sch->dev, parent); diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 79c50cb7dcb81..eb0b8cc210bb8 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -49,8 +49,51 @@ static int vfio_ccw_mdev_init_dev(struct vfio_device *vdev) struct vfio_ccw_private *private = container_of(vdev, struct vfio_ccw_private, vdev); + mutex_init(&private->io_mutex); + private->state = VFIO_CCW_STATE_STANDBY; + INIT_LIST_HEAD(&private->crw); + INIT_WORK(&private->io_work, vfio_ccw_sch_io_todo); + INIT_WORK(&private->crw_work, vfio_ccw_crw_todo); init_completion(&private->release_comp); + + private->cp.guest_cp = kcalloc(CCWCHAIN_LEN_MAX, sizeof(struct ccw1), + GFP_KERNEL); + if (!private->cp.guest_cp) + goto out_free_private; + + private->io_region = kmem_cache_zalloc(vfio_ccw_io_region, + GFP_KERNEL | GFP_DMA); + if (!private->io_region) + goto out_free_cp; + + private->cmd_region = kmem_cache_zalloc(vfio_ccw_cmd_region, + GFP_KERNEL | GFP_DMA); + if (!private->cmd_region) + goto out_free_io; + + private->schib_region = kmem_cache_zalloc(vfio_ccw_schib_region, + GFP_KERNEL | GFP_DMA); + if (!private->schib_region) + goto out_free_cmd; + + private->crw_region = kmem_cache_zalloc(vfio_ccw_crw_region, + GFP_KERNEL | GFP_DMA); + if (!private->crw_region) + goto out_free_schib; + return 0; + +out_free_schib: + kmem_cache_free(vfio_ccw_schib_region, private->schib_region); +out_free_cmd: + kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region); +out_free_io: + kmem_cache_free(vfio_ccw_io_region, private->io_region); +out_free_cp: + kfree(private->cp.guest_cp); +out_free_private: + mutex_destroy(&private->io_mutex); + return -ENOMEM; } static int vfio_ccw_mdev_probe(struct mdev_device *mdev) diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index b28af2f639630..55d636225cff8 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -131,6 +131,8 @@ struct vfio_ccw_private { } __aligned(8); int vfio_ccw_sch_quiesce(struct subchannel *sch); +void vfio_ccw_sch_io_todo(struct work_struct *work); +void vfio_ccw_crw_todo(struct work_struct *work); extern struct mdev_driver vfio_ccw_mdev_driver; @@ -178,7 +180,10 @@ static inline void vfio_ccw_fsm_event(struct vfio_ccw_private *private, } extern struct workqueue_struct *vfio_ccw_work_q; - +extern struct kmem_cache *vfio_ccw_io_region; +extern struct kmem_cache *vfio_ccw_cmd_region; +extern struct kmem_cache *vfio_ccw_schib_region; +extern struct kmem_cache *vfio_ccw_crw_region; /* s390 debug feature, similar to base cio */ extern debug_info_t *vfio_ccw_debug_msg_id; -- GitLab From 3d62fe18b6a3a93c5360c2d590b9b40b6842133e Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Fri, 4 Nov 2022 15:20:04 +0100 Subject: [PATCH 0495/1423] vfio/ccw: move private to mdev lifecycle Now that the mdev parent data is split out into its own struct, it is safe to move the remaining private data to follow the mdev probe/remove lifecycle. The mdev parent data will remain where it is, and follow the subchannel and the css driver interfaces. Signed-off-by: Eric Farman Reviewed-by: Matthew Rosato Link: https://lore.kernel.org/r/20221104142007.1314999-5-farman@linux.ibm.com Signed-off-by: Alex Williamson --- drivers/s390/cio/vfio_ccw_drv.c | 16 +--------------- drivers/s390/cio/vfio_ccw_ops.c | 26 +++++++++++++------------- drivers/s390/cio/vfio_ccw_private.h | 2 ++ 3 files changed, 16 insertions(+), 28 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index fbc26338ceabe..9fbd1b27a1ac1 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -152,7 +152,7 @@ static void vfio_ccw_sch_irq(struct subchannel *sch) vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_INTERRUPT); } -static void vfio_ccw_free_private(struct vfio_ccw_private *private) +void vfio_ccw_free_private(struct vfio_ccw_private *private) { struct vfio_ccw_crw *crw, *temp; @@ -180,7 +180,6 @@ static void vfio_ccw_free_parent(struct device *dev) static int vfio_ccw_sch_probe(struct subchannel *sch) { struct pmcw *pmcw = &sch->schib.pmcw; - struct vfio_ccw_private *private; struct vfio_ccw_parent *parent; int ret = -ENOMEM; @@ -201,14 +200,7 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) if (ret) goto out_free; - private = kzalloc(sizeof(*private), GFP_KERNEL); - if (!private) { - device_unregister(&parent->dev); - return -ENOMEM; - } - dev_set_drvdata(&sch->dev, parent); - dev_set_drvdata(&parent->dev, private); parent->mdev_type.sysfs_name = "io"; parent->mdev_type.pretty_name = "I/O subchannel (Non-QDIO)"; @@ -227,25 +219,19 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) out_unreg: device_unregister(&parent->dev); out_free: - dev_set_drvdata(&parent->dev, NULL); dev_set_drvdata(&sch->dev, NULL); - if (private) - vfio_ccw_free_private(private); return ret; } static void vfio_ccw_sch_remove(struct subchannel *sch) { struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev); - struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev); mdev_unregister_parent(&parent->parent); device_unregister(&parent->dev); dev_set_drvdata(&sch->dev, NULL); - vfio_ccw_free_private(private); - VFIO_CCW_MSG_EVENT(4, "unbound from subchannel %x.%x.%04x\n", sch->schid.cssid, sch->schid.ssid, sch->schid.sch_no); diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index eb0b8cc210bb8..e45d4acb109b0 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -100,15 +100,20 @@ static int vfio_ccw_mdev_probe(struct mdev_device *mdev) { struct subchannel *sch = to_subchannel(mdev->dev.parent); struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev); - struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev); + struct vfio_ccw_private *private; int ret; - if (private->state == VFIO_CCW_STATE_NOT_OPER) - return -ENODEV; + private = kzalloc(sizeof(*private), GFP_KERNEL); + if (!private) + return -ENOMEM; ret = vfio_init_device(&private->vdev, &mdev->dev, &vfio_ccw_dev_ops); - if (ret) + if (ret) { + kfree(private); return ret; + } + + dev_set_drvdata(&parent->dev, private); VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: create\n", sch->schid.cssid, @@ -122,6 +127,7 @@ static int vfio_ccw_mdev_probe(struct mdev_device *mdev) return 0; err_put_vdev: + dev_set_drvdata(&parent->dev, NULL); vfio_put_device(&private->vdev); return ret; } @@ -131,15 +137,6 @@ static void vfio_ccw_mdev_release_dev(struct vfio_device *vdev) struct vfio_ccw_private *private = container_of(vdev, struct vfio_ccw_private, vdev); - /* - * We cannot free vfio_ccw_private here because it includes - * parent info which must be free'ed by css driver. - * - * Use a workaround by memset'ing the core device part and - * then notifying the remove path that all active references - * to this device have been released. - */ - memset(vdev, 0, sizeof(*vdev)); complete(&private->release_comp); } @@ -156,6 +153,7 @@ static void vfio_ccw_mdev_remove(struct mdev_device *mdev) vfio_unregister_group_dev(&private->vdev); + dev_set_drvdata(&parent->dev, NULL); vfio_put_device(&private->vdev); /* * Wait for all active references on mdev are released so it @@ -166,6 +164,8 @@ static void vfio_ccw_mdev_remove(struct mdev_device *mdev) * cycle. */ wait_for_completion(&private->release_comp); + + vfio_ccw_free_private(private); } static int vfio_ccw_mdev_open_device(struct vfio_device *vdev) diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index 55d636225cff8..747aba5f52723 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -134,6 +134,8 @@ int vfio_ccw_sch_quiesce(struct subchannel *sch); void vfio_ccw_sch_io_todo(struct work_struct *work); void vfio_ccw_crw_todo(struct work_struct *work); +void vfio_ccw_free_private(struct vfio_ccw_private *private); + extern struct mdev_driver vfio_ccw_mdev_driver; /* -- GitLab From f4da83f7e3f096ece936512d86ef3726e470fbfd Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Fri, 4 Nov 2022 15:20:05 +0100 Subject: [PATCH 0496/1423] vfio/ccw: remove release completion There's enough separation between the parent and private structs now, that it is fine to remove the release completion hack. Signed-off-by: Eric Farman Reviewed-by: Kevin Tian Reviewed-by: Matthew Rosato Link: https://lore.kernel.org/r/20221104142007.1314999-6-farman@linux.ibm.com Signed-off-by: Alex Williamson --- drivers/s390/cio/vfio_ccw_ops.c | 14 +------------- drivers/s390/cio/vfio_ccw_private.h | 3 --- 2 files changed, 1 insertion(+), 16 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index e45d4acb109b0..8a929a9cf3c66 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -54,7 +54,6 @@ static int vfio_ccw_mdev_init_dev(struct vfio_device *vdev) INIT_LIST_HEAD(&private->crw); INIT_WORK(&private->io_work, vfio_ccw_sch_io_todo); INIT_WORK(&private->crw_work, vfio_ccw_crw_todo); - init_completion(&private->release_comp); private->cp.guest_cp = kcalloc(CCWCHAIN_LEN_MAX, sizeof(struct ccw1), GFP_KERNEL); @@ -137,7 +136,7 @@ static void vfio_ccw_mdev_release_dev(struct vfio_device *vdev) struct vfio_ccw_private *private = container_of(vdev, struct vfio_ccw_private, vdev); - complete(&private->release_comp); + vfio_ccw_free_private(private); } static void vfio_ccw_mdev_remove(struct mdev_device *mdev) @@ -155,17 +154,6 @@ static void vfio_ccw_mdev_remove(struct mdev_device *mdev) dev_set_drvdata(&parent->dev, NULL); vfio_put_device(&private->vdev); - /* - * Wait for all active references on mdev are released so it - * is safe to defer kfree() to a later point. - * - * TODO: the clean fix is to split parent/mdev info from ccw - * private structure so each can be managed in its own life - * cycle. - */ - wait_for_completion(&private->release_comp); - - vfio_ccw_free_private(private); } static int vfio_ccw_mdev_open_device(struct vfio_device *vdev) diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index 747aba5f52723..2278fd38d34ea 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -102,7 +102,6 @@ struct vfio_ccw_parent { * @req_trigger: eventfd ctx for signaling userspace to return device * @io_work: work for deferral process of I/O handling * @crw_work: work for deferral process of CRW handling - * @release_comp: synchronization helper for vfio device release */ struct vfio_ccw_private { struct vfio_device vdev; @@ -126,8 +125,6 @@ struct vfio_ccw_private { struct eventfd_ctx *req_trigger; struct work_struct io_work; struct work_struct crw_work; - - struct completion release_comp; } __aligned(8); int vfio_ccw_sch_quiesce(struct subchannel *sch); -- GitLab From d1104f9327df9b26901b97cd026949f80ccab0d3 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Fri, 4 Nov 2022 15:20:06 +0100 Subject: [PATCH 0497/1423] vfio/ccw: replace vfio_init_device with _alloc_ Now that we have a reasonable separation of structs that follow the subchannel and mdev lifecycles, there's no reason we can't call the official vfio_alloc_device routine for our private data, and behave like everyone else. Signed-off-by: Eric Farman Reviewed-by: Kevin Tian Reviewed-by: Matthew Rosato Link: https://lore.kernel.org/r/20221104142007.1314999-7-farman@linux.ibm.com Signed-off-by: Alex Williamson --- drivers/s390/cio/vfio_ccw_drv.c | 18 ------------------ drivers/s390/cio/vfio_ccw_ops.c | 28 ++++++++++++++++++---------- drivers/s390/cio/vfio_ccw_private.h | 2 -- drivers/vfio/vfio_main.c | 10 +++++----- include/linux/vfio.h | 2 -- 5 files changed, 23 insertions(+), 37 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index 9fbd1b27a1ac1..c2a65808605af 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -152,24 +152,6 @@ static void vfio_ccw_sch_irq(struct subchannel *sch) vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_INTERRUPT); } -void vfio_ccw_free_private(struct vfio_ccw_private *private) -{ - struct vfio_ccw_crw *crw, *temp; - - list_for_each_entry_safe(crw, temp, &private->crw, next) { - list_del(&crw->next); - kfree(crw); - } - - kmem_cache_free(vfio_ccw_crw_region, private->crw_region); - kmem_cache_free(vfio_ccw_schib_region, private->schib_region); - kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region); - kmem_cache_free(vfio_ccw_io_region, private->io_region); - kfree(private->cp.guest_cp); - mutex_destroy(&private->io_mutex); - kfree(private); -} - static void vfio_ccw_free_parent(struct device *dev) { struct vfio_ccw_parent *parent = container_of(dev, struct vfio_ccw_parent, dev); diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 8a929a9cf3c66..1155f8bcedd99 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -102,15 +102,10 @@ static int vfio_ccw_mdev_probe(struct mdev_device *mdev) struct vfio_ccw_private *private; int ret; - private = kzalloc(sizeof(*private), GFP_KERNEL); - if (!private) - return -ENOMEM; - - ret = vfio_init_device(&private->vdev, &mdev->dev, &vfio_ccw_dev_ops); - if (ret) { - kfree(private); - return ret; - } + private = vfio_alloc_device(vfio_ccw_private, vdev, &mdev->dev, + &vfio_ccw_dev_ops); + if (IS_ERR(private)) + return PTR_ERR(private); dev_set_drvdata(&parent->dev, private); @@ -135,8 +130,21 @@ static void vfio_ccw_mdev_release_dev(struct vfio_device *vdev) { struct vfio_ccw_private *private = container_of(vdev, struct vfio_ccw_private, vdev); + struct vfio_ccw_crw *crw, *temp; + + list_for_each_entry_safe(crw, temp, &private->crw, next) { + list_del(&crw->next); + kfree(crw); + } + + kmem_cache_free(vfio_ccw_crw_region, private->crw_region); + kmem_cache_free(vfio_ccw_schib_region, private->schib_region); + kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region); + kmem_cache_free(vfio_ccw_io_region, private->io_region); + kfree(private->cp.guest_cp); + mutex_destroy(&private->io_mutex); - vfio_ccw_free_private(private); + vfio_free_device(vdev); } static void vfio_ccw_mdev_remove(struct mdev_device *mdev) diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index 2278fd38d34ea..b441ae6700fd2 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -131,8 +131,6 @@ int vfio_ccw_sch_quiesce(struct subchannel *sch); void vfio_ccw_sch_io_todo(struct work_struct *work); void vfio_ccw_crw_todo(struct work_struct *work); -void vfio_ccw_free_private(struct vfio_ccw_private *private); - extern struct mdev_driver vfio_ccw_mdev_driver; /* diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 2d168793d4e1c..2901b8ad5be92 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -348,6 +348,9 @@ static void vfio_device_release(struct device *dev) device->ops->release(device); } +static int vfio_init_device(struct vfio_device *device, struct device *dev, + const struct vfio_device_ops *ops); + /* * Allocate and initialize vfio_device so it can be registered to vfio * core. @@ -386,11 +389,9 @@ EXPORT_SYMBOL_GPL(_vfio_alloc_device); /* * Initialize a vfio_device so it can be registered to vfio core. - * - * Only vfio-ccw driver should call this interface. */ -int vfio_init_device(struct vfio_device *device, struct device *dev, - const struct vfio_device_ops *ops) +static int vfio_init_device(struct vfio_device *device, struct device *dev, + const struct vfio_device_ops *ops) { int ret; @@ -422,7 +423,6 @@ out_uninit: ida_free(&vfio.device_ida, device->index); return ret; } -EXPORT_SYMBOL_GPL(vfio_init_device); /* * The helper called by driver @release callback to free the device diff --git a/include/linux/vfio.h b/include/linux/vfio.h index e7cebeb875dd1..ba809268a48e0 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -176,8 +176,6 @@ struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, dev, ops), \ struct dev_struct, member) -int vfio_init_device(struct vfio_device *device, struct device *dev, - const struct vfio_device_ops *ops); void vfio_free_device(struct vfio_device *device); static inline void vfio_put_device(struct vfio_device *device) { -- GitLab From 913447d06f032a9e9c84870bec0b1adb8c588f29 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Fri, 4 Nov 2022 15:20:07 +0100 Subject: [PATCH 0498/1423] vfio: Remove vfio_free_device With the "mess" sorted out, we should be able to inline the vfio_free_device call introduced by commit cb9ff3f3b84c ("vfio: Add helpers for unifying vfio_device life cycle") and remove them from driver release callbacks. Signed-off-by: Eric Farman Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Cornelia Huck Reviewed-by: Tony Krowiak # vfio-ap part Reviewed-by: Matthew Rosato Link: https://lore.kernel.org/r/20221104142007.1314999-8-farman@linux.ibm.com Signed-off-by: Alex Williamson --- drivers/gpu/drm/i915/gvt/kvmgt.c | 1 - drivers/s390/cio/vfio_ccw_ops.c | 2 -- drivers/s390/crypto/vfio_ap_ops.c | 6 ------ drivers/vfio/fsl-mc/vfio_fsl_mc.c | 1 - drivers/vfio/pci/vfio_pci_core.c | 1 - drivers/vfio/platform/vfio_amba.c | 1 - drivers/vfio/platform/vfio_platform.c | 1 - drivers/vfio/vfio_main.c | 22 ++++------------------ include/linux/vfio.h | 1 - samples/vfio-mdev/mbochs.c | 1 - samples/vfio-mdev/mdpy.c | 1 - samples/vfio-mdev/mtty.c | 1 - 12 files changed, 4 insertions(+), 35 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 7a45e5360caf2..eee6805e67ded 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -1461,7 +1461,6 @@ static void intel_vgpu_release_dev(struct vfio_device *vfio_dev) struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev); intel_gvt_destroy_vgpu(vgpu); - vfio_free_device(vfio_dev); } static const struct vfio_device_ops intel_vgpu_dev_ops = { diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 1155f8bcedd99..598a3814d4282 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -143,8 +143,6 @@ static void vfio_ccw_mdev_release_dev(struct vfio_device *vdev) kmem_cache_free(vfio_ccw_io_region, private->io_region); kfree(private->cp.guest_cp); mutex_destroy(&private->io_mutex); - - vfio_free_device(vdev); } static void vfio_ccw_mdev_remove(struct mdev_device *mdev) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 0b4cc8c597ae6..f108c0f147125 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -765,11 +765,6 @@ static void vfio_ap_mdev_unlink_fr_queues(struct ap_matrix_mdev *matrix_mdev) } } -static void vfio_ap_mdev_release_dev(struct vfio_device *vdev) -{ - vfio_free_device(vdev); -} - static void vfio_ap_mdev_remove(struct mdev_device *mdev) { struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(&mdev->dev); @@ -1784,7 +1779,6 @@ static const struct attribute_group vfio_queue_attr_group = { static const struct vfio_device_ops vfio_ap_matrix_dev_ops = { .init = vfio_ap_mdev_init_dev, - .release = vfio_ap_mdev_release_dev, .open_device = vfio_ap_mdev_open_device, .close_device = vfio_ap_mdev_close_device, .ioctl = vfio_ap_mdev_ioctl, diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c index b16874e913e4f..7b8889f550076 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -568,7 +568,6 @@ static void vfio_fsl_mc_release_dev(struct vfio_device *core_vdev) vfio_fsl_uninit_device(vdev); mutex_destroy(&vdev->igate); - vfio_free_device(core_vdev); } static int vfio_fsl_mc_remove(struct fsl_mc_device *mc_dev) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index badc9d828cac2..9be2d5be5d959 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -2109,7 +2109,6 @@ void vfio_pci_core_release_dev(struct vfio_device *core_vdev) mutex_destroy(&vdev->vma_lock); kfree(vdev->region); kfree(vdev->pm_save); - vfio_free_device(core_vdev); } EXPORT_SYMBOL_GPL(vfio_pci_core_release_dev); diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c index eaea63e5294c5..18faf2678b998 100644 --- a/drivers/vfio/platform/vfio_amba.c +++ b/drivers/vfio/platform/vfio_amba.c @@ -95,7 +95,6 @@ static void vfio_amba_release_dev(struct vfio_device *core_vdev) vfio_platform_release_common(vdev); kfree(vdev->name); - vfio_free_device(core_vdev); } static void vfio_amba_remove(struct amba_device *adev) diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c index 82cedcebfd902..9910451dc3415 100644 --- a/drivers/vfio/platform/vfio_platform.c +++ b/drivers/vfio/platform/vfio_platform.c @@ -83,7 +83,6 @@ static void vfio_platform_release_dev(struct vfio_device *core_vdev) container_of(core_vdev, struct vfio_platform_device, vdev); vfio_platform_release_common(vdev); - vfio_free_device(core_vdev); } static int vfio_platform_remove(struct platform_device *pdev) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 2901b8ad5be92..9835757e2bee4 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -339,13 +339,10 @@ static void vfio_device_release(struct device *dev) vfio_release_device_set(device); ida_free(&vfio.device_ida, device->index); - /* - * kvfree() cannot be done here due to a life cycle mess in - * vfio-ccw. Before the ccw part is fixed all drivers are - * required to support @release and call vfio_free_device() - * from there. - */ - device->ops->release(device); + if (device->ops->release) + device->ops->release(device); + + kvfree(device); } static int vfio_init_device(struct vfio_device *device, struct device *dev, @@ -424,17 +421,6 @@ out_uninit: return ret; } -/* - * The helper called by driver @release callback to free the device - * structure. Drivers which don't have private data to clean can - * simply use this helper as its @release. - */ -void vfio_free_device(struct vfio_device *device) -{ - kvfree(device); -} -EXPORT_SYMBOL_GPL(vfio_free_device); - static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev, enum vfio_group_type type) { diff --git a/include/linux/vfio.h b/include/linux/vfio.h index ba809268a48e0..e7480154825ec 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -176,7 +176,6 @@ struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, dev, ops), \ struct dev_struct, member) -void vfio_free_device(struct vfio_device *device); static inline void vfio_put_device(struct vfio_device *device) { put_device(&device->device); diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index 117a8d799f711..8b5a3a778a257 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -594,7 +594,6 @@ static void mbochs_release_dev(struct vfio_device *vdev) atomic_add(mdev_state->type->mbytes, &mbochs_avail_mbytes); kfree(mdev_state->pages); kfree(mdev_state->vconfig); - vfio_free_device(vdev); } static void mbochs_remove(struct mdev_device *mdev) diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index 946e8cfde6fdd..721fb06c6413f 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -283,7 +283,6 @@ static void mdpy_release_dev(struct vfio_device *vdev) vfree(mdev_state->memblk); kfree(mdev_state->vconfig); - vfio_free_device(vdev); } static void mdpy_remove(struct mdev_device *mdev) diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index e72085fc13763..3c2a421b9b696 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -784,7 +784,6 @@ static void mtty_release_dev(struct vfio_device *vdev) atomic_add(mdev_state->nr_ports, &mdev_avail_ports); kfree(mdev_state->vconfig); - vfio_free_device(vdev); } static void mtty_remove(struct mdev_device *mdev) -- GitLab From 3bfadb2325891d122771ce534336af531e93d7b2 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Oct 2022 15:12:04 +0800 Subject: [PATCH 0499/1423] KVM: selftests: memslot_perf_test: Use data->nslots in prepare_vm() In prepare_vm(), 'data->nslots' is assigned with 'max_mem_slots - 1' at the beginning, meaning they are interchangeable. Use 'data->nslots' isntead of 'max_mem_slots - 1'. With this, it becomes easier to move the logic of probing number of slots into upper layer in subsequent patches. No functional change intended. Signed-off-by: Gavin Shan Reviewed-by: Maciej S. Szmigiero Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221020071209.559062-2-gshan@redhat.com --- tools/testing/selftests/kvm/memslot_perf_test.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c index 44995446d942a..231cc8449c2ef 100644 --- a/tools/testing/selftests/kvm/memslot_perf_test.c +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -280,14 +280,14 @@ static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots, ucall_init(data->vm, NULL); pr_info_v("Adding slots 1..%i, each slot with %"PRIu64" pages + %"PRIu64" extra pages last\n", - max_mem_slots - 1, data->pages_per_slot, rempages); + data->nslots, data->pages_per_slot, rempages); clock_gettime(CLOCK_MONOTONIC, &tstart); - for (slot = 1, guest_addr = MEM_GPA; slot < max_mem_slots; slot++) { + for (slot = 1, guest_addr = MEM_GPA; slot <= data->nslots; slot++) { uint64_t npages; npages = data->pages_per_slot; - if (slot == max_mem_slots - 1) + if (slot == data->nslots) npages += rempages; vm_userspace_mem_region_add(data->vm, VM_MEM_SRC_ANONYMOUS, @@ -297,12 +297,12 @@ static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots, } *slot_runtime = timespec_elapsed(tstart); - for (slot = 0, guest_addr = MEM_GPA; slot < max_mem_slots - 1; slot++) { + for (slot = 0, guest_addr = MEM_GPA; slot < data->nslots; slot++) { uint64_t npages; uint64_t gpa; npages = data->pages_per_slot; - if (slot == max_mem_slots - 2) + if (slot == data->nslots - 1) npages += rempages; gpa = vm_phy_pages_alloc(data->vm, npages, guest_addr, -- GitLab From 2aae5e6795e1407334bb849f96f11c9051b959e2 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Oct 2022 15:12:05 +0800 Subject: [PATCH 0500/1423] KVM: selftests: memslot_perf_test: Consolidate loop conditions in prepare_vm() There are two loops in prepare_vm(), which have different conditions. 'slot' is treated as meory slot index in the first loop, but index of the host virtual address array in the second loop. It makes it a bit hard to understand the code. Change the usage of 'slot' in the second loop, to treat it as the memory slot index either. No functional change intended. Signed-off-by: Gavin Shan Reviewed-by: Maciej S. Szmigiero Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221020071209.559062-3-gshan@redhat.com --- tools/testing/selftests/kvm/memslot_perf_test.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c index 231cc8449c2ef..dcb492b3f27b2 100644 --- a/tools/testing/selftests/kvm/memslot_perf_test.c +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -297,21 +297,20 @@ static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots, } *slot_runtime = timespec_elapsed(tstart); - for (slot = 0, guest_addr = MEM_GPA; slot < data->nslots; slot++) { + for (slot = 1, guest_addr = MEM_GPA; slot <= data->nslots; slot++) { uint64_t npages; uint64_t gpa; npages = data->pages_per_slot; - if (slot == data->nslots - 1) + if (slot == data->nslots) npages += rempages; - gpa = vm_phy_pages_alloc(data->vm, npages, guest_addr, - slot + 1); + gpa = vm_phy_pages_alloc(data->vm, npages, guest_addr, slot); TEST_ASSERT(gpa == guest_addr, "vm_phy_pages_alloc() failed\n"); - data->hva_slots[slot] = addr_gpa2hva(data->vm, guest_addr); - memset(data->hva_slots[slot], 0, npages * 4096); + data->hva_slots[slot - 1] = addr_gpa2hva(data->vm, guest_addr); + memset(data->hva_slots[slot - 1], 0, npages * 4096); guest_addr += npages * 4096; } -- GitLab From 34396437b11f904fc61b272e3974f4c92868451b Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Oct 2022 15:12:06 +0800 Subject: [PATCH 0501/1423] KVM: selftests: memslot_perf_test: Probe memory slots for once prepare_vm() is called in every iteration and run. The allowed memory slots (KVM_CAP_NR_MEMSLOTS) are probed for multiple times. It's not free and unnecessary. Move the probing logic for the allowed memory slots to parse_args() for once, which is upper layer of prepare_vm(). No functional change intended. Signed-off-by: Gavin Shan Reviewed-by: Maciej S. Szmigiero Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221020071209.559062-4-gshan@redhat.com --- .../testing/selftests/kvm/memslot_perf_test.c | 32 +++++++++++-------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c index dcb492b3f27b2..f0ea3f75b6e1d 100644 --- a/tools/testing/selftests/kvm/memslot_perf_test.c +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -245,27 +245,17 @@ static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots, void *guest_code, uint64_t mempages, struct timespec *slot_runtime) { - uint32_t max_mem_slots; uint64_t rempages; uint64_t guest_addr; uint32_t slot; struct timespec tstart; struct sync_area *sync; - max_mem_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS); - TEST_ASSERT(max_mem_slots > 1, - "KVM_CAP_NR_MEMSLOTS should be greater than 1"); - TEST_ASSERT(nslots > 1 || nslots == -1, - "Slot count cap should be greater than 1"); - if (nslots != -1) - max_mem_slots = min(max_mem_slots, (uint32_t)nslots); - pr_info_v("Allowed number of memory slots: %"PRIu32"\n", max_mem_slots); - TEST_ASSERT(mempages > 1, "Can't test without any memory"); data->npages = mempages; - data->nslots = max_mem_slots - 1; + data->nslots = nslots; data->pages_per_slot = mempages / data->nslots; if (!data->pages_per_slot) { *maxslots = mempages + 1; @@ -869,6 +859,7 @@ static void help(char *name, struct test_args *targs) static bool parse_args(int argc, char *argv[], struct test_args *targs) { + uint32_t max_mem_slots; int opt; while ((opt = getopt(argc, argv, "hvds:f:e:l:r:")) != -1) { @@ -885,8 +876,8 @@ static bool parse_args(int argc, char *argv[], break; case 's': targs->nslots = atoi(optarg); - if (targs->nslots <= 0 && targs->nslots != -1) { - pr_info("Slot count cap has to be positive or -1 for no cap\n"); + if (targs->nslots <= 1 && targs->nslots != -1) { + pr_info("Slot count cap must be larger than 1 or -1 for no cap\n"); return false; } break; @@ -932,6 +923,21 @@ static bool parse_args(int argc, char *argv[], return false; } + max_mem_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS); + if (max_mem_slots <= 1) { + pr_info("KVM_CAP_NR_MEMSLOTS should be greater than 1\n"); + return false; + } + + /* Memory slot 0 is reserved */ + if (targs->nslots == -1) + targs->nslots = max_mem_slots - 1; + else + targs->nslots = min_t(int, targs->nslots, max_mem_slots) - 1; + + pr_info_v("Allowed Number of memory slots: %"PRIu32"\n", + targs->nslots + 1); + return true; } -- GitLab From 8675c6f226986ddb67752be22279a0e2385b197e Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Oct 2022 15:12:07 +0800 Subject: [PATCH 0502/1423] KVM: selftests: memslot_perf_test: Support variable guest page size The test case is obviously broken on aarch64 because non-4KB guest page size is supported. The guest page size on aarch64 could be 4KB, 16KB or 64KB. This supports variable guest page size, mostly for aarch64. - The host determines the guest page size when virtual machine is created. The value is also passed to guest through the synchronization area. - The number of guest pages are unknown until the virtual machine is to be created. So all the related macros are dropped. Instead, their values are dynamically calculated based on the guest page size. - The static checks on memory sizes and pages becomes dependent on guest page size, which is unknown until the virtual machine is about to be created. So all the static checks are converted to dynamic checks, done in check_memory_sizes(). - As the address passed to madvise() should be aligned to host page, the size of page chunk is automatically selected, other than one page. - MEM_TEST_MOVE_SIZE has fixed and non-working 64KB. It will be consolidated in next patch. However, the comments about how it's calculated has been correct. - All other changes included in this patch are almost mechanical replacing '4096' with 'guest_page_size'. Signed-off-by: Gavin Shan Reviewed-by: Maciej S. Szmigiero Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221020071209.559062-5-gshan@redhat.com --- .../testing/selftests/kvm/memslot_perf_test.c | 210 +++++++++++------- 1 file changed, 129 insertions(+), 81 deletions(-) diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c index f0ea3f75b6e1d..9af61ca8ad0a0 100644 --- a/tools/testing/selftests/kvm/memslot_perf_test.c +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -26,14 +26,11 @@ #include #define MEM_SIZE ((512U << 20) + 4096) -#define MEM_SIZE_PAGES (MEM_SIZE / 4096) #define MEM_GPA 0x10000000UL #define MEM_AUX_GPA MEM_GPA #define MEM_SYNC_GPA MEM_AUX_GPA #define MEM_TEST_GPA (MEM_AUX_GPA + 4096) #define MEM_TEST_SIZE (MEM_SIZE - 4096) -static_assert(MEM_SIZE % 4096 == 0, "invalid mem size"); -static_assert(MEM_TEST_SIZE % 4096 == 0, "invalid mem test size"); /* * 32 MiB is max size that gets well over 100 iterations on 509 slots. @@ -42,43 +39,37 @@ static_assert(MEM_TEST_SIZE % 4096 == 0, "invalid mem test size"); * limited resolution). */ #define MEM_SIZE_MAP ((32U << 20) + 4096) -#define MEM_SIZE_MAP_PAGES (MEM_SIZE_MAP / 4096) #define MEM_TEST_MAP_SIZE (MEM_SIZE_MAP - 4096) -#define MEM_TEST_MAP_SIZE_PAGES (MEM_TEST_MAP_SIZE / 4096) -static_assert(MEM_SIZE_MAP % 4096 == 0, "invalid map test region size"); -static_assert(MEM_TEST_MAP_SIZE % 4096 == 0, "invalid map test region size"); -static_assert(MEM_TEST_MAP_SIZE_PAGES % 2 == 0, "invalid map test region size"); -static_assert(MEM_TEST_MAP_SIZE_PAGES > 2, "invalid map test region size"); /* * 128 MiB is min size that fills 32k slots with at least one page in each * while at the same time gets 100+ iterations in such test + * + * 2 MiB chunk size like a typical huge page */ #define MEM_TEST_UNMAP_SIZE (128U << 20) -#define MEM_TEST_UNMAP_SIZE_PAGES (MEM_TEST_UNMAP_SIZE / 4096) -/* 2 MiB chunk size like a typical huge page */ -#define MEM_TEST_UNMAP_CHUNK_PAGES (2U << (20 - 12)) -static_assert(MEM_TEST_UNMAP_SIZE <= MEM_TEST_SIZE, - "invalid unmap test region size"); -static_assert(MEM_TEST_UNMAP_SIZE % 4096 == 0, - "invalid unmap test region size"); -static_assert(MEM_TEST_UNMAP_SIZE_PAGES % - (2 * MEM_TEST_UNMAP_CHUNK_PAGES) == 0, - "invalid unmap test region size"); +#define MEM_TEST_UNMAP_CHUNK_SIZE (2U << 20) /* * For the move active test the middle of the test area is placed on * a memslot boundary: half lies in the memslot being moved, half in * other memslot(s). * - * When running this test with 32k memslots (32764, really) each memslot - * contains 4 pages. - * The last one additionally contains the remaining 21 pages of memory, - * for the total size of 25 pages. - * Hence, the maximum size here is 50 pages. + * We have different number of memory slots, excluding the reserved + * memory slot 0, on various architectures and configurations. The + * memory size in this test is calculated by picking the maximal + * last memory slot's memory size, with alignment to the largest + * supported page size (64KB). In this way, the selected memory + * size for this test is compatible with test_memslot_move_prepare(). + * + * architecture slots memory-per-slot memory-on-last-slot + * -------------------------------------------------------------- + * x86-4KB 32763 16KB 100KB + * arm64-4KB 32766 16KB 52KB + * arm64-16KB 32766 16KB 48KB + * arm64-64KB 8192 64KB 64KB */ -#define MEM_TEST_MOVE_SIZE_PAGES (50) -#define MEM_TEST_MOVE_SIZE (MEM_TEST_MOVE_SIZE_PAGES * 4096) +#define MEM_TEST_MOVE_SIZE 0x10000 #define MEM_TEST_MOVE_GPA_DEST (MEM_GPA + MEM_SIZE) static_assert(MEM_TEST_MOVE_SIZE <= MEM_TEST_SIZE, "invalid move test region size"); @@ -100,6 +91,7 @@ struct vm_data { }; struct sync_area { + uint32_t guest_page_size; atomic_bool start_flag; atomic_bool exit_flag; atomic_bool sync_flag; @@ -192,14 +184,15 @@ static void *vm_gpa2hva(struct vm_data *data, uint64_t gpa, uint64_t *rempages) uint64_t gpage, pgoffs; uint32_t slot, slotoffs; void *base; + uint32_t guest_page_size = data->vm->page_size; TEST_ASSERT(gpa >= MEM_GPA, "Too low gpa to translate"); - TEST_ASSERT(gpa < MEM_GPA + data->npages * 4096, + TEST_ASSERT(gpa < MEM_GPA + data->npages * guest_page_size, "Too high gpa to translate"); gpa -= MEM_GPA; - gpage = gpa / 4096; - pgoffs = gpa % 4096; + gpage = gpa / guest_page_size; + pgoffs = gpa % guest_page_size; slot = min(gpage / data->pages_per_slot, (uint64_t)data->nslots - 1); slotoffs = gpage - (slot * data->pages_per_slot); @@ -217,14 +210,16 @@ static void *vm_gpa2hva(struct vm_data *data, uint64_t gpa, uint64_t *rempages) } base = data->hva_slots[slot]; - return (uint8_t *)base + slotoffs * 4096 + pgoffs; + return (uint8_t *)base + slotoffs * guest_page_size + pgoffs; } static uint64_t vm_slot2gpa(struct vm_data *data, uint32_t slot) { + uint32_t guest_page_size = data->vm->page_size; + TEST_ASSERT(slot < data->nslots, "Too high slot number"); - return MEM_GPA + slot * data->pages_per_slot * 4096; + return MEM_GPA + slot * data->pages_per_slot * guest_page_size; } static struct vm_data *alloc_vm(void) @@ -242,33 +237,35 @@ static struct vm_data *alloc_vm(void) } static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots, - void *guest_code, uint64_t mempages, + void *guest_code, uint64_t mem_size, struct timespec *slot_runtime) { - uint64_t rempages; + uint64_t mempages, rempages; uint64_t guest_addr; - uint32_t slot; + uint32_t slot, guest_page_size; struct timespec tstart; struct sync_area *sync; - TEST_ASSERT(mempages > 1, - "Can't test without any memory"); + guest_page_size = vm_guest_mode_params[VM_MODE_DEFAULT].page_size; + mempages = mem_size / guest_page_size; + + data->vm = __vm_create_with_one_vcpu(&data->vcpu, mempages, guest_code); + ucall_init(data->vm, NULL); + TEST_ASSERT(data->vm->page_size == guest_page_size, "Invalid VM page size"); data->npages = mempages; + TEST_ASSERT(data->npages > 1, "Can't test without any memory"); data->nslots = nslots; - data->pages_per_slot = mempages / data->nslots; + data->pages_per_slot = data->npages / data->nslots; if (!data->pages_per_slot) { - *maxslots = mempages + 1; + *maxslots = data->npages + 1; return false; } - rempages = mempages % data->nslots; + rempages = data->npages % data->nslots; data->hva_slots = malloc(sizeof(*data->hva_slots) * data->nslots); TEST_ASSERT(data->hva_slots, "malloc() fail"); - data->vm = __vm_create_with_one_vcpu(&data->vcpu, mempages, guest_code); - ucall_init(data->vm, NULL); - pr_info_v("Adding slots 1..%i, each slot with %"PRIu64" pages + %"PRIu64" extra pages last\n", data->nslots, data->pages_per_slot, rempages); @@ -283,7 +280,7 @@ static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots, vm_userspace_mem_region_add(data->vm, VM_MEM_SRC_ANONYMOUS, guest_addr, slot, npages, 0); - guest_addr += npages * 4096; + guest_addr += npages * guest_page_size; } *slot_runtime = timespec_elapsed(tstart); @@ -300,12 +297,12 @@ static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots, "vm_phy_pages_alloc() failed\n"); data->hva_slots[slot - 1] = addr_gpa2hva(data->vm, guest_addr); - memset(data->hva_slots[slot - 1], 0, npages * 4096); + memset(data->hva_slots[slot - 1], 0, npages * guest_page_size); - guest_addr += npages * 4096; + guest_addr += npages * guest_page_size; } - virt_map(data->vm, MEM_GPA, MEM_GPA, mempages); + virt_map(data->vm, MEM_GPA, MEM_GPA, data->npages); sync = (typeof(sync))vm_gpa2hva(data, MEM_SYNC_GPA, NULL); atomic_init(&sync->start_flag, false); @@ -404,6 +401,7 @@ static bool guest_perform_sync(void) static void guest_code_test_memslot_move(void) { struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; + uint32_t page_size = (typeof(page_size))READ_ONCE(sync->guest_page_size); uintptr_t base = (typeof(base))READ_ONCE(sync->move_area_ptr); GUEST_SYNC(0); @@ -414,7 +412,7 @@ static void guest_code_test_memslot_move(void) uintptr_t ptr; for (ptr = base; ptr < base + MEM_TEST_MOVE_SIZE; - ptr += 4096) + ptr += page_size) *(uint64_t *)ptr = MEM_TEST_VAL_1; /* @@ -432,6 +430,7 @@ static void guest_code_test_memslot_move(void) static void guest_code_test_memslot_map(void) { struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; + uint32_t page_size = (typeof(page_size))READ_ONCE(sync->guest_page_size); GUEST_SYNC(0); @@ -441,14 +440,16 @@ static void guest_code_test_memslot_map(void) uintptr_t ptr; for (ptr = MEM_TEST_GPA; - ptr < MEM_TEST_GPA + MEM_TEST_MAP_SIZE / 2; ptr += 4096) + ptr < MEM_TEST_GPA + MEM_TEST_MAP_SIZE / 2; + ptr += page_size) *(uint64_t *)ptr = MEM_TEST_VAL_1; if (!guest_perform_sync()) break; for (ptr = MEM_TEST_GPA + MEM_TEST_MAP_SIZE / 2; - ptr < MEM_TEST_GPA + MEM_TEST_MAP_SIZE; ptr += 4096) + ptr < MEM_TEST_GPA + MEM_TEST_MAP_SIZE; + ptr += page_size) *(uint64_t *)ptr = MEM_TEST_VAL_2; if (!guest_perform_sync()) @@ -495,6 +496,9 @@ static void guest_code_test_memslot_unmap(void) static void guest_code_test_memslot_rw(void) { + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; + uint32_t page_size = (typeof(page_size))READ_ONCE(sync->guest_page_size); + GUEST_SYNC(0); guest_spin_until_start(); @@ -503,14 +507,14 @@ static void guest_code_test_memslot_rw(void) uintptr_t ptr; for (ptr = MEM_TEST_GPA; - ptr < MEM_TEST_GPA + MEM_TEST_SIZE; ptr += 4096) + ptr < MEM_TEST_GPA + MEM_TEST_SIZE; ptr += page_size) *(uint64_t *)ptr = MEM_TEST_VAL_1; if (!guest_perform_sync()) break; - for (ptr = MEM_TEST_GPA + 4096 / 2; - ptr < MEM_TEST_GPA + MEM_TEST_SIZE; ptr += 4096) { + for (ptr = MEM_TEST_GPA + page_size / 2; + ptr < MEM_TEST_GPA + MEM_TEST_SIZE; ptr += page_size) { uint64_t val = *(uint64_t *)ptr; GUEST_ASSERT_1(val == MEM_TEST_VAL_2, val); @@ -528,6 +532,8 @@ static bool test_memslot_move_prepare(struct vm_data *data, struct sync_area *sync, uint64_t *maxslots, bool isactive) { + uint32_t guest_page_size = data->vm->page_size; + uint64_t move_pages = MEM_TEST_MOVE_SIZE / guest_page_size; uint64_t movesrcgpa, movetestgpa; movesrcgpa = vm_slot2gpa(data, data->nslots - 1); @@ -536,7 +542,7 @@ static bool test_memslot_move_prepare(struct vm_data *data, uint64_t lastpages; vm_gpa2hva(data, movesrcgpa, &lastpages); - if (lastpages < MEM_TEST_MOVE_SIZE_PAGES / 2) { + if (lastpages < move_pages / 2) { *maxslots = 0; return false; } @@ -582,8 +588,9 @@ static void test_memslot_do_unmap(struct vm_data *data, uint64_t offsp, uint64_t count) { uint64_t gpa, ctr; + uint32_t guest_page_size = data->vm->page_size; - for (gpa = MEM_TEST_GPA + offsp * 4096, ctr = 0; ctr < count; ) { + for (gpa = MEM_TEST_GPA + offsp * guest_page_size, ctr = 0; ctr < count; ) { uint64_t npages; void *hva; int ret; @@ -591,12 +598,12 @@ static void test_memslot_do_unmap(struct vm_data *data, hva = vm_gpa2hva(data, gpa, &npages); TEST_ASSERT(npages, "Empty memory slot at gptr 0x%"PRIx64, gpa); npages = min(npages, count - ctr); - ret = madvise(hva, npages * 4096, MADV_DONTNEED); + ret = madvise(hva, npages * guest_page_size, MADV_DONTNEED); TEST_ASSERT(!ret, "madvise(%p, MADV_DONTNEED) on VM memory should not fail for gptr 0x%"PRIx64, hva, gpa); ctr += npages; - gpa += npages * 4096; + gpa += npages * guest_page_size; } TEST_ASSERT(ctr == count, "madvise(MADV_DONTNEED) should exactly cover all of the requested area"); @@ -607,11 +614,12 @@ static void test_memslot_map_unmap_check(struct vm_data *data, { uint64_t gpa; uint64_t *val; + uint32_t guest_page_size = data->vm->page_size; if (!map_unmap_verify) return; - gpa = MEM_TEST_GPA + offsp * 4096; + gpa = MEM_TEST_GPA + offsp * guest_page_size; val = (typeof(val))vm_gpa2hva(data, gpa, NULL); TEST_ASSERT(*val == valexp, "Guest written values should read back correctly before unmap (%"PRIu64" vs %"PRIu64" @ %"PRIx64")", @@ -621,12 +629,14 @@ static void test_memslot_map_unmap_check(struct vm_data *data, static void test_memslot_map_loop(struct vm_data *data, struct sync_area *sync) { + uint32_t guest_page_size = data->vm->page_size; + uint64_t guest_pages = MEM_TEST_MAP_SIZE / guest_page_size; + /* * Unmap the second half of the test area while guest writes to (maps) * the first half. */ - test_memslot_do_unmap(data, MEM_TEST_MAP_SIZE_PAGES / 2, - MEM_TEST_MAP_SIZE_PAGES / 2); + test_memslot_do_unmap(data, guest_pages / 2, guest_pages / 2); /* * Wait for the guest to finish writing the first half of the test @@ -637,10 +647,8 @@ static void test_memslot_map_loop(struct vm_data *data, struct sync_area *sync) */ host_perform_sync(sync); test_memslot_map_unmap_check(data, 0, MEM_TEST_VAL_1); - test_memslot_map_unmap_check(data, - MEM_TEST_MAP_SIZE_PAGES / 2 - 1, - MEM_TEST_VAL_1); - test_memslot_do_unmap(data, 0, MEM_TEST_MAP_SIZE_PAGES / 2); + test_memslot_map_unmap_check(data, guest_pages / 2 - 1, MEM_TEST_VAL_1); + test_memslot_do_unmap(data, 0, guest_pages / 2); /* @@ -653,16 +661,16 @@ static void test_memslot_map_loop(struct vm_data *data, struct sync_area *sync) * the test area. */ host_perform_sync(sync); - test_memslot_map_unmap_check(data, MEM_TEST_MAP_SIZE_PAGES / 2, - MEM_TEST_VAL_2); - test_memslot_map_unmap_check(data, MEM_TEST_MAP_SIZE_PAGES - 1, - MEM_TEST_VAL_2); + test_memslot_map_unmap_check(data, guest_pages / 2, MEM_TEST_VAL_2); + test_memslot_map_unmap_check(data, guest_pages - 1, MEM_TEST_VAL_2); } static void test_memslot_unmap_loop_common(struct vm_data *data, struct sync_area *sync, uint64_t chunk) { + uint32_t guest_page_size = data->vm->page_size; + uint64_t guest_pages = MEM_TEST_UNMAP_SIZE / guest_page_size; uint64_t ctr; /* @@ -674,42 +682,49 @@ static void test_memslot_unmap_loop_common(struct vm_data *data, */ host_perform_sync(sync); test_memslot_map_unmap_check(data, 0, MEM_TEST_VAL_1); - for (ctr = 0; ctr < MEM_TEST_UNMAP_SIZE_PAGES / 2; ctr += chunk) + for (ctr = 0; ctr < guest_pages / 2; ctr += chunk) test_memslot_do_unmap(data, ctr, chunk); /* Likewise, but for the opposite host / guest areas */ host_perform_sync(sync); - test_memslot_map_unmap_check(data, MEM_TEST_UNMAP_SIZE_PAGES / 2, - MEM_TEST_VAL_2); - for (ctr = MEM_TEST_UNMAP_SIZE_PAGES / 2; - ctr < MEM_TEST_UNMAP_SIZE_PAGES; ctr += chunk) + test_memslot_map_unmap_check(data, guest_pages / 2, MEM_TEST_VAL_2); + for (ctr = guest_pages / 2; ctr < guest_pages; ctr += chunk) test_memslot_do_unmap(data, ctr, chunk); } static void test_memslot_unmap_loop(struct vm_data *data, struct sync_area *sync) { - test_memslot_unmap_loop_common(data, sync, 1); + uint32_t host_page_size = getpagesize(); + uint32_t guest_page_size = data->vm->page_size; + uint64_t guest_chunk_pages = guest_page_size >= host_page_size ? + 1 : host_page_size / guest_page_size; + + test_memslot_unmap_loop_common(data, sync, guest_chunk_pages); } static void test_memslot_unmap_loop_chunked(struct vm_data *data, struct sync_area *sync) { - test_memslot_unmap_loop_common(data, sync, MEM_TEST_UNMAP_CHUNK_PAGES); + uint32_t guest_page_size = data->vm->page_size; + uint64_t guest_chunk_pages = MEM_TEST_UNMAP_CHUNK_SIZE / guest_page_size; + + test_memslot_unmap_loop_common(data, sync, guest_chunk_pages); } static void test_memslot_rw_loop(struct vm_data *data, struct sync_area *sync) { uint64_t gptr; + uint32_t guest_page_size = data->vm->page_size; - for (gptr = MEM_TEST_GPA + 4096 / 2; - gptr < MEM_TEST_GPA + MEM_TEST_SIZE; gptr += 4096) + for (gptr = MEM_TEST_GPA + guest_page_size / 2; + gptr < MEM_TEST_GPA + MEM_TEST_SIZE; gptr += guest_page_size) *(uint64_t *)vm_gpa2hva(data, gptr, NULL) = MEM_TEST_VAL_2; host_perform_sync(sync); for (gptr = MEM_TEST_GPA; - gptr < MEM_TEST_GPA + MEM_TEST_SIZE; gptr += 4096) { + gptr < MEM_TEST_GPA + MEM_TEST_SIZE; gptr += guest_page_size) { uint64_t *vptr = (typeof(vptr))vm_gpa2hva(data, gptr, NULL); uint64_t val = *vptr; @@ -738,7 +753,7 @@ static bool test_execute(int nslots, uint64_t *maxslots, struct timespec *slot_runtime, struct timespec *guest_runtime) { - uint64_t mem_size = tdata->mem_size ? : MEM_SIZE_PAGES; + uint64_t mem_size = tdata->mem_size ? : MEM_SIZE; struct vm_data *data; struct sync_area *sync; struct timespec tstart; @@ -753,6 +768,7 @@ static bool test_execute(int nslots, uint64_t *maxslots, sync = (typeof(sync))vm_gpa2hva(data, MEM_SYNC_GPA, NULL); + sync->guest_page_size = data->vm->page_size; if (tdata->prepare && !tdata->prepare(data, sync, maxslots)) { ret = false; @@ -786,19 +802,19 @@ exit_free: static const struct test_data tests[] = { { .name = "map", - .mem_size = MEM_SIZE_MAP_PAGES, + .mem_size = MEM_SIZE_MAP, .guest_code = guest_code_test_memslot_map, .loop = test_memslot_map_loop, }, { .name = "unmap", - .mem_size = MEM_TEST_UNMAP_SIZE_PAGES + 1, + .mem_size = MEM_TEST_UNMAP_SIZE + 4096, .guest_code = guest_code_test_memslot_unmap, .loop = test_memslot_unmap_loop, }, { .name = "unmap chunked", - .mem_size = MEM_TEST_UNMAP_SIZE_PAGES + 1, + .mem_size = MEM_TEST_UNMAP_SIZE + 4096, .guest_code = guest_code_test_memslot_unmap, .loop = test_memslot_unmap_loop_chunked, }, @@ -856,6 +872,35 @@ static void help(char *name, struct test_args *targs) pr_info("%d: %s\n", ctr, tests[ctr].name); } +static bool check_memory_sizes(void) +{ + uint32_t guest_page_size = vm_guest_mode_params[VM_MODE_DEFAULT].page_size; + + if (MEM_SIZE % guest_page_size || + MEM_TEST_SIZE % guest_page_size) { + pr_info("invalid MEM_SIZE or MEM_TEST_SIZE\n"); + return false; + } + + if (MEM_SIZE_MAP % guest_page_size || + MEM_TEST_MAP_SIZE % guest_page_size || + (MEM_TEST_MAP_SIZE / guest_page_size) <= 2 || + (MEM_TEST_MAP_SIZE / guest_page_size) % 2) { + pr_info("invalid MEM_SIZE_MAP or MEM_TEST_MAP_SIZE\n"); + return false; + } + + if (MEM_TEST_UNMAP_SIZE > MEM_TEST_SIZE || + MEM_TEST_UNMAP_SIZE % guest_page_size || + (MEM_TEST_UNMAP_SIZE / guest_page_size) % + (2 * MEM_TEST_UNMAP_CHUNK_SIZE / guest_page_size)) { + pr_info("invalid MEM_TEST_UNMAP_SIZE or MEM_TEST_UNMAP_CHUNK_SIZE\n"); + return false; + } + + return true; +} + static bool parse_args(int argc, char *argv[], struct test_args *targs) { @@ -1015,6 +1060,9 @@ int main(int argc, char *argv[]) /* Tell stdout not to buffer its content */ setbuf(stdout, NULL); + if (!check_memory_sizes()) + return -1; + if (!parse_args(argc, argv, &targs)) return -1; -- GitLab From 88a64e65484ef6b5cb09fe545d0dd00c950a1131 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Oct 2022 15:12:08 +0800 Subject: [PATCH 0503/1423] KVM: selftests: memslot_perf_test: Consolidate memory The addresses and sizes passed to vm_userspace_mem_region_add() and madvise() should be aligned to host page size, which can be 64KB on aarch64. So it's wrong by passing additional fixed 4KB memory area to various tests. Fix it by passing additional fixed 64KB memory area to various tests. We also add checks to ensure that none of host/guest page size exceeds 64KB. MEM_TEST_MOVE_SIZE is fixed up to 192KB either. With this, the following command works fine on 64KB-page-size-host and 4KB-page-size-guest. # ./memslot_perf_test -v -s 512 Signed-off-by: Gavin Shan Reviewed-by: Maciej S. Szmigiero Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221020071209.559062-6-gshan@redhat.com --- .../testing/selftests/kvm/memslot_perf_test.c | 43 +++++++++++-------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c index 9af61ca8ad0a0..daebc264de5ad 100644 --- a/tools/testing/selftests/kvm/memslot_perf_test.c +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -20,17 +20,20 @@ #include #include +#include #include #include #include -#define MEM_SIZE ((512U << 20) + 4096) -#define MEM_GPA 0x10000000UL +#define MEM_EXTRA_SIZE SZ_64K + +#define MEM_SIZE (SZ_512M + MEM_EXTRA_SIZE) +#define MEM_GPA SZ_256M #define MEM_AUX_GPA MEM_GPA #define MEM_SYNC_GPA MEM_AUX_GPA -#define MEM_TEST_GPA (MEM_AUX_GPA + 4096) -#define MEM_TEST_SIZE (MEM_SIZE - 4096) +#define MEM_TEST_GPA (MEM_AUX_GPA + MEM_EXTRA_SIZE) +#define MEM_TEST_SIZE (MEM_SIZE - MEM_EXTRA_SIZE) /* * 32 MiB is max size that gets well over 100 iterations on 509 slots. @@ -38,8 +41,8 @@ * 8194 slots in use can then be tested (although with slightly * limited resolution). */ -#define MEM_SIZE_MAP ((32U << 20) + 4096) -#define MEM_TEST_MAP_SIZE (MEM_SIZE_MAP - 4096) +#define MEM_SIZE_MAP (SZ_32M + MEM_EXTRA_SIZE) +#define MEM_TEST_MAP_SIZE (MEM_SIZE_MAP - MEM_EXTRA_SIZE) /* * 128 MiB is min size that fills 32k slots with at least one page in each @@ -47,8 +50,8 @@ * * 2 MiB chunk size like a typical huge page */ -#define MEM_TEST_UNMAP_SIZE (128U << 20) -#define MEM_TEST_UNMAP_CHUNK_SIZE (2U << 20) +#define MEM_TEST_UNMAP_SIZE SZ_128M +#define MEM_TEST_UNMAP_CHUNK_SIZE SZ_2M /* * For the move active test the middle of the test area is placed on @@ -64,12 +67,12 @@ * * architecture slots memory-per-slot memory-on-last-slot * -------------------------------------------------------------- - * x86-4KB 32763 16KB 100KB - * arm64-4KB 32766 16KB 52KB - * arm64-16KB 32766 16KB 48KB - * arm64-64KB 8192 64KB 64KB + * x86-4KB 32763 16KB 160KB + * arm64-4KB 32766 16KB 112KB + * arm64-16KB 32766 16KB 112KB + * arm64-64KB 8192 64KB 128KB */ -#define MEM_TEST_MOVE_SIZE 0x10000 +#define MEM_TEST_MOVE_SIZE (3 * SZ_64K) #define MEM_TEST_MOVE_GPA_DEST (MEM_GPA + MEM_SIZE) static_assert(MEM_TEST_MOVE_SIZE <= MEM_TEST_SIZE, "invalid move test region size"); @@ -533,7 +536,6 @@ static bool test_memslot_move_prepare(struct vm_data *data, uint64_t *maxslots, bool isactive) { uint32_t guest_page_size = data->vm->page_size; - uint64_t move_pages = MEM_TEST_MOVE_SIZE / guest_page_size; uint64_t movesrcgpa, movetestgpa; movesrcgpa = vm_slot2gpa(data, data->nslots - 1); @@ -542,7 +544,7 @@ static bool test_memslot_move_prepare(struct vm_data *data, uint64_t lastpages; vm_gpa2hva(data, movesrcgpa, &lastpages); - if (lastpages < move_pages / 2) { + if (lastpages * guest_page_size < MEM_TEST_MOVE_SIZE / 2) { *maxslots = 0; return false; } @@ -808,13 +810,13 @@ static const struct test_data tests[] = { }, { .name = "unmap", - .mem_size = MEM_TEST_UNMAP_SIZE + 4096, + .mem_size = MEM_TEST_UNMAP_SIZE + MEM_EXTRA_SIZE, .guest_code = guest_code_test_memslot_unmap, .loop = test_memslot_unmap_loop, }, { .name = "unmap chunked", - .mem_size = MEM_TEST_UNMAP_SIZE + 4096, + .mem_size = MEM_TEST_UNMAP_SIZE + MEM_EXTRA_SIZE, .guest_code = guest_code_test_memslot_unmap, .loop = test_memslot_unmap_loop_chunked, }, @@ -874,8 +876,15 @@ static void help(char *name, struct test_args *targs) static bool check_memory_sizes(void) { + uint32_t host_page_size = getpagesize(); uint32_t guest_page_size = vm_guest_mode_params[VM_MODE_DEFAULT].page_size; + if (host_page_size > SZ_64K || guest_page_size > SZ_64K) { + pr_info("Unsupported page size on host (0x%x) or guest (0x%x)\n", + host_page_size, guest_page_size); + return false; + } + if (MEM_SIZE % guest_page_size || MEM_TEST_SIZE % guest_page_size) { pr_info("invalid MEM_SIZE or MEM_TEST_SIZE\n"); -- GitLab From a69170c65acdf430e24fc1b6174dcc3aa501fe2f Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Oct 2022 15:12:09 +0800 Subject: [PATCH 0504/1423] KVM: selftests: memslot_perf_test: Report optimal memory slots The memory area in each slot should be aligned to host page size. Otherwise, the test will fail. For example, the following command fails with the following messages with 64KB-page-size-host and 4KB-pae-size-guest. It's not user friendly to abort the test. Lets do something to report the optimal memory slots, instead of failing the test. # ./memslot_perf_test -v -s 1000 Number of memory slots: 999 Testing map performance with 1 runs, 5 seconds each Adding slots 1..999, each slot with 8 pages + 216 extra pages last ==== Test Assertion Failure ==== lib/kvm_util.c:824: vm_adjust_num_guest_pages(vm->mode, npages) == npages pid=19872 tid=19872 errno=0 - Success 1 0x00000000004065b3: vm_userspace_mem_region_add at kvm_util.c:822 2 0x0000000000401d6b: prepare_vm at memslot_perf_test.c:273 3 (inlined by) test_execute at memslot_perf_test.c:756 4 (inlined by) test_loop at memslot_perf_test.c:994 5 (inlined by) main at memslot_perf_test.c:1073 6 0x0000ffff7ebb4383: ?? ??:0 7 0x00000000004021ff: _start at :? Number of guest pages is not compatible with the host. Try npages=16 Report the optimal memory slots instead of failing the test when the memory area in each slot isn't aligned to host page size. With this applied, the optimal memory slots is reported. # ./memslot_perf_test -v -s 1000 Number of memory slots: 999 Testing map performance with 1 runs, 5 seconds each Memslot count too high for this test, decrease the cap (max is 514) Signed-off-by: Gavin Shan Reviewed-by: Maciej S. Szmigiero Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221020071209.559062-7-gshan@redhat.com --- .../testing/selftests/kvm/memslot_perf_test.c | 45 +++++++++++++++++-- 1 file changed, 41 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c index daebc264de5ad..2ad40f7c9c08e 100644 --- a/tools/testing/selftests/kvm/memslot_perf_test.c +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -239,16 +239,52 @@ static struct vm_data *alloc_vm(void) return data; } +static bool check_slot_pages(uint32_t host_page_size, uint32_t guest_page_size, + uint64_t pages_per_slot, uint64_t rempages) +{ + if (!pages_per_slot) + return false; + + if ((pages_per_slot * guest_page_size) % host_page_size) + return false; + + if ((rempages * guest_page_size) % host_page_size) + return false; + + return true; +} + + +static uint64_t get_max_slots(struct vm_data *data, uint32_t host_page_size) +{ + uint32_t guest_page_size = data->vm->page_size; + uint64_t mempages, pages_per_slot, rempages; + uint64_t slots; + + mempages = data->npages; + slots = data->nslots; + while (--slots > 1) { + pages_per_slot = mempages / slots; + rempages = mempages % pages_per_slot; + if (check_slot_pages(host_page_size, guest_page_size, + pages_per_slot, rempages)) + return slots + 1; /* slot 0 is reserved */ + } + + return 0; +} + static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots, void *guest_code, uint64_t mem_size, struct timespec *slot_runtime) { uint64_t mempages, rempages; uint64_t guest_addr; - uint32_t slot, guest_page_size; + uint32_t slot, host_page_size, guest_page_size; struct timespec tstart; struct sync_area *sync; + host_page_size = getpagesize(); guest_page_size = vm_guest_mode_params[VM_MODE_DEFAULT].page_size; mempages = mem_size / guest_page_size; @@ -260,12 +296,13 @@ static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots, TEST_ASSERT(data->npages > 1, "Can't test without any memory"); data->nslots = nslots; data->pages_per_slot = data->npages / data->nslots; - if (!data->pages_per_slot) { - *maxslots = data->npages + 1; + rempages = data->npages % data->nslots; + if (!check_slot_pages(host_page_size, guest_page_size, + data->pages_per_slot, rempages)) { + *maxslots = get_max_slots(data, host_page_size); return false; } - rempages = data->npages % data->nslots; data->hva_slots = malloc(sizeof(*data->hva_slots) * data->nslots); TEST_ASSERT(data->hva_slots, "malloc() fail"); -- GitLab From 1a6182033f2d5c481aec1f8c1c26ebc649693d57 Mon Sep 17 00:00:00 2001 From: Reiji Watanabe Date: Wed, 19 Oct 2022 22:41:54 -0700 Subject: [PATCH 0505/1423] KVM: arm64: selftests: Use FIELD_GET() to extract ID register fields Use FIELD_GET() macro to extract ID register fields for existing aarch64 selftests code. No functional change intended. Signed-off-by: Reiji Watanabe Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221020054202.2119018-2-reijiw@google.com --- tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c | 3 ++- tools/testing/selftests/kvm/aarch64/debug-exceptions.c | 3 ++- tools/testing/selftests/kvm/lib/aarch64/processor.c | 7 ++++--- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c b/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c index 6f9c1f19c7f64..b6a5e8861b354 100644 --- a/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c +++ b/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c @@ -13,6 +13,7 @@ #include "kvm_util.h" #include "processor.h" #include "test_util.h" +#include #define BAD_ID_REG_VAL 0x1badc0deul @@ -145,7 +146,7 @@ static bool vcpu_aarch64_only(struct kvm_vcpu *vcpu) vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), &val); - el0 = (val & ARM64_FEATURE_MASK(ID_AA64PFR0_EL0)) >> ID_AA64PFR0_EL0_SHIFT; + el0 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL0), val); return el0 == ID_AA64PFR0_ELx_64BIT_ONLY; } diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c index 947bd201435ce..3808d3d750558 100644 --- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c +++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c @@ -2,6 +2,7 @@ #include #include #include +#include #define MDSCR_KDE (1 << 13) #define MDSCR_MDE (1 << 15) @@ -284,7 +285,7 @@ static int debug_version(struct kvm_vcpu *vcpu) uint64_t id_aa64dfr0; vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64DFR0_EL1), &id_aa64dfr0); - return id_aa64dfr0 & 0xf; + return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_DEBUGVER), id_aa64dfr0); } static void test_guest_debug_exceptions(void) diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index 6f5551368944e..7c96b931edd5c 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -11,6 +11,7 @@ #include "guest_modes.h" #include "kvm_util.h" #include "processor.h" +#include #define DEFAULT_ARM64_GUEST_STACK_VADDR_MIN 0xac0000 @@ -486,9 +487,9 @@ void aarch64_get_supported_page_sizes(uint32_t ipa, err = ioctl(vcpu_fd, KVM_GET_ONE_REG, ®); TEST_ASSERT(err == 0, KVM_IOCTL_ERROR(KVM_GET_ONE_REG, vcpu_fd)); - *ps4k = ((val >> 28) & 0xf) != 0xf; - *ps64k = ((val >> 24) & 0xf) == 0; - *ps16k = ((val >> 20) & 0xf) != 0; + *ps4k = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_TGRAN4), val) != 0xf; + *ps64k = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_TGRAN64), val) == 0; + *ps16k = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_TGRAN16), val) != 0; close(vcpu_fd); close(vm_fd); -- GitLab From f6d02aa28ae21161d64300bac62b2dde85584004 Mon Sep 17 00:00:00 2001 From: Reiji Watanabe Date: Wed, 19 Oct 2022 22:41:55 -0700 Subject: [PATCH 0506/1423] KVM: arm64: selftests: Add write_dbg{b,w}{c,v}r helpers in debug-exceptions Introduce helpers in the debug-exceptions test to write to dbg{b,w}{c,v}r registers. Those helpers will be useful for test cases that will be added to the test in subsequent patches. No functional change intended. Signed-off-by: Reiji Watanabe Reviewed-by: Ricardo Koller Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221020054202.2119018-3-reijiw@google.com --- .../selftests/kvm/aarch64/debug-exceptions.c | 72 +++++++++++++++++-- 1 file changed, 68 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c index 3808d3d750558..d9884907fe87b 100644 --- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c +++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c @@ -30,6 +30,69 @@ static volatile uint64_t svc_addr; static volatile uint64_t ss_addr[4], ss_idx; #define PC(v) ((uint64_t)&(v)) +#define GEN_DEBUG_WRITE_REG(reg_name) \ +static void write_##reg_name(int num, uint64_t val) \ +{ \ + switch (num) { \ + case 0: \ + write_sysreg(val, reg_name##0_el1); \ + break; \ + case 1: \ + write_sysreg(val, reg_name##1_el1); \ + break; \ + case 2: \ + write_sysreg(val, reg_name##2_el1); \ + break; \ + case 3: \ + write_sysreg(val, reg_name##3_el1); \ + break; \ + case 4: \ + write_sysreg(val, reg_name##4_el1); \ + break; \ + case 5: \ + write_sysreg(val, reg_name##5_el1); \ + break; \ + case 6: \ + write_sysreg(val, reg_name##6_el1); \ + break; \ + case 7: \ + write_sysreg(val, reg_name##7_el1); \ + break; \ + case 8: \ + write_sysreg(val, reg_name##8_el1); \ + break; \ + case 9: \ + write_sysreg(val, reg_name##9_el1); \ + break; \ + case 10: \ + write_sysreg(val, reg_name##10_el1); \ + break; \ + case 11: \ + write_sysreg(val, reg_name##11_el1); \ + break; \ + case 12: \ + write_sysreg(val, reg_name##12_el1); \ + break; \ + case 13: \ + write_sysreg(val, reg_name##13_el1); \ + break; \ + case 14: \ + write_sysreg(val, reg_name##14_el1); \ + break; \ + case 15: \ + write_sysreg(val, reg_name##15_el1); \ + break; \ + default: \ + GUEST_ASSERT(0); \ + } \ +} + +/* Define write_dbgbcr()/write_dbgbvr()/write_dbgwcr()/write_dbgwvr() */ +GEN_DEBUG_WRITE_REG(dbgbcr) +GEN_DEBUG_WRITE_REG(dbgbvr) +GEN_DEBUG_WRITE_REG(dbgwcr) +GEN_DEBUG_WRITE_REG(dbgwvr) + static void reset_debug_state(void) { asm volatile("msr daifset, #8"); @@ -61,8 +124,9 @@ static void install_wp(uint64_t addr) uint32_t mdscr; wcr = DBGWCR_LEN8 | DBGWCR_RD | DBGWCR_WR | DBGWCR_EL1 | DBGWCR_E; - write_sysreg(wcr, dbgwcr0_el1); - write_sysreg(addr, dbgwvr0_el1); + write_dbgwcr(0, wcr); + write_dbgwvr(0, addr); + isb(); asm volatile("msr daifclr, #8"); @@ -78,8 +142,8 @@ static void install_hw_bp(uint64_t addr) uint32_t mdscr; bcr = DBGBCR_LEN8 | DBGBCR_EXEC | DBGBCR_EL1 | DBGBCR_E; - write_sysreg(bcr, dbgbcr0_el1); - write_sysreg(addr, dbgbvr0_el1); + write_dbgbcr(0, bcr); + write_dbgbvr(0, addr); isb(); asm volatile("msr daifclr, #8"); -- GitLab From 700b8860e02cbaa7dd1181a914ff38e0fae18bf0 Mon Sep 17 00:00:00 2001 From: Reiji Watanabe Date: Wed, 19 Oct 2022 22:41:56 -0700 Subject: [PATCH 0507/1423] KVM: arm64: selftests: Remove the hard-coded {b,w}pn#0 from debug-exceptions Remove the hard-coded {break,watch}point #0 from the guest_code() in debug-exceptions to allow {break,watch}point number to be specified. Change reset_debug_state() to zeroing all dbg{b,w}{c,v}r_el0 registers so that guest_code() can use the function to reset those registers even when non-zero {break,watch}points are specified for guest_code(). Subsequent patches will add test cases for non-zero {break,watch}points. Signed-off-by: Reiji Watanabe Reviewed-by: Ricardo Koller Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221020054202.2119018-4-reijiw@google.com --- .../selftests/kvm/aarch64/debug-exceptions.c | 50 ++++++++++++------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c index d9884907fe87b..608a6c8db9a21 100644 --- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c +++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c @@ -95,6 +95,9 @@ GEN_DEBUG_WRITE_REG(dbgwvr) static void reset_debug_state(void) { + uint8_t brps, wrps, i; + uint64_t dfr0; + asm volatile("msr daifset, #8"); write_sysreg(0, osdlr_el1); @@ -102,11 +105,20 @@ static void reset_debug_state(void) isb(); write_sysreg(0, mdscr_el1); - /* This test only uses the first bp and wp slot. */ - write_sysreg(0, dbgbvr0_el1); - write_sysreg(0, dbgbcr0_el1); - write_sysreg(0, dbgwcr0_el1); - write_sysreg(0, dbgwvr0_el1); + + /* Reset all bcr/bvr/wcr/wvr registers */ + dfr0 = read_sysreg(id_aa64dfr0_el1); + brps = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_BRPS), dfr0); + for (i = 0; i <= brps; i++) { + write_dbgbcr(i, 0); + write_dbgbvr(i, 0); + } + wrps = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_WRPS), dfr0); + for (i = 0; i <= wrps; i++) { + write_dbgwcr(i, 0); + write_dbgwvr(i, 0); + } + isb(); } @@ -118,14 +130,14 @@ static void enable_os_lock(void) GUEST_ASSERT(read_sysreg(oslsr_el1) & 2); } -static void install_wp(uint64_t addr) +static void install_wp(uint8_t wpn, uint64_t addr) { uint32_t wcr; uint32_t mdscr; wcr = DBGWCR_LEN8 | DBGWCR_RD | DBGWCR_WR | DBGWCR_EL1 | DBGWCR_E; - write_dbgwcr(0, wcr); - write_dbgwvr(0, addr); + write_dbgwcr(wpn, wcr); + write_dbgwvr(wpn, addr); isb(); @@ -136,14 +148,14 @@ static void install_wp(uint64_t addr) isb(); } -static void install_hw_bp(uint64_t addr) +static void install_hw_bp(uint8_t bpn, uint64_t addr) { uint32_t bcr; uint32_t mdscr; bcr = DBGBCR_LEN8 | DBGBCR_EXEC | DBGBCR_EL1 | DBGBCR_E; - write_dbgbcr(0, bcr); - write_dbgbvr(0, addr); + write_dbgbcr(bpn, bcr); + write_dbgbvr(bpn, addr); isb(); asm volatile("msr daifclr, #8"); @@ -166,7 +178,7 @@ static void install_ss(void) static volatile char write_data; -static void guest_code(void) +static void guest_code(uint8_t bpn, uint8_t wpn) { GUEST_SYNC(0); @@ -179,7 +191,7 @@ static void guest_code(void) /* Hardware-breakpoint */ reset_debug_state(); - install_hw_bp(PC(hw_bp)); + install_hw_bp(bpn, PC(hw_bp)); asm volatile("hw_bp: nop"); GUEST_ASSERT_EQ(hw_bp_addr, PC(hw_bp)); @@ -187,7 +199,7 @@ static void guest_code(void) /* Hardware-breakpoint + svc */ reset_debug_state(); - install_hw_bp(PC(bp_svc)); + install_hw_bp(bpn, PC(bp_svc)); asm volatile("bp_svc: svc #0"); GUEST_ASSERT_EQ(hw_bp_addr, PC(bp_svc)); GUEST_ASSERT_EQ(svc_addr, PC(bp_svc) + 4); @@ -196,7 +208,7 @@ static void guest_code(void) /* Hardware-breakpoint + software-breakpoint */ reset_debug_state(); - install_hw_bp(PC(bp_brk)); + install_hw_bp(bpn, PC(bp_brk)); asm volatile("bp_brk: brk #0"); GUEST_ASSERT_EQ(sw_bp_addr, PC(bp_brk)); GUEST_ASSERT_EQ(hw_bp_addr, PC(bp_brk)); @@ -205,7 +217,7 @@ static void guest_code(void) /* Watchpoint */ reset_debug_state(); - install_wp(PC(write_data)); + install_wp(wpn, PC(write_data)); write_data = 'x'; GUEST_ASSERT_EQ(write_data, 'x'); GUEST_ASSERT_EQ(wp_data_addr, PC(write_data)); @@ -239,7 +251,7 @@ static void guest_code(void) /* OS Lock blocking hardware-breakpoint */ reset_debug_state(); enable_os_lock(); - install_hw_bp(PC(hw_bp2)); + install_hw_bp(bpn, PC(hw_bp2)); hw_bp_addr = 0; asm volatile("hw_bp2: nop"); GUEST_ASSERT_EQ(hw_bp_addr, 0); @@ -251,7 +263,7 @@ static void guest_code(void) enable_os_lock(); write_data = '\0'; wp_data_addr = 0; - install_wp(PC(write_data)); + install_wp(wpn, PC(write_data)); write_data = 'x'; GUEST_ASSERT_EQ(write_data, 'x'); GUEST_ASSERT_EQ(wp_data_addr, 0); @@ -376,6 +388,8 @@ static void test_guest_debug_exceptions(void) vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, ESR_EC_SVC64, guest_svc_handler); + /* Run tests with breakpoint#0 and watchpoint#0. */ + vcpu_args_set(vcpu, 2, 0, 0); for (stage = 0; stage < 11; stage++) { vcpu_run(vcpu); -- GitLab From 152880d8edf5ad6df5b4b4915a4d9f9085ab8fef Mon Sep 17 00:00:00 2001 From: Reiji Watanabe Date: Wed, 19 Oct 2022 22:41:57 -0700 Subject: [PATCH 0508/1423] KVM: arm64: selftests: Add helpers to enable debug exceptions Add helpers to enable breakpoint and watchpoint exceptions. No functional change intended. Signed-off-by: Reiji Watanabe Reviewed-by: Ricardo Koller Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221020054202.2119018-5-reijiw@google.com --- .../selftests/kvm/aarch64/debug-exceptions.c | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c index 608a6c8db9a21..0c237022f4d39 100644 --- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c +++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c @@ -130,10 +130,20 @@ static void enable_os_lock(void) GUEST_ASSERT(read_sysreg(oslsr_el1) & 2); } +static void enable_monitor_debug_exceptions(void) +{ + uint32_t mdscr; + + asm volatile("msr daifclr, #8"); + + mdscr = read_sysreg(mdscr_el1) | MDSCR_KDE | MDSCR_MDE; + write_sysreg(mdscr, mdscr_el1); + isb(); +} + static void install_wp(uint8_t wpn, uint64_t addr) { uint32_t wcr; - uint32_t mdscr; wcr = DBGWCR_LEN8 | DBGWCR_RD | DBGWCR_WR | DBGWCR_EL1 | DBGWCR_E; write_dbgwcr(wpn, wcr); @@ -141,28 +151,19 @@ static void install_wp(uint8_t wpn, uint64_t addr) isb(); - asm volatile("msr daifclr, #8"); - - mdscr = read_sysreg(mdscr_el1) | MDSCR_KDE | MDSCR_MDE; - write_sysreg(mdscr, mdscr_el1); - isb(); + enable_monitor_debug_exceptions(); } static void install_hw_bp(uint8_t bpn, uint64_t addr) { uint32_t bcr; - uint32_t mdscr; bcr = DBGBCR_LEN8 | DBGBCR_EXEC | DBGBCR_EL1 | DBGBCR_E; write_dbgbcr(bpn, bcr); write_dbgbvr(bpn, addr); isb(); - asm volatile("msr daifclr, #8"); - - mdscr = read_sysreg(mdscr_el1) | MDSCR_KDE | MDSCR_MDE; - write_sysreg(mdscr, mdscr_el1); - isb(); + enable_monitor_debug_exceptions(); } static void install_ss(void) -- GitLab From 948f439c9d0080972ec937f4aefbe51229546510 Mon Sep 17 00:00:00 2001 From: Reiji Watanabe Date: Wed, 19 Oct 2022 22:41:58 -0700 Subject: [PATCH 0509/1423] KVM: arm64: selftests: Stop unnecessary test stage tracking of debug-exceptions Currently, debug-exceptions test unnecessarily tracks some test stages using GUEST_SYNC(). The code for it needs to be updated as test cases are added or removed. Stop doing the unnecessary stage tracking, as they are not so useful and are a bit pain to maintain. Signed-off-by: Reiji Watanabe Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221020054202.2119018-6-reijiw@google.com --- .../selftests/kvm/aarch64/debug-exceptions.c | 46 ++++--------------- 1 file changed, 9 insertions(+), 37 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c index 0c237022f4d39..040e4d7f8755c 100644 --- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c +++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c @@ -181,23 +181,17 @@ static volatile char write_data; static void guest_code(uint8_t bpn, uint8_t wpn) { - GUEST_SYNC(0); - /* Software-breakpoint */ reset_debug_state(); asm volatile("sw_bp: brk #0"); GUEST_ASSERT_EQ(sw_bp_addr, PC(sw_bp)); - GUEST_SYNC(1); - /* Hardware-breakpoint */ reset_debug_state(); install_hw_bp(bpn, PC(hw_bp)); asm volatile("hw_bp: nop"); GUEST_ASSERT_EQ(hw_bp_addr, PC(hw_bp)); - GUEST_SYNC(2); - /* Hardware-breakpoint + svc */ reset_debug_state(); install_hw_bp(bpn, PC(bp_svc)); @@ -205,8 +199,6 @@ static void guest_code(uint8_t bpn, uint8_t wpn) GUEST_ASSERT_EQ(hw_bp_addr, PC(bp_svc)); GUEST_ASSERT_EQ(svc_addr, PC(bp_svc) + 4); - GUEST_SYNC(3); - /* Hardware-breakpoint + software-breakpoint */ reset_debug_state(); install_hw_bp(bpn, PC(bp_brk)); @@ -214,8 +206,6 @@ static void guest_code(uint8_t bpn, uint8_t wpn) GUEST_ASSERT_EQ(sw_bp_addr, PC(bp_brk)); GUEST_ASSERT_EQ(hw_bp_addr, PC(bp_brk)); - GUEST_SYNC(4); - /* Watchpoint */ reset_debug_state(); install_wp(wpn, PC(write_data)); @@ -223,8 +213,6 @@ static void guest_code(uint8_t bpn, uint8_t wpn) GUEST_ASSERT_EQ(write_data, 'x'); GUEST_ASSERT_EQ(wp_data_addr, PC(write_data)); - GUEST_SYNC(5); - /* Single-step */ reset_debug_state(); install_ss(); @@ -238,8 +226,6 @@ static void guest_code(uint8_t bpn, uint8_t wpn) GUEST_ASSERT_EQ(ss_addr[1], PC(ss_start) + 4); GUEST_ASSERT_EQ(ss_addr[2], PC(ss_start) + 8); - GUEST_SYNC(6); - /* OS Lock does not block software-breakpoint */ reset_debug_state(); enable_os_lock(); @@ -247,8 +233,6 @@ static void guest_code(uint8_t bpn, uint8_t wpn) asm volatile("sw_bp2: brk #0"); GUEST_ASSERT_EQ(sw_bp_addr, PC(sw_bp2)); - GUEST_SYNC(7); - /* OS Lock blocking hardware-breakpoint */ reset_debug_state(); enable_os_lock(); @@ -257,8 +241,6 @@ static void guest_code(uint8_t bpn, uint8_t wpn) asm volatile("hw_bp2: nop"); GUEST_ASSERT_EQ(hw_bp_addr, 0); - GUEST_SYNC(8); - /* OS Lock blocking watchpoint */ reset_debug_state(); enable_os_lock(); @@ -269,8 +251,6 @@ static void guest_code(uint8_t bpn, uint8_t wpn) GUEST_ASSERT_EQ(write_data, 'x'); GUEST_ASSERT_EQ(wp_data_addr, 0); - GUEST_SYNC(9); - /* OS Lock blocking single-step */ reset_debug_state(); enable_os_lock(); @@ -370,7 +350,6 @@ static void test_guest_debug_exceptions(void) struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct ucall uc; - int stage; vm = vm_create_with_one_vcpu(&vcpu, guest_code); ucall_init(vm, NULL); @@ -391,23 +370,16 @@ static void test_guest_debug_exceptions(void) /* Run tests with breakpoint#0 and watchpoint#0. */ vcpu_args_set(vcpu, 2, 0, 0); - for (stage = 0; stage < 11; stage++) { - vcpu_run(vcpu); - switch (get_ucall(vcpu, &uc)) { - case UCALL_SYNC: - TEST_ASSERT(uc.args[1] == stage, - "Stage %d: Unexpected sync ucall, got %lx", - stage, (ulong)uc.args[1]); - break; - case UCALL_ABORT: - REPORT_GUEST_ASSERT_2(uc, "values: %#lx, %#lx"); - break; - case UCALL_DONE: - goto done; - default: - TEST_FAIL("Unknown ucall %lu", uc.cmd); - } + vcpu_run(vcpu); + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT_2(uc, "values: %#lx, %#lx"); + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); } done: -- GitLab From 5dd544e882d96d43b363c5ef64683281f2a386d9 Mon Sep 17 00:00:00 2001 From: Reiji Watanabe Date: Wed, 19 Oct 2022 22:41:59 -0700 Subject: [PATCH 0510/1423] KVM: arm64: selftests: Change debug_version() to take ID_AA64DFR0_EL1 Change debug_version() to take the ID_AA64DFR0_EL1 value instead of vcpu as an argument, and change its callsite to read ID_AA64DFR0_EL1 (and pass it to debug_version()). Subsequent patches will reuse the register value in the callsite. No functional change intended. Signed-off-by: Reiji Watanabe Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221020054202.2119018-7-reijiw@google.com --- tools/testing/selftests/kvm/aarch64/debug-exceptions.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c index 040e4d7f8755c..72ec9bb166827 100644 --- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c +++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c @@ -337,11 +337,8 @@ static void guest_code_ss(int test_cnt) GUEST_DONE(); } -static int debug_version(struct kvm_vcpu *vcpu) +static int debug_version(uint64_t id_aa64dfr0) { - uint64_t id_aa64dfr0; - - vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64DFR0_EL1), &id_aa64dfr0); return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_DEBUGVER), id_aa64dfr0); } @@ -466,9 +463,11 @@ int main(int argc, char *argv[]) struct kvm_vm *vm; int opt; int ss_iteration = 10000; + uint64_t aa64dfr0; vm = vm_create_with_one_vcpu(&vcpu, guest_code); - __TEST_REQUIRE(debug_version(vcpu) >= 6, + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64DFR0_EL1), &aa64dfr0); + __TEST_REQUIRE(debug_version(aa64dfr0) >= 6, "Armv8 debug architecture not supported."); kvm_vm_free(vm); -- GitLab From 142365932f5f296df593dd653d79194ff5457722 Mon Sep 17 00:00:00 2001 From: Reiji Watanabe Date: Wed, 19 Oct 2022 22:42:00 -0700 Subject: [PATCH 0511/1423] KVM: arm64: selftests: Add a test case for a linked breakpoint Currently, the debug-exceptions test doesn't have a test case for a linked breakpoint. Add a test case for the linked breakpoint to the test. The new test case uses a pair of breakpoints. One is the higiest numbered context-aware breakpoint (for Context ID match), and the other one is the breakpoint#0 (for Address Match), which is linked to the context-aware breakpoint. Signed-off-by: Reiji Watanabe Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221020054202.2119018-8-reijiw@google.com --- .../selftests/kvm/aarch64/debug-exceptions.c | 63 +++++++++++++++++-- 1 file changed, 57 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c index 72ec9bb166827..362e7668a9782 100644 --- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c +++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c @@ -12,6 +12,10 @@ #define DBGBCR_EXEC (0x0 << 3) #define DBGBCR_EL1 (0x1 << 1) #define DBGBCR_E (0x1 << 0) +#define DBGBCR_LBN_SHIFT 16 +#define DBGBCR_BT_SHIFT 20 +#define DBGBCR_BT_ADDR_LINK_CTX (0x1 << DBGBCR_BT_SHIFT) +#define DBGBCR_BT_CTX_LINK (0x3 << DBGBCR_BT_SHIFT) #define DBGWCR_LEN8 (0xff << 5) #define DBGWCR_RD (0x1 << 3) @@ -22,7 +26,7 @@ #define SPSR_D (1 << 9) #define SPSR_SS (1 << 21) -extern unsigned char sw_bp, sw_bp2, hw_bp, hw_bp2, bp_svc, bp_brk, hw_wp, ss_start; +extern unsigned char sw_bp, sw_bp2, hw_bp, hw_bp2, bp_svc, bp_brk, hw_wp, ss_start, hw_bp_ctx; extern unsigned char iter_ss_begin, iter_ss_end; static volatile uint64_t sw_bp_addr, hw_bp_addr; static volatile uint64_t wp_addr, wp_data_addr; @@ -105,6 +109,7 @@ static void reset_debug_state(void) isb(); write_sysreg(0, mdscr_el1); + write_sysreg(0, contextidr_el1); /* Reset all bcr/bvr/wcr/wvr registers */ dfr0 = read_sysreg(id_aa64dfr0_el1); @@ -166,6 +171,31 @@ static void install_hw_bp(uint8_t bpn, uint64_t addr) enable_monitor_debug_exceptions(); } +void install_hw_bp_ctx(uint8_t addr_bp, uint8_t ctx_bp, uint64_t addr, + uint64_t ctx) +{ + uint32_t addr_bcr, ctx_bcr; + + /* Setup a context-aware breakpoint for Linked Context ID Match */ + ctx_bcr = DBGBCR_LEN8 | DBGBCR_EXEC | DBGBCR_EL1 | DBGBCR_E | + DBGBCR_BT_CTX_LINK; + write_dbgbcr(ctx_bp, ctx_bcr); + write_dbgbvr(ctx_bp, ctx); + + /* + * Setup a normal breakpoint for Linked Address Match, and link it + * to the context-aware breakpoint. + */ + addr_bcr = DBGBCR_LEN8 | DBGBCR_EXEC | DBGBCR_EL1 | DBGBCR_E | + DBGBCR_BT_ADDR_LINK_CTX | + ((uint32_t)ctx_bp << DBGBCR_LBN_SHIFT); + write_dbgbcr(addr_bp, addr_bcr); + write_dbgbvr(addr_bp, addr); + isb(); + + enable_monitor_debug_exceptions(); +} + static void install_ss(void) { uint32_t mdscr; @@ -179,8 +209,10 @@ static void install_ss(void) static volatile char write_data; -static void guest_code(uint8_t bpn, uint8_t wpn) +static void guest_code(uint8_t bpn, uint8_t wpn, uint8_t ctx_bpn) { + uint64_t ctx = 0xabcdef; /* a random context number */ + /* Software-breakpoint */ reset_debug_state(); asm volatile("sw_bp: brk #0"); @@ -263,6 +295,17 @@ static void guest_code(uint8_t bpn, uint8_t wpn) : : : "x0"); GUEST_ASSERT_EQ(ss_addr[0], 0); + /* Linked hardware-breakpoint */ + hw_bp_addr = 0; + reset_debug_state(); + install_hw_bp_ctx(bpn, ctx_bpn, PC(hw_bp_ctx), ctx); + /* Set context id */ + write_sysreg(ctx, contextidr_el1); + isb(); + asm volatile("hw_bp_ctx: nop"); + write_sysreg(0, contextidr_el1); + GUEST_ASSERT_EQ(hw_bp_addr, PC(hw_bp_ctx)); + GUEST_DONE(); } @@ -342,11 +385,12 @@ static int debug_version(uint64_t id_aa64dfr0) return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_DEBUGVER), id_aa64dfr0); } -static void test_guest_debug_exceptions(void) +static void test_guest_debug_exceptions(uint64_t aa64dfr0) { struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct ucall uc; + uint8_t brp_num; vm = vm_create_with_one_vcpu(&vcpu, guest_code); ucall_init(vm, NULL); @@ -365,8 +409,15 @@ static void test_guest_debug_exceptions(void) vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, ESR_EC_SVC64, guest_svc_handler); - /* Run tests with breakpoint#0 and watchpoint#0. */ - vcpu_args_set(vcpu, 2, 0, 0); + /* Number of breakpoints */ + brp_num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_BRPS), aa64dfr0) + 1; + __TEST_REQUIRE(brp_num >= 2, "At least two breakpoints are required"); + + /* + * Run tests with breakpoint#0, watchpoint#0, and the higiest + * numbered (context-aware) breakpoint. + */ + vcpu_args_set(vcpu, 3, 0, 0, brp_num - 1); vcpu_run(vcpu); switch (get_ucall(vcpu, &uc)) { @@ -483,7 +534,7 @@ int main(int argc, char *argv[]) } } - test_guest_debug_exceptions(); + test_guest_debug_exceptions(aa64dfr0); test_single_step_from_userspace(ss_iteration); return 0; -- GitLab From 5ced4e533b676b1a582d89aba5328e4b316957e0 Mon Sep 17 00:00:00 2001 From: Reiji Watanabe Date: Wed, 19 Oct 2022 22:42:01 -0700 Subject: [PATCH 0512/1423] KVM: arm64: selftests: Add a test case for a linked watchpoint Currently, the debug-exceptions test doesn't have a test case for a linked watchpoint. Add a test case for the linked watchpoint to the test. The new test case uses the highest numbered context-aware breakpoint (for Context ID match), and the watchpoint#0, which is linked to the context-aware breakpoint. Signed-off-by: Reiji Watanabe Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221020054202.2119018-9-reijiw@google.com --- .../selftests/kvm/aarch64/debug-exceptions.c | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c index 362e7668a9782..73a95e6b345e5 100644 --- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c +++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c @@ -22,6 +22,9 @@ #define DBGWCR_WR (0x2 << 3) #define DBGWCR_EL1 (0x1 << 1) #define DBGWCR_E (0x1 << 0) +#define DBGWCR_LBN_SHIFT 16 +#define DBGWCR_WT_SHIFT 20 +#define DBGWCR_WT_LINK (0x1 << DBGWCR_WT_SHIFT) #define SPSR_D (1 << 9) #define SPSR_SS (1 << 21) @@ -171,6 +174,28 @@ static void install_hw_bp(uint8_t bpn, uint64_t addr) enable_monitor_debug_exceptions(); } +static void install_wp_ctx(uint8_t addr_wp, uint8_t ctx_bp, uint64_t addr, + uint64_t ctx) +{ + uint32_t wcr; + uint64_t ctx_bcr; + + /* Setup a context-aware breakpoint for Linked Context ID Match */ + ctx_bcr = DBGBCR_LEN8 | DBGBCR_EXEC | DBGBCR_EL1 | DBGBCR_E | + DBGBCR_BT_CTX_LINK; + write_dbgbcr(ctx_bp, ctx_bcr); + write_dbgbvr(ctx_bp, ctx); + + /* Setup a linked watchpoint (linked to the context-aware breakpoint) */ + wcr = DBGWCR_LEN8 | DBGWCR_RD | DBGWCR_WR | DBGWCR_EL1 | DBGWCR_E | + DBGWCR_WT_LINK | ((uint32_t)ctx_bp << DBGWCR_LBN_SHIFT); + write_dbgwcr(addr_wp, wcr); + write_dbgwvr(addr_wp, addr); + isb(); + + enable_monitor_debug_exceptions(); +} + void install_hw_bp_ctx(uint8_t addr_bp, uint8_t ctx_bp, uint64_t addr, uint64_t ctx) { @@ -306,6 +331,16 @@ static void guest_code(uint8_t bpn, uint8_t wpn, uint8_t ctx_bpn) write_sysreg(0, contextidr_el1); GUEST_ASSERT_EQ(hw_bp_addr, PC(hw_bp_ctx)); + /* Linked watchpoint */ + reset_debug_state(); + install_wp_ctx(wpn, ctx_bpn, PC(write_data), ctx); + /* Set context id */ + write_sysreg(ctx, contextidr_el1); + isb(); + write_data = 'x'; + GUEST_ASSERT_EQ(write_data, 'x'); + GUEST_ASSERT_EQ(wp_data_addr, PC(write_data)); + GUEST_DONE(); } -- GitLab From ebb8cc10316de3040efc4cfb40030f374cbbaa3b Mon Sep 17 00:00:00 2001 From: Reiji Watanabe Date: Wed, 19 Oct 2022 22:42:02 -0700 Subject: [PATCH 0513/1423] KVM: arm64: selftests: Test with every breakpoint/watchpoint Currently, the debug-exceptions test always uses only {break,watch}point#0 and the highest numbered context-aware breakpoint. Modify the test to use all {break,watch}points and context-aware breakpoints supported on the system. Signed-off-by: Reiji Watanabe Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221020054202.2119018-10-reijiw@google.com --- .../selftests/kvm/aarch64/debug-exceptions.c | 54 ++++++++++++++----- 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c index 73a95e6b345e5..b30add3e77269 100644 --- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c +++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c @@ -420,12 +420,11 @@ static int debug_version(uint64_t id_aa64dfr0) return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_DEBUGVER), id_aa64dfr0); } -static void test_guest_debug_exceptions(uint64_t aa64dfr0) +static void test_guest_debug_exceptions(uint8_t bpn, uint8_t wpn, uint8_t ctx_bpn) { struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct ucall uc; - uint8_t brp_num; vm = vm_create_with_one_vcpu(&vcpu, guest_code); ucall_init(vm, NULL); @@ -444,15 +443,9 @@ static void test_guest_debug_exceptions(uint64_t aa64dfr0) vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, ESR_EC_SVC64, guest_svc_handler); - /* Number of breakpoints */ - brp_num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_BRPS), aa64dfr0) + 1; - __TEST_REQUIRE(brp_num >= 2, "At least two breakpoints are required"); - - /* - * Run tests with breakpoint#0, watchpoint#0, and the higiest - * numbered (context-aware) breakpoint. - */ - vcpu_args_set(vcpu, 3, 0, 0, brp_num - 1); + /* Specify bpn/wpn/ctx_bpn to be tested */ + vcpu_args_set(vcpu, 3, bpn, wpn, ctx_bpn); + pr_debug("Use bpn#%d, wpn#%d and ctx_bpn#%d\n", bpn, wpn, ctx_bpn); vcpu_run(vcpu); switch (get_ucall(vcpu, &uc)) { @@ -535,6 +528,43 @@ void test_single_step_from_userspace(int test_cnt) kvm_vm_free(vm); } +/* + * Run debug testing using the various breakpoint#, watchpoint# and + * context-aware breakpoint# with the given ID_AA64DFR0_EL1 configuration. + */ +void test_guest_debug_exceptions_all(uint64_t aa64dfr0) +{ + uint8_t brp_num, wrp_num, ctx_brp_num, normal_brp_num, ctx_brp_base; + int b, w, c; + + /* Number of breakpoints */ + brp_num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_BRPS), aa64dfr0) + 1; + __TEST_REQUIRE(brp_num >= 2, "At least two breakpoints are required"); + + /* Number of watchpoints */ + wrp_num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_WRPS), aa64dfr0) + 1; + + /* Number of context aware breakpoints */ + ctx_brp_num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_CTX_CMPS), aa64dfr0) + 1; + + pr_debug("%s brp_num:%d, wrp_num:%d, ctx_brp_num:%d\n", __func__, + brp_num, wrp_num, ctx_brp_num); + + /* Number of normal (non-context aware) breakpoints */ + normal_brp_num = brp_num - ctx_brp_num; + + /* Lowest context aware breakpoint number */ + ctx_brp_base = normal_brp_num; + + /* Run tests with all supported breakpoints/watchpoints */ + for (c = ctx_brp_base; c < ctx_brp_base + ctx_brp_num; c++) { + for (b = 0; b < normal_brp_num; b++) { + for (w = 0; w < wrp_num; w++) + test_guest_debug_exceptions(b, w, c); + } + } +} + static void help(char *name) { puts(""); @@ -569,7 +599,7 @@ int main(int argc, char *argv[]) } } - test_guest_debug_exceptions(aa64dfr0); + test_guest_debug_exceptions_all(aa64dfr0); test_single_step_from_userspace(ss_iteration); return 0; -- GitLab From a93871d0ea9fd59fb5eb783619334183d7f07f51 Mon Sep 17 00:00:00 2001 From: Ricardo Koller Date: Mon, 17 Oct 2022 19:58:21 +0000 Subject: [PATCH 0514/1423] KVM: selftests: Add a userfaultfd library Move the generic userfaultfd code out of demand_paging_test.c into a common library, userfaultfd_util. This library consists of a setup and a stop function. The setup function starts a thread for handling page faults using the handler callback function. This setup returns a uffd_desc object which is then used in the stop function (to wait and destroy the threads). Reviewed-by: Oliver Upton Reviewed-by: Ben Gardon Signed-off-by: Ricardo Koller Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221017195834.2295901-2-ricarkol@google.com --- tools/testing/selftests/kvm/Makefile | 1 + .../selftests/kvm/demand_paging_test.c | 228 +++--------------- .../selftests/kvm/include/userfaultfd_util.h | 45 ++++ .../selftests/kvm/lib/userfaultfd_util.c | 186 ++++++++++++++ 4 files changed, 262 insertions(+), 198 deletions(-) create mode 100644 tools/testing/selftests/kvm/include/userfaultfd_util.h create mode 100644 tools/testing/selftests/kvm/lib/userfaultfd_util.c diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 0172eb6cb6eee..08a2606aff331 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -47,6 +47,7 @@ LIBKVM += lib/perf_test_util.c LIBKVM += lib/rbtree.c LIBKVM += lib/sparsebit.c LIBKVM += lib/test_util.c +LIBKVM += lib/userfaultfd_util.c LIBKVM_STRING += lib/string_override.c diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 779ae54f89c40..8e1fe4ffcccdf 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -22,23 +22,13 @@ #include "test_util.h" #include "perf_test_util.h" #include "guest_modes.h" +#include "userfaultfd_util.h" #ifdef __NR_userfaultfd -#ifdef PRINT_PER_PAGE_UPDATES -#define PER_PAGE_DEBUG(...) printf(__VA_ARGS__) -#else -#define PER_PAGE_DEBUG(...) _no_printf(__VA_ARGS__) -#endif - -#ifdef PRINT_PER_VCPU_UPDATES -#define PER_VCPU_DEBUG(...) printf(__VA_ARGS__) -#else -#define PER_VCPU_DEBUG(...) _no_printf(__VA_ARGS__) -#endif - static int nr_vcpus = 1; static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE; + static size_t demand_paging_size; static char *guest_data_prototype; @@ -67,9 +57,11 @@ static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args) ts_diff.tv_sec, ts_diff.tv_nsec); } -static int handle_uffd_page_request(int uffd_mode, int uffd, uint64_t addr) +static int handle_uffd_page_request(int uffd_mode, int uffd, + struct uffd_msg *msg) { pid_t tid = syscall(__NR_gettid); + uint64_t addr = msg->arg.pagefault.address; struct timespec start; struct timespec ts_diff; int r; @@ -116,174 +108,32 @@ static int handle_uffd_page_request(int uffd_mode, int uffd, uint64_t addr) return 0; } -bool quit_uffd_thread; - -struct uffd_handler_args { +struct test_params { int uffd_mode; - int uffd; - int pipefd; - useconds_t delay; + useconds_t uffd_delay; + enum vm_mem_backing_src_type src_type; + bool partition_vcpu_memory_access; }; -static void *uffd_handler_thread_fn(void *arg) -{ - struct uffd_handler_args *uffd_args = (struct uffd_handler_args *)arg; - int uffd = uffd_args->uffd; - int pipefd = uffd_args->pipefd; - useconds_t delay = uffd_args->delay; - int64_t pages = 0; - struct timespec start; - struct timespec ts_diff; - - clock_gettime(CLOCK_MONOTONIC, &start); - while (!quit_uffd_thread) { - struct uffd_msg msg; - struct pollfd pollfd[2]; - char tmp_chr; - int r; - uint64_t addr; - - pollfd[0].fd = uffd; - pollfd[0].events = POLLIN; - pollfd[1].fd = pipefd; - pollfd[1].events = POLLIN; - - r = poll(pollfd, 2, -1); - switch (r) { - case -1: - pr_info("poll err"); - continue; - case 0: - continue; - case 1: - break; - default: - pr_info("Polling uffd returned %d", r); - return NULL; - } - - if (pollfd[0].revents & POLLERR) { - pr_info("uffd revents has POLLERR"); - return NULL; - } - - if (pollfd[1].revents & POLLIN) { - r = read(pollfd[1].fd, &tmp_chr, 1); - TEST_ASSERT(r == 1, - "Error reading pipefd in UFFD thread\n"); - return NULL; - } - - if (!(pollfd[0].revents & POLLIN)) - continue; - - r = read(uffd, &msg, sizeof(msg)); - if (r == -1) { - if (errno == EAGAIN) - continue; - pr_info("Read of uffd got errno %d\n", errno); - return NULL; - } - - if (r != sizeof(msg)) { - pr_info("Read on uffd returned unexpected size: %d bytes", r); - return NULL; - } - - if (!(msg.event & UFFD_EVENT_PAGEFAULT)) - continue; - - if (delay) - usleep(delay); - addr = msg.arg.pagefault.address; - r = handle_uffd_page_request(uffd_args->uffd_mode, uffd, addr); - if (r < 0) - return NULL; - pages++; - } - - ts_diff = timespec_elapsed(start); - PER_VCPU_DEBUG("userfaulted %ld pages over %ld.%.9lds. (%f/sec)\n", - pages, ts_diff.tv_sec, ts_diff.tv_nsec, - pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0)); - - return NULL; -} - -static void setup_demand_paging(struct kvm_vm *vm, - pthread_t *uffd_handler_thread, int pipefd, - int uffd_mode, useconds_t uffd_delay, - struct uffd_handler_args *uffd_args, - void *hva, void *alias, uint64_t len) +static void prefault_mem(void *alias, uint64_t len) { - bool is_minor = (uffd_mode == UFFDIO_REGISTER_MODE_MINOR); - int uffd; - struct uffdio_api uffdio_api; - struct uffdio_register uffdio_register; - uint64_t expected_ioctls = ((uint64_t) 1) << _UFFDIO_COPY; - int ret; + size_t p; - PER_PAGE_DEBUG("Userfaultfd %s mode, faults resolved with %s\n", - is_minor ? "MINOR" : "MISSING", - is_minor ? "UFFDIO_CONINUE" : "UFFDIO_COPY"); - - /* In order to get minor faults, prefault via the alias. */ - if (is_minor) { - size_t p; - - expected_ioctls = ((uint64_t) 1) << _UFFDIO_CONTINUE; - - TEST_ASSERT(alias != NULL, "Alias required for minor faults"); - for (p = 0; p < (len / demand_paging_size); ++p) { - memcpy(alias + (p * demand_paging_size), - guest_data_prototype, demand_paging_size); - } + TEST_ASSERT(alias != NULL, "Alias required for minor faults"); + for (p = 0; p < (len / demand_paging_size); ++p) { + memcpy(alias + (p * demand_paging_size), + guest_data_prototype, demand_paging_size); } - - uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); - TEST_ASSERT(uffd >= 0, __KVM_SYSCALL_ERROR("userfaultfd()", uffd)); - - uffdio_api.api = UFFD_API; - uffdio_api.features = 0; - ret = ioctl(uffd, UFFDIO_API, &uffdio_api); - TEST_ASSERT(ret != -1, __KVM_SYSCALL_ERROR("UFFDIO_API", ret)); - - uffdio_register.range.start = (uint64_t)hva; - uffdio_register.range.len = len; - uffdio_register.mode = uffd_mode; - ret = ioctl(uffd, UFFDIO_REGISTER, &uffdio_register); - TEST_ASSERT(ret != -1, __KVM_SYSCALL_ERROR("UFFDIO_REGISTER", ret)); - TEST_ASSERT((uffdio_register.ioctls & expected_ioctls) == - expected_ioctls, "missing userfaultfd ioctls"); - - uffd_args->uffd_mode = uffd_mode; - uffd_args->uffd = uffd; - uffd_args->pipefd = pipefd; - uffd_args->delay = uffd_delay; - pthread_create(uffd_handler_thread, NULL, uffd_handler_thread_fn, - uffd_args); - - PER_VCPU_DEBUG("Created uffd thread for HVA range [%p, %p)\n", - hva, hva + len); } -struct test_params { - int uffd_mode; - useconds_t uffd_delay; - enum vm_mem_backing_src_type src_type; - bool partition_vcpu_memory_access; -}; - static void run_test(enum vm_guest_mode mode, void *arg) { struct test_params *p = arg; - pthread_t *uffd_handler_threads = NULL; - struct uffd_handler_args *uffd_args = NULL; + struct uffd_desc **uffd_descs = NULL; struct timespec start; struct timespec ts_diff; - int *pipefds = NULL; struct kvm_vm *vm; - int r, i; + int i; vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, p->src_type, p->partition_vcpu_memory_access); @@ -296,15 +146,8 @@ static void run_test(enum vm_guest_mode mode, void *arg) memset(guest_data_prototype, 0xAB, demand_paging_size); if (p->uffd_mode) { - uffd_handler_threads = - malloc(nr_vcpus * sizeof(*uffd_handler_threads)); - TEST_ASSERT(uffd_handler_threads, "Memory allocation failed"); - - uffd_args = malloc(nr_vcpus * sizeof(*uffd_args)); - TEST_ASSERT(uffd_args, "Memory allocation failed"); - - pipefds = malloc(sizeof(int) * nr_vcpus * 2); - TEST_ASSERT(pipefds, "Unable to allocate memory for pipefd"); + uffd_descs = malloc(nr_vcpus * sizeof(struct uffd_desc *)); + TEST_ASSERT(uffd_descs, "Memory allocation failed"); for (i = 0; i < nr_vcpus; i++) { struct perf_test_vcpu_args *vcpu_args; @@ -317,19 +160,17 @@ static void run_test(enum vm_guest_mode mode, void *arg) vcpu_hva = addr_gpa2hva(vm, vcpu_args->gpa); vcpu_alias = addr_gpa2alias(vm, vcpu_args->gpa); + prefault_mem(vcpu_alias, + vcpu_args->pages * perf_test_args.guest_page_size); + /* * Set up user fault fd to handle demand paging * requests. */ - r = pipe2(&pipefds[i * 2], - O_CLOEXEC | O_NONBLOCK); - TEST_ASSERT(!r, "Failed to set up pipefd"); - - setup_demand_paging(vm, &uffd_handler_threads[i], - pipefds[i * 2], p->uffd_mode, - p->uffd_delay, &uffd_args[i], - vcpu_hva, vcpu_alias, - vcpu_args->pages * perf_test_args.guest_page_size); + uffd_descs[i] = uffd_setup_demand_paging( + p->uffd_mode, p->uffd_delay, vcpu_hva, + vcpu_args->pages * perf_test_args.guest_page_size, + &handle_uffd_page_request); } } @@ -344,15 +185,9 @@ static void run_test(enum vm_guest_mode mode, void *arg) pr_info("All vCPU threads joined\n"); if (p->uffd_mode) { - char c; - /* Tell the user fault fd handler threads to quit */ - for (i = 0; i < nr_vcpus; i++) { - r = write(pipefds[i * 2 + 1], &c, 1); - TEST_ASSERT(r == 1, "Unable to write to pipefd"); - - pthread_join(uffd_handler_threads[i], NULL); - } + for (i = 0; i < nr_vcpus; i++) + uffd_stop_demand_paging(uffd_descs[i]); } pr_info("Total guest execution time: %ld.%.9lds\n", @@ -364,11 +199,8 @@ static void run_test(enum vm_guest_mode mode, void *arg) perf_test_destroy_vm(vm); free(guest_data_prototype); - if (p->uffd_mode) { - free(uffd_handler_threads); - free(uffd_args); - free(pipefds); - } + if (p->uffd_mode) + free(uffd_descs); } static void help(char *name) diff --git a/tools/testing/selftests/kvm/include/userfaultfd_util.h b/tools/testing/selftests/kvm/include/userfaultfd_util.h new file mode 100644 index 0000000000000..877449c345928 --- /dev/null +++ b/tools/testing/selftests/kvm/include/userfaultfd_util.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KVM userfaultfd util + * + * Copyright (C) 2018, Red Hat, Inc. + * Copyright (C) 2019-2022 Google LLC + */ + +#define _GNU_SOURCE /* for pipe2 */ + +#include +#include +#include +#include + +#include "test_util.h" + +typedef int (*uffd_handler_t)(int uffd_mode, int uffd, struct uffd_msg *msg); + +struct uffd_desc { + int uffd_mode; + int uffd; + int pipefds[2]; + useconds_t delay; + uffd_handler_t handler; + pthread_t thread; +}; + +struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, + void *hva, uint64_t len, + uffd_handler_t handler); + +void uffd_stop_demand_paging(struct uffd_desc *uffd); + +#ifdef PRINT_PER_PAGE_UPDATES +#define PER_PAGE_DEBUG(...) printf(__VA_ARGS__) +#else +#define PER_PAGE_DEBUG(...) _no_printf(__VA_ARGS__) +#endif + +#ifdef PRINT_PER_VCPU_UPDATES +#define PER_VCPU_DEBUG(...) printf(__VA_ARGS__) +#else +#define PER_VCPU_DEBUG(...) _no_printf(__VA_ARGS__) +#endif diff --git a/tools/testing/selftests/kvm/lib/userfaultfd_util.c b/tools/testing/selftests/kvm/lib/userfaultfd_util.c new file mode 100644 index 0000000000000..3b44846fc277e --- /dev/null +++ b/tools/testing/selftests/kvm/lib/userfaultfd_util.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM userfaultfd util + * Adapted from demand_paging_test.c + * + * Copyright (C) 2018, Red Hat, Inc. + * Copyright (C) 2019-2022 Google LLC + */ + +#define _GNU_SOURCE /* for pipe2 */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kvm_util.h" +#include "test_util.h" +#include "perf_test_util.h" +#include "userfaultfd_util.h" + +#ifdef __NR_userfaultfd + +static void *uffd_handler_thread_fn(void *arg) +{ + struct uffd_desc *uffd_desc = (struct uffd_desc *)arg; + int uffd = uffd_desc->uffd; + int pipefd = uffd_desc->pipefds[0]; + useconds_t delay = uffd_desc->delay; + int64_t pages = 0; + struct timespec start; + struct timespec ts_diff; + + clock_gettime(CLOCK_MONOTONIC, &start); + while (1) { + struct uffd_msg msg; + struct pollfd pollfd[2]; + char tmp_chr; + int r; + + pollfd[0].fd = uffd; + pollfd[0].events = POLLIN; + pollfd[1].fd = pipefd; + pollfd[1].events = POLLIN; + + r = poll(pollfd, 2, -1); + switch (r) { + case -1: + pr_info("poll err"); + continue; + case 0: + continue; + case 1: + break; + default: + pr_info("Polling uffd returned %d", r); + return NULL; + } + + if (pollfd[0].revents & POLLERR) { + pr_info("uffd revents has POLLERR"); + return NULL; + } + + if (pollfd[1].revents & POLLIN) { + r = read(pollfd[1].fd, &tmp_chr, 1); + TEST_ASSERT(r == 1, + "Error reading pipefd in UFFD thread\n"); + return NULL; + } + + if (!(pollfd[0].revents & POLLIN)) + continue; + + r = read(uffd, &msg, sizeof(msg)); + if (r == -1) { + if (errno == EAGAIN) + continue; + pr_info("Read of uffd got errno %d\n", errno); + return NULL; + } + + if (r != sizeof(msg)) { + pr_info("Read on uffd returned unexpected size: %d bytes", r); + return NULL; + } + + if (!(msg.event & UFFD_EVENT_PAGEFAULT)) + continue; + + if (delay) + usleep(delay); + r = uffd_desc->handler(uffd_desc->uffd_mode, uffd, &msg); + if (r < 0) + return NULL; + pages++; + } + + ts_diff = timespec_elapsed(start); + PER_VCPU_DEBUG("userfaulted %ld pages over %ld.%.9lds. (%f/sec)\n", + pages, ts_diff.tv_sec, ts_diff.tv_nsec, + pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0)); + + return NULL; +} + +struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, + void *hva, uint64_t len, + uffd_handler_t handler) +{ + struct uffd_desc *uffd_desc; + bool is_minor = (uffd_mode == UFFDIO_REGISTER_MODE_MINOR); + int uffd; + struct uffdio_api uffdio_api; + struct uffdio_register uffdio_register; + uint64_t expected_ioctls = ((uint64_t) 1) << _UFFDIO_COPY; + int ret; + + PER_PAGE_DEBUG("Userfaultfd %s mode, faults resolved with %s\n", + is_minor ? "MINOR" : "MISSING", + is_minor ? "UFFDIO_CONINUE" : "UFFDIO_COPY"); + + uffd_desc = malloc(sizeof(struct uffd_desc)); + TEST_ASSERT(uffd_desc, "malloc failed"); + + /* In order to get minor faults, prefault via the alias. */ + if (is_minor) + expected_ioctls = ((uint64_t) 1) << _UFFDIO_CONTINUE; + + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + TEST_ASSERT(uffd >= 0, "uffd creation failed, errno: %d", errno); + + uffdio_api.api = UFFD_API; + uffdio_api.features = 0; + TEST_ASSERT(ioctl(uffd, UFFDIO_API, &uffdio_api) != -1, + "ioctl UFFDIO_API failed: %" PRIu64, + (uint64_t)uffdio_api.api); + + uffdio_register.range.start = (uint64_t)hva; + uffdio_register.range.len = len; + uffdio_register.mode = uffd_mode; + TEST_ASSERT(ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) != -1, + "ioctl UFFDIO_REGISTER failed"); + TEST_ASSERT((uffdio_register.ioctls & expected_ioctls) == + expected_ioctls, "missing userfaultfd ioctls"); + + ret = pipe2(uffd_desc->pipefds, O_CLOEXEC | O_NONBLOCK); + TEST_ASSERT(!ret, "Failed to set up pipefd"); + + uffd_desc->uffd_mode = uffd_mode; + uffd_desc->uffd = uffd; + uffd_desc->delay = delay; + uffd_desc->handler = handler; + pthread_create(&uffd_desc->thread, NULL, uffd_handler_thread_fn, + uffd_desc); + + PER_VCPU_DEBUG("Created uffd thread for HVA range [%p, %p)\n", + hva, hva + len); + + return uffd_desc; +} + +void uffd_stop_demand_paging(struct uffd_desc *uffd) +{ + char c = 0; + int ret; + + ret = write(uffd->pipefds[1], &c, 1); + TEST_ASSERT(ret == 1, "Unable to write to pipefd"); + + ret = pthread_join(uffd->thread, NULL); + TEST_ASSERT(ret == 0, "Pthread_join failed."); + + close(uffd->uffd); + + close(uffd->pipefds[1]); + close(uffd->pipefds[0]); + + free(uffd); +} + +#endif /* __NR_userfaultfd */ -- GitLab From 228f324dc718f702e8777164c4e2e7426824fb13 Mon Sep 17 00:00:00 2001 From: Ricardo Koller Date: Mon, 17 Oct 2022 19:58:22 +0000 Subject: [PATCH 0515/1423] KVM: selftests: aarch64: Add virt_get_pte_hva() library function Add a library function to get the PTE (a host virtual address) of a given GVA. This will be used in a future commit by a test to clear and check the access flag of a particular page. Reviewed-by: Oliver Upton Reviewed-by: Andrew Jones Signed-off-by: Ricardo Koller Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221017195834.2295901-3-ricarkol@google.com --- .../selftests/kvm/include/aarch64/processor.h | 2 ++ tools/testing/selftests/kvm/lib/aarch64/processor.c | 13 ++++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h index a8124f9dd68a4..df4bfac695517 100644 --- a/tools/testing/selftests/kvm/include/aarch64/processor.h +++ b/tools/testing/selftests/kvm/include/aarch64/processor.h @@ -109,6 +109,8 @@ void vm_install_exception_handler(struct kvm_vm *vm, void vm_install_sync_handler(struct kvm_vm *vm, int vector, int ec, handler_fn handler); +uint64_t *virt_get_pte_hva(struct kvm_vm *vm, vm_vaddr_t gva); + static inline void cpu_relax(void) { asm volatile("yield" ::: "memory"); diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index 6f5551368944e..63ef3c78e55ee 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -138,7 +138,7 @@ void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) _virt_pg_map(vm, vaddr, paddr, attr_idx); } -vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) +uint64_t *virt_get_pte_hva(struct kvm_vm *vm, vm_vaddr_t gva) { uint64_t *ptep; @@ -169,11 +169,18 @@ vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) TEST_FAIL("Page table levels must be 2, 3, or 4"); } - return pte_addr(vm, *ptep) + (gva & (vm->page_size - 1)); + return ptep; unmapped_gva: TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva); - exit(1); + exit(EXIT_FAILURE); +} + +vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) +{ + uint64_t *ptep = virt_get_pte_hva(vm, gva); + + return pte_addr(vm, *ptep) + (gva & (vm->page_size - 1)); } static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent, uint64_t page, int level) -- GitLab From b6b03b86c0250a80b671313dbc0d7bcdbab78f41 Mon Sep 17 00:00:00 2001 From: Ricardo Koller Date: Mon, 17 Oct 2022 19:58:23 +0000 Subject: [PATCH 0516/1423] KVM: selftests: Add missing close and munmap in __vm_mem_region_delete() Deleting a memslot (when freeing a VM) is not closing the backing fd, nor it's unmapping the alias mapping. Fix by adding the missing close and munmap. Reviewed-by: Andrew Jones Reviewed-by: Oliver Upton Reviewed-by: Ben Gardon Signed-off-by: Ricardo Koller Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221017195834.2295901-4-ricarkol@google.com --- tools/testing/selftests/kvm/lib/kvm_util.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index f1cb1627161fd..19e37fb7de7cc 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -586,6 +586,12 @@ static void __vm_mem_region_delete(struct kvm_vm *vm, sparsebit_free(®ion->unused_phy_pages); ret = munmap(region->mmap_start, region->mmap_size); TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); + if (region->fd >= 0) { + /* There's an extra map when using shared memory. */ + ret = munmap(region->mmap_alias, region->mmap_size); + TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); + close(region->fd); + } free(region); } -- GitLab From 41f5189ea9c08f7fc28340a7aefc93d0d2dcb769 Mon Sep 17 00:00:00 2001 From: Ricardo Koller Date: Mon, 17 Oct 2022 19:58:24 +0000 Subject: [PATCH 0517/1423] KVM: selftests: aarch64: Construct DEFAULT_MAIR_EL1 using sysreg.h macros Define macros for memory type indexes and construct DEFAULT_MAIR_EL1 with macros from asm/sysreg.h. The index macros can then be used when constructing PTEs (instead of using raw numbers). Reviewed-by: Andrew Jones Reviewed-by: Oliver Upton Signed-off-by: Ricardo Koller Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221017195834.2295901-5-ricarkol@google.com --- .../selftests/kvm/include/aarch64/processor.h | 25 ++++++++++++++----- .../selftests/kvm/lib/aarch64/processor.c | 2 +- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h index df4bfac695517..c1ddca8db225a 100644 --- a/tools/testing/selftests/kvm/include/aarch64/processor.h +++ b/tools/testing/selftests/kvm/include/aarch64/processor.h @@ -38,12 +38,25 @@ * NORMAL 4 1111:1111 * NORMAL_WT 5 1011:1011 */ -#define DEFAULT_MAIR_EL1 ((0x00ul << (0 * 8)) | \ - (0x04ul << (1 * 8)) | \ - (0x0cul << (2 * 8)) | \ - (0x44ul << (3 * 8)) | \ - (0xfful << (4 * 8)) | \ - (0xbbul << (5 * 8))) + +/* Linux doesn't use these memory types, so let's define them. */ +#define MAIR_ATTR_DEVICE_GRE UL(0x0c) +#define MAIR_ATTR_NORMAL_WT UL(0xbb) + +#define MT_DEVICE_nGnRnE 0 +#define MT_DEVICE_nGnRE 1 +#define MT_DEVICE_GRE 2 +#define MT_NORMAL_NC 3 +#define MT_NORMAL 4 +#define MT_NORMAL_WT 5 + +#define DEFAULT_MAIR_EL1 \ + (MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRnE, MT_DEVICE_nGnRnE) | \ + MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRE, MT_DEVICE_nGnRE) | \ + MAIR_ATTRIDX(MAIR_ATTR_DEVICE_GRE, MT_DEVICE_GRE) | \ + MAIR_ATTRIDX(MAIR_ATTR_NORMAL_NC, MT_NORMAL_NC) | \ + MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL) | \ + MAIR_ATTRIDX(MAIR_ATTR_NORMAL_WT, MT_NORMAL_WT)) #define MPIDR_HWID_BITMASK (0xff00fffffful) diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index 63ef3c78e55ee..26f0eccff6fee 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -133,7 +133,7 @@ static void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) { - uint64_t attr_idx = 4; /* NORMAL (See DEFAULT_MAIR_EL1) */ + uint64_t attr_idx = MT_NORMAL; _virt_pg_map(vm, vaddr, paddr, attr_idx); } -- GitLab From 590b949597b1e811d35df2f32021dd17d8e47f8c Mon Sep 17 00:00:00 2001 From: Ricardo Koller Date: Mon, 17 Oct 2022 19:58:25 +0000 Subject: [PATCH 0518/1423] tools: Copy bitfield.h from the kernel sources Copy bitfield.h from include/linux/bitfield.h. A subsequent change will make use of some FIELD_{GET,PREP} macros defined in this header. The header was copied as-is, no changes needed. Cc: Jakub Kicinski Cc: Arnaldo Carvalho de Melo Reviewed-by: Oliver Upton Signed-off-by: Ricardo Koller Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221017195834.2295901-6-ricarkol@google.com --- tools/include/linux/bitfield.h | 176 +++++++++++++++++++++++++++++++++ 1 file changed, 176 insertions(+) create mode 100644 tools/include/linux/bitfield.h diff --git a/tools/include/linux/bitfield.h b/tools/include/linux/bitfield.h new file mode 100644 index 0000000000000..6093fa6db2600 --- /dev/null +++ b/tools/include/linux/bitfield.h @@ -0,0 +1,176 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2014 Felix Fietkau + * Copyright (C) 2004 - 2009 Ivo van Doorn + */ + +#ifndef _LINUX_BITFIELD_H +#define _LINUX_BITFIELD_H + +#include +#include + +/* + * Bitfield access macros + * + * FIELD_{GET,PREP} macros take as first parameter shifted mask + * from which they extract the base mask and shift amount. + * Mask must be a compilation time constant. + * + * Example: + * + * #define REG_FIELD_A GENMASK(6, 0) + * #define REG_FIELD_B BIT(7) + * #define REG_FIELD_C GENMASK(15, 8) + * #define REG_FIELD_D GENMASK(31, 16) + * + * Get: + * a = FIELD_GET(REG_FIELD_A, reg); + * b = FIELD_GET(REG_FIELD_B, reg); + * + * Set: + * reg = FIELD_PREP(REG_FIELD_A, 1) | + * FIELD_PREP(REG_FIELD_B, 0) | + * FIELD_PREP(REG_FIELD_C, c) | + * FIELD_PREP(REG_FIELD_D, 0x40); + * + * Modify: + * reg &= ~REG_FIELD_C; + * reg |= FIELD_PREP(REG_FIELD_C, c); + */ + +#define __bf_shf(x) (__builtin_ffsll(x) - 1) + +#define __scalar_type_to_unsigned_cases(type) \ + unsigned type: (unsigned type)0, \ + signed type: (unsigned type)0 + +#define __unsigned_scalar_typeof(x) typeof( \ + _Generic((x), \ + char: (unsigned char)0, \ + __scalar_type_to_unsigned_cases(char), \ + __scalar_type_to_unsigned_cases(short), \ + __scalar_type_to_unsigned_cases(int), \ + __scalar_type_to_unsigned_cases(long), \ + __scalar_type_to_unsigned_cases(long long), \ + default: (x))) + +#define __bf_cast_unsigned(type, x) ((__unsigned_scalar_typeof(type))(x)) + +#define __BF_FIELD_CHECK(_mask, _reg, _val, _pfx) \ + ({ \ + BUILD_BUG_ON_MSG(!__builtin_constant_p(_mask), \ + _pfx "mask is not constant"); \ + BUILD_BUG_ON_MSG((_mask) == 0, _pfx "mask is zero"); \ + BUILD_BUG_ON_MSG(__builtin_constant_p(_val) ? \ + ~((_mask) >> __bf_shf(_mask)) & (_val) : 0, \ + _pfx "value too large for the field"); \ + BUILD_BUG_ON_MSG(__bf_cast_unsigned(_mask, _mask) > \ + __bf_cast_unsigned(_reg, ~0ull), \ + _pfx "type of reg too small for mask"); \ + __BUILD_BUG_ON_NOT_POWER_OF_2((_mask) + \ + (1ULL << __bf_shf(_mask))); \ + }) + +/** + * FIELD_MAX() - produce the maximum value representable by a field + * @_mask: shifted mask defining the field's length and position + * + * FIELD_MAX() returns the maximum value that can be held in the field + * specified by @_mask. + */ +#define FIELD_MAX(_mask) \ + ({ \ + __BF_FIELD_CHECK(_mask, 0ULL, 0ULL, "FIELD_MAX: "); \ + (typeof(_mask))((_mask) >> __bf_shf(_mask)); \ + }) + +/** + * FIELD_FIT() - check if value fits in the field + * @_mask: shifted mask defining the field's length and position + * @_val: value to test against the field + * + * Return: true if @_val can fit inside @_mask, false if @_val is too big. + */ +#define FIELD_FIT(_mask, _val) \ + ({ \ + __BF_FIELD_CHECK(_mask, 0ULL, 0ULL, "FIELD_FIT: "); \ + !((((typeof(_mask))_val) << __bf_shf(_mask)) & ~(_mask)); \ + }) + +/** + * FIELD_PREP() - prepare a bitfield element + * @_mask: shifted mask defining the field's length and position + * @_val: value to put in the field + * + * FIELD_PREP() masks and shifts up the value. The result should + * be combined with other fields of the bitfield using logical OR. + */ +#define FIELD_PREP(_mask, _val) \ + ({ \ + __BF_FIELD_CHECK(_mask, 0ULL, _val, "FIELD_PREP: "); \ + ((typeof(_mask))(_val) << __bf_shf(_mask)) & (_mask); \ + }) + +/** + * FIELD_GET() - extract a bitfield element + * @_mask: shifted mask defining the field's length and position + * @_reg: value of entire bitfield + * + * FIELD_GET() extracts the field specified by @_mask from the + * bitfield passed in as @_reg by masking and shifting it down. + */ +#define FIELD_GET(_mask, _reg) \ + ({ \ + __BF_FIELD_CHECK(_mask, _reg, 0U, "FIELD_GET: "); \ + (typeof(_mask))(((_reg) & (_mask)) >> __bf_shf(_mask)); \ + }) + +extern void __compiletime_error("value doesn't fit into mask") +__field_overflow(void); +extern void __compiletime_error("bad bitfield mask") +__bad_mask(void); +static __always_inline u64 field_multiplier(u64 field) +{ + if ((field | (field - 1)) & ((field | (field - 1)) + 1)) + __bad_mask(); + return field & -field; +} +static __always_inline u64 field_mask(u64 field) +{ + return field / field_multiplier(field); +} +#define field_max(field) ((typeof(field))field_mask(field)) +#define ____MAKE_OP(type,base,to,from) \ +static __always_inline __##type type##_encode_bits(base v, base field) \ +{ \ + if (__builtin_constant_p(v) && (v & ~field_mask(field))) \ + __field_overflow(); \ + return to((v & field_mask(field)) * field_multiplier(field)); \ +} \ +static __always_inline __##type type##_replace_bits(__##type old, \ + base val, base field) \ +{ \ + return (old & ~to(field)) | type##_encode_bits(val, field); \ +} \ +static __always_inline void type##p_replace_bits(__##type *p, \ + base val, base field) \ +{ \ + *p = (*p & ~to(field)) | type##_encode_bits(val, field); \ +} \ +static __always_inline base type##_get_bits(__##type v, base field) \ +{ \ + return (from(v) & field)/field_multiplier(field); \ +} +#define __MAKE_OP(size) \ + ____MAKE_OP(le##size,u##size,cpu_to_le##size,le##size##_to_cpu) \ + ____MAKE_OP(be##size,u##size,cpu_to_be##size,be##size##_to_cpu) \ + ____MAKE_OP(u##size,u##size,,) +____MAKE_OP(u8,u8,,) +__MAKE_OP(16) +__MAKE_OP(32) +__MAKE_OP(64) +#undef __MAKE_OP +#undef ____MAKE_OP + +#endif -- GitLab From bd3ed7e1a47eb7b3838ca09439f1eb289ec3be1f Mon Sep 17 00:00:00 2001 From: Ricardo Koller Date: Mon, 17 Oct 2022 19:58:26 +0000 Subject: [PATCH 0519/1423] KVM: selftests: Stash backing_src_type in struct userspace_mem_region Add the backing_src_type into struct userspace_mem_region. This struct already stores a lot of info about memory regions, except the backing source type. This info will be used by a future commit in order to determine the method for punching a hole. Reviewed-by: Oliver Upton Signed-off-by: Ricardo Koller Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221017195834.2295901-7-ricarkol@google.com --- tools/testing/selftests/kvm/include/kvm_util_base.h | 1 + tools/testing/selftests/kvm/lib/kvm_util.c | 1 + 2 files changed, 2 insertions(+) diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index e42a09cd24a04..a9264ed22cca5 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -34,6 +34,7 @@ struct userspace_mem_region { struct sparsebit *unused_phy_pages; int fd; off_t offset; + enum vm_mem_backing_src_type backing_src_type; void *host_mem; void *host_alias; void *mmap_start; diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 19e37fb7de7cc..6affce47e899a 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -929,6 +929,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, vm_mem_backing_src_alias(src_type)->name); } + region->backing_src_type = src_type; region->unused_phy_pages = sparsebit_alloc(); sparsebit_set_num(region->unused_phy_pages, guest_paddr >> vm->page_shift, npages); -- GitLab From 290c5b54012b7f05e9c51af32d557574bf69a654 Mon Sep 17 00:00:00 2001 From: Ricardo Koller Date: Mon, 17 Oct 2022 19:58:27 +0000 Subject: [PATCH 0520/1423] KVM: selftests: Add vm->memslots[] and enum kvm_mem_region_type The vm_create() helpers are hardcoded to place most page types (code, page-tables, stacks, etc) in the same memslot #0, and always backed with anonymous 4K. There are a couple of issues with that. First, tests willing to differ a bit, like placing page-tables in a different backing source type must replicate much of what's already done by the vm_create() functions. Second, the hardcoded assumption of memslot #0 holding most things is spread everywhere; this makes it very hard to change. Fix the above issues by having selftests specify how they want memory to be laid out. Start by changing ____vm_create() to not create memslot #0; a test (to come) will specify all memslots used by the VM. Then, add the vm->memslots[] array to specify the right memslot for different memory allocators, e.g.,: lib/elf should use the vm->[MEM_REGION_CODE] memslot. This will be used as a way to specify the page-tables memslots (to be backed by huge pages for example). There is no functional change intended. The current commit lays out memory exactly as before. A future commit will change the allocators to get the region they should be using, e.g.,: like the page table allocators using the pt memslot. Cc: Sean Christopherson Cc: Andrew Jones Signed-off-by: Ricardo Koller Reviewed-by: Andrew Jones Reviewed-by: Sean Christopherson Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221017195834.2295901-8-ricarkol@google.com --- .../selftests/kvm/include/kvm_util_base.h | 26 +++++++++++++++++-- tools/testing/selftests/kvm/lib/kvm_util.c | 18 +++++++------ 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index a9264ed22cca5..6442aa9e9061f 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -65,6 +65,14 @@ struct userspace_mem_regions { DECLARE_HASHTABLE(slot_hash, 9); }; +enum kvm_mem_region_type { + MEM_REGION_CODE, + MEM_REGION_DATA, + MEM_REGION_PT, + MEM_REGION_TEST_DATA, + NR_MEM_REGIONS, +}; + struct kvm_vm { int mode; unsigned long type; @@ -93,6 +101,13 @@ struct kvm_vm { int stats_fd; struct kvm_stats_header stats_header; struct kvm_stats_desc *stats_desc; + + /* + * KVM region slots. These are the default memslots used by page + * allocators, e.g., lib/elf uses the memslots[MEM_REGION_CODE] + * memslot. + */ + uint32_t memslots[NR_MEM_REGIONS]; }; @@ -105,6 +120,13 @@ struct kvm_vm { struct userspace_mem_region * memslot2region(struct kvm_vm *vm, uint32_t memslot); +static inline struct userspace_mem_region *vm_get_mem_region(struct kvm_vm *vm, + enum kvm_mem_region_type type) +{ + assert(type < NR_MEM_REGIONS); + return memslot2region(vm, vm->memslots[type]); +} + /* Minimum allocated guest virtual and physical addresses */ #define KVM_UTIL_MIN_VADDR 0x2000 #define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 @@ -647,13 +669,13 @@ vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm); * __vm_create() does NOT create vCPUs, @nr_runnable_vcpus is used purely to * calculate the amount of memory needed for per-vCPU data, e.g. stacks. */ -struct kvm_vm *____vm_create(enum vm_guest_mode mode, uint64_t nr_pages); +struct kvm_vm *____vm_create(enum vm_guest_mode mode); struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint32_t nr_runnable_vcpus, uint64_t nr_extra_pages); static inline struct kvm_vm *vm_create_barebones(void) { - return ____vm_create(VM_MODE_DEFAULT, 0); + return ____vm_create(VM_MODE_DEFAULT); } static inline struct kvm_vm *vm_create(uint32_t nr_runnable_vcpus) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 6affce47e899a..f3dfa4e9ee0f5 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -185,13 +185,10 @@ const struct vm_guest_mode_params vm_guest_mode_params[] = { _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES, "Missing new mode params?"); -struct kvm_vm *____vm_create(enum vm_guest_mode mode, uint64_t nr_pages) +struct kvm_vm *____vm_create(enum vm_guest_mode mode) { struct kvm_vm *vm; - pr_debug("%s: mode='%s' pages='%ld'\n", __func__, - vm_guest_mode_string(mode), nr_pages); - vm = calloc(1, sizeof(*vm)); TEST_ASSERT(vm != NULL, "Insufficient Memory"); @@ -287,9 +284,6 @@ struct kvm_vm *____vm_create(enum vm_guest_mode mode, uint64_t nr_pages) /* Allocate and setup memory for guest. */ vm->vpages_mapped = sparsebit_alloc(); - if (nr_pages != 0) - vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, - 0, 0, nr_pages, 0); return vm; } @@ -335,8 +329,16 @@ struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint32_t nr_runnable_vcpus, uint64_t nr_pages = vm_nr_pages_required(mode, nr_runnable_vcpus, nr_extra_pages); struct kvm_vm *vm; + int i; + + pr_debug("%s: mode='%s' pages='%ld'\n", __func__, + vm_guest_mode_string(mode), nr_pages); + + vm = ____vm_create(mode); - vm = ____vm_create(mode, nr_pages); + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, 0); + for (i = 0; i < NR_MEM_REGIONS; i++) + vm->memslots[i] = 0; kvm_vm_elf_load(vm, program_invocation_name); -- GitLab From 5485e822e31a75dfac3713d94b6b22025d4895da Mon Sep 17 00:00:00 2001 From: Ricardo Koller Date: Mon, 17 Oct 2022 19:58:28 +0000 Subject: [PATCH 0521/1423] KVM: selftests: Fix alignment in virt_arch_pgd_alloc() and vm_vaddr_alloc() Refactor virt_arch_pgd_alloc() and vm_vaddr_alloc() in both RISC-V and aarch64 to fix the alignment of parameters in a couple of calls. This will make it easier to fix the alignment in a future commit that adds an extra parameter (that happens to be very long). No functional change intended. Suggested-by: Sean Christopherson Signed-off-by: Ricardo Koller Reviewed-by: Andrew Jones Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221017195834.2295901-9-ricarkol@google.com --- .../selftests/kvm/lib/aarch64/processor.c | 27 ++++++++++--------- .../selftests/kvm/lib/riscv/processor.c | 27 ++++++++++--------- 2 files changed, 30 insertions(+), 24 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index 26f0eccff6fee..6ff2b9d6cea6a 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -76,13 +76,14 @@ static uint64_t __maybe_unused ptrs_per_pte(struct kvm_vm *vm) void virt_arch_pgd_alloc(struct kvm_vm *vm) { - if (!vm->pgd_created) { - vm_paddr_t paddr = vm_phy_pages_alloc(vm, - page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0); - vm->pgd = paddr; - vm->pgd_created = true; - } + size_t nr_pages = page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size; + + if (vm->pgd_created) + return; + + vm->pgd = vm_phy_pages_alloc(vm, nr_pages, + KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0); + vm->pgd_created = true; } static void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, @@ -325,13 +326,15 @@ void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent) struct kvm_vcpu *aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, struct kvm_vcpu_init *init, void *guest_code) { - size_t stack_size = vm->page_size == 4096 ? - DEFAULT_STACK_PGS * vm->page_size : - vm->page_size; - uint64_t stack_vaddr = vm_vaddr_alloc(vm, stack_size, - DEFAULT_ARM64_GUEST_STACK_VADDR_MIN); + size_t stack_size; + uint64_t stack_vaddr; struct kvm_vcpu *vcpu = __vm_vcpu_add(vm, vcpu_id); + stack_size = vm->page_size == 4096 ? DEFAULT_STACK_PGS * vm->page_size : + vm->page_size; + stack_vaddr = vm_vaddr_alloc(vm, stack_size, + DEFAULT_ARM64_GUEST_STACK_VADDR_MIN); + aarch64_vcpu_setup(vcpu, init); vcpu_set_reg(vcpu, ARM64_CORE_REG(sp_el1), stack_vaddr + stack_size); diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c index 6044781512120..ac7fc9d317db3 100644 --- a/tools/testing/selftests/kvm/lib/riscv/processor.c +++ b/tools/testing/selftests/kvm/lib/riscv/processor.c @@ -55,13 +55,14 @@ static uint64_t pte_index(struct kvm_vm *vm, vm_vaddr_t gva, int level) void virt_arch_pgd_alloc(struct kvm_vm *vm) { - if (!vm->pgd_created) { - vm_paddr_t paddr = vm_phy_pages_alloc(vm, - page_align(vm, ptrs_per_pte(vm) * 8) / vm->page_size, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0); - vm->pgd = paddr; - vm->pgd_created = true; - } + size_t nr_pages = page_align(vm, ptrs_per_pte(vm) * 8) / vm->page_size; + + if (vm->pgd_created) + return; + + vm->pgd = vm_phy_pages_alloc(vm, nr_pages, + KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0); + vm->pgd_created = true; } void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) @@ -279,15 +280,17 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, void *guest_code) { int r; - size_t stack_size = vm->page_size == 4096 ? - DEFAULT_STACK_PGS * vm->page_size : - vm->page_size; - unsigned long stack_vaddr = vm_vaddr_alloc(vm, stack_size, - DEFAULT_RISCV_GUEST_STACK_VADDR_MIN); + size_t stack_size; + unsigned long stack_vaddr; unsigned long current_gp = 0; struct kvm_mp_state mps; struct kvm_vcpu *vcpu; + stack_size = vm->page_size == 4096 ? DEFAULT_STACK_PGS * vm->page_size : + vm->page_size; + stack_vaddr = vm_vaddr_alloc(vm, stack_size, + DEFAULT_RISCV_GUEST_STACK_VADDR_MIN); + vcpu = __vm_vcpu_add(vm, vcpu_id); riscv_vcpu_mmu_setup(vcpu); -- GitLab From 1446e331432d7f24ed56b870ad605a4345fee43f Mon Sep 17 00:00:00 2001 From: Ricardo Koller Date: Mon, 17 Oct 2022 19:58:29 +0000 Subject: [PATCH 0522/1423] KVM: selftests: Use the right memslot for code, page-tables, and data allocations Now that kvm_vm allows specifying different memslots for code, page tables, and data, use the appropriate memslot when making allocations in common/libraty code. Change them accordingly: - code (allocated by lib/elf) use the CODE memslot - stacks, exception tables, and other core data pages (like the TSS in x86) use the DATA memslot - page tables and the PGD use the PT memslot - test data (anything allocated with vm_vaddr_alloc()) uses the TEST_DATA memslot No functional change intended. All allocators keep using memslot #0. Cc: Sean Christopherson Cc: Andrew Jones Signed-off-by: Ricardo Koller Reviewed-by: Sean Christopherson Reviewed-by: Andrew Jones Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221017195834.2295901-10-ricarkol@google.com --- .../selftests/kvm/include/kvm_util_base.h | 4 ++ .../selftests/kvm/lib/aarch64/processor.c | 12 ++-- tools/testing/selftests/kvm/lib/elf.c | 3 +- tools/testing/selftests/kvm/lib/kvm_util.c | 57 ++++++++++++------- .../selftests/kvm/lib/riscv/processor.c | 8 ++- .../selftests/kvm/lib/s390x/processor.c | 8 ++- .../selftests/kvm/lib/x86_64/processor.c | 13 +++-- 7 files changed, 65 insertions(+), 40 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 6442aa9e9061f..b0da75af1ff33 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -407,7 +407,11 @@ void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id); vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min); +vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, + enum kvm_mem_region_type type); vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages); +vm_vaddr_t __vm_vaddr_alloc_page(struct kvm_vm *vm, + enum kvm_mem_region_type type); vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm); void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index 6ff2b9d6cea6a..2883dfd1ad49e 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -82,7 +82,8 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) return; vm->pgd = vm_phy_pages_alloc(vm, nr_pages, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0); + KVM_GUEST_PAGE_TABLE_MIN_PADDR, + vm->memslots[MEM_REGION_PT]); vm->pgd_created = true; } @@ -332,8 +333,9 @@ struct kvm_vcpu *aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, stack_size = vm->page_size == 4096 ? DEFAULT_STACK_PGS * vm->page_size : vm->page_size; - stack_vaddr = vm_vaddr_alloc(vm, stack_size, - DEFAULT_ARM64_GUEST_STACK_VADDR_MIN); + stack_vaddr = __vm_vaddr_alloc(vm, stack_size, + DEFAULT_ARM64_GUEST_STACK_VADDR_MIN, + MEM_REGION_DATA); aarch64_vcpu_setup(vcpu, init); @@ -438,8 +440,8 @@ unexpected_exception: void vm_init_descriptor_tables(struct kvm_vm *vm) { - vm->handlers = vm_vaddr_alloc(vm, sizeof(struct handlers), - vm->page_size); + vm->handlers = __vm_vaddr_alloc(vm, sizeof(struct handlers), + vm->page_size, MEM_REGION_DATA); *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers; } diff --git a/tools/testing/selftests/kvm/lib/elf.c b/tools/testing/selftests/kvm/lib/elf.c index 9f54c098d9d08..51f280c412bad 100644 --- a/tools/testing/selftests/kvm/lib/elf.c +++ b/tools/testing/selftests/kvm/lib/elf.c @@ -161,7 +161,8 @@ void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename) seg_vend |= vm->page_size - 1; size_t seg_size = seg_vend - seg_vstart + 1; - vm_vaddr_t vaddr = vm_vaddr_alloc(vm, seg_size, seg_vstart); + vm_vaddr_t vaddr = __vm_vaddr_alloc(vm, seg_size, seg_vstart, + MEM_REGION_CODE); TEST_ASSERT(vaddr == seg_vstart, "Unable to allocate " "virtual memory for segment at requested min addr,\n" " segment idx: %u\n" diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index f3dfa4e9ee0f5..5ad4acaec8e01 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1226,32 +1226,15 @@ va_found: return pgidx_start * vm->page_size; } -/* - * VM Virtual Address Allocate - * - * Input Args: - * vm - Virtual Machine - * sz - Size in bytes - * vaddr_min - Minimum starting virtual address - * - * Output Args: None - * - * Return: - * Starting guest virtual address - * - * Allocates at least sz bytes within the virtual address space of the vm - * given by vm. The allocated bytes are mapped to a virtual address >= - * the address given by vaddr_min. Note that each allocation uses a - * a unique set of pages, with the minimum real allocation being at least - * a page. - */ -vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min) +vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, + enum kvm_mem_region_type type) { uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0); virt_pgd_alloc(vm); vm_paddr_t paddr = vm_phy_pages_alloc(vm, pages, - KVM_UTIL_MIN_PFN * vm->page_size, 0); + KVM_UTIL_MIN_PFN * vm->page_size, + vm->memslots[type]); /* * Find an unused range of virtual page addresses of at least @@ -1272,6 +1255,30 @@ vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min) return vaddr_start; } +/* + * VM Virtual Address Allocate + * + * Input Args: + * vm - Virtual Machine + * sz - Size in bytes + * vaddr_min - Minimum starting virtual address + * + * Output Args: None + * + * Return: + * Starting guest virtual address + * + * Allocates at least sz bytes within the virtual address space of the vm + * given by vm. The allocated bytes are mapped to a virtual address >= + * the address given by vaddr_min. Note that each allocation uses a + * a unique set of pages, with the minimum real allocation being at least + * a page. The allocated physical space comes from the TEST_DATA memory region. + */ +vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min) +{ + return __vm_vaddr_alloc(vm, sz, vaddr_min, MEM_REGION_TEST_DATA); +} + /* * VM Virtual Address Allocate Pages * @@ -1291,6 +1298,11 @@ vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages) return vm_vaddr_alloc(vm, nr_pages * getpagesize(), KVM_UTIL_MIN_VADDR); } +vm_vaddr_t __vm_vaddr_alloc_page(struct kvm_vm *vm, enum kvm_mem_region_type type) +{ + return __vm_vaddr_alloc(vm, getpagesize(), KVM_UTIL_MIN_VADDR, type); +} + /* * VM Virtual Address Allocate Page * @@ -1856,7 +1868,8 @@ vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min, vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm) { - return vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0); + return vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, + vm->memslots[MEM_REGION_PT]); } /* diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c index ac7fc9d317db3..d146ca71e0c09 100644 --- a/tools/testing/selftests/kvm/lib/riscv/processor.c +++ b/tools/testing/selftests/kvm/lib/riscv/processor.c @@ -61,7 +61,8 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) return; vm->pgd = vm_phy_pages_alloc(vm, nr_pages, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0); + KVM_GUEST_PAGE_TABLE_MIN_PADDR, + vm->memslots[MEM_REGION_PT]); vm->pgd_created = true; } @@ -288,8 +289,9 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, stack_size = vm->page_size == 4096 ? DEFAULT_STACK_PGS * vm->page_size : vm->page_size; - stack_vaddr = vm_vaddr_alloc(vm, stack_size, - DEFAULT_RISCV_GUEST_STACK_VADDR_MIN); + stack_vaddr = __vm_vaddr_alloc(vm, stack_size, + DEFAULT_RISCV_GUEST_STACK_VADDR_MIN, + MEM_REGION_DATA); vcpu = __vm_vcpu_add(vm, vcpu_id); riscv_vcpu_mmu_setup(vcpu); diff --git a/tools/testing/selftests/kvm/lib/s390x/processor.c b/tools/testing/selftests/kvm/lib/s390x/processor.c index 89d7340d9cbd8..15945121daf17 100644 --- a/tools/testing/selftests/kvm/lib/s390x/processor.c +++ b/tools/testing/selftests/kvm/lib/s390x/processor.c @@ -21,7 +21,8 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) return; paddr = vm_phy_pages_alloc(vm, PAGES_PER_REGION, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0); + KVM_GUEST_PAGE_TABLE_MIN_PADDR, + vm->memslots[MEM_REGION_PT]); memset(addr_gpa2hva(vm, paddr), 0xff, PAGES_PER_REGION * vm->page_size); vm->pgd = paddr; @@ -167,8 +168,9 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x", vm->page_size); - stack_vaddr = vm_vaddr_alloc(vm, stack_size, - DEFAULT_GUEST_STACK_VADDR_MIN); + stack_vaddr = __vm_vaddr_alloc(vm, stack_size, + DEFAULT_GUEST_STACK_VADDR_MIN, + MEM_REGION_DATA); vcpu = __vm_vcpu_add(vm, vcpu_id); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 39c4409ef56a6..b199dde90e9ff 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -552,7 +552,7 @@ unmapped_gva: static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt) { if (!vm->gdt) - vm->gdt = vm_vaddr_alloc_page(vm); + vm->gdt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); dt->base = vm->gdt; dt->limit = getpagesize(); @@ -562,7 +562,7 @@ static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp, int selector) { if (!vm->tss) - vm->tss = vm_vaddr_alloc_page(vm); + vm->tss = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); memset(segp, 0, sizeof(*segp)); segp->base = vm->tss; @@ -647,8 +647,9 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, vm_vaddr_t stack_vaddr; struct kvm_vcpu *vcpu; - stack_vaddr = vm_vaddr_alloc(vm, DEFAULT_STACK_PGS * getpagesize(), - DEFAULT_GUEST_STACK_VADDR_MIN); + stack_vaddr = __vm_vaddr_alloc(vm, DEFAULT_STACK_PGS * getpagesize(), + DEFAULT_GUEST_STACK_VADDR_MIN, + MEM_REGION_DATA); vcpu = __vm_vcpu_add(vm, vcpu_id); vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid()); @@ -1145,8 +1146,8 @@ void vm_init_descriptor_tables(struct kvm_vm *vm) extern void *idt_handlers; int i; - vm->idt = vm_vaddr_alloc_page(vm); - vm->handlers = vm_vaddr_alloc_page(vm); + vm->idt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); + vm->handlers = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); /* Handlers have the same address in both address spaces.*/ for (i = 0; i < NUM_INTERRUPTS; i++) set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0, -- GitLab From 35c5810157124cb71aaa939cd2d5508192714877 Mon Sep 17 00:00:00 2001 From: Ricardo Koller Date: Mon, 17 Oct 2022 19:58:30 +0000 Subject: [PATCH 0523/1423] KVM: selftests: aarch64: Add aarch64/page_fault_test Add a new test for stage 2 faults when using different combinations of guest accesses (e.g., write, S1PTW), backing source type (e.g., anon) and types of faults (e.g., read on hugetlbfs with a hole). The next commits will add different handling methods and more faults (e.g., uffd and dirty logging). This first commit starts by adding two sanity checks for all types of accesses: AF setting by the hw, and accessing memslots with holes. Signed-off-by: Ricardo Koller Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221017195834.2295901-11-ricarkol@google.com --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + .../selftests/kvm/aarch64/page_fault_test.c | 594 ++++++++++++++++++ .../selftests/kvm/include/aarch64/processor.h | 8 + 4 files changed, 604 insertions(+) create mode 100644 tools/testing/selftests/kvm/aarch64/page_fault_test.c diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 2f0d705db9dba..4a30d684e208c 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -4,6 +4,7 @@ /aarch64/debug-exceptions /aarch64/get-reg-list /aarch64/hypercalls +/aarch64/page_fault_test /aarch64/psci_test /aarch64/vcpu_width_config /aarch64/vgic_init diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 08a2606aff331..50c30335460f8 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -153,6 +153,7 @@ TEST_GEN_PROGS_aarch64 += aarch64/arch_timer TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list TEST_GEN_PROGS_aarch64 += aarch64/hypercalls +TEST_GEN_PROGS_aarch64 += aarch64/page_fault_test TEST_GEN_PROGS_aarch64 += aarch64/psci_test TEST_GEN_PROGS_aarch64 += aarch64/vcpu_width_config TEST_GEN_PROGS_aarch64 += aarch64/vgic_init diff --git a/tools/testing/selftests/kvm/aarch64/page_fault_test.c b/tools/testing/selftests/kvm/aarch64/page_fault_test.c new file mode 100644 index 0000000000000..28859a96053f6 --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/page_fault_test.c @@ -0,0 +1,594 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * page_fault_test.c - Test stage 2 faults. + * + * This test tries different combinations of guest accesses (e.g., write, + * S1PTW), backing source type (e.g., anon) and types of faults (e.g., read on + * hugetlbfs with a hole). It checks that the expected handling method is + * called (e.g., uffd faults with the right address and write/read flag). + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include "guest_modes.h" +#include "userfaultfd_util.h" + +/* Guest virtual addresses that point to the test page and its PTE. */ +#define TEST_GVA 0xc0000000 +#define TEST_EXEC_GVA (TEST_GVA + 0x8) +#define TEST_PTE_GVA 0xb0000000 +#define TEST_DATA 0x0123456789ABCDEF + +static uint64_t *guest_test_memory = (uint64_t *)TEST_GVA; + +#define CMD_NONE (0) +#define CMD_SKIP_TEST (1ULL << 1) +#define CMD_HOLE_PT (1ULL << 2) +#define CMD_HOLE_DATA (1ULL << 3) + +#define PREPARE_FN_NR 10 +#define CHECK_FN_NR 10 + +struct test_desc { + const char *name; + uint64_t mem_mark_cmd; + /* Skip the test if any prepare function returns false */ + bool (*guest_prepare[PREPARE_FN_NR])(void); + void (*guest_test)(void); + void (*guest_test_check[CHECK_FN_NR])(void); + void (*dabt_handler)(struct ex_regs *regs); + void (*iabt_handler)(struct ex_regs *regs); + uint32_t pt_memslot_flags; + uint32_t data_memslot_flags; + bool skip; +}; + +struct test_params { + enum vm_mem_backing_src_type src_type; + struct test_desc *test_desc; +}; + +static inline void flush_tlb_page(uint64_t vaddr) +{ + uint64_t page = vaddr >> 12; + + dsb(ishst); + asm volatile("tlbi vaae1is, %0" :: "r" (page)); + dsb(ish); + isb(); +} + +static void guest_write64(void) +{ + uint64_t val; + + WRITE_ONCE(*guest_test_memory, TEST_DATA); + val = READ_ONCE(*guest_test_memory); + GUEST_ASSERT_EQ(val, TEST_DATA); +} + +/* Check the system for atomic instructions. */ +static bool guest_check_lse(void) +{ + uint64_t isar0 = read_sysreg(id_aa64isar0_el1); + uint64_t atomic; + + atomic = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR0_ATOMICS), isar0); + return atomic >= 2; +} + +static bool guest_check_dc_zva(void) +{ + uint64_t dczid = read_sysreg(dczid_el0); + uint64_t dzp = FIELD_GET(ARM64_FEATURE_MASK(DCZID_DZP), dczid); + + return dzp == 0; +} + +/* Compare and swap instruction. */ +static void guest_cas(void) +{ + uint64_t val; + + GUEST_ASSERT(guest_check_lse()); + asm volatile(".arch_extension lse\n" + "casal %0, %1, [%2]\n" + :: "r" (0), "r" (TEST_DATA), "r" (guest_test_memory)); + val = READ_ONCE(*guest_test_memory); + GUEST_ASSERT_EQ(val, TEST_DATA); +} + +static void guest_read64(void) +{ + uint64_t val; + + val = READ_ONCE(*guest_test_memory); + GUEST_ASSERT_EQ(val, 0); +} + +/* Address translation instruction */ +static void guest_at(void) +{ + uint64_t par; + + asm volatile("at s1e1r, %0" :: "r" (guest_test_memory)); + par = read_sysreg(par_el1); + isb(); + + /* Bit 1 indicates whether the AT was successful */ + GUEST_ASSERT_EQ(par & 1, 0); +} + +/* + * The size of the block written by "dc zva" is guaranteed to be between (2 << + * 0) and (2 << 9), which is safe in our case as we need the write to happen + * for at least a word, and not more than a page. + */ +static void guest_dc_zva(void) +{ + uint16_t val; + + asm volatile("dc zva, %0" :: "r" (guest_test_memory)); + dsb(ish); + val = READ_ONCE(*guest_test_memory); + GUEST_ASSERT_EQ(val, 0); +} + +/* + * Pre-indexing loads and stores don't have a valid syndrome (ESR_EL2.ISV==0). + * And that's special because KVM must take special care with those: they + * should still count as accesses for dirty logging or user-faulting, but + * should be handled differently on mmio. + */ +static void guest_ld_preidx(void) +{ + uint64_t val; + uint64_t addr = TEST_GVA - 8; + + /* + * This ends up accessing "TEST_GVA + 8 - 8", where "TEST_GVA - 8" is + * in a gap between memslots not backing by anything. + */ + asm volatile("ldr %0, [%1, #8]!" + : "=r" (val), "+r" (addr)); + GUEST_ASSERT_EQ(val, 0); + GUEST_ASSERT_EQ(addr, TEST_GVA); +} + +static void guest_st_preidx(void) +{ + uint64_t val = TEST_DATA; + uint64_t addr = TEST_GVA - 8; + + asm volatile("str %0, [%1, #8]!" + : "+r" (val), "+r" (addr)); + + GUEST_ASSERT_EQ(addr, TEST_GVA); + val = READ_ONCE(*guest_test_memory); +} + +static bool guest_set_ha(void) +{ + uint64_t mmfr1 = read_sysreg(id_aa64mmfr1_el1); + uint64_t hadbs, tcr; + + /* Skip if HA is not supported. */ + hadbs = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_HADBS), mmfr1); + if (hadbs == 0) + return false; + + tcr = read_sysreg(tcr_el1) | TCR_EL1_HA; + write_sysreg(tcr, tcr_el1); + isb(); + + return true; +} + +static bool guest_clear_pte_af(void) +{ + *((uint64_t *)TEST_PTE_GVA) &= ~PTE_AF; + flush_tlb_page(TEST_GVA); + + return true; +} + +static void guest_check_pte_af(void) +{ + dsb(ish); + GUEST_ASSERT_EQ(*((uint64_t *)TEST_PTE_GVA) & PTE_AF, PTE_AF); +} + +static void guest_exec(void) +{ + int (*code)(void) = (int (*)(void))TEST_EXEC_GVA; + int ret; + + ret = code(); + GUEST_ASSERT_EQ(ret, 0x77); +} + +static bool guest_prepare(struct test_desc *test) +{ + bool (*prepare_fn)(void); + int i; + + for (i = 0; i < PREPARE_FN_NR; i++) { + prepare_fn = test->guest_prepare[i]; + if (prepare_fn && !prepare_fn()) + return false; + } + + return true; +} + +static void guest_test_check(struct test_desc *test) +{ + void (*check_fn)(void); + int i; + + for (i = 0; i < CHECK_FN_NR; i++) { + check_fn = test->guest_test_check[i]; + if (check_fn) + check_fn(); + } +} + +static void guest_code(struct test_desc *test) +{ + if (!guest_prepare(test)) + GUEST_SYNC(CMD_SKIP_TEST); + + GUEST_SYNC(test->mem_mark_cmd); + + if (test->guest_test) + test->guest_test(); + + guest_test_check(test); + GUEST_DONE(); +} + +static void no_dabt_handler(struct ex_regs *regs) +{ + GUEST_ASSERT_1(false, read_sysreg(far_el1)); +} + +static void no_iabt_handler(struct ex_regs *regs) +{ + GUEST_ASSERT_1(false, regs->pc); +} + +/* Returns true to continue the test, and false if it should be skipped. */ +static bool punch_hole_in_backing_store(struct kvm_vm *vm, + struct userspace_mem_region *region) +{ + void *hva = (void *)region->region.userspace_addr; + uint64_t paging_size = region->region.memory_size; + int ret, fd = region->fd; + + if (fd != -1) { + ret = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + 0, paging_size); + TEST_ASSERT(ret == 0, "fallocate failed\n"); + } else { + ret = madvise(hva, paging_size, MADV_DONTNEED); + TEST_ASSERT(ret == 0, "madvise failed\n"); + } + + return true; +} + +/* Returns true to continue the test, and false if it should be skipped. */ +static bool handle_cmd(struct kvm_vm *vm, int cmd) +{ + struct userspace_mem_region *data_region, *pt_region; + bool continue_test = true; + + data_region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA); + pt_region = vm_get_mem_region(vm, MEM_REGION_PT); + + if (cmd == CMD_SKIP_TEST) + continue_test = false; + + if (cmd & CMD_HOLE_PT) + continue_test = punch_hole_in_backing_store(vm, pt_region); + if (cmd & CMD_HOLE_DATA) + continue_test = punch_hole_in_backing_store(vm, data_region); + + return continue_test; +} + +typedef uint32_t aarch64_insn_t; +extern aarch64_insn_t __exec_test[2]; + +noinline void __return_0x77(void) +{ + asm volatile("__exec_test: mov x0, #0x77\n" + "ret\n"); +} + +/* + * Note that this function runs on the host before the test VM starts: there's + * no need to sync the D$ and I$ caches. + */ +static void load_exec_code_for_test(struct kvm_vm *vm) +{ + uint64_t *code; + struct userspace_mem_region *region; + void *hva; + + region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA); + hva = (void *)region->region.userspace_addr; + + assert(TEST_EXEC_GVA > TEST_GVA); + code = hva + TEST_EXEC_GVA - TEST_GVA; + memcpy(code, __exec_test, sizeof(__exec_test)); +} + +static void setup_abort_handlers(struct kvm_vm *vm, struct kvm_vcpu *vcpu, + struct test_desc *test) +{ + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vcpu); + + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_EC_DABT, no_dabt_handler); + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_EC_IABT, no_iabt_handler); +} + +static void setup_gva_maps(struct kvm_vm *vm) +{ + struct userspace_mem_region *region; + uint64_t pte_gpa; + + region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA); + /* Map TEST_GVA first. This will install a new PTE. */ + virt_pg_map(vm, TEST_GVA, region->region.guest_phys_addr); + /* Then map TEST_PTE_GVA to the above PTE. */ + pte_gpa = addr_hva2gpa(vm, virt_get_pte_hva(vm, TEST_GVA)); + virt_pg_map(vm, TEST_PTE_GVA, pte_gpa); +} + +enum pf_test_memslots { + CODE_AND_DATA_MEMSLOT, + PAGE_TABLE_MEMSLOT, + TEST_DATA_MEMSLOT, +}; + +/* + * Create a memslot for code and data at pfn=0, and test-data and PT ones + * at max_gfn. + */ +static void setup_memslots(struct kvm_vm *vm, struct test_params *p) +{ + uint64_t backing_src_pagesz = get_backing_src_pagesz(p->src_type); + uint64_t guest_page_size = vm->page_size; + uint64_t max_gfn = vm_compute_max_gfn(vm); + /* Enough for 2M of code when using 4K guest pages. */ + uint64_t code_npages = 512; + uint64_t pt_size, data_size, data_gpa; + + /* + * This test requires 1 pgd, 2 pud, 4 pmd, and 6 pte pages when using + * VM_MODE_P48V48_4K. Note that the .text takes ~1.6MBs. That's 13 + * pages. VM_MODE_P48V48_4K is the mode with most PT pages; let's use + * twice that just in case. + */ + pt_size = 26 * guest_page_size; + + /* memslot sizes and gpa's must be aligned to the backing page size */ + pt_size = align_up(pt_size, backing_src_pagesz); + data_size = align_up(guest_page_size, backing_src_pagesz); + data_gpa = (max_gfn * guest_page_size) - data_size; + data_gpa = align_down(data_gpa, backing_src_pagesz); + + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, + CODE_AND_DATA_MEMSLOT, code_npages, 0); + vm->memslots[MEM_REGION_CODE] = CODE_AND_DATA_MEMSLOT; + vm->memslots[MEM_REGION_DATA] = CODE_AND_DATA_MEMSLOT; + + vm_userspace_mem_region_add(vm, p->src_type, data_gpa - pt_size, + PAGE_TABLE_MEMSLOT, pt_size / guest_page_size, + p->test_desc->pt_memslot_flags); + vm->memslots[MEM_REGION_PT] = PAGE_TABLE_MEMSLOT; + + vm_userspace_mem_region_add(vm, p->src_type, data_gpa, TEST_DATA_MEMSLOT, + data_size / guest_page_size, + p->test_desc->data_memslot_flags); + vm->memslots[MEM_REGION_TEST_DATA] = TEST_DATA_MEMSLOT; +} + +static void print_test_banner(enum vm_guest_mode mode, struct test_params *p) +{ + struct test_desc *test = p->test_desc; + + pr_debug("Test: %s\n", test->name); + pr_debug("Testing guest mode: %s\n", vm_guest_mode_string(mode)); + pr_debug("Testing memory backing src type: %s\n", + vm_mem_backing_src_alias(p->src_type)->name); +} + +/* + * This function either succeeds, skips the test (after setting test->skip), or + * fails with a TEST_FAIL that aborts all tests. + */ +static void vcpu_run_loop(struct kvm_vm *vm, struct kvm_vcpu *vcpu, + struct test_desc *test) +{ + struct ucall uc; + + for (;;) { + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + if (!handle_cmd(vm, uc.args[1])) { + test->skip = true; + goto done; + } + break; + case UCALL_ABORT: + REPORT_GUEST_ASSERT_2(uc, "values: %#lx, %#lx"); + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + +done: + pr_debug(test->skip ? "Skipped.\n" : "Done.\n"); +} + +static void run_test(enum vm_guest_mode mode, void *arg) +{ + struct test_params *p = (struct test_params *)arg; + struct test_desc *test = p->test_desc; + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + + print_test_banner(mode, p); + + vm = ____vm_create(mode); + setup_memslots(vm, p); + kvm_vm_elf_load(vm, program_invocation_name); + vcpu = vm_vcpu_add(vm, 0, guest_code); + + setup_gva_maps(vm); + + ucall_init(vm, NULL); + + load_exec_code_for_test(vm); + setup_abort_handlers(vm, vcpu, test); + vcpu_args_set(vcpu, 1, test); + + vcpu_run_loop(vm, vcpu, test); + + ucall_uninit(vm); + kvm_vm_free(vm); +} + +static void help(char *name) +{ + puts(""); + printf("usage: %s [-h] [-s mem-type]\n", name); + puts(""); + guest_modes_help(); + backing_src_help("-s"); + puts(""); +} + +#define SNAME(s) #s +#define SCAT2(a, b) SNAME(a ## _ ## b) +#define SCAT3(a, b, c) SCAT2(a, SCAT2(b, c)) + +#define _CHECK(_test) _CHECK_##_test +#define _PREPARE(_test) _PREPARE_##_test +#define _PREPARE_guest_read64 NULL +#define _PREPARE_guest_ld_preidx NULL +#define _PREPARE_guest_write64 NULL +#define _PREPARE_guest_st_preidx NULL +#define _PREPARE_guest_exec NULL +#define _PREPARE_guest_at NULL +#define _PREPARE_guest_dc_zva guest_check_dc_zva +#define _PREPARE_guest_cas guest_check_lse + +/* With or without access flag checks */ +#define _PREPARE_with_af guest_set_ha, guest_clear_pte_af +#define _PREPARE_no_af NULL +#define _CHECK_with_af guest_check_pte_af +#define _CHECK_no_af NULL + +/* Performs an access and checks that no faults were triggered. */ +#define TEST_ACCESS(_access, _with_af, _mark_cmd) \ +{ \ + .name = SCAT3(_access, _with_af, #_mark_cmd), \ + .guest_prepare = { _PREPARE(_with_af), \ + _PREPARE(_access) }, \ + .mem_mark_cmd = _mark_cmd, \ + .guest_test = _access, \ + .guest_test_check = { _CHECK(_with_af) }, \ +} + +static struct test_desc tests[] = { + + /* Check that HW is setting the Access Flag (AF) (sanity checks). */ + TEST_ACCESS(guest_read64, with_af, CMD_NONE), + TEST_ACCESS(guest_ld_preidx, with_af, CMD_NONE), + TEST_ACCESS(guest_cas, with_af, CMD_NONE), + TEST_ACCESS(guest_write64, with_af, CMD_NONE), + TEST_ACCESS(guest_st_preidx, with_af, CMD_NONE), + TEST_ACCESS(guest_dc_zva, with_af, CMD_NONE), + TEST_ACCESS(guest_exec, with_af, CMD_NONE), + + /* + * Punch a hole in the data backing store, and then try multiple + * accesses: reads should rturn zeroes, and writes should + * re-populate the page. Moreover, the test also check that no + * exception was generated in the guest. Note that this + * reading/writing behavior is the same as reading/writing a + * punched page (with fallocate(FALLOC_FL_PUNCH_HOLE)) from + * userspace. + */ + TEST_ACCESS(guest_read64, no_af, CMD_HOLE_DATA), + TEST_ACCESS(guest_cas, no_af, CMD_HOLE_DATA), + TEST_ACCESS(guest_ld_preidx, no_af, CMD_HOLE_DATA), + TEST_ACCESS(guest_write64, no_af, CMD_HOLE_DATA), + TEST_ACCESS(guest_st_preidx, no_af, CMD_HOLE_DATA), + TEST_ACCESS(guest_at, no_af, CMD_HOLE_DATA), + TEST_ACCESS(guest_dc_zva, no_af, CMD_HOLE_DATA), + + { 0 } +}; + +static void for_each_test_and_guest_mode(enum vm_mem_backing_src_type src_type) +{ + struct test_desc *t; + + for (t = &tests[0]; t->name; t++) { + if (t->skip) + continue; + + struct test_params p = { + .src_type = src_type, + .test_desc = t, + }; + + for_each_guest_mode(run_test, &p); + } +} + +int main(int argc, char *argv[]) +{ + enum vm_mem_backing_src_type src_type; + int opt; + + setbuf(stdout, NULL); + + src_type = DEFAULT_VM_MEM_SRC; + + while ((opt = getopt(argc, argv, "hm:s:")) != -1) { + switch (opt) { + case 'm': + guest_modes_cmdline(optarg); + break; + case 's': + src_type = parse_backing_src_type(optarg); + break; + case 'h': + default: + help(argv[0]); + exit(0); + } + } + + for_each_test_and_guest_mode(src_type); + return 0; +} diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h index c1ddca8db225a..5f977528e09c0 100644 --- a/tools/testing/selftests/kvm/include/aarch64/processor.h +++ b/tools/testing/selftests/kvm/include/aarch64/processor.h @@ -105,11 +105,19 @@ enum { #define ESR_EC_MASK (ESR_EC_NUM - 1) #define ESR_EC_SVC64 0x15 +#define ESR_EC_IABT 0x21 +#define ESR_EC_DABT 0x25 #define ESR_EC_HW_BP_CURRENT 0x31 #define ESR_EC_SSTEP_CURRENT 0x33 #define ESR_EC_WP_CURRENT 0x35 #define ESR_EC_BRK_INS 0x3c +/* Access flag */ +#define PTE_AF (1ULL << 10) + +/* Access flag update enable/disable */ +#define TCR_EL1_HA (1ULL << 39) + void aarch64_get_supported_page_sizes(uint32_t ipa, bool *ps4k, bool *ps16k, bool *ps64k); -- GitLab From 3b1d915659c64dce079f4926a648f2271faea008 Mon Sep 17 00:00:00 2001 From: Ricardo Koller Date: Mon, 17 Oct 2022 19:58:31 +0000 Subject: [PATCH 0524/1423] KVM: selftests: aarch64: Add userfaultfd tests into page_fault_test Add some userfaultfd tests into page_fault_test. Punch holes into the data and/or page-table memslots, perform some accesses, and check that the faults are taken (or not taken) when expected. Signed-off-by: Ricardo Koller Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221017195834.2295901-12-ricarkol@google.com --- .../selftests/kvm/aarch64/page_fault_test.c | 187 ++++++++++++++++++ 1 file changed, 187 insertions(+) diff --git a/tools/testing/selftests/kvm/aarch64/page_fault_test.c b/tools/testing/selftests/kvm/aarch64/page_fault_test.c index 28859a96053f6..8ecc2ac8c476c 100644 --- a/tools/testing/selftests/kvm/aarch64/page_fault_test.c +++ b/tools/testing/selftests/kvm/aarch64/page_fault_test.c @@ -35,6 +35,12 @@ static uint64_t *guest_test_memory = (uint64_t *)TEST_GVA; #define PREPARE_FN_NR 10 #define CHECK_FN_NR 10 +static struct event_cnt { + int uffd_faults; + /* uffd_faults is incremented from multiple threads. */ + pthread_mutex_t uffd_faults_mutex; +} events; + struct test_desc { const char *name; uint64_t mem_mark_cmd; @@ -42,11 +48,14 @@ struct test_desc { bool (*guest_prepare[PREPARE_FN_NR])(void); void (*guest_test)(void); void (*guest_test_check[CHECK_FN_NR])(void); + uffd_handler_t uffd_pt_handler; + uffd_handler_t uffd_data_handler; void (*dabt_handler)(struct ex_regs *regs); void (*iabt_handler)(struct ex_regs *regs); uint32_t pt_memslot_flags; uint32_t data_memslot_flags; bool skip; + struct event_cnt expected_events; }; struct test_params { @@ -263,7 +272,110 @@ static void no_iabt_handler(struct ex_regs *regs) GUEST_ASSERT_1(false, regs->pc); } +static struct uffd_args { + char *copy; + void *hva; + uint64_t paging_size; +} pt_args, data_args; + /* Returns true to continue the test, and false if it should be skipped. */ +static int uffd_generic_handler(int uffd_mode, int uffd, struct uffd_msg *msg, + struct uffd_args *args, bool expect_write) +{ + uint64_t addr = msg->arg.pagefault.address; + uint64_t flags = msg->arg.pagefault.flags; + struct uffdio_copy copy; + int ret; + + TEST_ASSERT(uffd_mode == UFFDIO_REGISTER_MODE_MISSING, + "The only expected UFFD mode is MISSING"); + ASSERT_EQ(!!(flags & UFFD_PAGEFAULT_FLAG_WRITE), expect_write); + ASSERT_EQ(addr, (uint64_t)args->hva); + + pr_debug("uffd fault: addr=%p write=%d\n", + (void *)addr, !!(flags & UFFD_PAGEFAULT_FLAG_WRITE)); + + copy.src = (uint64_t)args->copy; + copy.dst = addr; + copy.len = args->paging_size; + copy.mode = 0; + + ret = ioctl(uffd, UFFDIO_COPY, ©); + if (ret == -1) { + pr_info("Failed UFFDIO_COPY in 0x%lx with errno: %d\n", + addr, errno); + return ret; + } + + pthread_mutex_lock(&events.uffd_faults_mutex); + events.uffd_faults += 1; + pthread_mutex_unlock(&events.uffd_faults_mutex); + return 0; +} + +static int uffd_pt_write_handler(int mode, int uffd, struct uffd_msg *msg) +{ + return uffd_generic_handler(mode, uffd, msg, &pt_args, true); +} + +static int uffd_data_write_handler(int mode, int uffd, struct uffd_msg *msg) +{ + return uffd_generic_handler(mode, uffd, msg, &data_args, true); +} + +static int uffd_data_read_handler(int mode, int uffd, struct uffd_msg *msg) +{ + return uffd_generic_handler(mode, uffd, msg, &data_args, false); +} + +static void setup_uffd_args(struct userspace_mem_region *region, + struct uffd_args *args) +{ + args->hva = (void *)region->region.userspace_addr; + args->paging_size = region->region.memory_size; + + args->copy = malloc(args->paging_size); + TEST_ASSERT(args->copy, "Failed to allocate data copy."); + memcpy(args->copy, args->hva, args->paging_size); +} + +static void setup_uffd(struct kvm_vm *vm, struct test_params *p, + struct uffd_desc **pt_uffd, struct uffd_desc **data_uffd) +{ + struct test_desc *test = p->test_desc; + int uffd_mode = UFFDIO_REGISTER_MODE_MISSING; + + setup_uffd_args(vm_get_mem_region(vm, MEM_REGION_PT), &pt_args); + setup_uffd_args(vm_get_mem_region(vm, MEM_REGION_TEST_DATA), &data_args); + + *pt_uffd = NULL; + if (test->uffd_pt_handler) + *pt_uffd = uffd_setup_demand_paging(uffd_mode, 0, + pt_args.hva, + pt_args.paging_size, + test->uffd_pt_handler); + + *data_uffd = NULL; + if (test->uffd_data_handler) + *data_uffd = uffd_setup_demand_paging(uffd_mode, 0, + data_args.hva, + data_args.paging_size, + test->uffd_data_handler); +} + +static void free_uffd(struct test_desc *test, struct uffd_desc *pt_uffd, + struct uffd_desc *data_uffd) +{ + if (test->uffd_pt_handler) + uffd_stop_demand_paging(pt_uffd); + if (test->uffd_data_handler) + uffd_stop_demand_paging(data_uffd); + + free(pt_args.copy); + free(data_args.copy); +} + +/* Returns false if the test should be skipped. */ static bool punch_hole_in_backing_store(struct kvm_vm *vm, struct userspace_mem_region *region) { @@ -404,6 +516,11 @@ static void setup_memslots(struct kvm_vm *vm, struct test_params *p) vm->memslots[MEM_REGION_TEST_DATA] = TEST_DATA_MEMSLOT; } +static void check_event_counts(struct test_desc *test) +{ + ASSERT_EQ(test->expected_events.uffd_faults, events.uffd_faults); +} + static void print_test_banner(enum vm_guest_mode mode, struct test_params *p) { struct test_desc *test = p->test_desc; @@ -414,6 +531,11 @@ static void print_test_banner(enum vm_guest_mode mode, struct test_params *p) vm_mem_backing_src_alias(p->src_type)->name); } +static void reset_event_counts(void) +{ + memset(&events, 0, sizeof(events)); +} + /* * This function either succeeds, skips the test (after setting test->skip), or * fails with a TEST_FAIL that aborts all tests. @@ -453,6 +575,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) struct test_desc *test = p->test_desc; struct kvm_vm *vm; struct kvm_vcpu *vcpu; + struct uffd_desc *pt_uffd, *data_uffd; print_test_banner(mode, p); @@ -465,7 +588,16 @@ static void run_test(enum vm_guest_mode mode, void *arg) ucall_init(vm, NULL); + reset_event_counts(); + + /* + * Set some code in the data memslot for the guest to execute (only + * applicable to the EXEC tests). This has to be done before + * setup_uffd() as that function copies the memslot data for the uffd + * handler. + */ load_exec_code_for_test(vm); + setup_uffd(vm, p, &pt_uffd, &data_uffd); setup_abort_handlers(vm, vcpu, test); vcpu_args_set(vcpu, 1, test); @@ -473,6 +605,14 @@ static void run_test(enum vm_guest_mode mode, void *arg) ucall_uninit(vm); kvm_vm_free(vm); + free_uffd(test, pt_uffd, data_uffd); + + /* + * Make sure we check the events after the uffd threads have exited, + * which means they updated their respective event counters. + */ + if (!test->skip) + check_event_counts(test); } static void help(char *name) @@ -488,6 +628,7 @@ static void help(char *name) #define SNAME(s) #s #define SCAT2(a, b) SNAME(a ## _ ## b) #define SCAT3(a, b, c) SCAT2(a, SCAT2(b, c)) +#define SCAT4(a, b, c, d) SCAT2(a, SCAT3(b, c, d)) #define _CHECK(_test) _CHECK_##_test #define _PREPARE(_test) _PREPARE_##_test @@ -515,6 +656,21 @@ static void help(char *name) .mem_mark_cmd = _mark_cmd, \ .guest_test = _access, \ .guest_test_check = { _CHECK(_with_af) }, \ + .expected_events = { 0 }, \ +} + +#define TEST_UFFD(_access, _with_af, _mark_cmd, \ + _uffd_data_handler, _uffd_pt_handler, _uffd_faults) \ +{ \ + .name = SCAT4(uffd, _access, _with_af, #_mark_cmd), \ + .guest_prepare = { _PREPARE(_with_af), \ + _PREPARE(_access) }, \ + .guest_test = _access, \ + .mem_mark_cmd = _mark_cmd, \ + .guest_test_check = { _CHECK(_with_af) }, \ + .uffd_data_handler = _uffd_data_handler, \ + .uffd_pt_handler = _uffd_pt_handler, \ + .expected_events = { .uffd_faults = _uffd_faults, }, \ } static struct test_desc tests[] = { @@ -545,6 +701,37 @@ static struct test_desc tests[] = { TEST_ACCESS(guest_at, no_af, CMD_HOLE_DATA), TEST_ACCESS(guest_dc_zva, no_af, CMD_HOLE_DATA), + /* + * Punch holes in the data and PT backing stores and mark them for + * userfaultfd handling. This should result in 2 faults: the access + * on the data backing store, and its respective S1 page table walk + * (S1PTW). + */ + TEST_UFFD(guest_read64, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, + uffd_data_read_handler, uffd_pt_write_handler, 2), + /* no_af should also lead to a PT write. */ + TEST_UFFD(guest_read64, no_af, CMD_HOLE_DATA | CMD_HOLE_PT, + uffd_data_read_handler, uffd_pt_write_handler, 2), + /* Note how that cas invokes the read handler. */ + TEST_UFFD(guest_cas, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, + uffd_data_read_handler, uffd_pt_write_handler, 2), + /* + * Can't test guest_at with_af as it's IMPDEF whether the AF is set. + * The S1PTW fault should still be marked as a write. + */ + TEST_UFFD(guest_at, no_af, CMD_HOLE_DATA | CMD_HOLE_PT, + uffd_data_read_handler, uffd_pt_write_handler, 1), + TEST_UFFD(guest_ld_preidx, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, + uffd_data_read_handler, uffd_pt_write_handler, 2), + TEST_UFFD(guest_write64, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, + uffd_data_write_handler, uffd_pt_write_handler, 2), + TEST_UFFD(guest_dc_zva, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, + uffd_data_write_handler, uffd_pt_write_handler, 2), + TEST_UFFD(guest_st_preidx, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, + uffd_data_write_handler, uffd_pt_write_handler, 2), + TEST_UFFD(guest_exec, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, + uffd_data_read_handler, uffd_pt_write_handler, 2), + { 0 } }; -- GitLab From a4edf25b3e25656c69cbc768d1c704868e4a616f Mon Sep 17 00:00:00 2001 From: Ricardo Koller Date: Mon, 17 Oct 2022 19:58:32 +0000 Subject: [PATCH 0525/1423] KVM: selftests: aarch64: Add dirty logging tests into page_fault_test Add some dirty logging tests into page_fault_test. Mark the data and/or page-table memory regions for dirty logging, perform some accesses, and check that the dirty log bits are set or clean when expected. Signed-off-by: Ricardo Koller Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221017195834.2295901-13-ricarkol@google.com --- .../selftests/kvm/aarch64/page_fault_test.c | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/tools/testing/selftests/kvm/aarch64/page_fault_test.c b/tools/testing/selftests/kvm/aarch64/page_fault_test.c index 8ecc2ac8c476c..a36001143aff5 100644 --- a/tools/testing/selftests/kvm/aarch64/page_fault_test.c +++ b/tools/testing/selftests/kvm/aarch64/page_fault_test.c @@ -31,6 +31,11 @@ static uint64_t *guest_test_memory = (uint64_t *)TEST_GVA; #define CMD_SKIP_TEST (1ULL << 1) #define CMD_HOLE_PT (1ULL << 2) #define CMD_HOLE_DATA (1ULL << 3) +#define CMD_CHECK_WRITE_IN_DIRTY_LOG (1ULL << 4) +#define CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG (1ULL << 5) +#define CMD_CHECK_NO_WRITE_IN_DIRTY_LOG (1ULL << 6) +#define CMD_CHECK_NO_S1PTW_WR_IN_DIRTY_LOG (1ULL << 7) +#define CMD_SET_PTE_AF (1ULL << 8) #define PREPARE_FN_NR 10 #define CHECK_FN_NR 10 @@ -213,6 +218,21 @@ static void guest_check_pte_af(void) GUEST_ASSERT_EQ(*((uint64_t *)TEST_PTE_GVA) & PTE_AF, PTE_AF); } +static void guest_check_write_in_dirty_log(void) +{ + GUEST_SYNC(CMD_CHECK_WRITE_IN_DIRTY_LOG); +} + +static void guest_check_no_write_in_dirty_log(void) +{ + GUEST_SYNC(CMD_CHECK_NO_WRITE_IN_DIRTY_LOG); +} + +static void guest_check_s1ptw_wr_in_dirty_log(void) +{ + GUEST_SYNC(CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG); +} + static void guest_exec(void) { int (*code)(void) = (int (*)(void))TEST_EXEC_GVA; @@ -395,6 +415,22 @@ static bool punch_hole_in_backing_store(struct kvm_vm *vm, return true; } +static bool check_write_in_dirty_log(struct kvm_vm *vm, + struct userspace_mem_region *region, + uint64_t host_pg_nr) +{ + unsigned long *bmap; + bool first_page_dirty; + uint64_t size = region->region.memory_size; + + /* getpage_size() is not always equal to vm->page_size */ + bmap = bitmap_zalloc(size / getpagesize()); + kvm_vm_get_dirty_log(vm, region->region.slot, bmap); + first_page_dirty = test_bit(host_pg_nr, bmap); + free(bmap); + return first_page_dirty; +} + /* Returns true to continue the test, and false if it should be skipped. */ static bool handle_cmd(struct kvm_vm *vm, int cmd) { @@ -411,6 +447,18 @@ static bool handle_cmd(struct kvm_vm *vm, int cmd) continue_test = punch_hole_in_backing_store(vm, pt_region); if (cmd & CMD_HOLE_DATA) continue_test = punch_hole_in_backing_store(vm, data_region); + if (cmd & CMD_CHECK_WRITE_IN_DIRTY_LOG) + TEST_ASSERT(check_write_in_dirty_log(vm, data_region, 0), + "Missing write in dirty log"); + if (cmd & CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG) + TEST_ASSERT(check_write_in_dirty_log(vm, pt_region, 0), + "Missing s1ptw write in dirty log"); + if (cmd & CMD_CHECK_NO_WRITE_IN_DIRTY_LOG) + TEST_ASSERT(!check_write_in_dirty_log(vm, data_region, 0), + "Unexpected write in dirty log"); + if (cmd & CMD_CHECK_NO_S1PTW_WR_IN_DIRTY_LOG) + TEST_ASSERT(!check_write_in_dirty_log(vm, pt_region, 0), + "Unexpected s1ptw write in dirty log"); return continue_test; } @@ -673,6 +721,19 @@ static void help(char *name) .expected_events = { .uffd_faults = _uffd_faults, }, \ } +#define TEST_DIRTY_LOG(_access, _with_af, _test_check) \ +{ \ + .name = SCAT3(dirty_log, _access, _with_af), \ + .data_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \ + .pt_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \ + .guest_prepare = { _PREPARE(_with_af), \ + _PREPARE(_access) }, \ + .guest_test = _access, \ + .guest_test_check = { _CHECK(_with_af), _test_check, \ + guest_check_s1ptw_wr_in_dirty_log}, \ + .expected_events = { 0 }, \ +} + static struct test_desc tests[] = { /* Check that HW is setting the Access Flag (AF) (sanity checks). */ @@ -732,6 +793,21 @@ static struct test_desc tests[] = { TEST_UFFD(guest_exec, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, uffd_data_read_handler, uffd_pt_write_handler, 2), + /* + * Try accesses when the data and PT memory regions are both + * tracked for dirty logging. + */ + TEST_DIRTY_LOG(guest_read64, with_af, guest_check_no_write_in_dirty_log), + /* no_af should also lead to a PT write. */ + TEST_DIRTY_LOG(guest_read64, no_af, guest_check_no_write_in_dirty_log), + TEST_DIRTY_LOG(guest_ld_preidx, with_af, guest_check_no_write_in_dirty_log), + TEST_DIRTY_LOG(guest_at, no_af, guest_check_no_write_in_dirty_log), + TEST_DIRTY_LOG(guest_exec, with_af, guest_check_no_write_in_dirty_log), + TEST_DIRTY_LOG(guest_write64, with_af, guest_check_write_in_dirty_log), + TEST_DIRTY_LOG(guest_cas, with_af, guest_check_write_in_dirty_log), + TEST_DIRTY_LOG(guest_dc_zva, with_af, guest_check_write_in_dirty_log), + TEST_DIRTY_LOG(guest_st_preidx, with_af, guest_check_write_in_dirty_log), + { 0 } }; -- GitLab From 45acde40f538a30e759f3b3f4aa5089edf097b2f Mon Sep 17 00:00:00 2001 From: Ricardo Koller Date: Mon, 17 Oct 2022 19:58:33 +0000 Subject: [PATCH 0526/1423] KVM: selftests: aarch64: Add readonly memslot tests into page_fault_test Add some readonly memslot tests into page_fault_test. Mark the data and/or page-table memory regions as readonly, perform some accesses, and check that the right fault is triggered when expected (e.g., a store with no write-back should lead to an mmio exit). Signed-off-by: Ricardo Koller Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221017195834.2295901-14-ricarkol@google.com --- .../selftests/kvm/aarch64/page_fault_test.c | 102 +++++++++++++++++- 1 file changed, 101 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/aarch64/page_fault_test.c b/tools/testing/selftests/kvm/aarch64/page_fault_test.c index a36001143aff5..727f4f2b6cc44 100644 --- a/tools/testing/selftests/kvm/aarch64/page_fault_test.c +++ b/tools/testing/selftests/kvm/aarch64/page_fault_test.c @@ -41,6 +41,8 @@ static uint64_t *guest_test_memory = (uint64_t *)TEST_GVA; #define CHECK_FN_NR 10 static struct event_cnt { + int mmio_exits; + int fail_vcpu_runs; int uffd_faults; /* uffd_faults is incremented from multiple threads. */ pthread_mutex_t uffd_faults_mutex; @@ -57,6 +59,8 @@ struct test_desc { uffd_handler_t uffd_data_handler; void (*dabt_handler)(struct ex_regs *regs); void (*iabt_handler)(struct ex_regs *regs); + void (*mmio_handler)(struct kvm_vm *vm, struct kvm_run *run); + void (*fail_vcpu_run_handler)(int ret); uint32_t pt_memslot_flags; uint32_t data_memslot_flags; bool skip; @@ -415,6 +419,31 @@ static bool punch_hole_in_backing_store(struct kvm_vm *vm, return true; } +static void mmio_on_test_gpa_handler(struct kvm_vm *vm, struct kvm_run *run) +{ + struct userspace_mem_region *region; + void *hva; + + region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA); + hva = (void *)region->region.userspace_addr; + + ASSERT_EQ(run->mmio.phys_addr, region->region.guest_phys_addr); + + memcpy(hva, run->mmio.data, run->mmio.len); + events.mmio_exits += 1; +} + +static void mmio_no_handler(struct kvm_vm *vm, struct kvm_run *run) +{ + uint64_t data; + + memcpy(&data, run->mmio.data, sizeof(data)); + pr_debug("addr=%lld len=%d w=%d data=%lx\n", + run->mmio.phys_addr, run->mmio.len, + run->mmio.is_write, data); + TEST_FAIL("There was no MMIO exit expected."); +} + static bool check_write_in_dirty_log(struct kvm_vm *vm, struct userspace_mem_region *region, uint64_t host_pg_nr) @@ -463,6 +492,18 @@ static bool handle_cmd(struct kvm_vm *vm, int cmd) return continue_test; } +void fail_vcpu_run_no_handler(int ret) +{ + TEST_FAIL("Unexpected vcpu run failure\n"); +} + +void fail_vcpu_run_mmio_no_syndrome_handler(int ret) +{ + TEST_ASSERT(errno == ENOSYS, + "The mmio handler should have returned not implemented."); + events.fail_vcpu_runs += 1; +} + typedef uint32_t aarch64_insn_t; extern aarch64_insn_t __exec_test[2]; @@ -564,9 +605,20 @@ static void setup_memslots(struct kvm_vm *vm, struct test_params *p) vm->memslots[MEM_REGION_TEST_DATA] = TEST_DATA_MEMSLOT; } +static void setup_default_handlers(struct test_desc *test) +{ + if (!test->mmio_handler) + test->mmio_handler = mmio_no_handler; + + if (!test->fail_vcpu_run_handler) + test->fail_vcpu_run_handler = fail_vcpu_run_no_handler; +} + static void check_event_counts(struct test_desc *test) { ASSERT_EQ(test->expected_events.uffd_faults, events.uffd_faults); + ASSERT_EQ(test->expected_events.mmio_exits, events.mmio_exits); + ASSERT_EQ(test->expected_events.fail_vcpu_runs, events.fail_vcpu_runs); } static void print_test_banner(enum vm_guest_mode mode, struct test_params *p) @@ -591,10 +643,18 @@ static void reset_event_counts(void) static void vcpu_run_loop(struct kvm_vm *vm, struct kvm_vcpu *vcpu, struct test_desc *test) { + struct kvm_run *run; struct ucall uc; + int ret; + + run = vcpu->run; for (;;) { - vcpu_run(vcpu); + ret = _vcpu_run(vcpu); + if (ret) { + test->fail_vcpu_run_handler(ret); + goto done; + } switch (get_ucall(vcpu, &uc)) { case UCALL_SYNC: @@ -608,6 +668,10 @@ static void vcpu_run_loop(struct kvm_vm *vm, struct kvm_vcpu *vcpu, break; case UCALL_DONE: goto done; + case UCALL_NONE: + if (run->exit_reason == KVM_EXIT_MMIO) + test->mmio_handler(vm, run); + break; default: TEST_FAIL("Unknown ucall %lu", uc.cmd); } @@ -647,6 +711,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) load_exec_code_for_test(vm); setup_uffd(vm, p, &pt_uffd, &data_uffd); setup_abort_handlers(vm, vcpu, test); + setup_default_handlers(test); vcpu_args_set(vcpu, 1, test); vcpu_run_loop(vm, vcpu, test); @@ -734,6 +799,25 @@ static void help(char *name) .expected_events = { 0 }, \ } +#define TEST_RO_MEMSLOT(_access, _mmio_handler, _mmio_exits) \ +{ \ + .name = SCAT3(ro_memslot, _access, _with_af), \ + .data_memslot_flags = KVM_MEM_READONLY, \ + .guest_prepare = { _PREPARE(_access) }, \ + .guest_test = _access, \ + .mmio_handler = _mmio_handler, \ + .expected_events = { .mmio_exits = _mmio_exits }, \ +} + +#define TEST_RO_MEMSLOT_NO_SYNDROME(_access) \ +{ \ + .name = SCAT2(ro_memslot_no_syndrome, _access), \ + .data_memslot_flags = KVM_MEM_READONLY, \ + .guest_test = _access, \ + .fail_vcpu_run_handler = fail_vcpu_run_mmio_no_syndrome_handler, \ + .expected_events = { .fail_vcpu_runs = 1 }, \ +} + static struct test_desc tests[] = { /* Check that HW is setting the Access Flag (AF) (sanity checks). */ @@ -808,6 +892,22 @@ static struct test_desc tests[] = { TEST_DIRTY_LOG(guest_dc_zva, with_af, guest_check_write_in_dirty_log), TEST_DIRTY_LOG(guest_st_preidx, with_af, guest_check_write_in_dirty_log), + /* + * Try accesses when the data memory region is marked read-only + * (with KVM_MEM_READONLY). Writes with a syndrome result in an + * MMIO exit, writes with no syndrome (e.g., CAS) result in a + * failed vcpu run, and reads/execs with and without syndroms do + * not fault. + */ + TEST_RO_MEMSLOT(guest_read64, 0, 0), + TEST_RO_MEMSLOT(guest_ld_preidx, 0, 0), + TEST_RO_MEMSLOT(guest_at, 0, 0), + TEST_RO_MEMSLOT(guest_exec, 0, 0), + TEST_RO_MEMSLOT(guest_write64, mmio_on_test_gpa_handler, 1), + TEST_RO_MEMSLOT_NO_SYNDROME(guest_dc_zva), + TEST_RO_MEMSLOT_NO_SYNDROME(guest_cas), + TEST_RO_MEMSLOT_NO_SYNDROME(guest_st_preidx), + { 0 } }; -- GitLab From ff2b5509e1d252cd18bb1430b5461d5044701559 Mon Sep 17 00:00:00 2001 From: Ricardo Koller Date: Mon, 17 Oct 2022 19:58:34 +0000 Subject: [PATCH 0527/1423] KVM: selftests: aarch64: Add mix of tests into page_fault_test Add some mix of tests into page_fault_test: memory regions with all the pairwise combinations of read-only, userfaultfd, and dirty-logging. For example, writing into a read-only region which has a hole handled with userfaultfd. Signed-off-by: Ricardo Koller Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221017195834.2295901-15-ricarkol@google.com --- .../selftests/kvm/aarch64/page_fault_test.c | 155 ++++++++++++++++++ 1 file changed, 155 insertions(+) diff --git a/tools/testing/selftests/kvm/aarch64/page_fault_test.c b/tools/testing/selftests/kvm/aarch64/page_fault_test.c index 727f4f2b6cc44..05bb6a6369c25 100644 --- a/tools/testing/selftests/kvm/aarch64/page_fault_test.c +++ b/tools/testing/selftests/kvm/aarch64/page_fault_test.c @@ -399,6 +399,12 @@ static void free_uffd(struct test_desc *test, struct uffd_desc *pt_uffd, free(data_args.copy); } +static int uffd_no_handler(int mode, int uffd, struct uffd_msg *msg) +{ + TEST_FAIL("There was no UFFD fault expected."); + return -1; +} + /* Returns false if the test should be skipped. */ static bool punch_hole_in_backing_store(struct kvm_vm *vm, struct userspace_mem_region *region) @@ -799,6 +805,22 @@ static void help(char *name) .expected_events = { 0 }, \ } +#define TEST_UFFD_AND_DIRTY_LOG(_access, _with_af, _uffd_data_handler, \ + _uffd_faults, _test_check) \ +{ \ + .name = SCAT3(uffd_and_dirty_log, _access, _with_af), \ + .data_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \ + .pt_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \ + .guest_prepare = { _PREPARE(_with_af), \ + _PREPARE(_access) }, \ + .guest_test = _access, \ + .mem_mark_cmd = CMD_HOLE_DATA | CMD_HOLE_PT, \ + .guest_test_check = { _CHECK(_with_af), _test_check }, \ + .uffd_data_handler = _uffd_data_handler, \ + .uffd_pt_handler = uffd_pt_write_handler, \ + .expected_events = { .uffd_faults = _uffd_faults, }, \ +} + #define TEST_RO_MEMSLOT(_access, _mmio_handler, _mmio_exits) \ { \ .name = SCAT3(ro_memslot, _access, _with_af), \ @@ -818,6 +840,59 @@ static void help(char *name) .expected_events = { .fail_vcpu_runs = 1 }, \ } +#define TEST_RO_MEMSLOT_AND_DIRTY_LOG(_access, _mmio_handler, _mmio_exits, \ + _test_check) \ +{ \ + .name = SCAT3(ro_memslot, _access, _with_af), \ + .data_memslot_flags = KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES, \ + .pt_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \ + .guest_prepare = { _PREPARE(_access) }, \ + .guest_test = _access, \ + .guest_test_check = { _test_check }, \ + .mmio_handler = _mmio_handler, \ + .expected_events = { .mmio_exits = _mmio_exits}, \ +} + +#define TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(_access, _test_check) \ +{ \ + .name = SCAT2(ro_memslot_no_syn_and_dlog, _access), \ + .data_memslot_flags = KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES, \ + .pt_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \ + .guest_test = _access, \ + .guest_test_check = { _test_check }, \ + .fail_vcpu_run_handler = fail_vcpu_run_mmio_no_syndrome_handler, \ + .expected_events = { .fail_vcpu_runs = 1 }, \ +} + +#define TEST_RO_MEMSLOT_AND_UFFD(_access, _mmio_handler, _mmio_exits, \ + _uffd_data_handler, _uffd_faults) \ +{ \ + .name = SCAT2(ro_memslot_uffd, _access), \ + .data_memslot_flags = KVM_MEM_READONLY, \ + .mem_mark_cmd = CMD_HOLE_DATA | CMD_HOLE_PT, \ + .guest_prepare = { _PREPARE(_access) }, \ + .guest_test = _access, \ + .uffd_data_handler = _uffd_data_handler, \ + .uffd_pt_handler = uffd_pt_write_handler, \ + .mmio_handler = _mmio_handler, \ + .expected_events = { .mmio_exits = _mmio_exits, \ + .uffd_faults = _uffd_faults }, \ +} + +#define TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(_access, _uffd_data_handler, \ + _uffd_faults) \ +{ \ + .name = SCAT2(ro_memslot_no_syndrome, _access), \ + .data_memslot_flags = KVM_MEM_READONLY, \ + .mem_mark_cmd = CMD_HOLE_DATA | CMD_HOLE_PT, \ + .guest_test = _access, \ + .uffd_data_handler = _uffd_data_handler, \ + .uffd_pt_handler = uffd_pt_write_handler, \ + .fail_vcpu_run_handler = fail_vcpu_run_mmio_no_syndrome_handler, \ + .expected_events = { .fail_vcpu_runs = 1, \ + .uffd_faults = _uffd_faults }, \ +} + static struct test_desc tests[] = { /* Check that HW is setting the Access Flag (AF) (sanity checks). */ @@ -892,6 +967,35 @@ static struct test_desc tests[] = { TEST_DIRTY_LOG(guest_dc_zva, with_af, guest_check_write_in_dirty_log), TEST_DIRTY_LOG(guest_st_preidx, with_af, guest_check_write_in_dirty_log), + /* + * Access when the data and PT memory regions are both marked for + * dirty logging and UFFD at the same time. The expected result is + * that writes should mark the dirty log and trigger a userfaultfd + * write fault. Reads/execs should result in a read userfaultfd + * fault, and nothing in the dirty log. Any S1PTW should result in + * a write in the dirty log and a userfaultfd write. + */ + TEST_UFFD_AND_DIRTY_LOG(guest_read64, with_af, uffd_data_read_handler, 2, + guest_check_no_write_in_dirty_log), + /* no_af should also lead to a PT write. */ + TEST_UFFD_AND_DIRTY_LOG(guest_read64, no_af, uffd_data_read_handler, 2, + guest_check_no_write_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_ld_preidx, with_af, uffd_data_read_handler, + 2, guest_check_no_write_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_at, with_af, 0, 1, + guest_check_no_write_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_exec, with_af, uffd_data_read_handler, 2, + guest_check_no_write_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_write64, with_af, uffd_data_write_handler, + 2, guest_check_write_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_cas, with_af, uffd_data_read_handler, 2, + guest_check_write_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_dc_zva, with_af, uffd_data_write_handler, + 2, guest_check_write_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_st_preidx, with_af, + uffd_data_write_handler, 2, + guest_check_write_in_dirty_log), + /* * Try accesses when the data memory region is marked read-only * (with KVM_MEM_READONLY). Writes with a syndrome result in an @@ -908,6 +1012,57 @@ static struct test_desc tests[] = { TEST_RO_MEMSLOT_NO_SYNDROME(guest_cas), TEST_RO_MEMSLOT_NO_SYNDROME(guest_st_preidx), + /* + * Access when both the data region is both read-only and marked + * for dirty logging at the same time. The expected result is that + * for writes there should be no write in the dirty log. The + * readonly handling is the same as if the memslot was not marked + * for dirty logging: writes with a syndrome result in an MMIO + * exit, and writes with no syndrome result in a failed vcpu run. + */ + TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_read64, 0, 0, + guest_check_no_write_in_dirty_log), + TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_ld_preidx, 0, 0, + guest_check_no_write_in_dirty_log), + TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_at, 0, 0, + guest_check_no_write_in_dirty_log), + TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_exec, 0, 0, + guest_check_no_write_in_dirty_log), + TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_write64, mmio_on_test_gpa_handler, + 1, guest_check_no_write_in_dirty_log), + TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(guest_dc_zva, + guest_check_no_write_in_dirty_log), + TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(guest_cas, + guest_check_no_write_in_dirty_log), + TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(guest_st_preidx, + guest_check_no_write_in_dirty_log), + + /* + * Access when the data region is both read-only and punched with + * holes tracked with userfaultfd. The expected result is the + * union of both userfaultfd and read-only behaviors. For example, + * write accesses result in a userfaultfd write fault and an MMIO + * exit. Writes with no syndrome result in a failed vcpu run and + * no userfaultfd write fault. Reads result in userfaultfd getting + * triggered. + */ + TEST_RO_MEMSLOT_AND_UFFD(guest_read64, 0, 0, + uffd_data_read_handler, 2), + TEST_RO_MEMSLOT_AND_UFFD(guest_ld_preidx, 0, 0, + uffd_data_read_handler, 2), + TEST_RO_MEMSLOT_AND_UFFD(guest_at, 0, 0, + uffd_no_handler, 1), + TEST_RO_MEMSLOT_AND_UFFD(guest_exec, 0, 0, + uffd_data_read_handler, 2), + TEST_RO_MEMSLOT_AND_UFFD(guest_write64, mmio_on_test_gpa_handler, 1, + uffd_data_write_handler, 2), + TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_cas, + uffd_data_read_handler, 2), + TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_dc_zva, + uffd_no_handler, 1), + TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_st_preidx, + uffd_no_handler, 1), + { 0 } }; -- GitLab From 579d7ebe90a332cc5b6c02db9250fd0816a64f63 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 3 Nov 2022 15:05:06 +0000 Subject: [PATCH 0528/1423] KVM: arm64: Fix kvm init failure when mode!=vhe and VA_BITS=52. For nvhe and protected modes, the hyp stage 1 page-tables were previously configured to have the same number of VA bits as the kernel's idmap. However, for kernel configs with VA_BITS=52 and where the kernel is loaded in physical memory below 48 bits, the idmap VA bits is actually smaller than the kernel's normal stage 1 VA bits. This can lead to kernel addresses that can't be mapped into the hypervisor, leading to kvm initialization failure during boot: kvm [1]: IPA Size Limit: 48 bits kvm [1]: Cannot map world-switch code kvm [1]: error initializing Hyp mode: -34 Fix this by ensuring that the hyp stage 1 VA size is the maximum of what's used for the idmap and the regular kernel stage 1. At the same time, refactor the code so that the hyp VA bits is only calculated in one place. Prior to 7ba8f2b2d652, the idmap was always 52 bits for a 52 VA bits kernel and therefore the hyp stage1 was also always 52 bits. Fixes: 7ba8f2b2d652 ("arm64: mm: use a 48-bit ID map when possible on 52-bit VA builds") Signed-off-by: Ryan Roberts [maz: commit message fixes] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221103150507.32948-2-ryan.roberts@arm.com --- arch/arm64/kvm/arm.c | 20 +++----------------- arch/arm64/kvm/mmu.c | 28 +++++++++++++++++++++++++++- 2 files changed, 30 insertions(+), 18 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 94d33e296e10c..803055da3ee3e 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1518,7 +1518,7 @@ static int kvm_init_vector_slots(void) return 0; } -static void cpu_prepare_hyp_mode(int cpu) +static void cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) { struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); unsigned long tcr; @@ -1534,23 +1534,9 @@ static void cpu_prepare_hyp_mode(int cpu) params->mair_el2 = read_sysreg(mair_el1); - /* - * The ID map may be configured to use an extended virtual address - * range. This is only the case if system RAM is out of range for the - * currently configured page size and VA_BITS, in which case we will - * also need the extended virtual range for the HYP ID map, or we won't - * be able to enable the EL2 MMU. - * - * However, at EL2, there is only one TTBR register, and we can't switch - * between translation tables *and* update TCR_EL2.T0SZ at the same - * time. Bottom line: we need to use the extended range with *both* our - * translation tables. - * - * So use the same T0SZ value we use for the ID map. - */ tcr = (read_sysreg(tcr_el1) & TCR_EL2_MASK) | TCR_EL2_RES1; tcr &= ~TCR_T0SZ_MASK; - tcr |= (idmap_t0sz & GENMASK(TCR_TxSZ_WIDTH - 1, 0)) << TCR_T0SZ_OFFSET; + tcr |= TCR_T0SZ(hyp_va_bits); params->tcr_el2 = tcr; params->pgd_pa = kvm_mmu_get_httbr(); @@ -2054,7 +2040,7 @@ static int init_hyp_mode(void) } /* Prepare the CPU initialization parameters */ - cpu_prepare_hyp_mode(cpu); + cpu_prepare_hyp_mode(cpu, hyp_va_bits); } if (is_protected_kvm_enabled()) { diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 60ee3d9f01f8c..4efb983cff436 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1618,6 +1618,8 @@ static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = { int kvm_mmu_init(u32 *hyp_va_bits) { int err; + u32 idmap_bits; + u32 kernel_bits; hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start); hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); @@ -1631,7 +1633,31 @@ int kvm_mmu_init(u32 *hyp_va_bits) */ BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); - *hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET); + /* + * The ID map may be configured to use an extended virtual address + * range. This is only the case if system RAM is out of range for the + * currently configured page size and VA_BITS_MIN, in which case we will + * also need the extended virtual range for the HYP ID map, or we won't + * be able to enable the EL2 MMU. + * + * However, in some cases the ID map may be configured for fewer than + * the number of VA bits used by the regular kernel stage 1. This + * happens when VA_BITS=52 and the kernel image is placed in PA space + * below 48 bits. + * + * At EL2, there is only one TTBR register, and we can't switch between + * translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom + * line: we need to use the extended range with *both* our translation + * tables. + * + * So use the maximum of the idmap VA bits and the regular kernel stage + * 1 VA bits to assure that the hypervisor can both ID map its code page + * and map any kernel memory. + */ + idmap_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET); + kernel_bits = vabits_actual; + *hyp_va_bits = max(idmap_bits, kernel_bits); + kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits); kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); kvm_debug("HYP VA range: %lx:%lx\n", -- GitLab From a0d37784bfd7f699986ba3a64cfeb68a03cb7fd0 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 3 Nov 2022 15:05:07 +0000 Subject: [PATCH 0529/1423] KVM: arm64: Fix PAR_TO_HPFAR() to work independently of PA_BITS. Kernel configs with PAGE_SIZE=64KB and PA_BITS=48 still advertise 52 bit IPA space on HW that implements LPA. This is by design (admitedly this is a very unlikely configuration in the real world). However on such a config, attempting to create a vm with the guest kernel placed above 48 bits in IPA space results in misbehaviour due to the hypervisor incorrectly interpretting a faulting IPA. Fix up PAR_TO_HPFAR() to always take 52 bits out of the PAR rather than masking to CONFIG_ARM64_PA_BITS. If the system has a smaller implemented PARange this should be safe because the bits are res0. A more robust approach would be to discover the IPA size in use by the page-table and mask based on that, to avoid relying on res0 reading back as zero. But this information is difficult to access safely from the code's location, so take the easy way out. Fixes: bc1d7de8c550 ("kvm: arm64: Add 52bit support for PAR to HPFAR conversoin") Signed-off-by: Ryan Roberts [maz: commit message fixes] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221103150507.32948-3-ryan.roberts@arm.com --- arch/arm64/include/asm/kvm_arm.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 8aa8492dafc0f..a82f2493a72bb 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -340,9 +340,13 @@ * We have * PAR [PA_Shift - 1 : 12] = PA [PA_Shift - 1 : 12] * HPFAR [PA_Shift - 9 : 4] = FIPA [PA_Shift - 1 : 12] + * + * Always assume 52 bit PA since at this point, we don't know how many PA bits + * the page table has been set up for. This should be safe since unused address + * bits in PAR are res0. */ #define PAR_TO_HPFAR(par) \ - (((par) & GENMASK_ULL(PHYS_MASK_SHIFT - 1, 12)) >> 8) + (((par) & GENMASK_ULL(52 - 1, 12)) >> 8) #define ECN(x) { ESR_ELx_EC_##x, #x } -- GitLab From 4554bac48a8c464ff00136a64efe8847e4da4ea8 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:09:59 -0500 Subject: [PATCH 0530/1423] RDMA/rxe: Add ibdev_dbg macros for rxe Add macros borrowed from siw to call dynamic debug macro ibdev_dbg. Link: https://lore.kernel.org/r/20221103171013.20659-2-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h index 30fbdf3bc76a3..ab334900fcc3d 100644 --- a/drivers/infiniband/sw/rxe/rxe.h +++ b/drivers/infiniband/sw/rxe/rxe.h @@ -38,6 +38,25 @@ #define RXE_ROCE_V2_SPORT (0xc000) +#define rxe_dbg(rxe, fmt, ...) ibdev_dbg(&(rxe)->ib_dev, \ + "%s: " fmt, __func__, ##__VA_ARGS__) +#define rxe_dbg_uc(uc, fmt, ...) ibdev_dbg((uc)->ibuc.device, \ + "uc#%d %s: " fmt, (uc)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_dbg_pd(pd, fmt, ...) ibdev_dbg((pd)->ibpd.device, \ + "pd#%d %s: " fmt, (pd)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_dbg_ah(ah, fmt, ...) ibdev_dbg((ah)->ibah.device, \ + "ah#%d %s: " fmt, (ah)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_dbg_srq(srq, fmt, ...) ibdev_dbg((srq)->ibsrq.device, \ + "srq#%d %s: " fmt, (srq)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_dbg_qp(qp, fmt, ...) ibdev_dbg((qp)->ibqp.device, \ + "qp#%d %s: " fmt, (qp)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_dbg_cq(cq, fmt, ...) ibdev_dbg((cq)->ibcq.device, \ + "cq#%d %s: " fmt, (cq)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_dbg_mr(mr, fmt, ...) ibdev_dbg((mr)->ibmr.device, \ + "mr#%d %s: " fmt, (mr)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_dbg_mw(mw, fmt, ...) ibdev_dbg((mw)->ibmw.device, \ + "mw#%d %s: " fmt, (mw)->elem.index, __func__, ##__VA_ARGS__) + void rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu); int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name); -- GitLab From 27c4c520bd3908c095401e93c71e7d3696ae8bdd Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:00 -0500 Subject: [PATCH 0531/1423] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_comp.c Replace calls to pr_xxx() in rxe_comp.c with rxe_dbg_xxx(). Link: https://lore.kernel.org/r/20221103171013.20659-3-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_comp.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index 66f392810c860..4dca4f8bbb5aa 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -114,7 +114,7 @@ void retransmit_timer(struct timer_list *t) { struct rxe_qp *qp = from_timer(qp, t, retrans_timer); - pr_debug("%s: fired for qp#%d\n", __func__, qp->elem.index); + rxe_dbg_qp(qp, "retransmit timer fired\n"); if (qp->valid) { qp->comp.timeout = 1; @@ -334,7 +334,7 @@ static inline enum comp_state check_ack(struct rxe_qp *qp, return COMPST_ERROR; default: - pr_warn("unexpected nak %x\n", syn); + rxe_dbg_qp(qp, "unexpected nak %x\n", syn); wqe->status = IB_WC_REM_OP_ERR; return COMPST_ERROR; } @@ -345,7 +345,7 @@ static inline enum comp_state check_ack(struct rxe_qp *qp, break; default: - pr_warn("unexpected opcode\n"); + rxe_dbg_qp(qp, "unexpected opcode\n"); } return COMPST_ERROR; @@ -598,8 +598,7 @@ int rxe_completer(void *arg) state = COMPST_GET_ACK; while (1) { - pr_debug("qp#%d state = %s\n", qp_num(qp), - comp_state_name[state]); + rxe_dbg_qp(qp, "state = %s\n", comp_state_name[state]); switch (state) { case COMPST_GET_ACK: skb = skb_dequeue(&qp->resp_pkts); @@ -746,8 +745,7 @@ int rxe_completer(void *arg) * rnr timer has fired */ qp->req.wait_for_rnr_timer = 1; - pr_debug("qp#%d set rnr nak timer\n", - qp_num(qp)); + rxe_dbg_qp(qp, "set rnr nak timer\n"); mod_timer(&qp->rnr_nak_timer, jiffies + rnrnak_jiffies(aeth_syn(pkt) & ~AETH_TYPE_MASK)); -- GitLab From 52920f537ab08237bd8a3d30ccbb06a9d57717cf Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:01 -0500 Subject: [PATCH 0532/1423] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_cq.c Replace calls to pr_xxx() in rxe_cq.c with rxe_dbg_xxx(). Link: https://lore.kernel.org/r/20221103171013.20659-4-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_cq.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c index b1a0ab3cd4bd1..1df186534639a 100644 --- a/drivers/infiniband/sw/rxe/rxe_cq.c +++ b/drivers/infiniband/sw/rxe/rxe_cq.c @@ -14,12 +14,12 @@ int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq, int count; if (cqe <= 0) { - pr_warn("cqe(%d) <= 0\n", cqe); + rxe_dbg(rxe, "cqe(%d) <= 0\n", cqe); goto err1; } if (cqe > rxe->attr.max_cqe) { - pr_debug("cqe(%d) > max_cqe(%d)\n", + rxe_dbg(rxe, "cqe(%d) > max_cqe(%d)\n", cqe, rxe->attr.max_cqe); goto err1; } @@ -27,7 +27,7 @@ int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq, if (cq) { count = queue_count(cq->queue, QUEUE_TYPE_TO_CLIENT); if (cqe < count) { - pr_debug("cqe(%d) < current # elements in queue (%d)", + rxe_dbg_cq(cq, "cqe(%d) < current # elements in queue (%d)", cqe, count); goto err1; } @@ -65,7 +65,7 @@ int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, cq->queue = rxe_queue_init(rxe, &cqe, sizeof(struct rxe_cqe), type); if (!cq->queue) { - pr_warn("unable to create cq\n"); + rxe_dbg(rxe, "unable to create cq\n"); return -ENOMEM; } -- GitLab From 2778b72b1df0c8ef61e0b3b3ef1c8c62eb03fa75 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:02 -0500 Subject: [PATCH 0533/1423] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_mr.c Replace calls to pr_xxx() in rxe_mr.c by rxe_dbg_mr(). Link: https://lore.kernel.org/r/20221103171013.20659-5-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mr.c | 40 ++++++++++++--------------- drivers/infiniband/sw/rxe/rxe_verbs.c | 3 ++ 2 files changed, 20 insertions(+), 23 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index bc081002bddcb..cd846cf82a84c 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -38,8 +38,7 @@ int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length) return 0; default: - pr_warn("%s: mr type (%d) not supported\n", - __func__, mr->ibmr.type); + rxe_dbg_mr(mr, "type (%d) not supported\n", mr->ibmr.type); return -EFAULT; } } @@ -125,8 +124,8 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, umem = ib_umem_get(&rxe->ib_dev, start, length, access); if (IS_ERR(umem)) { - pr_warn("%s: Unable to pin memory region err = %d\n", - __func__, (int)PTR_ERR(umem)); + rxe_dbg_mr(mr, "Unable to pin memory region err = %d\n", + (int)PTR_ERR(umem)); err = PTR_ERR(umem); goto err_out; } @@ -137,8 +136,7 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, err = rxe_mr_alloc(mr, num_buf); if (err) { - pr_warn("%s: Unable to allocate memory for map\n", - __func__); + rxe_dbg_mr(mr, "Unable to allocate memory for map\n"); goto err_release_umem; } @@ -159,8 +157,7 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, vaddr = page_address(sg_page_iter_page(&sg_iter)); if (!vaddr) { - pr_warn("%s: Unable to get virtual address\n", - __func__); + rxe_dbg_mr(mr, "Unable to get virtual address\n"); err = -ENOMEM; goto err_cleanup_map; } @@ -255,7 +252,7 @@ void *iova_to_vaddr(struct rxe_mr *mr, u64 iova, int length) void *addr; if (mr->state != RXE_MR_STATE_VALID) { - pr_warn("mr not in valid state\n"); + rxe_dbg_mr(mr, "Not in valid state\n"); addr = NULL; goto out; } @@ -266,7 +263,7 @@ void *iova_to_vaddr(struct rxe_mr *mr, u64 iova, int length) } if (mr_check_range(mr, iova, length)) { - pr_warn("range violation\n"); + rxe_dbg_mr(mr, "Range violation\n"); addr = NULL; goto out; } @@ -274,7 +271,7 @@ void *iova_to_vaddr(struct rxe_mr *mr, u64 iova, int length) lookup_iova(mr, iova, &m, &n, &offset); if (offset + length > mr->map[m]->buf[n].size) { - pr_warn("crosses page boundary\n"); + rxe_dbg_mr(mr, "Crosses page boundary\n"); addr = NULL; goto out; } @@ -527,27 +524,26 @@ int rxe_invalidate_mr(struct rxe_qp *qp, u32 key) mr = rxe_pool_get_index(&rxe->mr_pool, key >> 8); if (!mr) { - pr_err("%s: No MR for key %#x\n", __func__, key); + rxe_dbg_mr(mr, "No MR for key %#x\n", key); ret = -EINVAL; goto err; } if (mr->rkey ? (key != mr->rkey) : (key != mr->lkey)) { - pr_err("%s: wr key (%#x) doesn't match mr key (%#x)\n", - __func__, key, (mr->rkey ? mr->rkey : mr->lkey)); + rxe_dbg_mr(mr, "wr key (%#x) doesn't match mr key (%#x)\n", + key, (mr->rkey ? mr->rkey : mr->lkey)); ret = -EINVAL; goto err_drop_ref; } if (atomic_read(&mr->num_mw) > 0) { - pr_warn("%s: Attempt to invalidate an MR while bound to MWs\n", - __func__); + rxe_dbg_mr(mr, "Attempt to invalidate an MR while bound to MWs\n"); ret = -EINVAL; goto err_drop_ref; } if (unlikely(mr->ibmr.type != IB_MR_TYPE_MEM_REG)) { - pr_warn("%s: mr type (%d) is wrong\n", __func__, mr->ibmr.type); + rxe_dbg_mr(mr, "Type (%d) is wrong\n", mr->ibmr.type); ret = -EINVAL; goto err_drop_ref; } @@ -576,22 +572,20 @@ int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe) /* user can only register MR in free state */ if (unlikely(mr->state != RXE_MR_STATE_FREE)) { - pr_warn("%s: mr->lkey = 0x%x not free\n", - __func__, mr->lkey); + rxe_dbg_mr(mr, "mr->lkey = 0x%x not free\n", mr->lkey); return -EINVAL; } /* user can only register mr with qp in same protection domain */ if (unlikely(qp->ibqp.pd != mr->ibmr.pd)) { - pr_warn("%s: qp->pd and mr->pd don't match\n", - __func__); + rxe_dbg_mr(mr, "qp->pd and mr->pd don't match\n"); return -EINVAL; } /* user is only allowed to change key portion of l/rkey */ if (unlikely((mr->lkey & ~0xff) != (key & ~0xff))) { - pr_warn("%s: key = 0x%x has wrong index mr->lkey = 0x%x\n", - __func__, key, mr->lkey); + rxe_dbg_mr(mr, "key = 0x%x has wrong index mr->lkey = 0x%x\n", + key, mr->lkey); return -EINVAL; } diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index bcdfdadaebbc8..510ae471ac7a8 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -875,6 +875,7 @@ static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access) rxe_get(pd); mr->ibmr.pd = ibpd; + mr->ibmr.device = ibpd->device; rxe_mr_init_dma(access, mr); rxe_finalize(mr); @@ -899,6 +900,7 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, rxe_get(pd); mr->ibmr.pd = ibpd; + mr->ibmr.device = ibpd->device; err = rxe_mr_init_user(rxe, start, length, iova, access, mr); if (err) @@ -930,6 +932,7 @@ static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, rxe_get(pd); mr->ibmr.pd = ibpd; + mr->ibmr.device = ibpd->device; err = rxe_mr_init_fast(max_num_sg, mr); if (err) -- GitLab From e8a87efdf87455454d0a14fd486c679769bfeee2 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:03 -0500 Subject: [PATCH 0534/1423] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_mw.c Replace calls to pr_xxx() int rxe_mw.c with rxe_dbg_xxx(). Link: https://lore.kernel.org/r/20221103171013.20659-6-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mw.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mw.c b/drivers/infiniband/sw/rxe/rxe_mw.c index 8df1c9066ed89..afa5ce1a71166 100644 --- a/drivers/infiniband/sw/rxe/rxe_mw.c +++ b/drivers/infiniband/sw/rxe/rxe_mw.c @@ -52,14 +52,14 @@ static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, { if (mw->ibmw.type == IB_MW_TYPE_1) { if (unlikely(mw->state != RXE_MW_STATE_VALID)) { - pr_err_once( + rxe_dbg_mw(mw, "attempt to bind a type 1 MW not in the valid state\n"); return -EINVAL; } /* o10-36.2.2 */ if (unlikely((mw->access & IB_ZERO_BASED))) { - pr_err_once("attempt to bind a zero based type 1 MW\n"); + rxe_dbg_mw(mw, "attempt to bind a zero based type 1 MW\n"); return -EINVAL; } } @@ -67,21 +67,21 @@ static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, if (mw->ibmw.type == IB_MW_TYPE_2) { /* o10-37.2.30 */ if (unlikely(mw->state != RXE_MW_STATE_FREE)) { - pr_err_once( + rxe_dbg_mw(mw, "attempt to bind a type 2 MW not in the free state\n"); return -EINVAL; } /* C10-72 */ if (unlikely(qp->pd != to_rpd(mw->ibmw.pd))) { - pr_err_once( + rxe_dbg_mw(mw, "attempt to bind type 2 MW with qp with different PD\n"); return -EINVAL; } /* o10-37.2.40 */ if (unlikely(!mr || wqe->wr.wr.mw.length == 0)) { - pr_err_once( + rxe_dbg_mw(mw, "attempt to invalidate type 2 MW by binding with NULL or zero length MR\n"); return -EINVAL; } @@ -92,13 +92,13 @@ static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, return 0; if (unlikely(mr->access & IB_ZERO_BASED)) { - pr_err_once("attempt to bind MW to zero based MR\n"); + rxe_dbg_mw(mw, "attempt to bind MW to zero based MR\n"); return -EINVAL; } /* C10-73 */ if (unlikely(!(mr->access & IB_ACCESS_MW_BIND))) { - pr_err_once( + rxe_dbg_mw(mw, "attempt to bind an MW to an MR without bind access\n"); return -EINVAL; } @@ -107,7 +107,7 @@ static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, if (unlikely((mw->access & (IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC)) && !(mr->access & IB_ACCESS_LOCAL_WRITE))) { - pr_err_once( + rxe_dbg_mw(mw, "attempt to bind an Writable MW to an MR without local write access\n"); return -EINVAL; } @@ -115,7 +115,7 @@ static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, /* C10-75 */ if (mw->access & IB_ZERO_BASED) { if (unlikely(wqe->wr.wr.mw.length > mr->ibmr.length)) { - pr_err_once( + rxe_dbg_mw(mw, "attempt to bind a ZB MW outside of the MR\n"); return -EINVAL; } @@ -123,7 +123,7 @@ static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, if (unlikely((wqe->wr.wr.mw.addr < mr->ibmr.iova) || ((wqe->wr.wr.mw.addr + wqe->wr.wr.mw.length) > (mr->ibmr.iova + mr->ibmr.length)))) { - pr_err_once( + rxe_dbg_mw(mw, "attempt to bind a VA MW outside of the MR\n"); return -EINVAL; } -- GitLab From 34549e88e0a3088416177023abf1232fe40e721c Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:04 -0500 Subject: [PATCH 0535/1423] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_net.c Replace (some) calls to pr_xxx() in rxe_net.c with rxe_dbg_xxx(). Calls with a rxe device not yet in scope are left as is. Link: https://lore.kernel.org/r/20221103171013.20659-7-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_net.c | 38 ++++++++++++++++------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index c36cad9c7a664..e02e1624bcf4d 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -20,9 +20,10 @@ static struct rxe_recv_sockets recv_sockets; -static struct dst_entry *rxe_find_route4(struct net_device *ndev, - struct in_addr *saddr, - struct in_addr *daddr) +static struct dst_entry *rxe_find_route4(struct rxe_qp *qp, + struct net_device *ndev, + struct in_addr *saddr, + struct in_addr *daddr) { struct rtable *rt; struct flowi4 fl = { { 0 } }; @@ -35,7 +36,7 @@ static struct dst_entry *rxe_find_route4(struct net_device *ndev, rt = ip_route_output_key(&init_net, &fl); if (IS_ERR(rt)) { - pr_err_ratelimited("no route to %pI4\n", &daddr->s_addr); + rxe_dbg_qp(qp, "no route to %pI4\n", &daddr->s_addr); return NULL; } @@ -43,7 +44,8 @@ static struct dst_entry *rxe_find_route4(struct net_device *ndev, } #if IS_ENABLED(CONFIG_IPV6) -static struct dst_entry *rxe_find_route6(struct net_device *ndev, +static struct dst_entry *rxe_find_route6(struct rxe_qp *qp, + struct net_device *ndev, struct in6_addr *saddr, struct in6_addr *daddr) { @@ -60,12 +62,12 @@ static struct dst_entry *rxe_find_route6(struct net_device *ndev, recv_sockets.sk6->sk, &fl6, NULL); if (IS_ERR(ndst)) { - pr_err_ratelimited("no route to %pI6\n", daddr); + rxe_dbg_qp(qp, "no route to %pI6\n", daddr); return NULL; } if (unlikely(ndst->error)) { - pr_err("no route to %pI6\n", daddr); + rxe_dbg_qp(qp, "no route to %pI6\n", daddr); goto put; } @@ -77,7 +79,8 @@ put: #else -static struct dst_entry *rxe_find_route6(struct net_device *ndev, +static struct dst_entry *rxe_find_route6(struct rxe_qp *qp, + struct net_device *ndev, struct in6_addr *saddr, struct in6_addr *daddr) { @@ -105,14 +108,14 @@ static struct dst_entry *rxe_find_route(struct net_device *ndev, saddr = &av->sgid_addr._sockaddr_in.sin_addr; daddr = &av->dgid_addr._sockaddr_in.sin_addr; - dst = rxe_find_route4(ndev, saddr, daddr); + dst = rxe_find_route4(qp, ndev, saddr, daddr); } else if (av->network_type == RXE_NETWORK_TYPE_IPV6) { struct in6_addr *saddr6; struct in6_addr *daddr6; saddr6 = &av->sgid_addr._sockaddr_in6.sin6_addr; daddr6 = &av->dgid_addr._sockaddr_in6.sin6_addr; - dst = rxe_find_route6(ndev, saddr6, daddr6); + dst = rxe_find_route6(qp, ndev, saddr6, daddr6); #if IS_ENABLED(CONFIG_IPV6) if (dst) qp->dst_cookie = @@ -282,7 +285,7 @@ static int prepare4(struct rxe_av *av, struct rxe_pkt_info *pkt, dst = rxe_find_route(skb->dev, qp, av); if (!dst) { - pr_err("Host not reachable\n"); + rxe_dbg_qp(qp, "Host not reachable\n"); return -EHOSTUNREACH; } @@ -306,7 +309,7 @@ static int prepare6(struct rxe_av *av, struct rxe_pkt_info *pkt, dst = rxe_find_route(skb->dev, qp, av); if (!dst) { - pr_err("Host not reachable\n"); + rxe_dbg_qp(qp, "Host not reachable\n"); return -EHOSTUNREACH; } @@ -365,7 +368,8 @@ static int rxe_send(struct sk_buff *skb, struct rxe_pkt_info *pkt) } else if (skb->protocol == htons(ETH_P_IPV6)) { err = ip6_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); } else { - pr_err("Unknown layer 3 protocol: %d\n", skb->protocol); + rxe_dbg_qp(pkt->qp, "Unknown layer 3 protocol: %d\n", + skb->protocol); atomic_dec(&pkt->qp->skb_out); rxe_put(pkt->qp); kfree_skb(skb); @@ -373,7 +377,7 @@ static int rxe_send(struct sk_buff *skb, struct rxe_pkt_info *pkt) } if (unlikely(net_xmit_eval(err))) { - pr_debug("error sending packet: %d\n", err); + rxe_dbg_qp(pkt->qp, "error sending packet: %d\n", err); return -EAGAIN; } @@ -411,7 +415,7 @@ int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt, if ((is_request && (qp->req.state != QP_STATE_READY)) || (!is_request && (qp->resp.state != QP_STATE_READY))) { - pr_info("Packet dropped. QP is not in ready state\n"); + rxe_dbg_qp(qp, "Packet dropped. QP is not in ready state\n"); goto drop; } @@ -592,7 +596,7 @@ static int rxe_notify(struct notifier_block *not_blk, rxe_port_down(rxe); break; case NETDEV_CHANGEMTU: - pr_info("%s changed mtu to %d\n", ndev->name, ndev->mtu); + rxe_dbg(rxe, "%s changed mtu to %d\n", ndev->name, ndev->mtu); rxe_set_mtu(rxe, ndev->mtu); break; case NETDEV_CHANGE: @@ -604,7 +608,7 @@ static int rxe_notify(struct notifier_block *not_blk, case NETDEV_CHANGENAME: case NETDEV_FEAT_CHANGE: default: - pr_info("ignoring netdev event = %ld for %s\n", + rxe_dbg(rxe, "ignoring netdev event = %ld for %s\n", event, ndev->name); break; } -- GitLab From 6af70060d2e561d6479b33fe94cc2f38582e830b Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:05 -0500 Subject: [PATCH 0536/1423] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_qp.c Replace calls to pr_xxx() in rxe_qp.c with rxe_dbg_xxx(). Link: https://lore.kernel.org/r/20221103171013.20659-8-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_qp.c | 65 ++++++++++++++---------------- 1 file changed, 30 insertions(+), 35 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 3f6d62a80bea9..bcbfe6068b8b0 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -19,33 +19,33 @@ static int rxe_qp_chk_cap(struct rxe_dev *rxe, struct ib_qp_cap *cap, int has_srq) { if (cap->max_send_wr > rxe->attr.max_qp_wr) { - pr_debug("invalid send wr = %u > %d\n", + rxe_dbg(rxe, "invalid send wr = %u > %d\n", cap->max_send_wr, rxe->attr.max_qp_wr); goto err1; } if (cap->max_send_sge > rxe->attr.max_send_sge) { - pr_debug("invalid send sge = %u > %d\n", + rxe_dbg(rxe, "invalid send sge = %u > %d\n", cap->max_send_sge, rxe->attr.max_send_sge); goto err1; } if (!has_srq) { if (cap->max_recv_wr > rxe->attr.max_qp_wr) { - pr_debug("invalid recv wr = %u > %d\n", + rxe_dbg(rxe, "invalid recv wr = %u > %d\n", cap->max_recv_wr, rxe->attr.max_qp_wr); goto err1; } if (cap->max_recv_sge > rxe->attr.max_recv_sge) { - pr_debug("invalid recv sge = %u > %d\n", + rxe_dbg(rxe, "invalid recv sge = %u > %d\n", cap->max_recv_sge, rxe->attr.max_recv_sge); goto err1; } } if (cap->max_inline_data > rxe->max_inline_data) { - pr_debug("invalid max inline data = %u > %d\n", + rxe_dbg(rxe, "invalid max inline data = %u > %d\n", cap->max_inline_data, rxe->max_inline_data); goto err1; } @@ -73,7 +73,7 @@ int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init) } if (!init->recv_cq || !init->send_cq) { - pr_debug("missing cq\n"); + rxe_dbg(rxe, "missing cq\n"); goto err1; } @@ -82,14 +82,14 @@ int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init) if (init->qp_type == IB_QPT_GSI) { if (!rdma_is_port_valid(&rxe->ib_dev, port_num)) { - pr_debug("invalid port = %d\n", port_num); + rxe_dbg(rxe, "invalid port = %d\n", port_num); goto err1; } port = &rxe->port; if (init->qp_type == IB_QPT_GSI && port->qp_gsi_index) { - pr_debug("GSI QP exists for port %d\n", port_num); + rxe_dbg(rxe, "GSI QP exists for port %d\n", port_num); goto err1; } } @@ -264,9 +264,6 @@ static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp, wqe_size = rcv_wqe_size(qp->rq.max_sge); - pr_debug("qp#%d max_wr = %d, max_sge = %d, wqe_size = %d\n", - qp_num(qp), qp->rq.max_wr, qp->rq.max_sge, wqe_size); - type = QUEUE_TYPE_FROM_CLIENT; qp->rq.queue = rxe_queue_init(rxe, &qp->rq.max_wr, wqe_size, type); @@ -395,7 +392,7 @@ int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, attr->qp_state : cur_state; if (!ib_modify_qp_is_ok(cur_state, new_state, qp_type(qp), mask)) { - pr_debug("invalid mask or state for qp\n"); + rxe_dbg_qp(qp, "invalid mask or state\n"); goto err1; } @@ -409,7 +406,7 @@ int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, if (mask & IB_QP_PORT) { if (!rdma_is_port_valid(&rxe->ib_dev, attr->port_num)) { - pr_debug("invalid port %d\n", attr->port_num); + rxe_dbg_qp(qp, "invalid port %d\n", attr->port_num); goto err1; } } @@ -424,11 +421,11 @@ int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, if (rxe_av_chk_attr(rxe, &attr->alt_ah_attr)) goto err1; if (!rdma_is_port_valid(&rxe->ib_dev, attr->alt_port_num)) { - pr_debug("invalid alt port %d\n", attr->alt_port_num); + rxe_dbg_qp(qp, "invalid alt port %d\n", attr->alt_port_num); goto err1; } if (attr->alt_timeout > 31) { - pr_debug("invalid QP alt timeout %d > 31\n", + rxe_dbg_qp(qp, "invalid alt timeout %d > 31\n", attr->alt_timeout); goto err1; } @@ -441,7 +438,7 @@ int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, enum ib_mtu mtu = attr->path_mtu; if (mtu > max_mtu) { - pr_debug("invalid mtu (%d) > (%d)\n", + rxe_dbg_qp(qp, "invalid mtu (%d) > (%d)\n", ib_mtu_enum_to_int(mtu), ib_mtu_enum_to_int(max_mtu)); goto err1; @@ -450,7 +447,7 @@ int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, if (mask & IB_QP_MAX_QP_RD_ATOMIC) { if (attr->max_rd_atomic > rxe->attr.max_qp_rd_atom) { - pr_debug("invalid max_rd_atomic %d > %d\n", + rxe_dbg_qp(qp, "invalid max_rd_atomic %d > %d\n", attr->max_rd_atomic, rxe->attr.max_qp_rd_atom); goto err1; @@ -459,7 +456,8 @@ int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, if (mask & IB_QP_TIMEOUT) { if (attr->timeout > 31) { - pr_debug("invalid QP timeout %d > 31\n", attr->timeout); + rxe_dbg_qp(qp, "invalid timeout %d > 31\n", + attr->timeout); goto err1; } } @@ -637,27 +635,24 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, if (mask & IB_QP_RETRY_CNT) { qp->attr.retry_cnt = attr->retry_cnt; qp->comp.retry_cnt = attr->retry_cnt; - pr_debug("qp#%d set retry count = %d\n", qp_num(qp), - attr->retry_cnt); + rxe_dbg_qp(qp, "set retry count = %d\n", attr->retry_cnt); } if (mask & IB_QP_RNR_RETRY) { qp->attr.rnr_retry = attr->rnr_retry; qp->comp.rnr_retry = attr->rnr_retry; - pr_debug("qp#%d set rnr retry count = %d\n", qp_num(qp), - attr->rnr_retry); + rxe_dbg_qp(qp, "set rnr retry count = %d\n", attr->rnr_retry); } if (mask & IB_QP_RQ_PSN) { qp->attr.rq_psn = (attr->rq_psn & BTH_PSN_MASK); qp->resp.psn = qp->attr.rq_psn; - pr_debug("qp#%d set resp psn = 0x%x\n", qp_num(qp), - qp->resp.psn); + rxe_dbg_qp(qp, "set resp psn = 0x%x\n", qp->resp.psn); } if (mask & IB_QP_MIN_RNR_TIMER) { qp->attr.min_rnr_timer = attr->min_rnr_timer; - pr_debug("qp#%d set min rnr timer = 0x%x\n", qp_num(qp), + rxe_dbg_qp(qp, "set min rnr timer = 0x%x\n", attr->min_rnr_timer); } @@ -665,7 +660,7 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, qp->attr.sq_psn = (attr->sq_psn & BTH_PSN_MASK); qp->req.psn = qp->attr.sq_psn; qp->comp.psn = qp->attr.sq_psn; - pr_debug("qp#%d set req psn = 0x%x\n", qp_num(qp), qp->req.psn); + rxe_dbg_qp(qp, "set req psn = 0x%x\n", qp->req.psn); } if (mask & IB_QP_PATH_MIG_STATE) @@ -679,40 +674,40 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, switch (attr->qp_state) { case IB_QPS_RESET: - pr_debug("qp#%d state -> RESET\n", qp_num(qp)); + rxe_dbg_qp(qp, "state -> RESET\n"); rxe_qp_reset(qp); break; case IB_QPS_INIT: - pr_debug("qp#%d state -> INIT\n", qp_num(qp)); + rxe_dbg_qp(qp, "state -> INIT\n"); qp->req.state = QP_STATE_INIT; qp->resp.state = QP_STATE_INIT; qp->comp.state = QP_STATE_INIT; break; case IB_QPS_RTR: - pr_debug("qp#%d state -> RTR\n", qp_num(qp)); + rxe_dbg_qp(qp, "state -> RTR\n"); qp->resp.state = QP_STATE_READY; break; case IB_QPS_RTS: - pr_debug("qp#%d state -> RTS\n", qp_num(qp)); + rxe_dbg_qp(qp, "state -> RTS\n"); qp->req.state = QP_STATE_READY; qp->comp.state = QP_STATE_READY; break; case IB_QPS_SQD: - pr_debug("qp#%d state -> SQD\n", qp_num(qp)); + rxe_dbg_qp(qp, "state -> SQD\n"); rxe_qp_drain(qp); break; case IB_QPS_SQE: - pr_warn("qp#%d state -> SQE !!?\n", qp_num(qp)); + rxe_dbg_qp(qp, "state -> SQE !!?\n"); /* Not possible from modify_qp. */ break; case IB_QPS_ERR: - pr_debug("qp#%d state -> ERR\n", qp_num(qp)); + rxe_dbg_qp(qp, "state -> ERR\n"); rxe_qp_error(qp); break; } @@ -752,7 +747,7 @@ int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) attr->sq_draining = 0; } - pr_debug("attr->sq_draining = %d\n", attr->sq_draining); + rxe_dbg_qp(qp, "attr->sq_draining = %d\n", attr->sq_draining); return 0; } @@ -764,7 +759,7 @@ int rxe_qp_chk_destroy(struct rxe_qp *qp) * will fail immediately. */ if (atomic_read(&qp->mcg_num)) { - pr_debug("Attempt to destroy QP while attached to multicast group\n"); + rxe_dbg_qp(qp, "Attempt to destroy while attached to multicast group\n"); return -EBUSY; } -- GitLab From 0edfb15e30a53e8bd039c35a17f61e49cba9f4dd Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:06 -0500 Subject: [PATCH 0537/1423] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_req.c Replace calls to pr_xxx() in rxe_req.c with rxe_dbg_xxx(). Link: https://lore.kernel.org/r/20221103171013.20659-9-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_req.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 41f1d84f0acbf..4d45f508392fe 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -100,7 +100,7 @@ void rnr_nak_timer(struct timer_list *t) { struct rxe_qp *qp = from_timer(qp, t, rnr_nak_timer); - pr_debug("%s: fired for qp#%d\n", __func__, qp_num(qp)); + rxe_dbg_qp(qp, "nak timer fired\n"); /* request a send queue retry */ qp->req.need_retry = 1; @@ -595,7 +595,7 @@ static int rxe_do_local_ops(struct rxe_qp *qp, struct rxe_send_wqe *wqe) } break; default: - pr_err("Unexpected send wqe opcode %d\n", opcode); + rxe_dbg_qp(qp, "Unexpected send wqe opcode %d\n", opcode); wqe->status = IB_WC_LOC_QP_OP_ERR; return -EINVAL; } @@ -748,14 +748,14 @@ int rxe_requester(void *arg) av = rxe_get_av(&pkt, &ah); if (unlikely(!av)) { - pr_err("qp#%d Failed no address vector\n", qp_num(qp)); + rxe_dbg_qp(qp, "Failed no address vector\n"); wqe->status = IB_WC_LOC_QP_OP_ERR; goto err; } skb = init_req_packet(qp, av, wqe, opcode, payload, &pkt); if (unlikely(!skb)) { - pr_err("qp#%d Failed allocating skb\n", qp_num(qp)); + rxe_dbg_qp(qp, "Failed allocating skb\n"); wqe->status = IB_WC_LOC_QP_OP_ERR; if (ah) rxe_put(ah); @@ -764,7 +764,7 @@ int rxe_requester(void *arg) err = finish_packet(qp, av, wqe, &pkt, skb, payload); if (unlikely(err)) { - pr_debug("qp#%d Error during finish packet\n", qp_num(qp)); + rxe_dbg_qp(qp, "Error during finish packet\n"); if (err == -EFAULT) wqe->status = IB_WC_LOC_PROT_ERR; else -- GitLab From 74ddf7233c571d51bcb802bb192a9f7d77cd8830 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:07 -0500 Subject: [PATCH 0538/1423] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_resp.c Replace calls to pr_xxx() in rxe_resp.c with rxe_dbg_xxx(). Link: https://lore.kernel.org/r/20221103171013.20659-10-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_resp.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 382d2053db43e..6761bcd1d4d8f 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -317,7 +317,7 @@ static enum resp_states get_srq_wqe(struct rxe_qp *qp) /* don't trust user space data */ if (unlikely(wqe->dma.num_sge > srq->rq.max_sge)) { spin_unlock_irqrestore(&srq->rq.consumer_lock, flags); - pr_warn("%s: invalid num_sge in SRQ entry\n", __func__); + rxe_dbg_qp(qp, "invalid num_sge in SRQ entry\n"); return RESPST_ERR_MALFORMED_WQE; } size = sizeof(*wqe) + wqe->dma.num_sge*sizeof(struct rxe_sge); @@ -473,15 +473,14 @@ static enum resp_states check_rkey(struct rxe_qp *qp, if (rkey_is_mw(rkey)) { mw = rxe_lookup_mw(qp, access, rkey); if (!mw) { - pr_debug("%s: no MW matches rkey %#x\n", - __func__, rkey); + rxe_dbg_qp(qp, "no MW matches rkey %#x\n", rkey); state = RESPST_ERR_RKEY_VIOLATION; goto err; } mr = mw->mr; if (!mr) { - pr_err("%s: MW doesn't have an MR\n", __func__); + rxe_dbg_qp(qp, "MW doesn't have an MR\n"); state = RESPST_ERR_RKEY_VIOLATION; goto err; } @@ -494,8 +493,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp, } else { mr = lookup_mr(qp->pd, access, rkey, RXE_LOOKUP_REMOTE); if (!mr) { - pr_debug("%s: no MR matches rkey %#x\n", - __func__, rkey); + rxe_dbg_qp(qp, "no MR matches rkey %#x\n", rkey); state = RESPST_ERR_RKEY_VIOLATION; goto err; } @@ -1064,7 +1062,7 @@ static int send_common_ack(struct rxe_qp *qp, u8 syndrome, u32 psn, err = rxe_xmit_packet(qp, &ack_pkt, skb); if (err) - pr_err_ratelimited("Failed sending %s\n", msg); + rxe_dbg_qp(qp, "Failed sending %s\n", msg); return err; } @@ -1310,8 +1308,7 @@ int rxe_responder(void *arg) } while (1) { - pr_debug("qp#%d state = %s\n", qp_num(qp), - resp_state_name[state]); + rxe_dbg_qp(qp, "state = %s\n", resp_state_name[state]); switch (state) { case RESPST_GET_REQ: state = get_req(qp, &pkt); @@ -1468,7 +1465,7 @@ int rxe_responder(void *arg) case RESPST_ERROR: qp->resp.goto_error = 0; - pr_debug("qp#%d moved to error state\n", qp_num(qp)); + rxe_dbg_qp(qp, "moved to error state\n"); rxe_qp_error(qp); goto exit; -- GitLab From 0e6090024b3ebf8d162f82c542eba9632a9c85fc Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:08 -0500 Subject: [PATCH 0539/1423] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_srq.c Replace calls to pr_xxx() in rxe_srq.c with rxe_dbg_xxx(). Link: https://lore.kernel.org/r/20221103171013.20659-11-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_srq.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_srq.c b/drivers/infiniband/sw/rxe/rxe_srq.c index 02b39498c370d..82e37a41ced40 100644 --- a/drivers/infiniband/sw/rxe/rxe_srq.c +++ b/drivers/infiniband/sw/rxe/rxe_srq.c @@ -13,13 +13,13 @@ int rxe_srq_chk_init(struct rxe_dev *rxe, struct ib_srq_init_attr *init) struct ib_srq_attr *attr = &init->attr; if (attr->max_wr > rxe->attr.max_srq_wr) { - pr_warn("max_wr(%d) > max_srq_wr(%d)\n", + rxe_dbg(rxe, "max_wr(%d) > max_srq_wr(%d)\n", attr->max_wr, rxe->attr.max_srq_wr); goto err1; } if (attr->max_wr <= 0) { - pr_warn("max_wr(%d) <= 0\n", attr->max_wr); + rxe_dbg(rxe, "max_wr(%d) <= 0\n", attr->max_wr); goto err1; } @@ -27,7 +27,7 @@ int rxe_srq_chk_init(struct rxe_dev *rxe, struct ib_srq_init_attr *init) attr->max_wr = RXE_MIN_SRQ_WR; if (attr->max_sge > rxe->attr.max_srq_sge) { - pr_warn("max_sge(%d) > max_srq_sge(%d)\n", + rxe_dbg(rxe, "max_sge(%d) > max_srq_sge(%d)\n", attr->max_sge, rxe->attr.max_srq_sge); goto err1; } @@ -65,7 +65,7 @@ int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq, type = QUEUE_TYPE_FROM_CLIENT; q = rxe_queue_init(rxe, &srq->rq.max_wr, srq_wqe_size, type); if (!q) { - pr_warn("unable to allocate queue for srq\n"); + rxe_dbg_srq(srq, "Unable to allocate queue\n"); return -ENOMEM; } @@ -94,24 +94,24 @@ int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq, struct ib_srq_attr *attr, enum ib_srq_attr_mask mask) { if (srq->error) { - pr_warn("srq in error state\n"); + rxe_dbg_srq(srq, "in error state\n"); goto err1; } if (mask & IB_SRQ_MAX_WR) { if (attr->max_wr > rxe->attr.max_srq_wr) { - pr_warn("max_wr(%d) > max_srq_wr(%d)\n", + rxe_dbg_srq(srq, "max_wr(%d) > max_srq_wr(%d)\n", attr->max_wr, rxe->attr.max_srq_wr); goto err1; } if (attr->max_wr <= 0) { - pr_warn("max_wr(%d) <= 0\n", attr->max_wr); + rxe_dbg_srq(srq, "max_wr(%d) <= 0\n", attr->max_wr); goto err1; } if (srq->limit && (attr->max_wr < srq->limit)) { - pr_warn("max_wr (%d) < srq->limit (%d)\n", + rxe_dbg_srq(srq, "max_wr (%d) < srq->limit (%d)\n", attr->max_wr, srq->limit); goto err1; } @@ -122,13 +122,13 @@ int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq, if (mask & IB_SRQ_LIMIT) { if (attr->srq_limit > rxe->attr.max_srq_wr) { - pr_warn("srq_limit(%d) > max_srq_wr(%d)\n", + rxe_dbg_srq(srq, "srq_limit(%d) > max_srq_wr(%d)\n", attr->srq_limit, rxe->attr.max_srq_wr); goto err1; } if (attr->srq_limit > srq->rq.queue->buf->index_mask) { - pr_warn("srq_limit (%d) > cur limit(%d)\n", + rxe_dbg_srq(srq, "srq_limit (%d) > cur limit(%d)\n", attr->srq_limit, srq->rq.queue->buf->index_mask); goto err1; -- GitLab From 14e501fdb0de3b45b8ee8a2a031c3c64aaa01817 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:09 -0500 Subject: [PATCH 0540/1423] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_verbs.c Replace calls to pr_xxx() in rxe_verbs.c with rxe_dbg_xxx(). Link: https://lore.kernel.org/r/20221103171013.20659-12-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 510ae471ac7a8..e6eca21c54e6e 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1103,7 +1103,7 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name) err = ib_register_device(dev, ibdev_name, NULL); if (err) - pr_warn("%s failed with error %d\n", __func__, err); + rxe_dbg(rxe, "failed with error %d\n", err); /* * Note that rxe may be invalid at this point if another thread -- GitLab From 25fd735a4c9e38c65904eea2769ed92f6f8586d0 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:10 -0500 Subject: [PATCH 0541/1423] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_av.c Replace calls to pr_xxx() in rxe_av.c with rxe_dbg_xxx(). Link: https://lore.kernel.org/r/20221103171013.20659-13-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_av.c | 43 ++++++++++++++++++++++----- drivers/infiniband/sw/rxe/rxe_loc.h | 8 ++--- drivers/infiniband/sw/rxe/rxe_qp.c | 4 +-- drivers/infiniband/sw/rxe/rxe_verbs.c | 13 ++++---- 4 files changed, 47 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_av.c b/drivers/infiniband/sw/rxe/rxe_av.c index 3b05314ca739e..889d7adbd4550 100644 --- a/drivers/infiniband/sw/rxe/rxe_av.c +++ b/drivers/infiniband/sw/rxe/rxe_av.c @@ -14,26 +14,45 @@ void rxe_init_av(struct rdma_ah_attr *attr, struct rxe_av *av) memcpy(av->dmac, attr->roce.dmac, ETH_ALEN); } -int rxe_av_chk_attr(struct rxe_dev *rxe, struct rdma_ah_attr *attr) +static int chk_attr(void *obj, struct rdma_ah_attr *attr, bool obj_is_ah) { const struct ib_global_route *grh = rdma_ah_read_grh(attr); struct rxe_port *port; + struct rxe_dev *rxe; + struct rxe_qp *qp; + struct rxe_ah *ah; int type; + if (obj_is_ah) { + ah = obj; + rxe = to_rdev(ah->ibah.device); + } else { + qp = obj; + rxe = to_rdev(qp->ibqp.device); + } + port = &rxe->port; if (rdma_ah_get_ah_flags(attr) & IB_AH_GRH) { if (grh->sgid_index > port->attr.gid_tbl_len) { - pr_warn("invalid sgid index = %d\n", - grh->sgid_index); + if (obj_is_ah) + rxe_dbg_ah(ah, "invalid sgid index = %d\n", + grh->sgid_index); + else + rxe_dbg_qp(qp, "invalid sgid index = %d\n", + grh->sgid_index); return -EINVAL; } type = rdma_gid_attr_network_type(grh->sgid_attr); if (type < RDMA_NETWORK_IPV4 || type > RDMA_NETWORK_IPV6) { - pr_warn("invalid network type for rdma_rxe = %d\n", - type); + if (obj_is_ah) + rxe_dbg_ah(ah, "invalid network type for rdma_rxe = %d\n", + type); + else + rxe_dbg_qp(qp, "invalid network type for rdma_rxe = %d\n", + type); return -EINVAL; } } @@ -41,6 +60,16 @@ int rxe_av_chk_attr(struct rxe_dev *rxe, struct rdma_ah_attr *attr) return 0; } +int rxe_av_chk_attr(struct rxe_qp *qp, struct rdma_ah_attr *attr) +{ + return chk_attr(qp, attr, false); +} + +int rxe_ah_chk_attr(struct rxe_ah *ah, struct rdma_ah_attr *attr) +{ + return chk_attr(ah, attr, true); +} + void rxe_av_from_attr(u8 port_num, struct rxe_av *av, struct rdma_ah_attr *attr) { @@ -121,12 +150,12 @@ struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt, struct rxe_ah **ahp) /* only new user provider or kernel client */ ah = rxe_pool_get_index(&pkt->rxe->ah_pool, ah_num); if (!ah) { - pr_warn("Unable to find AH matching ah_num\n"); + rxe_dbg_qp(pkt->qp, "Unable to find AH matching ah_num\n"); return NULL; } if (rxe_ah_pd(ah) != pkt->qp->pd) { - pr_warn("PDs don't match for AH and QP\n"); + rxe_dbg_qp(pkt->qp, "PDs don't match for AH and QP\n"); rxe_put(ah); return NULL; } diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index c2a5c8814a48b..a22476d27b384 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -9,16 +9,12 @@ /* rxe_av.c */ void rxe_init_av(struct rdma_ah_attr *attr, struct rxe_av *av); - -int rxe_av_chk_attr(struct rxe_dev *rxe, struct rdma_ah_attr *attr); - +int rxe_av_chk_attr(struct rxe_qp *qp, struct rdma_ah_attr *attr); +int rxe_ah_chk_attr(struct rxe_ah *ah, struct rdma_ah_attr *attr); void rxe_av_from_attr(u8 port_num, struct rxe_av *av, struct rdma_ah_attr *attr); - void rxe_av_to_attr(struct rxe_av *av, struct rdma_ah_attr *attr); - void rxe_av_fill_ip_info(struct rxe_av *av, struct rdma_ah_attr *attr); - struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt, struct rxe_ah **ahp); /* rxe_cq.c */ diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index bcbfe6068b8b0..46f6c74ce00e1 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -414,11 +414,11 @@ int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, if (mask & IB_QP_CAP && rxe_qp_chk_cap(rxe, &attr->cap, !!qp->srq)) goto err1; - if (mask & IB_QP_AV && rxe_av_chk_attr(rxe, &attr->ah_attr)) + if (mask & IB_QP_AV && rxe_av_chk_attr(qp, &attr->ah_attr)) goto err1; if (mask & IB_QP_ALT_PATH) { - if (rxe_av_chk_attr(rxe, &attr->alt_ah_attr)) + if (rxe_av_chk_attr(qp, &attr->alt_ah_attr)) goto err1; if (!rdma_is_port_valid(&rxe->ib_dev, attr->alt_port_num)) { rxe_dbg_qp(qp, "invalid alt port %d\n", attr->alt_port_num); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index e6eca21c54e6e..025b35bf014e2 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -172,10 +172,6 @@ static int rxe_create_ah(struct ib_ah *ibah, ah->is_user = false; } - err = rxe_av_chk_attr(rxe, init_attr->ah_attr); - if (err) - return err; - err = rxe_add_to_pool_ah(&rxe->ah_pool, ah, init_attr->flags & RDMA_CREATE_AH_SLEEPABLE); if (err) @@ -184,6 +180,12 @@ static int rxe_create_ah(struct ib_ah *ibah, /* create index > 0 */ ah->ah_num = ah->elem.index; + err = rxe_ah_chk_attr(ah, init_attr->ah_attr); + if (err) { + rxe_cleanup(ah); + return err; + } + if (uresp) { /* only if new user provider */ err = copy_to_user(&uresp->ah_num, &ah->ah_num, @@ -206,10 +208,9 @@ static int rxe_create_ah(struct ib_ah *ibah, static int rxe_modify_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr) { int err; - struct rxe_dev *rxe = to_rdev(ibah->device); struct rxe_ah *ah = to_rah(ibah); - err = rxe_av_chk_attr(rxe, attr); + err = rxe_ah_chk_attr(ah, attr); if (err) return err; -- GitLab From fc50597934411170ed149d3368b0b733bc5119f1 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:11 -0500 Subject: [PATCH 0542/1423] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_task.c Replace calls to pr_xxx() in rxe_task.c with rxe_dbg_xxx(). Link: https://lore.kernel.org/r/20221103171013.20659-14-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_task.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c index 0208d833a41b9..60b90e33a8849 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.c +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -29,6 +29,7 @@ static void do_task(struct tasklet_struct *t) int cont; int ret; struct rxe_task *task = from_tasklet(task, t, tasklet); + struct rxe_qp *qp = (struct rxe_qp *)task->arg; unsigned int iterations = RXE_MAX_ITERATIONS; spin_lock_bh(&task->lock); @@ -47,7 +48,7 @@ static void do_task(struct tasklet_struct *t) default: spin_unlock_bh(&task->lock); - pr_warn("%s failed with bad state %d\n", __func__, task->state); + rxe_dbg_qp(qp, "failed with bad state %d\n", task->state); return; } @@ -81,8 +82,8 @@ static void do_task(struct tasklet_struct *t) break; default: - pr_warn("%s failed with bad state %d\n", __func__, - task->state); + rxe_dbg_qp(qp, "failed with bad state %d\n", + task->state); } spin_unlock_bh(&task->lock); } while (cont); -- GitLab From c6aba5ea00550017629c56972b635f33bb6a6903 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:12 -0500 Subject: [PATCH 0543/1423] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe.c Replace calls to pr_xxx() in rxe.c with rxe_dbg_xxx(). Calls with a rxe device not yet in scope are left as is. Link: https://lore.kernel.org/r/20221103171013.20659-15-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index 51daac5c4feb7..136c2efe34660 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -187,14 +187,14 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev) exists = rxe_get_dev_from_net(ndev); if (exists) { ib_device_put(&exists->ib_dev); - pr_err("already configured on %s\n", ndev->name); + rxe_dbg(exists, "already configured on %s\n", ndev->name); err = -EEXIST; goto err; } err = rxe_net_add(ibdev_name, ndev); if (err) { - pr_err("failed to add %s\n", ndev->name); + rxe_dbg(exists, "failed to add %s\n", ndev->name); goto err; } err: -- GitLab From 813728043b79a11f7029ba196decfc7f576ae487 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:13 -0500 Subject: [PATCH 0544/1423] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_icrc.c Replace calls to pr_xxx() in rxe_icrc.c with rxe_dbg_xxx(). Link: https://lore.kernel.org/r/20221103171013.20659-16-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_icrc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_icrc.c b/drivers/infiniband/sw/rxe/rxe_icrc.c index 46bb07c5c4df2..71bc2c1895888 100644 --- a/drivers/infiniband/sw/rxe/rxe_icrc.c +++ b/drivers/infiniband/sw/rxe/rxe_icrc.c @@ -21,7 +21,7 @@ int rxe_icrc_init(struct rxe_dev *rxe) tfm = crypto_alloc_shash("crc32", 0, 0); if (IS_ERR(tfm)) { - pr_warn("failed to init crc32 algorithm err:%ld\n", + rxe_dbg(rxe, "failed to init crc32 algorithm err: %ld\n", PTR_ERR(tfm)); return PTR_ERR(tfm); } @@ -51,7 +51,7 @@ static __be32 rxe_crc32(struct rxe_dev *rxe, __be32 crc, void *next, size_t len) *(__be32 *)shash_desc_ctx(shash) = crc; err = crypto_shash_update(shash, next, len); if (unlikely(err)) { - pr_warn_ratelimited("failed crc calculation, err: %d\n", err); + rxe_dbg(rxe, "failed crc calculation, err: %d\n", err); return (__force __be32)crc32_le((__force u32)crc, next, len); } -- GitLab From 5de087250f1d8f7b81abaf94110884994793f073 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:14 -0500 Subject: [PATCH 0545/1423] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_mmap.c Replace calls to pr_xxx() in rxe_mmap.c with rxe_dbg_xxx(). Link: https://lore.kernel.org/r/20221103171013.20659-17-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mmap.c b/drivers/infiniband/sw/rxe/rxe_mmap.c index 9149b60954296..a47d72dbc5376 100644 --- a/drivers/infiniband/sw/rxe/rxe_mmap.c +++ b/drivers/infiniband/sw/rxe/rxe_mmap.c @@ -79,7 +79,7 @@ int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) /* Don't allow a mmap larger than the object. */ if (size > ip->info.size) { - pr_err("mmap region is larger than the object!\n"); + rxe_dbg(rxe, "mmap region is larger than the object!\n"); spin_unlock_bh(&rxe->pending_lock); ret = -EINVAL; goto done; @@ -87,7 +87,7 @@ int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) goto found_it; } - pr_warn("unable to find pending mmap info\n"); + rxe_dbg(rxe, "unable to find pending mmap info\n"); spin_unlock_bh(&rxe->pending_lock); ret = -EINVAL; goto done; @@ -98,7 +98,7 @@ found_it: ret = remap_vmalloc_range(vma, ip->obj, 0); if (ret) { - pr_err("err %d from remap_vmalloc_range\n", ret); + rxe_dbg(rxe, "err %d from remap_vmalloc_range\n", ret); goto done; } -- GitLab From 8c50cd059c5cd974da4285af17adf0e38905b25b Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 31 Oct 2022 10:39:50 -0500 Subject: [PATCH 0546/1423] PCI: altera-msi: Include explicitly pcie-altera-msi.c uses irq_domain_add_linear() and related interfaces, so it needs but doesn't include it directly; it relies on the fact that includes it. But pcie-altera-msi.c *doesn't* need itself. Include directly to remove this implicit dependency so a future patch can drop . Link: https://lore.kernel.org/r/20221031153954.1163623-2-helgaas@kernel.org Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/pcie-altera-msi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/controller/pcie-altera-msi.c b/drivers/pci/controller/pcie-altera-msi.c index 7b1d3ebc34ecd..4366e042e98ba 100644 --- a/drivers/pci/controller/pcie-altera-msi.c +++ b/drivers/pci/controller/pcie-altera-msi.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include -- GitLab From 606a0430b37af4a1dd3e1e3baaf073b42fbb53e3 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 31 Oct 2022 10:39:51 -0500 Subject: [PATCH 0547/1423] PCI: microchip: Include explicitly pcie-microchip-host.c uses irq_domain_add_linear() and related interfaces, so it needs but doesn't include it directly; it relies on the fact that includes it. But pcie-microchip-host.c *doesn't* need itself. Include directly to remove this implicit dependency so a future patch can drop . Link: https://lore.kernel.org/r/20221031153954.1163623-3-helgaas@kernel.org Signed-off-by: Bjorn Helgaas Reviewed-by: Conor Dooley --- drivers/pci/controller/pcie-microchip-host.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/controller/pcie-microchip-host.c b/drivers/pci/controller/pcie-microchip-host.c index 7263d175b5adb..57b2a62f52c8a 100644 --- a/drivers/pci/controller/pcie-microchip-host.c +++ b/drivers/pci/controller/pcie-microchip-host.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include -- GitLab From 763d25e7affe13c7be923fec6192ca6325f33c73 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 31 Oct 2022 10:39:52 -0500 Subject: [PATCH 0548/1423] PCI: mvebu: Include explicitly pci-mvebu.c uses irq_domain_add_linear() and related interfaces but relies on but doesn't include it directly; it relies on the fact that includes it. Include directly to remove this implicit dependency. Link: https://lore.kernel.org/r/20221031153954.1163623-4-helgaas@kernel.org Signed-off-by: Bjorn Helgaas Acked-by: Thomas Petazzoni --- drivers/pci/controller/pci-mvebu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/controller/pci-mvebu.c b/drivers/pci/controller/pci-mvebu.c index 1ced73726a267..73db99035c2b7 100644 --- a/drivers/pci/controller/pci-mvebu.c +++ b/drivers/pci/controller/pci-mvebu.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include -- GitLab From 753596dcdb753afb63874c644b2fb20ef7fb3948 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 31 Oct 2022 10:39:53 -0500 Subject: [PATCH 0549/1423] PCI: xgene-msi: Include explicitly pci-xgene-msi.c uses irq_domain_add_linear() and related interfaces, so it needs but doesn't include it directly; it relies on the fact that includes it. But pci-xgene-msi.c *doesn't* need itself. Include directly to remove this implicit dependency so a future patch can drop . Link: https://lore.kernel.org/r/20221031153954.1163623-5-helgaas@kernel.org Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/pci-xgene-msi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/controller/pci-xgene-msi.c b/drivers/pci/controller/pci-xgene-msi.c index bfa259781b692..bacb14e558eeb 100644 --- a/drivers/pci/controller/pci-xgene-msi.c +++ b/drivers/pci/controller/pci-xgene-msi.c @@ -8,6 +8,7 @@ */ #include #include +#include #include #include #include -- GitLab From 277004d7a4a348de185fb4149ff29a651e994ff4 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 31 Oct 2022 10:39:54 -0500 Subject: [PATCH 0550/1423] PCI: Remove unnecessary includes Many host controller drivers #include even though they don't need it. Remove the unnecessary #includes. Link: https://lore.kernel.org/r/20221031153954.1163623-6-helgaas@kernel.org Signed-off-by: Bjorn Helgaas Acked-by: Roy Zang --- drivers/pci/controller/cadence/pci-j721e.c | 1 - drivers/pci/controller/dwc/pci-layerscape.c | 1 - drivers/pci/controller/dwc/pcie-armada8k.c | 1 - drivers/pci/controller/dwc/pcie-tegra194.c | 1 - drivers/pci/controller/pci-v3-semi.c | 1 - drivers/pci/controller/pci-xgene-msi.c | 1 - drivers/pci/controller/pci-xgene.c | 1 - drivers/pci/controller/pcie-altera-msi.c | 1 - drivers/pci/controller/pcie-iproc-platform.c | 1 - drivers/pci/controller/pcie-iproc.c | 1 - drivers/pci/controller/pcie-microchip-host.c | 1 - drivers/pci/controller/pcie-rockchip-host.c | 1 - drivers/pci/controller/pcie-xilinx-cpm.c | 1 - drivers/pci/controller/pcie-xilinx-nwl.c | 1 - 14 files changed, 14 deletions(-) diff --git a/drivers/pci/controller/cadence/pci-j721e.c b/drivers/pci/controller/cadence/pci-j721e.c index a82f845cc4b52..cc83a8925ce03 100644 --- a/drivers/pci/controller/cadence/pci-j721e.c +++ b/drivers/pci/controller/cadence/pci-j721e.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/pci/controller/dwc/pci-layerscape.c b/drivers/pci/controller/dwc/pci-layerscape.c index 879b8692f96a5..ed5fb492fe084 100644 --- a/drivers/pci/controller/dwc/pci-layerscape.c +++ b/drivers/pci/controller/dwc/pci-layerscape.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/pci/controller/dwc/pcie-armada8k.c b/drivers/pci/controller/dwc/pcie-armada8k.c index dc469ef8e99b3..5c999e15c357f 100644 --- a/drivers/pci/controller/dwc/pcie-armada8k.c +++ b/drivers/pci/controller/dwc/pcie-armada8k.c @@ -21,7 +21,6 @@ #include #include #include -#include #include "pcie-designware.h" diff --git a/drivers/pci/controller/dwc/pcie-tegra194.c b/drivers/pci/controller/dwc/pcie-tegra194.c index 1b6b437823d22..02d78a12b6e72 100644 --- a/drivers/pci/controller/dwc/pcie-tegra194.c +++ b/drivers/pci/controller/dwc/pcie-tegra194.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/pci/controller/pci-v3-semi.c b/drivers/pci/controller/pci-v3-semi.c index 154a5398633cc..784fcf35599c6 100644 --- a/drivers/pci/controller/pci-v3-semi.c +++ b/drivers/pci/controller/pci-v3-semi.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/pci/controller/pci-xgene-msi.c b/drivers/pci/controller/pci-xgene-msi.c index bacb14e558eeb..d7987b281f799 100644 --- a/drivers/pci/controller/pci-xgene-msi.c +++ b/drivers/pci/controller/pci-xgene-msi.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/pci/controller/pci-xgene.c b/drivers/pci/controller/pci-xgene.c index 549d3bd6d1c27..887b4941ff32f 100644 --- a/drivers/pci/controller/pci-xgene.c +++ b/drivers/pci/controller/pci-xgene.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/pci/controller/pcie-altera-msi.c b/drivers/pci/controller/pcie-altera-msi.c index 4366e042e98ba..65e8a20cc4426 100644 --- a/drivers/pci/controller/pcie-altera-msi.c +++ b/drivers/pci/controller/pcie-altera-msi.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/pci/controller/pcie-iproc-platform.c b/drivers/pci/controller/pcie-iproc-platform.c index 538115246c799..4142a73e611d1 100644 --- a/drivers/pci/controller/pcie-iproc-platform.c +++ b/drivers/pci/controller/pcie-iproc-platform.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include diff --git a/drivers/pci/controller/pcie-iproc.c b/drivers/pci/controller/pcie-iproc.c index 2519201b0e51c..83029bdfd8844 100644 --- a/drivers/pci/controller/pcie-iproc.c +++ b/drivers/pci/controller/pcie-iproc.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include diff --git a/drivers/pci/controller/pcie-microchip-host.c b/drivers/pci/controller/pcie-microchip-host.c index 57b2a62f52c8a..0ebf7015e9afd 100644 --- a/drivers/pci/controller/pcie-microchip-host.c +++ b/drivers/pci/controller/pcie-microchip-host.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/pci/controller/pcie-rockchip-host.c b/drivers/pci/controller/pcie-rockchip-host.c index 7352b5ff8d359..c96c0f454570c 100644 --- a/drivers/pci/controller/pcie-rockchip-host.c +++ b/drivers/pci/controller/pcie-rockchip-host.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/pci/controller/pcie-xilinx-cpm.c b/drivers/pci/controller/pcie-xilinx-cpm.c index e4ab48041eb6d..4a787a941674b 100644 --- a/drivers/pci/controller/pcie-xilinx-cpm.c +++ b/drivers/pci/controller/pcie-xilinx-cpm.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/pci/controller/pcie-xilinx-nwl.c b/drivers/pci/controller/pcie-xilinx-nwl.c index 40d070e54ad2e..f0271b6c6f8d2 100644 --- a/drivers/pci/controller/pcie-xilinx-nwl.c +++ b/drivers/pci/controller/pcie-xilinx-nwl.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include -- GitLab From 0266a177631d4c6b963b5b12dd986a8c5abdbf06 Mon Sep 17 00:00:00 2001 From: Long Li Date: Thu, 3 Nov 2022 12:16:30 -0700 Subject: [PATCH 0551/1423] RDMA/mana_ib: Add a driver for Microsoft Azure Network Adapter Add a RDMA VF driver for Microsoft Azure Network Adapter (MANA). Co-developed-by: Ajay Sharma Signed-off-by: Ajay Sharma Reviewed-by: Dexuan Cui Signed-off-by: Long Li Link: https://lore.kernel.org/r/1667502990-2559-13-git-send-email-longli@linuxonhyperv.com Signed-off-by: Leon Romanovsky --- MAINTAINERS | 9 + drivers/infiniband/Kconfig | 1 + drivers/infiniband/hw/Makefile | 1 + drivers/infiniband/hw/mana/Kconfig | 10 + drivers/infiniband/hw/mana/Makefile | 4 + drivers/infiniband/hw/mana/cq.c | 79 ++++ drivers/infiniband/hw/mana/device.c | 117 ++++++ drivers/infiniband/hw/mana/main.c | 521 ++++++++++++++++++++++++ drivers/infiniband/hw/mana/mana_ib.h | 162 ++++++++ drivers/infiniband/hw/mana/mr.c | 198 +++++++++ drivers/infiniband/hw/mana/qp.c | 506 +++++++++++++++++++++++ drivers/infiniband/hw/mana/wq.c | 115 ++++++ include/net/mana/mana.h | 3 + include/uapi/rdma/ib_user_ioctl_verbs.h | 1 + include/uapi/rdma/mana-abi.h | 66 +++ 15 files changed, 1793 insertions(+) create mode 100644 drivers/infiniband/hw/mana/Kconfig create mode 100644 drivers/infiniband/hw/mana/Makefile create mode 100644 drivers/infiniband/hw/mana/cq.c create mode 100644 drivers/infiniband/hw/mana/device.c create mode 100644 drivers/infiniband/hw/mana/main.c create mode 100644 drivers/infiniband/hw/mana/mana_ib.h create mode 100644 drivers/infiniband/hw/mana/mr.c create mode 100644 drivers/infiniband/hw/mana/qp.c create mode 100644 drivers/infiniband/hw/mana/wq.c create mode 100644 include/uapi/rdma/mana-abi.h diff --git a/MAINTAINERS b/MAINTAINERS index 441a65d41eb44..4db8e4e02c05f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13669,6 +13669,15 @@ F: drivers/scsi/smartpqi/smartpqi*.[ch] F: include/linux/cciss*.h F: include/uapi/linux/cciss*.h +MICROSOFT MANA RDMA DRIVER +M: Long Li +M: Ajay Sharma +L: linux-rdma@vger.kernel.org +S: Supported +F: drivers/infiniband/hw/mana/ +F: include/net/mana +F: include/uapi/rdma/mana-abi.h + MICROSOFT SURFACE AGGREGATOR TABLET-MODE SWITCH M: Maximilian Luz L: platform-driver-x86@vger.kernel.org diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index aa36ac618e729..ccc874478f0b6 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -85,6 +85,7 @@ source "drivers/infiniband/hw/erdma/Kconfig" source "drivers/infiniband/hw/hfi1/Kconfig" source "drivers/infiniband/hw/hns/Kconfig" source "drivers/infiniband/hw/irdma/Kconfig" +source "drivers/infiniband/hw/mana/Kconfig" source "drivers/infiniband/hw/mlx4/Kconfig" source "drivers/infiniband/hw/mlx5/Kconfig" source "drivers/infiniband/hw/mthca/Kconfig" diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile index 6b3a88046125a..1211f4317a9f4 100644 --- a/drivers/infiniband/hw/Makefile +++ b/drivers/infiniband/hw/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_INFINIBAND_QIB) += qib/ obj-$(CONFIG_INFINIBAND_CXGB4) += cxgb4/ obj-$(CONFIG_INFINIBAND_EFA) += efa/ obj-$(CONFIG_INFINIBAND_IRDMA) += irdma/ +obj-$(CONFIG_MANA_INFINIBAND) += mana/ obj-$(CONFIG_MLX4_INFINIBAND) += mlx4/ obj-$(CONFIG_MLX5_INFINIBAND) += mlx5/ obj-$(CONFIG_INFINIBAND_OCRDMA) += ocrdma/ diff --git a/drivers/infiniband/hw/mana/Kconfig b/drivers/infiniband/hw/mana/Kconfig new file mode 100644 index 0000000000000..546640657bac2 --- /dev/null +++ b/drivers/infiniband/hw/mana/Kconfig @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0-only +config MANA_INFINIBAND + tristate "Microsoft Azure Network Adapter support" + depends on NETDEVICES && ETHERNET && PCI && MICROSOFT_MANA + help + This driver provides low-level RDMA support for Microsoft Azure + Network Adapter (MANA). MANA supports RDMA features that can be used + for workloads (e.g. DPDK, MPI etc) that uses RDMA verbs to directly + access hardware from user-mode processes in Microsoft Azure cloud + environment. diff --git a/drivers/infiniband/hw/mana/Makefile b/drivers/infiniband/hw/mana/Makefile new file mode 100644 index 0000000000000..88655fe5e398a --- /dev/null +++ b/drivers/infiniband/hw/mana/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_MANA_INFINIBAND) += mana_ib.o + +mana_ib-y := device.o main.o wq.o qp.o cq.o mr.o diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c new file mode 100644 index 0000000000000..d141cab8a1e69 --- /dev/null +++ b/drivers/infiniband/hw/mana/cq.c @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022, Microsoft Corporation. All rights reserved. + */ + +#include "mana_ib.h" + +int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) +{ + struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq); + struct ib_device *ibdev = ibcq->device; + struct mana_ib_create_cq ucmd = {}; + struct mana_ib_dev *mdev; + int err; + + mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); + + if (udata->inlen < sizeof(ucmd)) + return -EINVAL; + + err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)); + if (err) { + ibdev_dbg(ibdev, + "Failed to copy from udata for create cq, %d\n", err); + return err; + } + + if (attr->cqe > MAX_SEND_BUFFERS_PER_QUEUE) { + ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe); + return -EINVAL; + } + + cq->cqe = attr->cqe; + cq->umem = ib_umem_get(ibdev, ucmd.buf_addr, cq->cqe * COMP_ENTRY_SIZE, + IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(cq->umem)) { + err = PTR_ERR(cq->umem); + ibdev_dbg(ibdev, "Failed to get umem for create cq, err %d\n", + err); + return err; + } + + err = mana_ib_gd_create_dma_region(mdev, cq->umem, &cq->gdma_region); + if (err) { + ibdev_dbg(ibdev, + "Failed to create dma region for create cq, %d\n", + err); + goto err_release_umem; + } + + ibdev_dbg(ibdev, + "mana_ib_gd_create_dma_region ret %d gdma_region 0x%llx\n", + err, cq->gdma_region); + + /* + * The CQ ID is not known at this time. The ID is generated at create_qp + */ + + return 0; + +err_release_umem: + ib_umem_release(cq->umem); + return err; +} + +int mana_ib_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) +{ + struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq); + struct ib_device *ibdev = ibcq->device; + struct mana_ib_dev *mdev; + + mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); + + mana_ib_gd_destroy_dma_region(mdev, cq->gdma_region); + ib_umem_release(cq->umem); + + return 0; +} diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c new file mode 100644 index 0000000000000..d4541b8707e4c --- /dev/null +++ b/drivers/infiniband/hw/mana/device.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022, Microsoft Corporation. All rights reserved. + */ + +#include "mana_ib.h" +#include + +MODULE_DESCRIPTION("Microsoft Azure Network Adapter IB driver"); +MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(NET_MANA); + +static const struct ib_device_ops mana_ib_dev_ops = { + .owner = THIS_MODULE, + .driver_id = RDMA_DRIVER_MANA, + .uverbs_abi_ver = MANA_IB_UVERBS_ABI_VERSION, + + .alloc_pd = mana_ib_alloc_pd, + .alloc_ucontext = mana_ib_alloc_ucontext, + .create_cq = mana_ib_create_cq, + .create_qp = mana_ib_create_qp, + .create_rwq_ind_table = mana_ib_create_rwq_ind_table, + .create_wq = mana_ib_create_wq, + .dealloc_pd = mana_ib_dealloc_pd, + .dealloc_ucontext = mana_ib_dealloc_ucontext, + .dereg_mr = mana_ib_dereg_mr, + .destroy_cq = mana_ib_destroy_cq, + .destroy_qp = mana_ib_destroy_qp, + .destroy_rwq_ind_table = mana_ib_destroy_rwq_ind_table, + .destroy_wq = mana_ib_destroy_wq, + .disassociate_ucontext = mana_ib_disassociate_ucontext, + .get_port_immutable = mana_ib_get_port_immutable, + .mmap = mana_ib_mmap, + .modify_qp = mana_ib_modify_qp, + .modify_wq = mana_ib_modify_wq, + .query_device = mana_ib_query_device, + .query_gid = mana_ib_query_gid, + .query_port = mana_ib_query_port, + .reg_user_mr = mana_ib_reg_user_mr, + + INIT_RDMA_OBJ_SIZE(ib_cq, mana_ib_cq, ibcq), + INIT_RDMA_OBJ_SIZE(ib_pd, mana_ib_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_qp, mana_ib_qp, ibqp), + INIT_RDMA_OBJ_SIZE(ib_ucontext, mana_ib_ucontext, ibucontext), + INIT_RDMA_OBJ_SIZE(ib_rwq_ind_table, mana_ib_rwq_ind_table, + ib_ind_table), +}; + +static int mana_ib_probe(struct auxiliary_device *adev, + const struct auxiliary_device_id *id) +{ + struct mana_adev *madev = container_of(adev, struct mana_adev, adev); + struct gdma_dev *mdev = madev->mdev; + struct mana_context *mc; + struct mana_ib_dev *dev; + int ret; + + mc = mdev->driver_data; + + dev = ib_alloc_device(mana_ib_dev, ib_dev); + if (!dev) + return -ENOMEM; + + ib_set_device_ops(&dev->ib_dev, &mana_ib_dev_ops); + + dev->ib_dev.phys_port_cnt = mc->num_ports; + + ibdev_dbg(&dev->ib_dev, "mdev=%p id=%d num_ports=%d\n", mdev, + mdev->dev_id.as_uint32, dev->ib_dev.phys_port_cnt); + + dev->gdma_dev = mdev; + dev->ib_dev.node_type = RDMA_NODE_IB_CA; + + /* + * num_comp_vectors needs to set to the max MSIX index + * when interrupts and event queues are implemented + */ + dev->ib_dev.num_comp_vectors = 1; + dev->ib_dev.dev.parent = mdev->gdma_context->dev; + + ret = ib_register_device(&dev->ib_dev, "mana_%d", + mdev->gdma_context->dev); + if (ret) { + ib_dealloc_device(&dev->ib_dev); + return ret; + } + + dev_set_drvdata(&adev->dev, dev); + + return 0; +} + +static void mana_ib_remove(struct auxiliary_device *adev) +{ + struct mana_ib_dev *dev = dev_get_drvdata(&adev->dev); + + ib_unregister_device(&dev->ib_dev); + ib_dealloc_device(&dev->ib_dev); +} + +static const struct auxiliary_device_id mana_id_table[] = { + { + .name = "mana.rdma", + }, + {}, +}; + +MODULE_DEVICE_TABLE(auxiliary, mana_id_table); + +static struct auxiliary_driver mana_driver = { + .name = "rdma", + .probe = mana_ib_probe, + .remove = mana_ib_remove, + .id_table = mana_id_table, +}; + +module_auxiliary_driver(mana_driver); diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c new file mode 100644 index 0000000000000..8b3bc302d6f3a --- /dev/null +++ b/drivers/infiniband/hw/mana/main.c @@ -0,0 +1,521 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022, Microsoft Corporation. All rights reserved. + */ + +#include "mana_ib.h" + +void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd, + u32 port) +{ + struct gdma_dev *gd = dev->gdma_dev; + struct mana_port_context *mpc; + struct net_device *ndev; + struct mana_context *mc; + + mc = gd->driver_data; + ndev = mc->ports[port]; + mpc = netdev_priv(ndev); + + mutex_lock(&pd->vport_mutex); + + pd->vport_use_count--; + WARN_ON(pd->vport_use_count < 0); + + if (!pd->vport_use_count) + mana_uncfg_vport(mpc); + + mutex_unlock(&pd->vport_mutex); +} + +int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port, struct mana_ib_pd *pd, + u32 doorbell_id) +{ + struct gdma_dev *mdev = dev->gdma_dev; + struct mana_port_context *mpc; + struct mana_context *mc; + struct net_device *ndev; + int err; + + mc = mdev->driver_data; + ndev = mc->ports[port]; + mpc = netdev_priv(ndev); + + mutex_lock(&pd->vport_mutex); + + pd->vport_use_count++; + if (pd->vport_use_count > 1) { + ibdev_dbg(&dev->ib_dev, + "Skip as this PD is already configured vport\n"); + mutex_unlock(&pd->vport_mutex); + return 0; + } + + err = mana_cfg_vport(mpc, pd->pdn, doorbell_id); + if (err) { + pd->vport_use_count--; + mutex_unlock(&pd->vport_mutex); + + ibdev_dbg(&dev->ib_dev, "Failed to configure vPort %d\n", err); + return err; + } + + mutex_unlock(&pd->vport_mutex); + + pd->tx_shortform_allowed = mpc->tx_shortform_allowed; + pd->tx_vp_offset = mpc->tx_vp_offset; + + ibdev_dbg(&dev->ib_dev, "vport handle %llx pdid %x doorbell_id %x\n", + mpc->port_handle, pd->pdn, doorbell_id); + + return 0; +} + +int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) +{ + struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd); + struct ib_device *ibdev = ibpd->device; + struct gdma_create_pd_resp resp = {}; + struct gdma_create_pd_req req = {}; + enum gdma_pd_flags flags = 0; + struct mana_ib_dev *dev; + struct gdma_dev *mdev; + int err; + + dev = container_of(ibdev, struct mana_ib_dev, ib_dev); + mdev = dev->gdma_dev; + + mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_PD, sizeof(req), + sizeof(resp)); + + req.flags = flags; + err = mana_gd_send_request(mdev->gdma_context, sizeof(req), &req, + sizeof(resp), &resp); + + if (err || resp.hdr.status) { + ibdev_dbg(&dev->ib_dev, + "Failed to get pd_id err %d status %u\n", err, + resp.hdr.status); + if (!err) + err = -EPROTO; + + return err; + } + + pd->pd_handle = resp.pd_handle; + pd->pdn = resp.pd_id; + ibdev_dbg(&dev->ib_dev, "pd_handle 0x%llx pd_id %d\n", + pd->pd_handle, pd->pdn); + + mutex_init(&pd->vport_mutex); + pd->vport_use_count = 0; + return 0; +} + +int mana_ib_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) +{ + struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd); + struct ib_device *ibdev = ibpd->device; + struct gdma_destory_pd_resp resp = {}; + struct gdma_destroy_pd_req req = {}; + struct mana_ib_dev *dev; + struct gdma_dev *mdev; + int err; + + dev = container_of(ibdev, struct mana_ib_dev, ib_dev); + mdev = dev->gdma_dev; + + mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_PD, sizeof(req), + sizeof(resp)); + + req.pd_handle = pd->pd_handle; + err = mana_gd_send_request(mdev->gdma_context, sizeof(req), &req, + sizeof(resp), &resp); + + if (err || resp.hdr.status) { + ibdev_dbg(&dev->ib_dev, + "Failed to destroy pd_handle 0x%llx err %d status %u", + pd->pd_handle, err, resp.hdr.status); + if (!err) + err = -EPROTO; + } + + return err; +} + +static int mana_gd_destroy_doorbell_page(struct gdma_context *gc, + int doorbell_page) +{ + struct gdma_destroy_resource_range_req req = {}; + struct gdma_resp_hdr resp = {}; + int err; + + mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_RESOURCE_RANGE, + sizeof(req), sizeof(resp)); + + req.resource_type = GDMA_RESOURCE_DOORBELL_PAGE; + req.num_resources = 1; + req.allocated_resources = doorbell_page; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err || resp.status) { + dev_err(gc->dev, + "Failed to destroy doorbell page: ret %d, 0x%x\n", + err, resp.status); + return err ?: -EPROTO; + } + + return 0; +} + +static int mana_gd_allocate_doorbell_page(struct gdma_context *gc, + int *doorbell_page) +{ + struct gdma_allocate_resource_range_req req = {}; + struct gdma_allocate_resource_range_resp resp = {}; + int err; + + mana_gd_init_req_hdr(&req.hdr, GDMA_ALLOCATE_RESOURCE_RANGE, + sizeof(req), sizeof(resp)); + + req.resource_type = GDMA_RESOURCE_DOORBELL_PAGE; + req.num_resources = 1; + req.alignment = 1; + + /* Have GDMA start searching from 0 */ + req.allocated_resources = 0; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err || resp.hdr.status) { + dev_err(gc->dev, + "Failed to allocate doorbell page: ret %d, 0x%x\n", + err, resp.hdr.status); + return err ?: -EPROTO; + } + + *doorbell_page = resp.allocated_resources; + + return 0; +} + +int mana_ib_alloc_ucontext(struct ib_ucontext *ibcontext, + struct ib_udata *udata) +{ + struct mana_ib_ucontext *ucontext = + container_of(ibcontext, struct mana_ib_ucontext, ibucontext); + struct ib_device *ibdev = ibcontext->device; + struct mana_ib_dev *mdev; + struct gdma_context *gc; + struct gdma_dev *dev; + int doorbell_page; + int ret; + + mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); + dev = mdev->gdma_dev; + gc = dev->gdma_context; + + /* Allocate a doorbell page index */ + ret = mana_gd_allocate_doorbell_page(gc, &doorbell_page); + if (ret) { + ibdev_dbg(ibdev, "Failed to allocate doorbell page %d\n", ret); + return ret; + } + + ibdev_dbg(ibdev, "Doorbell page allocated %d\n", doorbell_page); + + ucontext->doorbell = doorbell_page; + + return 0; +} + +void mana_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) +{ + struct mana_ib_ucontext *mana_ucontext = + container_of(ibcontext, struct mana_ib_ucontext, ibucontext); + struct ib_device *ibdev = ibcontext->device; + struct mana_ib_dev *mdev; + struct gdma_context *gc; + int ret; + + mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); + gc = mdev->gdma_dev->gdma_context; + + ret = mana_gd_destroy_doorbell_page(gc, mana_ucontext->doorbell); + if (ret) + ibdev_dbg(ibdev, "Failed to destroy doorbell page %d\n", ret); +} + +static int +mana_ib_gd_first_dma_region(struct mana_ib_dev *dev, + struct gdma_context *gc, + struct gdma_create_dma_region_req *create_req, + size_t num_pages, mana_handle_t *gdma_region) +{ + struct gdma_create_dma_region_resp create_resp = {}; + unsigned int create_req_msg_size; + int err; + + create_req_msg_size = + struct_size(create_req, page_addr_list, num_pages); + create_req->page_addr_list_len = num_pages; + + err = mana_gd_send_request(gc, create_req_msg_size, create_req, + sizeof(create_resp), &create_resp); + if (err || create_resp.hdr.status) { + ibdev_dbg(&dev->ib_dev, + "Failed to create DMA region: %d, 0x%x\n", + err, create_resp.hdr.status); + if (!err) + err = -EPROTO; + + return err; + } + + *gdma_region = create_resp.dma_region_handle; + ibdev_dbg(&dev->ib_dev, "Created DMA region handle 0x%llx\n", + *gdma_region); + + return 0; +} + +static int +mana_ib_gd_add_dma_region(struct mana_ib_dev *dev, struct gdma_context *gc, + struct gdma_dma_region_add_pages_req *add_req, + unsigned int num_pages, u32 expected_status) +{ + unsigned int add_req_msg_size = + struct_size(add_req, page_addr_list, num_pages); + struct gdma_general_resp add_resp = {}; + int err; + + mana_gd_init_req_hdr(&add_req->hdr, GDMA_DMA_REGION_ADD_PAGES, + add_req_msg_size, sizeof(add_resp)); + add_req->page_addr_list_len = num_pages; + + err = mana_gd_send_request(gc, add_req_msg_size, add_req, + sizeof(add_resp), &add_resp); + if (err || add_resp.hdr.status != expected_status) { + ibdev_dbg(&dev->ib_dev, + "Failed to create DMA region: %d, 0x%x\n", + err, add_resp.hdr.status); + + if (!err) + err = -EPROTO; + + return err; + } + + return 0; +} + +int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, + mana_handle_t *gdma_region) +{ + struct gdma_dma_region_add_pages_req *add_req = NULL; + size_t num_pages_processed = 0, num_pages_to_handle; + struct gdma_create_dma_region_req *create_req; + unsigned int create_req_msg_size; + struct hw_channel_context *hwc; + struct ib_block_iter biter; + size_t max_pgs_add_cmd = 0; + size_t max_pgs_create_cmd; + struct gdma_context *gc; + size_t num_pages_total; + struct gdma_dev *mdev; + unsigned long page_sz; + unsigned int tail = 0; + u64 *page_addr_list; + void *request_buf; + int err; + + mdev = dev->gdma_dev; + gc = mdev->gdma_context; + hwc = gc->hwc.driver_data; + + /* Hardware requires dma region to align to chosen page size */ + page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, 0); + if (!page_sz) { + ibdev_dbg(&dev->ib_dev, "failed to find page size.\n"); + return -ENOMEM; + } + num_pages_total = ib_umem_num_dma_blocks(umem, page_sz); + + max_pgs_create_cmd = + (hwc->max_req_msg_size - sizeof(*create_req)) / sizeof(u64); + num_pages_to_handle = + min_t(size_t, num_pages_total, max_pgs_create_cmd); + create_req_msg_size = + struct_size(create_req, page_addr_list, num_pages_to_handle); + + request_buf = kzalloc(hwc->max_req_msg_size, GFP_KERNEL); + if (!request_buf) + return -ENOMEM; + + create_req = request_buf; + mana_gd_init_req_hdr(&create_req->hdr, GDMA_CREATE_DMA_REGION, + create_req_msg_size, + sizeof(struct gdma_create_dma_region_resp)); + + create_req->length = umem->length; + create_req->offset_in_page = umem->address & (page_sz - 1); + create_req->gdma_page_type = order_base_2(page_sz) - PAGE_SHIFT; + create_req->page_count = num_pages_total; + + ibdev_dbg(&dev->ib_dev, "size_dma_region %lu num_pages_total %lu\n", + umem->length, num_pages_total); + + ibdev_dbg(&dev->ib_dev, "page_sz %lu offset_in_page %u\n", + page_sz, create_req->offset_in_page); + + ibdev_dbg(&dev->ib_dev, "num_pages_to_handle %lu, gdma_page_type %u", + num_pages_to_handle, create_req->gdma_page_type); + + page_addr_list = create_req->page_addr_list; + rdma_umem_for_each_dma_block(umem, &biter, page_sz) { + page_addr_list[tail++] = rdma_block_iter_dma_address(&biter); + if (tail < num_pages_to_handle) + continue; + + if (!num_pages_processed) { + /* First create message */ + err = mana_ib_gd_first_dma_region(dev, gc, create_req, + tail, gdma_region); + if (err) + goto out; + + max_pgs_add_cmd = (hwc->max_req_msg_size - + sizeof(*add_req)) / sizeof(u64); + + add_req = request_buf; + add_req->dma_region_handle = *gdma_region; + add_req->reserved3 = 0; + page_addr_list = add_req->page_addr_list; + } else { + /* Subsequent create messages */ + u32 expected_s = 0; + + if (num_pages_processed + num_pages_to_handle < + num_pages_total) + expected_s = GDMA_STATUS_MORE_ENTRIES; + + err = mana_ib_gd_add_dma_region(dev, gc, add_req, tail, + expected_s); + if (err) + break; + } + + num_pages_processed += tail; + tail = 0; + + /* The remaining pages to create */ + num_pages_to_handle = + min_t(size_t, + num_pages_total - num_pages_processed, + max_pgs_add_cmd); + } + + if (err) + mana_ib_gd_destroy_dma_region(dev, *gdma_region); + +out: + kfree(request_buf); + return err; +} + +int mana_ib_gd_destroy_dma_region(struct mana_ib_dev *dev, u64 gdma_region) +{ + struct gdma_dev *mdev = dev->gdma_dev; + struct gdma_context *gc; + + gc = mdev->gdma_context; + ibdev_dbg(&dev->ib_dev, "destroy dma region 0x%llx\n", gdma_region); + + return mana_gd_destroy_dma_region(gc, gdma_region); +} + +int mana_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) +{ + struct mana_ib_ucontext *mana_ucontext = + container_of(ibcontext, struct mana_ib_ucontext, ibucontext); + struct ib_device *ibdev = ibcontext->device; + struct mana_ib_dev *mdev; + struct gdma_context *gc; + phys_addr_t pfn; + pgprot_t prot; + int ret; + + mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); + gc = mdev->gdma_dev->gdma_context; + + if (vma->vm_pgoff != 0) { + ibdev_dbg(ibdev, "Unexpected vm_pgoff %lu\n", vma->vm_pgoff); + return -EINVAL; + } + + /* Map to the page indexed by ucontext->doorbell */ + pfn = (gc->phys_db_page_base + + gc->db_page_size * mana_ucontext->doorbell) >> + PAGE_SHIFT; + prot = pgprot_writecombine(vma->vm_page_prot); + + ret = rdma_user_mmap_io(ibcontext, vma, pfn, gc->db_page_size, prot, + NULL); + if (ret) + ibdev_dbg(ibdev, "can't rdma_user_mmap_io ret %d\n", ret); + else + ibdev_dbg(ibdev, "mapped I/O pfn 0x%llx page_size %u, ret %d\n", + pfn, gc->db_page_size, ret); + + return ret; +} + +int mana_ib_get_port_immutable(struct ib_device *ibdev, u32 port_num, + struct ib_port_immutable *immutable) +{ + /* + * This version only support RAW_PACKET + * other values need to be filled for other types + */ + immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET; + + return 0; +} + +int mana_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props, + struct ib_udata *uhw) +{ + props->max_qp = MANA_MAX_NUM_QUEUES; + props->max_qp_wr = MAX_SEND_BUFFERS_PER_QUEUE; + + /* + * max_cqe could be potentially much bigger. + * As this version of driver only support RAW QP, set it to the same + * value as max_qp_wr + */ + props->max_cqe = MAX_SEND_BUFFERS_PER_QUEUE; + + props->max_mr_size = MANA_IB_MAX_MR_SIZE; + props->max_mr = MANA_IB_MAX_MR; + props->max_send_sge = MAX_TX_WQE_SGL_ENTRIES; + props->max_recv_sge = MAX_RX_WQE_SGL_ENTRIES; + + return 0; +} + +int mana_ib_query_port(struct ib_device *ibdev, u32 port, + struct ib_port_attr *props) +{ + /* This version doesn't return port properties */ + return 0; +} + +int mana_ib_query_gid(struct ib_device *ibdev, u32 port, int index, + union ib_gid *gid) +{ + /* This version doesn't return GID properties */ + return 0; +} + +void mana_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) +{ +} diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h new file mode 100644 index 0000000000000..502cc8672eefa --- /dev/null +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -0,0 +1,162 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2022 Microsoft Corporation. All rights reserved. + */ + +#ifndef _MANA_IB_H_ +#define _MANA_IB_H_ + +#include +#include +#include +#include +#include + +#include + +#define PAGE_SZ_BM \ + (SZ_4K | SZ_8K | SZ_16K | SZ_32K | SZ_64K | SZ_128K | SZ_256K | \ + SZ_512K | SZ_1M | SZ_2M) + +/* MANA doesn't have any limit for MR size */ +#define MANA_IB_MAX_MR_SIZE U64_MAX + +/* + * The hardware limit of number of MRs is greater than maximum number of MRs + * that can possibly represent in 24 bits + */ +#define MANA_IB_MAX_MR 0xFFFFFFu + +struct mana_ib_dev { + struct ib_device ib_dev; + struct gdma_dev *gdma_dev; +}; + +struct mana_ib_wq { + struct ib_wq ibwq; + struct ib_umem *umem; + int wqe; + u32 wq_buf_size; + u64 gdma_region; + u64 id; + mana_handle_t rx_object; +}; + +struct mana_ib_pd { + struct ib_pd ibpd; + u32 pdn; + mana_handle_t pd_handle; + + /* Mutex for sharing access to vport_use_count */ + struct mutex vport_mutex; + int vport_use_count; + + bool tx_shortform_allowed; + u32 tx_vp_offset; +}; + +struct mana_ib_mr { + struct ib_mr ibmr; + struct ib_umem *umem; + mana_handle_t mr_handle; +}; + +struct mana_ib_cq { + struct ib_cq ibcq; + struct ib_umem *umem; + int cqe; + u64 gdma_region; + u64 id; +}; + +struct mana_ib_qp { + struct ib_qp ibqp; + + /* Work queue info */ + struct ib_umem *sq_umem; + int sqe; + u64 sq_gdma_region; + u64 sq_id; + mana_handle_t tx_object; + + /* The port on the IB device, starting with 1 */ + u32 port; +}; + +struct mana_ib_ucontext { + struct ib_ucontext ibucontext; + u32 doorbell; +}; + +struct mana_ib_rwq_ind_table { + struct ib_rwq_ind_table ib_ind_table; +}; + +int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, + mana_handle_t *gdma_region); + +int mana_ib_gd_destroy_dma_region(struct mana_ib_dev *dev, + mana_handle_t gdma_region); + +struct ib_wq *mana_ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata); + +int mana_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, + u32 wq_attr_mask, struct ib_udata *udata); + +int mana_ib_destroy_wq(struct ib_wq *ibwq, struct ib_udata *udata); + +int mana_ib_create_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_table, + struct ib_rwq_ind_table_init_attr *init_attr, + struct ib_udata *udata); + +int mana_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl); + +struct ib_mr *mana_ib_get_dma_mr(struct ib_pd *ibpd, int access_flags); + +struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 iova, int access_flags, + struct ib_udata *udata); + +int mana_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata); + +int mana_ib_create_qp(struct ib_qp *qp, struct ib_qp_init_attr *qp_init_attr, + struct ib_udata *udata); + +int mana_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); + +int mana_ib_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata); + +int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port_id, + struct mana_ib_pd *pd, u32 doorbell_id); +void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd, + u32 port); + +int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); + +int mana_ib_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); + +int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata); +int mana_ib_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata); + +int mana_ib_alloc_ucontext(struct ib_ucontext *ibcontext, + struct ib_udata *udata); +void mana_ib_dealloc_ucontext(struct ib_ucontext *ibcontext); + +int mana_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma); + +int mana_ib_get_port_immutable(struct ib_device *ibdev, u32 port_num, + struct ib_port_immutable *immutable); +int mana_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props, + struct ib_udata *uhw); +int mana_ib_query_port(struct ib_device *ibdev, u32 port, + struct ib_port_attr *props); +int mana_ib_query_gid(struct ib_device *ibdev, u32 port, int index, + union ib_gid *gid); + +void mana_ib_disassociate_ucontext(struct ib_ucontext *ibcontext); + +#endif diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c new file mode 100644 index 0000000000000..a56236cdd9eec --- /dev/null +++ b/drivers/infiniband/hw/mana/mr.c @@ -0,0 +1,198 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022, Microsoft Corporation. All rights reserved. + */ + +#include "mana_ib.h" + +#define VALID_MR_FLAGS \ + (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ) + +static enum gdma_mr_access_flags +mana_ib_verbs_to_gdma_access_flags(int access_flags) +{ + enum gdma_mr_access_flags flags = GDMA_ACCESS_FLAG_LOCAL_READ; + + if (access_flags & IB_ACCESS_LOCAL_WRITE) + flags |= GDMA_ACCESS_FLAG_LOCAL_WRITE; + + if (access_flags & IB_ACCESS_REMOTE_WRITE) + flags |= GDMA_ACCESS_FLAG_REMOTE_WRITE; + + if (access_flags & IB_ACCESS_REMOTE_READ) + flags |= GDMA_ACCESS_FLAG_REMOTE_READ; + + return flags; +} + +static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct mana_ib_mr *mr, + struct gdma_create_mr_params *mr_params) +{ + struct gdma_create_mr_response resp = {}; + struct gdma_create_mr_request req = {}; + struct gdma_dev *mdev = dev->gdma_dev; + struct gdma_context *gc; + int err; + + gc = mdev->gdma_context; + + mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_MR, sizeof(req), + sizeof(resp)); + req.pd_handle = mr_params->pd_handle; + req.mr_type = mr_params->mr_type; + + switch (mr_params->mr_type) { + case GDMA_MR_TYPE_GVA: + req.gva.dma_region_handle = mr_params->gva.dma_region_handle; + req.gva.virtual_address = mr_params->gva.virtual_address; + req.gva.access_flags = mr_params->gva.access_flags; + break; + + default: + ibdev_dbg(&dev->ib_dev, + "invalid param (GDMA_MR_TYPE) passed, type %d\n", + req.mr_type); + return -EINVAL; + } + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + + if (err || resp.hdr.status) { + ibdev_dbg(&dev->ib_dev, "Failed to create mr %d, %u", err, + resp.hdr.status); + if (!err) + err = -EPROTO; + + return err; + } + + mr->ibmr.lkey = resp.lkey; + mr->ibmr.rkey = resp.rkey; + mr->mr_handle = resp.mr_handle; + + return 0; +} + +static int mana_ib_gd_destroy_mr(struct mana_ib_dev *dev, + gdma_obj_handle_t mr_handle) +{ + struct gdma_destroy_mr_response resp = {}; + struct gdma_destroy_mr_request req = {}; + struct gdma_dev *mdev = dev->gdma_dev; + struct gdma_context *gc; + int err; + + gc = mdev->gdma_context; + + mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_MR, sizeof(req), + sizeof(resp)); + + req.mr_handle = mr_handle; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err || resp.hdr.status) { + dev_err(gc->dev, "Failed to destroy MR: %d, 0x%x\n", err, + resp.hdr.status); + if (!err) + err = -EPROTO; + return err; + } + + return 0; +} + +struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, + u64 iova, int access_flags, + struct ib_udata *udata) +{ + struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd); + struct gdma_create_mr_params mr_params = {}; + struct ib_device *ibdev = ibpd->device; + gdma_obj_handle_t dma_region_handle; + struct mana_ib_dev *dev; + struct mana_ib_mr *mr; + int err; + + dev = container_of(ibdev, struct mana_ib_dev, ib_dev); + + ibdev_dbg(ibdev, + "start 0x%llx, iova 0x%llx length 0x%llx access_flags 0x%x", + start, iova, length, access_flags); + + if (access_flags & ~VALID_MR_FLAGS) + return ERR_PTR(-EINVAL); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + mr->umem = ib_umem_get(ibdev, start, length, access_flags); + if (IS_ERR(mr->umem)) { + err = PTR_ERR(mr->umem); + ibdev_dbg(ibdev, + "Failed to get umem for register user-mr, %d\n", err); + goto err_free; + } + + err = mana_ib_gd_create_dma_region(dev, mr->umem, &dma_region_handle); + if (err) { + ibdev_dbg(ibdev, "Failed create dma region for user-mr, %d\n", + err); + goto err_umem; + } + + ibdev_dbg(ibdev, + "mana_ib_gd_create_dma_region ret %d gdma_region %llx\n", err, + dma_region_handle); + + mr_params.pd_handle = pd->pd_handle; + mr_params.mr_type = GDMA_MR_TYPE_GVA; + mr_params.gva.dma_region_handle = dma_region_handle; + mr_params.gva.virtual_address = iova; + mr_params.gva.access_flags = + mana_ib_verbs_to_gdma_access_flags(access_flags); + + err = mana_ib_gd_create_mr(dev, mr, &mr_params); + if (err) + goto err_dma_region; + + /* + * There is no need to keep track of dma_region_handle after MR is + * successfully created. The dma_region_handle is tracked in the PF + * as part of the lifecycle of this MR. + */ + + return &mr->ibmr; + +err_dma_region: + mana_gd_destroy_dma_region(dev->gdma_dev->gdma_context, + dma_region_handle); + +err_umem: + ib_umem_release(mr->umem); + +err_free: + kfree(mr); + return ERR_PTR(err); +} + +int mana_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) +{ + struct mana_ib_mr *mr = container_of(ibmr, struct mana_ib_mr, ibmr); + struct ib_device *ibdev = ibmr->device; + struct mana_ib_dev *dev; + int err; + + dev = container_of(ibdev, struct mana_ib_dev, ib_dev); + + err = mana_ib_gd_destroy_mr(dev, mr->mr_handle); + if (err) + return err; + + if (mr->umem) + ib_umem_release(mr->umem); + + kfree(mr); + + return 0; +} diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c new file mode 100644 index 0000000000000..ea15ec77e3212 --- /dev/null +++ b/drivers/infiniband/hw/mana/qp.c @@ -0,0 +1,506 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022, Microsoft Corporation. All rights reserved. + */ + +#include "mana_ib.h" + +static int mana_ib_cfg_vport_steering(struct mana_ib_dev *dev, + struct net_device *ndev, + mana_handle_t default_rxobj, + mana_handle_t ind_table[], + u32 log_ind_tbl_size, u32 rx_hash_key_len, + u8 *rx_hash_key) +{ + struct mana_port_context *mpc = netdev_priv(ndev); + struct mana_cfg_rx_steer_req *req = NULL; + struct mana_cfg_rx_steer_resp resp = {}; + mana_handle_t *req_indir_tab; + struct gdma_context *gc; + struct gdma_dev *mdev; + u32 req_buf_size; + int i, err; + + mdev = dev->gdma_dev; + gc = mdev->gdma_context; + + req_buf_size = + sizeof(*req) + sizeof(mana_handle_t) * MANA_INDIRECT_TABLE_SIZE; + req = kzalloc(req_buf_size, GFP_KERNEL); + if (!req) + return -ENOMEM; + + mana_gd_init_req_hdr(&req->hdr, MANA_CONFIG_VPORT_RX, req_buf_size, + sizeof(resp)); + + req->vport = mpc->port_handle; + req->rx_enable = 1; + req->update_default_rxobj = 1; + req->default_rxobj = default_rxobj; + req->hdr.dev_id = mdev->dev_id; + + /* If there are more than 1 entries in indirection table, enable RSS */ + if (log_ind_tbl_size) + req->rss_enable = true; + + req->num_indir_entries = MANA_INDIRECT_TABLE_SIZE; + req->indir_tab_offset = sizeof(*req); + req->update_indir_tab = true; + + req_indir_tab = (mana_handle_t *)(req + 1); + /* The ind table passed to the hardware must have + * MANA_INDIRECT_TABLE_SIZE entries. Adjust the verb + * ind_table to MANA_INDIRECT_TABLE_SIZE if required + */ + ibdev_dbg(&dev->ib_dev, "ind table size %u\n", 1 << log_ind_tbl_size); + for (i = 0; i < MANA_INDIRECT_TABLE_SIZE; i++) { + req_indir_tab[i] = ind_table[i % (1 << log_ind_tbl_size)]; + ibdev_dbg(&dev->ib_dev, "index %u handle 0x%llx\n", i, + req_indir_tab[i]); + } + + req->update_hashkey = true; + if (rx_hash_key_len) + memcpy(req->hashkey, rx_hash_key, rx_hash_key_len); + else + netdev_rss_key_fill(req->hashkey, MANA_HASH_KEY_SIZE); + + ibdev_dbg(&dev->ib_dev, "vport handle %llu default_rxobj 0x%llx\n", + req->vport, default_rxobj); + + err = mana_gd_send_request(gc, req_buf_size, req, sizeof(resp), &resp); + if (err) { + netdev_err(ndev, "Failed to configure vPort RX: %d\n", err); + goto out; + } + + if (resp.hdr.status) { + netdev_err(ndev, "vPort RX configuration failed: 0x%x\n", + resp.hdr.status); + err = -EPROTO; + goto out; + } + + netdev_info(ndev, "Configured steering vPort %llu log_entries %u\n", + mpc->port_handle, log_ind_tbl_size); + +out: + kfree(req); + return err; +} + +static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, + struct ib_qp_init_attr *attr, + struct ib_udata *udata) +{ + struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); + struct mana_ib_dev *mdev = + container_of(pd->device, struct mana_ib_dev, ib_dev); + struct ib_rwq_ind_table *ind_tbl = attr->rwq_ind_tbl; + struct mana_ib_create_qp_rss_resp resp = {}; + struct mana_ib_create_qp_rss ucmd = {}; + struct gdma_dev *gd = mdev->gdma_dev; + mana_handle_t *mana_ind_table; + struct mana_port_context *mpc; + struct mana_context *mc; + struct net_device *ndev; + struct mana_ib_cq *cq; + struct mana_ib_wq *wq; + unsigned int ind_tbl_size; + struct ib_cq *ibcq; + struct ib_wq *ibwq; + int i = 0; + u32 port; + int ret; + + mc = gd->driver_data; + + if (!udata || udata->inlen < sizeof(ucmd)) + return -EINVAL; + + ret = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)); + if (ret) { + ibdev_dbg(&mdev->ib_dev, + "Failed copy from udata for create rss-qp, err %d\n", + ret); + return ret; + } + + if (attr->cap.max_recv_wr > MAX_SEND_BUFFERS_PER_QUEUE) { + ibdev_dbg(&mdev->ib_dev, + "Requested max_recv_wr %d exceeding limit\n", + attr->cap.max_recv_wr); + return -EINVAL; + } + + if (attr->cap.max_recv_sge > MAX_RX_WQE_SGL_ENTRIES) { + ibdev_dbg(&mdev->ib_dev, + "Requested max_recv_sge %d exceeding limit\n", + attr->cap.max_recv_sge); + return -EINVAL; + } + + ind_tbl_size = 1 << ind_tbl->log_ind_tbl_size; + if (ind_tbl_size > MANA_INDIRECT_TABLE_SIZE) { + ibdev_dbg(&mdev->ib_dev, + "Indirect table size %d exceeding limit\n", + ind_tbl_size); + return -EINVAL; + } + + if (ucmd.rx_hash_function != MANA_IB_RX_HASH_FUNC_TOEPLITZ) { + ibdev_dbg(&mdev->ib_dev, + "RX Hash function is not supported, %d\n", + ucmd.rx_hash_function); + return -EINVAL; + } + + /* IB ports start with 1, MANA start with 0 */ + port = ucmd.port; + if (port < 1 || port > mc->num_ports) { + ibdev_dbg(&mdev->ib_dev, "Invalid port %u in creating qp\n", + port); + return -EINVAL; + } + ndev = mc->ports[port - 1]; + mpc = netdev_priv(ndev); + + ibdev_dbg(&mdev->ib_dev, "rx_hash_function %d port %d\n", + ucmd.rx_hash_function, port); + + mana_ind_table = kcalloc(ind_tbl_size, sizeof(mana_handle_t), + GFP_KERNEL); + if (!mana_ind_table) { + ret = -ENOMEM; + goto fail; + } + + qp->port = port; + + for (i = 0; i < ind_tbl_size; i++) { + struct mana_obj_spec wq_spec = {}; + struct mana_obj_spec cq_spec = {}; + + ibwq = ind_tbl->ind_tbl[i]; + wq = container_of(ibwq, struct mana_ib_wq, ibwq); + + ibcq = ibwq->cq; + cq = container_of(ibcq, struct mana_ib_cq, ibcq); + + wq_spec.gdma_region = wq->gdma_region; + wq_spec.queue_size = wq->wq_buf_size; + + cq_spec.gdma_region = cq->gdma_region; + cq_spec.queue_size = cq->cqe * COMP_ENTRY_SIZE; + cq_spec.modr_ctx_id = 0; + cq_spec.attached_eq = GDMA_CQ_NO_EQ; + + ret = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_RQ, + &wq_spec, &cq_spec, &wq->rx_object); + if (ret) + goto fail; + + /* The GDMA regions are now owned by the WQ object */ + wq->gdma_region = GDMA_INVALID_DMA_REGION; + cq->gdma_region = GDMA_INVALID_DMA_REGION; + + wq->id = wq_spec.queue_index; + cq->id = cq_spec.queue_index; + + ibdev_dbg(&mdev->ib_dev, + "ret %d rx_object 0x%llx wq id %llu cq id %llu\n", + ret, wq->rx_object, wq->id, cq->id); + + resp.entries[i].cqid = cq->id; + resp.entries[i].wqid = wq->id; + + mana_ind_table[i] = wq->rx_object; + } + resp.num_entries = i; + + ret = mana_ib_cfg_vport_steering(mdev, ndev, wq->rx_object, + mana_ind_table, + ind_tbl->log_ind_tbl_size, + ucmd.rx_hash_key_len, + ucmd.rx_hash_key); + if (ret) + goto fail; + + ret = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (ret) { + ibdev_dbg(&mdev->ib_dev, + "Failed to copy to udata create rss-qp, %d\n", + ret); + goto fail; + } + + kfree(mana_ind_table); + + return 0; + +fail: + while (i-- > 0) { + ibwq = ind_tbl->ind_tbl[i]; + wq = container_of(ibwq, struct mana_ib_wq, ibwq); + mana_destroy_wq_obj(mpc, GDMA_RQ, wq->rx_object); + } + + kfree(mana_ind_table); + + return ret; +} + +static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, + struct ib_qp_init_attr *attr, + struct ib_udata *udata) +{ + struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd); + struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); + struct mana_ib_dev *mdev = + container_of(ibpd->device, struct mana_ib_dev, ib_dev); + struct mana_ib_cq *send_cq = + container_of(attr->send_cq, struct mana_ib_cq, ibcq); + struct mana_ib_ucontext *mana_ucontext = + rdma_udata_to_drv_context(udata, struct mana_ib_ucontext, + ibucontext); + struct mana_ib_create_qp_resp resp = {}; + struct gdma_dev *gd = mdev->gdma_dev; + struct mana_ib_create_qp ucmd = {}; + struct mana_obj_spec wq_spec = {}; + struct mana_obj_spec cq_spec = {}; + struct mana_port_context *mpc; + struct mana_context *mc; + struct net_device *ndev; + struct ib_umem *umem; + int err; + u32 port; + + mc = gd->driver_data; + + if (!mana_ucontext || udata->inlen < sizeof(ucmd)) + return -EINVAL; + + err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)); + if (err) { + ibdev_dbg(&mdev->ib_dev, + "Failed to copy from udata create qp-raw, %d\n", err); + return err; + } + + /* IB ports start with 1, MANA Ethernet ports start with 0 */ + port = ucmd.port; + if (ucmd.port > mc->num_ports) + return -EINVAL; + + if (attr->cap.max_send_wr > MAX_SEND_BUFFERS_PER_QUEUE) { + ibdev_dbg(&mdev->ib_dev, + "Requested max_send_wr %d exceeding limit\n", + attr->cap.max_send_wr); + return -EINVAL; + } + + if (attr->cap.max_send_sge > MAX_TX_WQE_SGL_ENTRIES) { + ibdev_dbg(&mdev->ib_dev, + "Requested max_send_sge %d exceeding limit\n", + attr->cap.max_send_sge); + return -EINVAL; + } + + ndev = mc->ports[port - 1]; + mpc = netdev_priv(ndev); + ibdev_dbg(&mdev->ib_dev, "port %u ndev %p mpc %p\n", port, ndev, mpc); + + err = mana_ib_cfg_vport(mdev, port - 1, pd, mana_ucontext->doorbell); + if (err) + return -ENODEV; + + qp->port = port; + + ibdev_dbg(&mdev->ib_dev, "ucmd sq_buf_addr 0x%llx port %u\n", + ucmd.sq_buf_addr, ucmd.port); + + umem = ib_umem_get(ibpd->device, ucmd.sq_buf_addr, ucmd.sq_buf_size, + IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(umem)) { + err = PTR_ERR(umem); + ibdev_dbg(&mdev->ib_dev, + "Failed to get umem for create qp-raw, err %d\n", + err); + goto err_free_vport; + } + qp->sq_umem = umem; + + err = mana_ib_gd_create_dma_region(mdev, qp->sq_umem, + &qp->sq_gdma_region); + if (err) { + ibdev_dbg(&mdev->ib_dev, + "Failed to create dma region for create qp-raw, %d\n", + err); + goto err_release_umem; + } + + ibdev_dbg(&mdev->ib_dev, + "mana_ib_gd_create_dma_region ret %d gdma_region 0x%llx\n", + err, qp->sq_gdma_region); + + /* Create a WQ on the same port handle used by the Ethernet */ + wq_spec.gdma_region = qp->sq_gdma_region; + wq_spec.queue_size = ucmd.sq_buf_size; + + cq_spec.gdma_region = send_cq->gdma_region; + cq_spec.queue_size = send_cq->cqe * COMP_ENTRY_SIZE; + cq_spec.modr_ctx_id = 0; + cq_spec.attached_eq = GDMA_CQ_NO_EQ; + + err = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_SQ, &wq_spec, + &cq_spec, &qp->tx_object); + if (err) { + ibdev_dbg(&mdev->ib_dev, + "Failed to create wq for create raw-qp, err %d\n", + err); + goto err_destroy_dma_region; + } + + /* The GDMA regions are now owned by the WQ object */ + qp->sq_gdma_region = GDMA_INVALID_DMA_REGION; + send_cq->gdma_region = GDMA_INVALID_DMA_REGION; + + qp->sq_id = wq_spec.queue_index; + send_cq->id = cq_spec.queue_index; + + ibdev_dbg(&mdev->ib_dev, + "ret %d qp->tx_object 0x%llx sq id %llu cq id %llu\n", err, + qp->tx_object, qp->sq_id, send_cq->id); + + resp.sqid = qp->sq_id; + resp.cqid = send_cq->id; + resp.tx_vp_offset = pd->tx_vp_offset; + + err = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (err) { + ibdev_dbg(&mdev->ib_dev, + "Failed copy udata for create qp-raw, %d\n", + err); + goto err_destroy_wq_obj; + } + + return 0; + +err_destroy_wq_obj: + mana_destroy_wq_obj(mpc, GDMA_SQ, qp->tx_object); + +err_destroy_dma_region: + mana_ib_gd_destroy_dma_region(mdev, qp->sq_gdma_region); + +err_release_umem: + ib_umem_release(umem); + +err_free_vport: + mana_ib_uncfg_vport(mdev, pd, port - 1); + + return err; +} + +int mana_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr, + struct ib_udata *udata) +{ + switch (attr->qp_type) { + case IB_QPT_RAW_PACKET: + /* When rwq_ind_tbl is used, it's for creating WQs for RSS */ + if (attr->rwq_ind_tbl) + return mana_ib_create_qp_rss(ibqp, ibqp->pd, attr, + udata); + + return mana_ib_create_qp_raw(ibqp, ibqp->pd, attr, udata); + default: + /* Creating QP other than IB_QPT_RAW_PACKET is not supported */ + ibdev_dbg(ibqp->device, "Creating QP type %u not supported\n", + attr->qp_type); + } + + return -EINVAL; +} + +int mana_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + /* modify_qp is not supported by this version of the driver */ + return -EOPNOTSUPP; +} + +static int mana_ib_destroy_qp_rss(struct mana_ib_qp *qp, + struct ib_rwq_ind_table *ind_tbl, + struct ib_udata *udata) +{ + struct mana_ib_dev *mdev = + container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev); + struct gdma_dev *gd = mdev->gdma_dev; + struct mana_port_context *mpc; + struct mana_context *mc; + struct net_device *ndev; + struct mana_ib_wq *wq; + struct ib_wq *ibwq; + int i; + + mc = gd->driver_data; + ndev = mc->ports[qp->port - 1]; + mpc = netdev_priv(ndev); + + for (i = 0; i < (1 << ind_tbl->log_ind_tbl_size); i++) { + ibwq = ind_tbl->ind_tbl[i]; + wq = container_of(ibwq, struct mana_ib_wq, ibwq); + ibdev_dbg(&mdev->ib_dev, "destroying wq->rx_object %llu\n", + wq->rx_object); + mana_destroy_wq_obj(mpc, GDMA_RQ, wq->rx_object); + } + + return 0; +} + +static int mana_ib_destroy_qp_raw(struct mana_ib_qp *qp, struct ib_udata *udata) +{ + struct mana_ib_dev *mdev = + container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev); + struct gdma_dev *gd = mdev->gdma_dev; + struct ib_pd *ibpd = qp->ibqp.pd; + struct mana_port_context *mpc; + struct mana_context *mc; + struct net_device *ndev; + struct mana_ib_pd *pd; + + mc = gd->driver_data; + ndev = mc->ports[qp->port - 1]; + mpc = netdev_priv(ndev); + pd = container_of(ibpd, struct mana_ib_pd, ibpd); + + mana_destroy_wq_obj(mpc, GDMA_SQ, qp->tx_object); + + if (qp->sq_umem) { + mana_ib_gd_destroy_dma_region(mdev, qp->sq_gdma_region); + ib_umem_release(qp->sq_umem); + } + + mana_ib_uncfg_vport(mdev, pd, qp->port - 1); + + return 0; +} + +int mana_ib_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) +{ + struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); + + switch (ibqp->qp_type) { + case IB_QPT_RAW_PACKET: + if (ibqp->rwq_ind_tbl) + return mana_ib_destroy_qp_rss(qp, ibqp->rwq_ind_tbl, + udata); + + return mana_ib_destroy_qp_raw(qp, udata); + + default: + ibdev_dbg(ibqp->device, "Unexpected QP type %u\n", + ibqp->qp_type); + } + + return -ENOENT; +} diff --git a/drivers/infiniband/hw/mana/wq.c b/drivers/infiniband/hw/mana/wq.c new file mode 100644 index 0000000000000..372d361510e0c --- /dev/null +++ b/drivers/infiniband/hw/mana/wq.c @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022, Microsoft Corporation. All rights reserved. + */ + +#include "mana_ib.h" + +struct ib_wq *mana_ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mana_ib_dev *mdev = + container_of(pd->device, struct mana_ib_dev, ib_dev); + struct mana_ib_create_wq ucmd = {}; + struct mana_ib_wq *wq; + struct ib_umem *umem; + int err; + + if (udata->inlen < sizeof(ucmd)) + return ERR_PTR(-EINVAL); + + err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)); + if (err) { + ibdev_dbg(&mdev->ib_dev, + "Failed to copy from udata for create wq, %d\n", err); + return ERR_PTR(err); + } + + wq = kzalloc(sizeof(*wq), GFP_KERNEL); + if (!wq) + return ERR_PTR(-ENOMEM); + + ibdev_dbg(&mdev->ib_dev, "ucmd wq_buf_addr 0x%llx\n", ucmd.wq_buf_addr); + + umem = ib_umem_get(pd->device, ucmd.wq_buf_addr, ucmd.wq_buf_size, + IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(umem)) { + err = PTR_ERR(umem); + ibdev_dbg(&mdev->ib_dev, + "Failed to get umem for create wq, err %d\n", err); + goto err_free_wq; + } + + wq->umem = umem; + wq->wqe = init_attr->max_wr; + wq->wq_buf_size = ucmd.wq_buf_size; + wq->rx_object = INVALID_MANA_HANDLE; + + err = mana_ib_gd_create_dma_region(mdev, wq->umem, &wq->gdma_region); + if (err) { + ibdev_dbg(&mdev->ib_dev, + "Failed to create dma region for create wq, %d\n", + err); + goto err_release_umem; + } + + ibdev_dbg(&mdev->ib_dev, + "mana_ib_gd_create_dma_region ret %d gdma_region 0x%llx\n", + err, wq->gdma_region); + + /* WQ ID is returned at wq_create time, doesn't know the value yet */ + + return &wq->ibwq; + +err_release_umem: + ib_umem_release(umem); + +err_free_wq: + kfree(wq); + + return ERR_PTR(err); +} + +int mana_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, + u32 wq_attr_mask, struct ib_udata *udata) +{ + /* modify_wq is not supported by this version of the driver */ + return -EOPNOTSUPP; +} + +int mana_ib_destroy_wq(struct ib_wq *ibwq, struct ib_udata *udata) +{ + struct mana_ib_wq *wq = container_of(ibwq, struct mana_ib_wq, ibwq); + struct ib_device *ib_dev = ibwq->device; + struct mana_ib_dev *mdev; + + mdev = container_of(ib_dev, struct mana_ib_dev, ib_dev); + + mana_ib_gd_destroy_dma_region(mdev, wq->gdma_region); + ib_umem_release(wq->umem); + + kfree(wq); + + return 0; +} + +int mana_ib_create_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_table, + struct ib_rwq_ind_table_init_attr *init_attr, + struct ib_udata *udata) +{ + /* + * There is no additional data in ind_table to be maintained by this + * driver, do nothing + */ + return 0; +} + +int mana_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl) +{ + /* + * There is no additional data in ind_table to be maintained by this + * driver, do nothing + */ + return 0; +} diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 713a8f8cca9a7..20212ffeefb9e 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -412,6 +412,9 @@ int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf); extern const struct ethtool_ops mana_ethtool_ops; +/* A CQ can be created not associated with any EQ */ +#define GDMA_CQ_NO_EQ 0xffff + struct mana_obj_spec { u32 queue_index; u64 gdma_region; diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h index 7dd56210226f5..e0c25537fd2e9 100644 --- a/include/uapi/rdma/ib_user_ioctl_verbs.h +++ b/include/uapi/rdma/ib_user_ioctl_verbs.h @@ -251,6 +251,7 @@ enum rdma_driver_id { RDMA_DRIVER_EFA, RDMA_DRIVER_SIW, RDMA_DRIVER_ERDMA, + RDMA_DRIVER_MANA, }; enum ib_uverbs_gid_type { diff --git a/include/uapi/rdma/mana-abi.h b/include/uapi/rdma/mana-abi.h new file mode 100644 index 0000000000000..5fcb31b37fb91 --- /dev/null +++ b/include/uapi/rdma/mana-abi.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) */ +/* + * Copyright (c) 2022, Microsoft Corporation. All rights reserved. + */ + +#ifndef MANA_ABI_USER_H +#define MANA_ABI_USER_H + +#include +#include + +/* + * Increment this value if any changes that break userspace ABI + * compatibility are made. + */ + +#define MANA_IB_UVERBS_ABI_VERSION 1 + +struct mana_ib_create_cq { + __aligned_u64 buf_addr; +}; + +struct mana_ib_create_qp { + __aligned_u64 sq_buf_addr; + __u32 sq_buf_size; + __u32 port; +}; + +struct mana_ib_create_qp_resp { + __u32 sqid; + __u32 cqid; + __u32 tx_vp_offset; + __u32 reserved; +}; + +struct mana_ib_create_wq { + __aligned_u64 wq_buf_addr; + __u32 wq_buf_size; + __u32 reserved; +}; + +/* RX Hash function flags */ +enum mana_ib_rx_hash_function_flags { + MANA_IB_RX_HASH_FUNC_TOEPLITZ = 1 << 0, +}; + +struct mana_ib_create_qp_rss { + __aligned_u64 rx_hash_fields_mask; + __u8 rx_hash_function; + __u8 reserved[7]; + __u32 rx_hash_key_len; + __u8 rx_hash_key[40]; + __u32 port; +}; + +struct rss_resp_entry { + __u32 cqid; + __u32 wqid; +}; + +struct mana_ib_create_qp_rss_resp { + __aligned_u64 num_entries; + struct rss_resp_entry entries[64]; +}; + +#endif -- GitLab From 61c581a46a9668747d355436bd4b2505594539bd Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 3 Nov 2022 20:22:57 +0100 Subject: [PATCH 0552/1423] crypto: move gf128mul library into lib/crypto The gf128mul library does not depend on the crypto API at all, so it can be moved into lib/crypto. This will allow us to use it in other library code in a subsequent patch without having to depend on CONFIG_CRYPTO. While at it, change the Kconfig symbol name to align with other crypto library implementations. However, the source file name is retained, as it is reflected in the module .ko filename, and changing this might break things for users. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm/crypto/Kconfig | 2 +- arch/arm64/crypto/Kconfig | 2 +- crypto/Kconfig | 9 +++------ crypto/Makefile | 1 - drivers/crypto/chelsio/Kconfig | 2 +- lib/crypto/Kconfig | 3 +++ lib/crypto/Makefile | 2 ++ {crypto => lib/crypto}/gf128mul.c | 0 8 files changed, 11 insertions(+), 10 deletions(-) rename {crypto => lib/crypto}/gf128mul.c (100%) diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index 3858c4d4cb988..7b2b7d043d9be 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig @@ -18,7 +18,7 @@ config CRYPTO_GHASH_ARM_CE depends on KERNEL_MODE_NEON select CRYPTO_HASH select CRYPTO_CRYPTD - select CRYPTO_GF128MUL + select CRYPTO_LIB_GF128MUL help GCM GHASH function (NIST SP800-38D) diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 6793d5bc3ee53..6d06b448a66e0 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -6,8 +6,8 @@ config CRYPTO_GHASH_ARM64_CE tristate "Hash functions: GHASH (ARMv8 Crypto Extensions)" depends on KERNEL_MODE_NEON select CRYPTO_HASH - select CRYPTO_GF128MUL select CRYPTO_LIB_AES + select CRYPTO_LIB_GF128MUL select CRYPTO_AEAD help GCM GHASH function (NIST SP800-38D) diff --git a/crypto/Kconfig b/crypto/Kconfig index d779667671b23..9c86f70451576 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -175,9 +175,6 @@ config CRYPTO_MANAGER_EXTRA_TESTS This is intended for developer use only, as these tests take much longer to run than the normal self tests. -config CRYPTO_GF128MUL - tristate - config CRYPTO_NULL tristate "Null algorithms" select CRYPTO_NULL2 @@ -714,9 +711,9 @@ config CRYPTO_KEYWRAP config CRYPTO_LRW tristate "LRW (Liskov Rivest Wagner)" + select CRYPTO_LIB_GF128MUL select CRYPTO_SKCIPHER select CRYPTO_MANAGER - select CRYPTO_GF128MUL select CRYPTO_ECB help LRW (Liskov Rivest Wagner) mode @@ -926,8 +923,8 @@ config CRYPTO_CMAC config CRYPTO_GHASH tristate "GHASH" - select CRYPTO_GF128MUL select CRYPTO_HASH + select CRYPTO_LIB_GF128MUL help GCM GHASH function (NIST SP800-38D) @@ -967,8 +964,8 @@ config CRYPTO_MICHAEL_MIC config CRYPTO_POLYVAL tristate - select CRYPTO_GF128MUL select CRYPTO_HASH + select CRYPTO_LIB_GF128MUL help POLYVAL hash function for HCTR2 diff --git a/crypto/Makefile b/crypto/Makefile index 303b21c43df05..d0126c915834b 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -85,7 +85,6 @@ obj-$(CONFIG_CRYPTO_WP512) += wp512.o CFLAGS_wp512.o := $(call cc-option,-fno-schedule-insns) # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79149 obj-$(CONFIG_CRYPTO_BLAKE2B) += blake2b_generic.o CFLAGS_blake2b_generic.o := -Wframe-larger-than=4096 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105930 -obj-$(CONFIG_CRYPTO_GF128MUL) += gf128mul.o obj-$(CONFIG_CRYPTO_ECB) += ecb.o obj-$(CONFIG_CRYPTO_CBC) += cbc.o obj-$(CONFIG_CRYPTO_CFB) += cfb.o diff --git a/drivers/crypto/chelsio/Kconfig b/drivers/crypto/chelsio/Kconfig index f886401af13e7..5dd3f6a4781a2 100644 --- a/drivers/crypto/chelsio/Kconfig +++ b/drivers/crypto/chelsio/Kconfig @@ -3,11 +3,11 @@ config CRYPTO_DEV_CHELSIO tristate "Chelsio Crypto Co-processor Driver" depends on CHELSIO_T4 select CRYPTO_LIB_AES + select CRYPTO_LIB_GF128MUL select CRYPTO_SHA1 select CRYPTO_SHA256 select CRYPTO_SHA512 select CRYPTO_AUTHENC - select CRYPTO_GF128MUL help The Chelsio Crypto Co-processor driver for T6 adapters. diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 7e9683e9f5c63..6767d86959de8 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -11,6 +11,9 @@ config CRYPTO_LIB_AES config CRYPTO_LIB_ARC4 tristate +config CRYPTO_LIB_GF128MUL + tristate + config CRYPTO_ARCH_HAVE_LIB_BLAKE2S bool help diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index c852f067ab060..7000eeb72286e 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -13,6 +13,8 @@ libaes-y := aes.o obj-$(CONFIG_CRYPTO_LIB_ARC4) += libarc4.o libarc4-y := arc4.o +obj-$(CONFIG_CRYPTO_LIB_GF128MUL) += gf128mul.o + # blake2s is used by the /dev/random driver which is always builtin obj-y += libblake2s.o libblake2s-y := blake2s.o diff --git a/crypto/gf128mul.c b/lib/crypto/gf128mul.c similarity index 100% rename from crypto/gf128mul.c rename to lib/crypto/gf128mul.c -- GitLab From b67ce439fef69a1a339cf2743c8198e8d90e6821 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 3 Nov 2022 20:22:58 +0100 Subject: [PATCH 0553/1423] crypto: lib/gf128mul - make gf128mul_lle time invariant The gf128mul library has different variants with different memory/performance tradeoffs, where the faster ones use 4k or 64k lookup tables precomputed at runtime, which are based on one of the multiplication factors, which is commonly the key for keyed hash algorithms such as GHASH. The slowest variant is gf128_mul_lle() [and its bbe/ble counterparts], which does not use precomputed lookup tables, but it still relies on a single u16[256] lookup table which is input independent. The use of such a table may cause the execution time of gf128_mul_lle() to correlate with the value of the inputs, which is generally something that must be avoided for cryptographic algorithms. On top of that, the function uses a sequence of if () statements that conditionally invoke be128_xor() based on which bits are set in the second argument of the function, which is usually a pointer to the multiplication factor that represents the key. In order to remove the correlation between the execution time of gf128_mul_lle() and the value of its inputs, let's address the identified shortcomings: - add a time invariant version of gf128mul_x8_lle() that replaces the table lookup with the expression that is used at compile time to populate the lookup table; - make the invocations of be128_xor() unconditional, but pass a zero vector as the third argument if the associated bit in the key is cleared. The resulting code is likely to be significantly slower. However, given that this is the slowest version already, making it even slower in order to make it more secure is assumed to be justified. The bbe and ble counterparts could receive the same treatment, but the former is never used anywhere in the kernel, and the latter is only used in the driver for a asynchronous crypto h/w accelerator (Chelsio), where timing variances are unlikely to matter. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- lib/crypto/gf128mul.c | 58 +++++++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 19 deletions(-) diff --git a/lib/crypto/gf128mul.c b/lib/crypto/gf128mul.c index a69ae3e6c16cb..8f8c45e0cdcf2 100644 --- a/lib/crypto/gf128mul.c +++ b/lib/crypto/gf128mul.c @@ -146,6 +146,17 @@ static void gf128mul_x8_lle(be128 *x) x->a = cpu_to_be64((a >> 8) ^ (_tt << 48)); } +/* time invariant version of gf128mul_x8_lle */ +static void gf128mul_x8_lle_ti(be128 *x) +{ + u64 a = be64_to_cpu(x->a); + u64 b = be64_to_cpu(x->b); + u64 _tt = xda_le(b & 0xff); /* avoid table lookup */ + + x->b = cpu_to_be64((b >> 8) | (a << 56)); + x->a = cpu_to_be64((a >> 8) ^ (_tt << 48)); +} + static void gf128mul_x8_bbe(be128 *x) { u64 a = be64_to_cpu(x->a); @@ -169,38 +180,47 @@ EXPORT_SYMBOL(gf128mul_x8_ble); void gf128mul_lle(be128 *r, const be128 *b) { - be128 p[8]; + /* + * The p array should be aligned to twice the size of its element type, + * so that every even/odd pair is guaranteed to share a cacheline + * (assuming a cacheline size of 32 bytes or more, which is by far the + * most common). This ensures that each be128_xor() call in the loop + * takes the same amount of time regardless of the value of 'ch', which + * is derived from function parameter 'b', which is commonly used as a + * key, e.g., for GHASH. The odd array elements are all set to zero, + * making each be128_xor() a NOP if its associated bit in 'ch' is not + * set, and this is equivalent to calling be128_xor() conditionally. + * This approach aims to avoid leaking information about such keys + * through execution time variances. + * + * Unfortunately, __aligned(16) or higher does not work on x86 for + * variables on the stack so we need to perform the alignment by hand. + */ + be128 array[16 + 3] = {}; + be128 *p = PTR_ALIGN(&array[0], 2 * sizeof(be128)); int i; p[0] = *r; for (i = 0; i < 7; ++i) - gf128mul_x_lle(&p[i + 1], &p[i]); + gf128mul_x_lle(&p[2 * i + 2], &p[2 * i]); memset(r, 0, sizeof(*r)); for (i = 0;;) { u8 ch = ((u8 *)b)[15 - i]; - if (ch & 0x80) - be128_xor(r, r, &p[0]); - if (ch & 0x40) - be128_xor(r, r, &p[1]); - if (ch & 0x20) - be128_xor(r, r, &p[2]); - if (ch & 0x10) - be128_xor(r, r, &p[3]); - if (ch & 0x08) - be128_xor(r, r, &p[4]); - if (ch & 0x04) - be128_xor(r, r, &p[5]); - if (ch & 0x02) - be128_xor(r, r, &p[6]); - if (ch & 0x01) - be128_xor(r, r, &p[7]); + be128_xor(r, r, &p[ 0 + !(ch & 0x80)]); + be128_xor(r, r, &p[ 2 + !(ch & 0x40)]); + be128_xor(r, r, &p[ 4 + !(ch & 0x20)]); + be128_xor(r, r, &p[ 6 + !(ch & 0x10)]); + be128_xor(r, r, &p[ 8 + !(ch & 0x08)]); + be128_xor(r, r, &p[10 + !(ch & 0x04)]); + be128_xor(r, r, &p[12 + !(ch & 0x02)]); + be128_xor(r, r, &p[14 + !(ch & 0x01)]); if (++i >= 16) break; - gf128mul_x8_lle(r); + gf128mul_x8_lle_ti(r); /* use the time invariant version */ } } EXPORT_SYMBOL(gf128mul_lle); -- GitLab From 520af5da664a8edc4f4c1cd8e6e8488ecccdb7e5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 3 Nov 2022 20:22:59 +0100 Subject: [PATCH 0554/1423] crypto: lib/aesgcm - Provide minimal library implementation Implement a minimal library version of AES-GCM based on the existing library implementations of AES and multiplication in GF(2^128). Using these primitives, GCM can be implemented in a straight-forward manner. GCM has a couple of sharp edges, i.e., the amount of input data processed with the same initialization vector (IV) should be capped to protect the counter from 32-bit rollover (or carry), and the size of the authentication tag should be fixed for a given key. [0] The former concern is addressed trivially, given that the function call API uses 32-bit signed types for the input lengths. It is still up to the caller to avoid IV reuse in general, but this is not something we can police at the implementation level. As for the latter concern, let's make the authentication tag size part of the key schedule, and only permit it to be configured as part of the key expansion routine. Note that table based AES implementations are susceptible to known plaintext timing attacks on the encryption key. The AES library already attempts to mitigate this to some extent, but given that the counter mode encryption used by GCM operates exclusively on known plaintext by construction (the IV and therefore the initial counter value are known to an attacker), let's take some extra care to mitigate this, by calling the AES library with interrupts disabled. [0] https://nvlpubs.nist.gov/nistpubs/legacy/sp/nistspecialpublication800-38d.pdf Link: https://lore.kernel.org/all/c6fb9b25-a4b6-2e4a-2dd1-63adda055a49@amd.com/ Signed-off-by: Ard Biesheuvel Tested-by: Nikunj A Dadhania Signed-off-by: Herbert Xu --- include/crypto/gcm.h | 22 ++ lib/crypto/Kconfig | 6 + lib/crypto/Makefile | 3 + lib/crypto/aesgcm.c | 727 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 758 insertions(+) create mode 100644 lib/crypto/aesgcm.c diff --git a/include/crypto/gcm.h b/include/crypto/gcm.h index 9d7eff04f2244..fd9df607a8364 100644 --- a/include/crypto/gcm.h +++ b/include/crypto/gcm.h @@ -3,6 +3,9 @@ #include +#include +#include + #define GCM_AES_IV_SIZE 12 #define GCM_RFC4106_IV_SIZE 8 #define GCM_RFC4543_IV_SIZE 8 @@ -60,4 +63,23 @@ static inline int crypto_ipsec_check_assoclen(unsigned int assoclen) return 0; } + +struct aesgcm_ctx { + be128 ghash_key; + struct crypto_aes_ctx aes_ctx; + unsigned int authsize; +}; + +int aesgcm_expandkey(struct aesgcm_ctx *ctx, const u8 *key, + unsigned int keysize, unsigned int authsize); + +void aesgcm_encrypt(const struct aesgcm_ctx *ctx, u8 *dst, const u8 *src, + int crypt_len, const u8 *assoc, int assoc_len, + const u8 iv[GCM_AES_IV_SIZE], u8 *authtag); + +bool __must_check aesgcm_decrypt(const struct aesgcm_ctx *ctx, u8 *dst, + const u8 *src, int crypt_len, const u8 *assoc, + int assoc_len, const u8 iv[GCM_AES_IV_SIZE], + const u8 *authtag); + #endif diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 6767d86959de8..45436bfc6dffe 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -8,6 +8,12 @@ config CRYPTO_LIB_UTILS config CRYPTO_LIB_AES tristate +config CRYPTO_LIB_AESGCM + tristate + select CRYPTO_LIB_AES + select CRYPTO_LIB_GF128MUL + select CRYPTO_LIB_UTILS + config CRYPTO_LIB_ARC4 tristate diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 7000eeb72286e..6ec2d4543d9ca 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -10,6 +10,9 @@ obj-$(CONFIG_CRYPTO_LIB_CHACHA_GENERIC) += libchacha.o obj-$(CONFIG_CRYPTO_LIB_AES) += libaes.o libaes-y := aes.o +obj-$(CONFIG_CRYPTO_LIB_AESGCM) += libaesgcm.o +libaesgcm-y := aesgcm.o + obj-$(CONFIG_CRYPTO_LIB_ARC4) += libarc4.o libarc4-y := arc4.o diff --git a/lib/crypto/aesgcm.c b/lib/crypto/aesgcm.c new file mode 100644 index 0000000000000..c632d6e17af87 --- /dev/null +++ b/lib/crypto/aesgcm.c @@ -0,0 +1,727 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Minimal library implementation of GCM + * + * Copyright 2022 Google LLC + */ + +#include + +#include +#include +#include + +#include + +static void aesgcm_encrypt_block(const struct crypto_aes_ctx *ctx, void *dst, + const void *src) +{ + unsigned long flags; + + /* + * In AES-GCM, both the GHASH key derivation and the CTR mode + * encryption operate on known plaintext, making them susceptible to + * timing attacks on the encryption key. The AES library already + * mitigates this risk to some extent by pulling the entire S-box into + * the caches before doing any substitutions, but this strategy is more + * effective when running with interrupts disabled. + */ + local_irq_save(flags); + aes_encrypt(ctx, dst, src); + local_irq_restore(flags); +} + +/** + * aesgcm_expandkey - Expands the AES and GHASH keys for the AES-GCM key + * schedule + * + * @ctx: The data structure that will hold the AES-GCM key schedule + * @key: The AES encryption input key + * @keysize: The length in bytes of the input key + * @authsize: The size in bytes of the GCM authentication tag + * + * Returns: 0 on success, or -EINVAL if @keysize or @authsize contain values + * that are not permitted by the GCM specification. + */ +int aesgcm_expandkey(struct aesgcm_ctx *ctx, const u8 *key, + unsigned int keysize, unsigned int authsize) +{ + u8 kin[AES_BLOCK_SIZE] = {}; + int ret; + + ret = crypto_gcm_check_authsize(authsize) ?: + aes_expandkey(&ctx->aes_ctx, key, keysize); + if (ret) + return ret; + + ctx->authsize = authsize; + aesgcm_encrypt_block(&ctx->aes_ctx, &ctx->ghash_key, kin); + + return 0; +} +EXPORT_SYMBOL(aesgcm_expandkey); + +static void aesgcm_ghash(be128 *ghash, const be128 *key, const void *src, + int len) +{ + while (len > 0) { + crypto_xor((u8 *)ghash, src, min(len, GHASH_BLOCK_SIZE)); + gf128mul_lle(ghash, key); + + src += GHASH_BLOCK_SIZE; + len -= GHASH_BLOCK_SIZE; + } +} + +static void aesgcm_mac(const struct aesgcm_ctx *ctx, const u8 *src, int src_len, + const u8 *assoc, int assoc_len, __be32 *ctr, u8 *authtag) +{ + be128 tail = { cpu_to_be64(assoc_len * 8), cpu_to_be64(src_len * 8) }; + u8 buf[AES_BLOCK_SIZE]; + be128 ghash = {}; + + aesgcm_ghash(&ghash, &ctx->ghash_key, assoc, assoc_len); + aesgcm_ghash(&ghash, &ctx->ghash_key, src, src_len); + aesgcm_ghash(&ghash, &ctx->ghash_key, &tail, sizeof(tail)); + + ctr[3] = cpu_to_be32(1); + aesgcm_encrypt_block(&ctx->aes_ctx, buf, ctr); + crypto_xor_cpy(authtag, buf, (u8 *)&ghash, ctx->authsize); + + memzero_explicit(&ghash, sizeof(ghash)); + memzero_explicit(buf, sizeof(buf)); +} + +static void aesgcm_crypt(const struct aesgcm_ctx *ctx, u8 *dst, const u8 *src, + int len, __be32 *ctr) +{ + u8 buf[AES_BLOCK_SIZE]; + unsigned int n = 2; + + while (len > 0) { + /* + * The counter increment below must not result in overflow or + * carry into the next 32-bit word, as this could result in + * inadvertent IV reuse, which must be avoided at all cost for + * stream ciphers such as AES-CTR. Given the range of 'int + * len', this cannot happen, so no explicit test is necessary. + */ + ctr[3] = cpu_to_be32(n++); + aesgcm_encrypt_block(&ctx->aes_ctx, buf, ctr); + crypto_xor_cpy(dst, src, buf, min(len, AES_BLOCK_SIZE)); + + dst += AES_BLOCK_SIZE; + src += AES_BLOCK_SIZE; + len -= AES_BLOCK_SIZE; + } + memzero_explicit(buf, sizeof(buf)); +} + +/** + * aesgcm_encrypt - Perform AES-GCM encryption on a block of data + * + * @ctx: The AES-GCM key schedule + * @dst: Pointer to the ciphertext output buffer + * @src: Pointer the plaintext (may equal @dst for encryption in place) + * @crypt_len: The size in bytes of the plaintext and ciphertext. + * @assoc: Pointer to the associated data, + * @assoc_len: The size in bytes of the associated data + * @iv: The initialization vector (IV) to use for this block of data + * (must be 12 bytes in size as per the GCM spec recommendation) + * @authtag: The address of the buffer in memory where the authentication + * tag should be stored. The buffer is assumed to have space for + * @ctx->authsize bytes. + */ +void aesgcm_encrypt(const struct aesgcm_ctx *ctx, u8 *dst, const u8 *src, + int crypt_len, const u8 *assoc, int assoc_len, + const u8 iv[GCM_AES_IV_SIZE], u8 *authtag) +{ + __be32 ctr[4]; + + memcpy(ctr, iv, GCM_AES_IV_SIZE); + + aesgcm_crypt(ctx, dst, src, crypt_len, ctr); + aesgcm_mac(ctx, dst, crypt_len, assoc, assoc_len, ctr, authtag); +} +EXPORT_SYMBOL(aesgcm_encrypt); + +/** + * aesgcm_decrypt - Perform AES-GCM decryption on a block of data + * + * @ctx: The AES-GCM key schedule + * @dst: Pointer to the plaintext output buffer + * @src: Pointer the ciphertext (may equal @dst for decryption in place) + * @crypt_len: The size in bytes of the plaintext and ciphertext. + * @assoc: Pointer to the associated data, + * @assoc_len: The size in bytes of the associated data + * @iv: The initialization vector (IV) to use for this block of data + * (must be 12 bytes in size as per the GCM spec recommendation) + * @authtag: The address of the buffer in memory where the authentication + * tag is stored. + * + * Returns: true on success, or false if the ciphertext failed authentication. + * On failure, no plaintext will be returned. + */ +bool __must_check aesgcm_decrypt(const struct aesgcm_ctx *ctx, u8 *dst, + const u8 *src, int crypt_len, const u8 *assoc, + int assoc_len, const u8 iv[GCM_AES_IV_SIZE], + const u8 *authtag) +{ + u8 tagbuf[AES_BLOCK_SIZE]; + __be32 ctr[4]; + + memcpy(ctr, iv, GCM_AES_IV_SIZE); + + aesgcm_mac(ctx, src, crypt_len, assoc, assoc_len, ctr, tagbuf); + if (crypto_memneq(authtag, tagbuf, ctx->authsize)) { + memzero_explicit(tagbuf, sizeof(tagbuf)); + return false; + } + aesgcm_crypt(ctx, dst, src, crypt_len, ctr); + return true; +} +EXPORT_SYMBOL(aesgcm_decrypt); + +MODULE_DESCRIPTION("Generic AES-GCM library"); +MODULE_AUTHOR("Ard Biesheuvel "); +MODULE_LICENSE("GPL"); + +#ifndef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS + +/* + * Test code below. Vectors taken from crypto/testmgr.h + */ + +static const u8 __initconst ctext0[16] = + "\x58\xe2\xfc\xce\xfa\x7e\x30\x61" + "\x36\x7f\x1d\x57\xa4\xe7\x45\x5a"; + +static const u8 __initconst ptext1[16]; + +static const u8 __initconst ctext1[32] = + "\x03\x88\xda\xce\x60\xb6\xa3\x92" + "\xf3\x28\xc2\xb9\x71\xb2\xfe\x78" + "\xab\x6e\x47\xd4\x2c\xec\x13\xbd" + "\xf5\x3a\x67\xb2\x12\x57\xbd\xdf"; + +static const u8 __initconst ptext2[64] = + "\xd9\x31\x32\x25\xf8\x84\x06\xe5" + "\xa5\x59\x09\xc5\xaf\xf5\x26\x9a" + "\x86\xa7\xa9\x53\x15\x34\xf7\xda" + "\x2e\x4c\x30\x3d\x8a\x31\x8a\x72" + "\x1c\x3c\x0c\x95\x95\x68\x09\x53" + "\x2f\xcf\x0e\x24\x49\xa6\xb5\x25" + "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57" + "\xba\x63\x7b\x39\x1a\xaf\xd2\x55"; + +static const u8 __initconst ctext2[80] = + "\x42\x83\x1e\xc2\x21\x77\x74\x24" + "\x4b\x72\x21\xb7\x84\xd0\xd4\x9c" + "\xe3\xaa\x21\x2f\x2c\x02\xa4\xe0" + "\x35\xc1\x7e\x23\x29\xac\xa1\x2e" + "\x21\xd5\x14\xb2\x54\x66\x93\x1c" + "\x7d\x8f\x6a\x5a\xac\x84\xaa\x05" + "\x1b\xa3\x0b\x39\x6a\x0a\xac\x97" + "\x3d\x58\xe0\x91\x47\x3f\x59\x85" + "\x4d\x5c\x2a\xf3\x27\xcd\x64\xa6" + "\x2c\xf3\x5a\xbd\x2b\xa6\xfa\xb4"; + +static const u8 __initconst ptext3[60] = + "\xd9\x31\x32\x25\xf8\x84\x06\xe5" + "\xa5\x59\x09\xc5\xaf\xf5\x26\x9a" + "\x86\xa7\xa9\x53\x15\x34\xf7\xda" + "\x2e\x4c\x30\x3d\x8a\x31\x8a\x72" + "\x1c\x3c\x0c\x95\x95\x68\x09\x53" + "\x2f\xcf\x0e\x24\x49\xa6\xb5\x25" + "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57" + "\xba\x63\x7b\x39"; + +static const u8 __initconst ctext3[76] = + "\x42\x83\x1e\xc2\x21\x77\x74\x24" + "\x4b\x72\x21\xb7\x84\xd0\xd4\x9c" + "\xe3\xaa\x21\x2f\x2c\x02\xa4\xe0" + "\x35\xc1\x7e\x23\x29\xac\xa1\x2e" + "\x21\xd5\x14\xb2\x54\x66\x93\x1c" + "\x7d\x8f\x6a\x5a\xac\x84\xaa\x05" + "\x1b\xa3\x0b\x39\x6a\x0a\xac\x97" + "\x3d\x58\xe0\x91" + "\x5b\xc9\x4f\xbc\x32\x21\xa5\xdb" + "\x94\xfa\xe9\x5a\xe7\x12\x1a\x47"; + +static const u8 __initconst ctext4[16] = + "\xcd\x33\xb2\x8a\xc7\x73\xf7\x4b" + "\xa0\x0e\xd1\xf3\x12\x57\x24\x35"; + +static const u8 __initconst ctext5[32] = + "\x98\xe7\x24\x7c\x07\xf0\xfe\x41" + "\x1c\x26\x7e\x43\x84\xb0\xf6\x00" + "\x2f\xf5\x8d\x80\x03\x39\x27\xab" + "\x8e\xf4\xd4\x58\x75\x14\xf0\xfb"; + +static const u8 __initconst ptext6[64] = + "\xd9\x31\x32\x25\xf8\x84\x06\xe5" + "\xa5\x59\x09\xc5\xaf\xf5\x26\x9a" + "\x86\xa7\xa9\x53\x15\x34\xf7\xda" + "\x2e\x4c\x30\x3d\x8a\x31\x8a\x72" + "\x1c\x3c\x0c\x95\x95\x68\x09\x53" + "\x2f\xcf\x0e\x24\x49\xa6\xb5\x25" + "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57" + "\xba\x63\x7b\x39\x1a\xaf\xd2\x55"; + +static const u8 __initconst ctext6[80] = + "\x39\x80\xca\x0b\x3c\x00\xe8\x41" + "\xeb\x06\xfa\xc4\x87\x2a\x27\x57" + "\x85\x9e\x1c\xea\xa6\xef\xd9\x84" + "\x62\x85\x93\xb4\x0c\xa1\xe1\x9c" + "\x7d\x77\x3d\x00\xc1\x44\xc5\x25" + "\xac\x61\x9d\x18\xc8\x4a\x3f\x47" + "\x18\xe2\x44\x8b\x2f\xe3\x24\xd9" + "\xcc\xda\x27\x10\xac\xad\xe2\x56" + "\x99\x24\xa7\xc8\x58\x73\x36\xbf" + "\xb1\x18\x02\x4d\xb8\x67\x4a\x14"; + +static const u8 __initconst ctext7[16] = + "\x53\x0f\x8a\xfb\xc7\x45\x36\xb9" + "\xa9\x63\xb4\xf1\xc4\xcb\x73\x8b"; + +static const u8 __initconst ctext8[32] = + "\xce\xa7\x40\x3d\x4d\x60\x6b\x6e" + "\x07\x4e\xc5\xd3\xba\xf3\x9d\x18" + "\xd0\xd1\xc8\xa7\x99\x99\x6b\xf0" + "\x26\x5b\x98\xb5\xd4\x8a\xb9\x19"; + +static const u8 __initconst ptext9[64] = + "\xd9\x31\x32\x25\xf8\x84\x06\xe5" + "\xa5\x59\x09\xc5\xaf\xf5\x26\x9a" + "\x86\xa7\xa9\x53\x15\x34\xf7\xda" + "\x2e\x4c\x30\x3d\x8a\x31\x8a\x72" + "\x1c\x3c\x0c\x95\x95\x68\x09\x53" + "\x2f\xcf\x0e\x24\x49\xa6\xb5\x25" + "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57" + "\xba\x63\x7b\x39\x1a\xaf\xd2\x55"; + +static const u8 __initconst ctext9[80] = + "\x52\x2d\xc1\xf0\x99\x56\x7d\x07" + "\xf4\x7f\x37\xa3\x2a\x84\x42\x7d" + "\x64\x3a\x8c\xdc\xbf\xe5\xc0\xc9" + "\x75\x98\xa2\xbd\x25\x55\xd1\xaa" + "\x8c\xb0\x8e\x48\x59\x0d\xbb\x3d" + "\xa7\xb0\x8b\x10\x56\x82\x88\x38" + "\xc5\xf6\x1e\x63\x93\xba\x7a\x0a" + "\xbc\xc9\xf6\x62\x89\x80\x15\xad" + "\xb0\x94\xda\xc5\xd9\x34\x71\xbd" + "\xec\x1a\x50\x22\x70\xe3\xcc\x6c"; + +static const u8 __initconst ptext10[60] = + "\xd9\x31\x32\x25\xf8\x84\x06\xe5" + "\xa5\x59\x09\xc5\xaf\xf5\x26\x9a" + "\x86\xa7\xa9\x53\x15\x34\xf7\xda" + "\x2e\x4c\x30\x3d\x8a\x31\x8a\x72" + "\x1c\x3c\x0c\x95\x95\x68\x09\x53" + "\x2f\xcf\x0e\x24\x49\xa6\xb5\x25" + "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57" + "\xba\x63\x7b\x39"; + +static const u8 __initconst ctext10[76] = + "\x52\x2d\xc1\xf0\x99\x56\x7d\x07" + "\xf4\x7f\x37\xa3\x2a\x84\x42\x7d" + "\x64\x3a\x8c\xdc\xbf\xe5\xc0\xc9" + "\x75\x98\xa2\xbd\x25\x55\xd1\xaa" + "\x8c\xb0\x8e\x48\x59\x0d\xbb\x3d" + "\xa7\xb0\x8b\x10\x56\x82\x88\x38" + "\xc5\xf6\x1e\x63\x93\xba\x7a\x0a" + "\xbc\xc9\xf6\x62" + "\x76\xfc\x6e\xce\x0f\x4e\x17\x68" + "\xcd\xdf\x88\x53\xbb\x2d\x55\x1b"; + +static const u8 __initconst ptext11[60] = + "\xd9\x31\x32\x25\xf8\x84\x06\xe5" + "\xa5\x59\x09\xc5\xaf\xf5\x26\x9a" + "\x86\xa7\xa9\x53\x15\x34\xf7\xda" + "\x2e\x4c\x30\x3d\x8a\x31\x8a\x72" + "\x1c\x3c\x0c\x95\x95\x68\x09\x53" + "\x2f\xcf\x0e\x24\x49\xa6\xb5\x25" + "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57" + "\xba\x63\x7b\x39"; + +static const u8 __initconst ctext11[76] = + "\x39\x80\xca\x0b\x3c\x00\xe8\x41" + "\xeb\x06\xfa\xc4\x87\x2a\x27\x57" + "\x85\x9e\x1c\xea\xa6\xef\xd9\x84" + "\x62\x85\x93\xb4\x0c\xa1\xe1\x9c" + "\x7d\x77\x3d\x00\xc1\x44\xc5\x25" + "\xac\x61\x9d\x18\xc8\x4a\x3f\x47" + "\x18\xe2\x44\x8b\x2f\xe3\x24\xd9" + "\xcc\xda\x27\x10" + "\x25\x19\x49\x8e\x80\xf1\x47\x8f" + "\x37\xba\x55\xbd\x6d\x27\x61\x8c"; + +static const u8 __initconst ptext12[719] = + "\x42\xc1\xcc\x08\x48\x6f\x41\x3f" + "\x2f\x11\x66\x8b\x2a\x16\xf0\xe0" + "\x58\x83\xf0\xc3\x70\x14\xc0\x5b" + "\x3f\xec\x1d\x25\x3c\x51\xd2\x03" + "\xcf\x59\x74\x1f\xb2\x85\xb4\x07" + "\xc6\x6a\x63\x39\x8a\x5b\xde\xcb" + "\xaf\x08\x44\xbd\x6f\x91\x15\xe1" + "\xf5\x7a\x6e\x18\xbd\xdd\x61\x50" + "\x59\xa9\x97\xab\xbb\x0e\x74\x5c" + "\x00\xa4\x43\x54\x04\x54\x9b\x3b" + "\x77\xec\xfd\x5c\xa6\xe8\x7b\x08" + "\xae\xe6\x10\x3f\x32\x65\xd1\xfc" + "\xa4\x1d\x2c\x31\xfb\x33\x7a\xb3" + "\x35\x23\xf4\x20\x41\xd4\xad\x82" + "\x8b\xa4\xad\x96\x1c\x20\x53\xbe" + "\x0e\xa6\xf4\xdc\x78\x49\x3e\x72" + "\xb1\xa9\xb5\x83\xcb\x08\x54\xb7" + "\xad\x49\x3a\xae\x98\xce\xa6\x66" + "\x10\x30\x90\x8c\x55\x83\xd7\x7c" + "\x8b\xe6\x53\xde\xd2\x6e\x18\x21" + "\x01\x52\xd1\x9f\x9d\xbb\x9c\x73" + "\x57\xcc\x89\x09\x75\x9b\x78\x70" + "\xed\x26\x97\x4d\xb4\xe4\x0c\xa5" + "\xfa\x70\x04\x70\xc6\x96\x1c\x7d" + "\x54\x41\x77\xa8\xe3\xb0\x7e\x96" + "\x82\xd9\xec\xa2\x87\x68\x55\xf9" + "\x8f\x9e\x73\x43\x47\x6a\x08\x36" + "\x93\x67\xa8\x2d\xde\xac\x41\xa9" + "\x5c\x4d\x73\x97\x0f\x70\x68\xfa" + "\x56\x4d\x00\xc2\x3b\x1f\xc8\xb9" + "\x78\x1f\x51\x07\xe3\x9a\x13\x4e" + "\xed\x2b\x2e\xa3\xf7\x44\xb2\xe7" + "\xab\x19\x37\xd9\xba\x76\x5e\xd2" + "\xf2\x53\x15\x17\x4c\x6b\x16\x9f" + "\x02\x66\x49\xca\x7c\x91\x05\xf2" + "\x45\x36\x1e\xf5\x77\xad\x1f\x46" + "\xa8\x13\xfb\x63\xb6\x08\x99\x63" + "\x82\xa2\xed\xb3\xac\xdf\x43\x19" + "\x45\xea\x78\x73\xd9\xb7\x39\x11" + "\xa3\x13\x7c\xf8\x3f\xf7\xad\x81" + "\x48\x2f\xa9\x5c\x5f\xa0\xf0\x79" + "\xa4\x47\x7d\x80\x20\x26\xfd\x63" + "\x0a\xc7\x7e\x6d\x75\x47\xff\x76" + "\x66\x2e\x8a\x6c\x81\x35\xaf\x0b" + "\x2e\x6a\x49\x60\xc1\x10\xe1\xe1" + "\x54\x03\xa4\x09\x0c\x37\x7a\x15" + "\x23\x27\x5b\x8b\x4b\xa5\x64\x97" + "\xae\x4a\x50\x73\x1f\x66\x1c\x5c" + "\x03\x25\x3c\x8d\x48\x58\x71\x34" + "\x0e\xec\x4e\x55\x1a\x03\x6a\xe5" + "\xb6\x19\x2b\x84\x2a\x20\xd1\xea" + "\x80\x6f\x96\x0e\x05\x62\xc7\x78" + "\x87\x79\x60\x38\x46\xb4\x25\x57" + "\x6e\x16\x63\xf8\xad\x6e\xd7\x42" + "\x69\xe1\x88\xef\x6e\xd5\xb4\x9a" + "\x3c\x78\x6c\x3b\xe5\xa0\x1d\x22" + "\x86\x5c\x74\x3a\xeb\x24\x26\xc7" + "\x09\xfc\x91\x96\x47\x87\x4f\x1a" + "\xd6\x6b\x2c\x18\x47\xc0\xb8\x24" + "\xa8\x5a\x4a\x9e\xcb\x03\xe7\x2a" + "\x09\xe6\x4d\x9c\x6d\x86\x60\xf5" + "\x2f\x48\x69\x37\x9f\xf2\xd2\xcb" + "\x0e\x5a\xdd\x6e\x8a\xfb\x6a\xfe" + "\x0b\x63\xde\x87\x42\x79\x8a\x68" + "\x51\x28\x9b\x7a\xeb\xaf\xb8\x2f" + "\x9d\xd1\xc7\x45\x90\x08\xc9\x83" + "\xe9\x83\x84\xcb\x28\x69\x09\x69" + "\xce\x99\x46\x00\x54\xcb\xd8\x38" + "\xf9\x53\x4a\xbf\x31\xce\x57\x15" + "\x33\xfa\x96\x04\x33\x42\xe3\xc0" + "\xb7\x54\x4a\x65\x7a\x7c\x02\xe6" + "\x19\x95\xd0\x0e\x82\x07\x63\xf9" + "\xe1\x2b\x2a\xfc\x55\x92\x52\xc9" + "\xb5\x9f\x23\x28\x60\xe7\x20\x51" + "\x10\xd3\xed\x6d\x9b\xab\xb8\xe2" + "\x5d\x9a\x34\xb3\xbe\x9c\x64\xcb" + "\x78\xc6\x91\x22\x40\x91\x80\xbe" + "\xd7\x78\x5c\x0e\x0a\xdc\x08\xe9" + "\x67\x10\xa4\x83\x98\x79\x23\xe7" + "\x92\xda\xa9\x22\x16\xb1\xe7\x78" + "\xa3\x1c\x6c\x8f\x35\x7c\x4d\x37" + "\x2f\x6e\x0b\x50\x5c\x34\xb9\xf9" + "\xe6\x3d\x91\x0d\x32\x95\xaa\x3d" + "\x48\x11\x06\xbb\x2d\xf2\x63\x88" + "\x3f\x73\x09\xe2\x45\x56\x31\x51" + "\xfa\x5e\x4e\x62\xf7\x90\xf9\xa9" + "\x7d\x7b\x1b\xb1\xc8\x26\x6e\x66" + "\xf6\x90\x9a\x7f\xf2\x57\xcc\x23" + "\x59\xfa\xfa\xaa\x44\x04\x01\xa7" + "\xa4\x78\xdb\x74\x3d\x8b\xb5"; + +static const u8 __initconst ctext12[735] = + "\x84\x0b\xdb\xd5\xb7\xa8\xfe\x20" + "\xbb\xb1\x12\x7f\x41\xea\xb3\xc0" + "\xa2\xb4\x37\x19\x11\x58\xb6\x0b" + "\x4c\x1d\x38\x05\x54\xd1\x16\x73" + "\x8e\x1c\x20\x90\xa2\x9a\xb7\x74" + "\x47\xe6\xd8\xfc\x18\x3a\xb4\xea" + "\xd5\x16\x5a\x2c\x53\x01\x46\xb3" + "\x18\x33\x74\x6c\x50\xf2\xe8\xc0" + "\x73\xda\x60\x22\xeb\xe3\xe5\x9b" + "\x20\x93\x6c\x4b\x37\x99\xb8\x23" + "\x3b\x4e\xac\xe8\x5b\xe8\x0f\xb7" + "\xc3\x8f\xfb\x4a\x37\xd9\x39\x95" + "\x34\xf1\xdb\x8f\x71\xd9\xc7\x0b" + "\x02\xf1\x63\xfc\x9b\xfc\xc5\xab" + "\xb9\x14\x13\x21\xdf\xce\xaa\x88" + "\x44\x30\x1e\xce\x26\x01\x92\xf8" + "\x9f\x00\x4b\x0c\x4b\xf7\x5f\xe0" + "\x89\xca\x94\x66\x11\x21\x97\xca" + "\x3e\x83\x74\x2d\xdb\x4d\x11\xeb" + "\x97\xc2\x14\xff\x9e\x1e\xa0\x6b" + "\x08\xb4\x31\x2b\x85\xc6\x85\x6c" + "\x90\xec\x39\xc0\xec\xb3\xb5\x4e" + "\xf3\x9c\xe7\x83\x3a\x77\x0a\xf4" + "\x56\xfe\xce\x18\x33\x6d\x0b\x2d" + "\x33\xda\xc8\x05\x5c\xb4\x09\x2a" + "\xde\x6b\x52\x98\x01\xef\x36\x3d" + "\xbd\xf9\x8f\xa8\x3e\xaa\xcd\xd1" + "\x01\x2d\x42\x49\xc3\xb6\x84\xbb" + "\x48\x96\xe0\x90\x93\x6c\x48\x64" + "\xd4\xfa\x7f\x93\x2c\xa6\x21\xc8" + "\x7a\x23\x7b\xaa\x20\x56\x12\xae" + "\x16\x9d\x94\x0f\x54\xa1\xec\xca" + "\x51\x4e\xf2\x39\xf4\xf8\x5f\x04" + "\x5a\x0d\xbf\xf5\x83\xa1\x15\xe1" + "\xf5\x3c\xd8\x62\xa3\xed\x47\x89" + "\x85\x4c\xe5\xdb\xac\x9e\x17\x1d" + "\x0c\x09\xe3\x3e\x39\x5b\x4d\x74" + "\x0e\xf5\x34\xee\x70\x11\x4c\xfd" + "\xdb\x34\xb1\xb5\x10\x3f\x73\xb7" + "\xf5\xfa\xed\xb0\x1f\xa5\xcd\x3c" + "\x8d\x35\x83\xd4\x11\x44\x6e\x6c" + "\x5b\xe0\x0e\x69\xa5\x39\xe5\xbb" + "\xa9\x57\x24\x37\xe6\x1f\xdd\xcf" + "\x16\x2a\x13\xf9\x6a\x2d\x90\xa0" + "\x03\x60\x7a\xed\x69\xd5\x00\x8b" + "\x7e\x4f\xcb\xb9\xfa\x91\xb9\x37" + "\xc1\x26\xce\x90\x97\x22\x64\x64" + "\xc1\x72\x43\x1b\xf6\xac\xc1\x54" + "\x8a\x10\x9c\xdd\x8d\xd5\x8e\xb2" + "\xe4\x85\xda\xe0\x20\x5f\xf4\xb4" + "\x15\xb5\xa0\x8d\x12\x74\x49\x23" + "\x3a\xdf\x4a\xd3\xf0\x3b\x89\xeb" + "\xf8\xcc\x62\x7b\xfb\x93\x07\x41" + "\x61\x26\x94\x58\x70\xa6\x3c\xe4" + "\xff\x58\xc4\x13\x3d\xcb\x36\x6b" + "\x32\xe5\xb2\x6d\x03\x74\x6f\x76" + "\x93\x77\xde\x48\xc4\xfa\x30\x4a" + "\xda\x49\x80\x77\x0f\x1c\xbe\x11" + "\xc8\x48\xb1\xe5\xbb\xf2\x8a\xe1" + "\x96\x2f\x9f\xd1\x8e\x8a\x5c\xe2" + "\xf7\xd7\xd8\x54\xf3\x3f\xc4\x91" + "\xb8\xfb\x86\xdc\x46\x24\x91\x60" + "\x6c\x2f\xc9\x41\x37\x51\x49\x54" + "\x09\x81\x21\xf3\x03\x9f\x2b\xe3" + "\x1f\x39\x63\xaf\xf4\xd7\x53\x60" + "\xa7\xc7\x54\xf9\xee\xb1\xb1\x7d" + "\x75\x54\x65\x93\xfe\xb1\x68\x6b" + "\x57\x02\xf9\xbb\x0e\xf9\xf8\xbf" + "\x01\x12\x27\xb4\xfe\xe4\x79\x7a" + "\x40\x5b\x51\x4b\xdf\x38\xec\xb1" + "\x6a\x56\xff\x35\x4d\x42\x33\xaa" + "\x6f\x1b\xe4\xdc\xe0\xdb\x85\x35" + "\x62\x10\xd4\xec\xeb\xc5\x7e\x45" + "\x1c\x6f\x17\xca\x3b\x8e\x2d\x66" + "\x4f\x4b\x36\x56\xcd\x1b\x59\xaa" + "\xd2\x9b\x17\xb9\x58\xdf\x7b\x64" + "\x8a\xff\x3b\x9c\xa6\xb5\x48\x9e" + "\xaa\xe2\x5d\x09\x71\x32\x5f\xb6" + "\x29\xbe\xe7\xc7\x52\x7e\x91\x82" + "\x6b\x6d\x33\xe1\x34\x06\x36\x21" + "\x5e\xbe\x1e\x2f\x3e\xc1\xfb\xea" + "\x49\x2c\xb5\xca\xf7\xb0\x37\xea" + "\x1f\xed\x10\x04\xd9\x48\x0d\x1a" + "\x1c\xfb\xe7\x84\x0e\x83\x53\x74" + "\xc7\x65\xe2\x5c\xe5\xba\x73\x4c" + "\x0e\xe1\xb5\x11\x45\x61\x43\x46" + "\xaa\x25\x8f\xbd\x85\x08\xfa\x4c" + "\x15\xc1\xc0\xd8\xf5\xdc\x16\xbb" + "\x7b\x1d\xe3\x87\x57\xa7\x2a\x1d" + "\x38\x58\x9e\x8a\x43\xdc\x57" + "\xd1\x81\x7d\x2b\xe9\xff\x99\x3a" + "\x4b\x24\x52\x58\x55\xe1\x49\x14"; + +static struct { + const u8 *ptext; + const u8 *ctext; + + u8 key[AES_MAX_KEY_SIZE]; + u8 iv[GCM_AES_IV_SIZE]; + u8 assoc[20]; + + int klen; + int clen; + int plen; + int alen; +} const aesgcm_tv[] __initconst = { + { /* From McGrew & Viega - http://citeseer.ist.psu.edu/656989.html */ + .klen = 16, + .ctext = ctext0, + .clen = sizeof(ctext0), + }, { + .klen = 16, + .ptext = ptext1, + .plen = sizeof(ptext1), + .ctext = ctext1, + .clen = sizeof(ctext1), + }, { + .key = "\xfe\xff\xe9\x92\x86\x65\x73\x1c" + "\x6d\x6a\x8f\x94\x67\x30\x83\x08", + .klen = 16, + .iv = "\xca\xfe\xba\xbe\xfa\xce\xdb\xad" + "\xde\xca\xf8\x88", + .ptext = ptext2, + .plen = sizeof(ptext2), + .ctext = ctext2, + .clen = sizeof(ctext2), + }, { + .key = "\xfe\xff\xe9\x92\x86\x65\x73\x1c" + "\x6d\x6a\x8f\x94\x67\x30\x83\x08", + .klen = 16, + .iv = "\xca\xfe\xba\xbe\xfa\xce\xdb\xad" + "\xde\xca\xf8\x88", + .ptext = ptext3, + .plen = sizeof(ptext3), + .assoc = "\xfe\xed\xfa\xce\xde\xad\xbe\xef" + "\xfe\xed\xfa\xce\xde\xad\xbe\xef" + "\xab\xad\xda\xd2", + .alen = 20, + .ctext = ctext3, + .clen = sizeof(ctext3), + }, { + .klen = 24, + .ctext = ctext4, + .clen = sizeof(ctext4), + }, { + .klen = 24, + .ptext = ptext1, + .plen = sizeof(ptext1), + .ctext = ctext5, + .clen = sizeof(ctext5), + }, { + .key = "\xfe\xff\xe9\x92\x86\x65\x73\x1c" + "\x6d\x6a\x8f\x94\x67\x30\x83\x08" + "\xfe\xff\xe9\x92\x86\x65\x73\x1c", + .klen = 24, + .iv = "\xca\xfe\xba\xbe\xfa\xce\xdb\xad" + "\xde\xca\xf8\x88", + .ptext = ptext6, + .plen = sizeof(ptext6), + .ctext = ctext6, + .clen = sizeof(ctext6), + }, { + .klen = 32, + .ctext = ctext7, + .clen = sizeof(ctext7), + }, { + .klen = 32, + .ptext = ptext1, + .plen = sizeof(ptext1), + .ctext = ctext8, + .clen = sizeof(ctext8), + }, { + .key = "\xfe\xff\xe9\x92\x86\x65\x73\x1c" + "\x6d\x6a\x8f\x94\x67\x30\x83\x08" + "\xfe\xff\xe9\x92\x86\x65\x73\x1c" + "\x6d\x6a\x8f\x94\x67\x30\x83\x08", + .klen = 32, + .iv = "\xca\xfe\xba\xbe\xfa\xce\xdb\xad" + "\xde\xca\xf8\x88", + .ptext = ptext9, + .plen = sizeof(ptext9), + .ctext = ctext9, + .clen = sizeof(ctext9), + }, { + .key = "\xfe\xff\xe9\x92\x86\x65\x73\x1c" + "\x6d\x6a\x8f\x94\x67\x30\x83\x08" + "\xfe\xff\xe9\x92\x86\x65\x73\x1c" + "\x6d\x6a\x8f\x94\x67\x30\x83\x08", + .klen = 32, + .iv = "\xca\xfe\xba\xbe\xfa\xce\xdb\xad" + "\xde\xca\xf8\x88", + .ptext = ptext10, + .plen = sizeof(ptext10), + .assoc = "\xfe\xed\xfa\xce\xde\xad\xbe\xef" + "\xfe\xed\xfa\xce\xde\xad\xbe\xef" + "\xab\xad\xda\xd2", + .alen = 20, + .ctext = ctext10, + .clen = sizeof(ctext10), + }, { + .key = "\xfe\xff\xe9\x92\x86\x65\x73\x1c" + "\x6d\x6a\x8f\x94\x67\x30\x83\x08" + "\xfe\xff\xe9\x92\x86\x65\x73\x1c", + .klen = 24, + .iv = "\xca\xfe\xba\xbe\xfa\xce\xdb\xad" + "\xde\xca\xf8\x88", + .ptext = ptext11, + .plen = sizeof(ptext11), + .assoc = "\xfe\xed\xfa\xce\xde\xad\xbe\xef" + "\xfe\xed\xfa\xce\xde\xad\xbe\xef" + "\xab\xad\xda\xd2", + .alen = 20, + .ctext = ctext11, + .clen = sizeof(ctext11), + }, { + .key = "\x62\x35\xf8\x95\xfc\xa5\xeb\xf6" + "\x0e\x92\x12\x04\xd3\xa1\x3f\x2e" + "\x8b\x32\xcf\xe7\x44\xed\x13\x59" + "\x04\x38\x77\xb0\xb9\xad\xb4\x38", + .klen = 32, + .iv = "\x00\xff\xff\xff\xff\x00\x00\xff" + "\xff\xff\x00\xff", + .ptext = ptext12, + .plen = sizeof(ptext12), + .ctext = ctext12, + .clen = sizeof(ctext12), + } +}; + +static int __init libaesgcm_init(void) +{ + for (int i = 0; i < ARRAY_SIZE(aesgcm_tv); i++) { + u8 tagbuf[AES_BLOCK_SIZE]; + int plen = aesgcm_tv[i].plen; + struct aesgcm_ctx ctx; + u8 buf[sizeof(ptext12)]; + + if (aesgcm_expandkey(&ctx, aesgcm_tv[i].key, aesgcm_tv[i].klen, + aesgcm_tv[i].clen - plen)) { + pr_err("aesgcm_expandkey() failed on vector %d\n", i); + return -ENODEV; + } + + if (!aesgcm_decrypt(&ctx, buf, aesgcm_tv[i].ctext, plen, + aesgcm_tv[i].assoc, aesgcm_tv[i].alen, + aesgcm_tv[i].iv, aesgcm_tv[i].ctext + plen) + || memcmp(buf, aesgcm_tv[i].ptext, plen)) { + pr_err("aesgcm_decrypt() #1 failed on vector %d\n", i); + return -ENODEV; + } + + /* encrypt in place */ + aesgcm_encrypt(&ctx, buf, buf, plen, aesgcm_tv[i].assoc, + aesgcm_tv[i].alen, aesgcm_tv[i].iv, tagbuf); + if (memcmp(buf, aesgcm_tv[i].ctext, plen)) { + pr_err("aesgcm_encrypt() failed on vector %d\n", i); + return -ENODEV; + } + + /* decrypt in place */ + if (!aesgcm_decrypt(&ctx, buf, buf, plen, aesgcm_tv[i].assoc, + aesgcm_tv[i].alen, aesgcm_tv[i].iv, tagbuf) + || memcmp(buf, aesgcm_tv[i].ptext, plen)) { + pr_err("aesgcm_decrypt() #2 failed on vector %d\n", i); + return -ENODEV; + } + } + return 0; +} +module_init(libaesgcm_init); + +static void __exit libaesgcm_exit(void) +{ +} +module_exit(libaesgcm_exit); +#endif -- GitLab From fb11cddfe24caad33667b5c9dd859562b2be6c75 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 4 Nov 2022 15:45:27 +0800 Subject: [PATCH 0555/1423] crypto: rockchip - Remove surplus dev_err() when using platform_get_irq() There is no need to call the dev_err() function directly to print a custom message when handling an error from either the platform_get_irq() or platform_get_irq_byname() functions as both are going to display an appropriate error message in case of a failure. ./drivers/crypto/rockchip/rk3288_crypto.c:351:2-9: line 351 is redundant because platform_get_irq() already prints an error Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=2677 Reported-by: Abaci Robot Signed-off-by: Yang Li Acked-by: Corentin Labbe Signed-off-by: Herbert Xu --- drivers/crypto/rockchip/rk3288_crypto.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/crypto/rockchip/rk3288_crypto.c b/drivers/crypto/rockchip/rk3288_crypto.c index 6217e73ba4c41..9f6ba770a90a1 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.c +++ b/drivers/crypto/rockchip/rk3288_crypto.c @@ -348,7 +348,6 @@ static int rk_crypto_probe(struct platform_device *pdev) crypto_info->irq = platform_get_irq(pdev, 0); if (crypto_info->irq < 0) { - dev_err(&pdev->dev, "control Interrupt is not available.\n"); err = crypto_info->irq; goto err_crypto; } -- GitLab From 557ffd5a4726f8b6f0dd1d4b632ae02c1c063233 Mon Sep 17 00:00:00 2001 From: Shashank Gupta Date: Fri, 4 Nov 2022 13:21:07 -0400 Subject: [PATCH 0556/1423] crypto: qat - remove ADF_STATUS_PF_RUNNING flag from probe The ADF_STATUS_PF_RUNNING bit is set after the successful initialization of the communication between VF to PF in adf_vf2pf_notify_init(). So, it is not required to be set after the execution of the function adf_dev_init(). Signed-off-by: Shashank Gupta Reviewed-by: Giovanni Cabiddu Reviewed-by: Wojciech Ziemba Signed-off-by: Herbert Xu --- drivers/crypto/qat/qat_c3xxxvf/adf_drv.c | 2 -- drivers/crypto/qat/qat_c62xvf/adf_drv.c | 2 -- drivers/crypto/qat/qat_dh895xccvf/adf_drv.c | 2 -- 3 files changed, 6 deletions(-) diff --git a/drivers/crypto/qat/qat_c3xxxvf/adf_drv.c b/drivers/crypto/qat/qat_c3xxxvf/adf_drv.c index fa18d8009f533..cf4ef83e186f8 100644 --- a/drivers/crypto/qat/qat_c3xxxvf/adf_drv.c +++ b/drivers/crypto/qat/qat_c3xxxvf/adf_drv.c @@ -177,8 +177,6 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (ret) goto out_err_dev_shutdown; - set_bit(ADF_STATUS_PF_RUNNING, &accel_dev->status); - ret = adf_dev_start(accel_dev); if (ret) goto out_err_dev_stop; diff --git a/drivers/crypto/qat/qat_c62xvf/adf_drv.c b/drivers/crypto/qat/qat_c62xvf/adf_drv.c index 686ec752d0e9e..0e642c94b9293 100644 --- a/drivers/crypto/qat/qat_c62xvf/adf_drv.c +++ b/drivers/crypto/qat/qat_c62xvf/adf_drv.c @@ -177,8 +177,6 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (ret) goto out_err_dev_shutdown; - set_bit(ADF_STATUS_PF_RUNNING, &accel_dev->status); - ret = adf_dev_start(accel_dev); if (ret) goto out_err_dev_stop; diff --git a/drivers/crypto/qat/qat_dh895xccvf/adf_drv.c b/drivers/crypto/qat/qat_dh895xccvf/adf_drv.c index 18756b2e1c912..c1485e702b3eb 100644 --- a/drivers/crypto/qat/qat_dh895xccvf/adf_drv.c +++ b/drivers/crypto/qat/qat_dh895xccvf/adf_drv.c @@ -177,8 +177,6 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (ret) goto out_err_dev_shutdown; - set_bit(ADF_STATUS_PF_RUNNING, &accel_dev->status); - ret = adf_dev_start(accel_dev); if (ret) goto out_err_dev_stop; -- GitLab From 198acab1772f22f2e91f68a2fc1331e91dad780a Mon Sep 17 00:00:00 2001 From: Jim Quinlan Date: Tue, 11 Oct 2022 14:42:06 -0400 Subject: [PATCH 0557/1423] PCI: brcmstb: Enable Multi-MSI We always wanted to enable Multi-MSI but didn't have a test device until recently. In addition, there are some devices out there that will ask for multiple MSI but refuse to work if they are only granted one. Link: https://lore.kernel.org/r/20221011184211.18128-2-jim2101024@gmail.com Signed-off-by: Jim Quinlan Signed-off-by: Lorenzo Pieralisi Acked-by: Florian Fainelli --- drivers/pci/controller/pcie-brcmstb.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c index 521acd632f1ac..a45ce7d61847a 100644 --- a/drivers/pci/controller/pcie-brcmstb.c +++ b/drivers/pci/controller/pcie-brcmstb.c @@ -445,7 +445,8 @@ static struct irq_chip brcm_msi_irq_chip = { static struct msi_domain_info brcm_msi_domain_info = { /* Multi MSI is supported by the controller, but not by this driver */ - .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS), + .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_MULTI_PCI_MSI), .chip = &brcm_msi_irq_chip, }; @@ -505,21 +506,23 @@ static struct irq_chip brcm_msi_bottom_irq_chip = { .irq_ack = brcm_msi_ack_irq, }; -static int brcm_msi_alloc(struct brcm_msi *msi) +static int brcm_msi_alloc(struct brcm_msi *msi, unsigned int nr_irqs) { int hwirq; mutex_lock(&msi->lock); - hwirq = bitmap_find_free_region(msi->used, msi->nr, 0); + hwirq = bitmap_find_free_region(msi->used, msi->nr, + order_base_2(nr_irqs)); mutex_unlock(&msi->lock); return hwirq; } -static void brcm_msi_free(struct brcm_msi *msi, unsigned long hwirq) +static void brcm_msi_free(struct brcm_msi *msi, unsigned long hwirq, + unsigned int nr_irqs) { mutex_lock(&msi->lock); - bitmap_release_region(msi->used, hwirq, 0); + bitmap_release_region(msi->used, hwirq, order_base_2(nr_irqs)); mutex_unlock(&msi->lock); } @@ -527,16 +530,17 @@ static int brcm_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *args) { struct brcm_msi *msi = domain->host_data; - int hwirq; + int hwirq, i; - hwirq = brcm_msi_alloc(msi); + hwirq = brcm_msi_alloc(msi, nr_irqs); if (hwirq < 0) return hwirq; - irq_domain_set_info(domain, virq, (irq_hw_number_t)hwirq, - &brcm_msi_bottom_irq_chip, domain->host_data, - handle_edge_irq, NULL, NULL); + for (i = 0; i < nr_irqs; i++) + irq_domain_set_info(domain, virq + i, hwirq + i, + &brcm_msi_bottom_irq_chip, domain->host_data, + handle_edge_irq, NULL, NULL); return 0; } @@ -546,7 +550,7 @@ static void brcm_irq_domain_free(struct irq_domain *domain, struct irq_data *d = irq_domain_get_irq_data(domain, virq); struct brcm_msi *msi = irq_data_get_irq_chip_data(d); - brcm_msi_free(msi, d->hwirq); + brcm_msi_free(msi, d->hwirq, nr_irqs); } static const struct irq_domain_ops msi_domain_ops = { -- GitLab From 3ae140ad827b359bc4fa7c7985691c4c1e3ca8f4 Mon Sep 17 00:00:00 2001 From: Jim Quinlan Date: Tue, 11 Oct 2022 14:42:07 -0400 Subject: [PATCH 0558/1423] PCI: brcmstb: Wait for 100ms following PERST# deassert Be prudent and give some time for power and clocks to become stable. As described in the PCIe CEM specification sections 2.2 and 2.2.1; as well as PCIe r5.0, 6.6.1. Link: https://lore.kernel.org/r/20221011184211.18128-3-jim2101024@gmail.com Signed-off-by: Jim Quinlan Signed-off-by: Lorenzo Pieralisi Acked-by: Florian Fainelli --- drivers/pci/controller/pcie-brcmstb.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c index a45ce7d61847a..39b545713ba02 100644 --- a/drivers/pci/controller/pcie-brcmstb.c +++ b/drivers/pci/controller/pcie-brcmstb.c @@ -1037,8 +1037,15 @@ static int brcm_pcie_start_link(struct brcm_pcie *pcie) pcie->perst_set(pcie, 0); /* - * Give the RC/EP time to wake up, before trying to configure RC. - * Intermittently check status for link-up, up to a total of 100ms. + * Wait for 100ms after PERST# deassertion; see PCIe CEM specification + * sections 2.2, PCIe r5.0, 6.6.1. + */ + msleep(100); + + /* + * Give the RC/EP even more time to wake up, before trying to + * configure RC. Intermittently check status for link-up, up to a + * total of 100ms. */ for (i = 0; i < 100 && !brcm_pcie_link_up(pcie); i += 5) msleep(5); -- GitLab From ca5dcc76314d1fa6d7307fd3b95039b08d2f2b97 Mon Sep 17 00:00:00 2001 From: Jim Quinlan Date: Tue, 11 Oct 2022 14:42:08 -0400 Subject: [PATCH 0559/1423] PCI: brcmstb: Replace status loops with read_poll_timeout_atomic() It would be nice to replace the PCIe link-up loop as well but there are too many uses of this that do not poll (and the read_poll_timeout uses "timeout==0" to loop forever). Link: https://lore.kernel.org/r/20221011184211.18128-4-jim2101024@gmail.com Signed-off-by: Jim Quinlan Signed-off-by: Lorenzo Pieralisi Acked-by: Florian Fainelli --- drivers/pci/controller/pcie-brcmstb.c | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c index 39b545713ba02..c7210cec1f582 100644 --- a/drivers/pci/controller/pcie-brcmstb.c +++ b/drivers/pci/controller/pcie-brcmstb.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -302,42 +303,34 @@ static u32 brcm_pcie_mdio_form_pkt(int port, int regad, int cmd) /* negative return value indicates error */ static int brcm_pcie_mdio_read(void __iomem *base, u8 port, u8 regad, u32 *val) { - int tries; u32 data; + int err; writel(brcm_pcie_mdio_form_pkt(port, regad, MDIO_CMD_READ), base + PCIE_RC_DL_MDIO_ADDR); readl(base + PCIE_RC_DL_MDIO_ADDR); - - data = readl(base + PCIE_RC_DL_MDIO_RD_DATA); - for (tries = 0; !MDIO_RD_DONE(data) && tries < 10; tries++) { - udelay(10); - data = readl(base + PCIE_RC_DL_MDIO_RD_DATA); - } - + err = readl_poll_timeout_atomic(base + PCIE_RC_DL_MDIO_RD_DATA, data, + MDIO_RD_DONE(data), 10, 100); *val = FIELD_GET(MDIO_DATA_MASK, data); - return MDIO_RD_DONE(data) ? 0 : -EIO; + + return err; } /* negative return value indicates error */ static int brcm_pcie_mdio_write(void __iomem *base, u8 port, u8 regad, u16 wrdata) { - int tries; u32 data; + int err; writel(brcm_pcie_mdio_form_pkt(port, regad, MDIO_CMD_WRITE), base + PCIE_RC_DL_MDIO_ADDR); readl(base + PCIE_RC_DL_MDIO_ADDR); writel(MDIO_DATA_DONE_MASK | wrdata, base + PCIE_RC_DL_MDIO_WR_DATA); - data = readl(base + PCIE_RC_DL_MDIO_WR_DATA); - for (tries = 0; !MDIO_WT_DONE(data) && tries < 10; tries++) { - udelay(10); - data = readl(base + PCIE_RC_DL_MDIO_WR_DATA); - } - - return MDIO_WT_DONE(data) ? 0 : -EIO; + err = readw_poll_timeout_atomic(base + PCIE_RC_DL_MDIO_WR_DATA, data, + MDIO_WT_DONE(data), 10, 100); + return err; } /* -- GitLab From 137b57413f569d558c1054e2a181313574eb9a87 Mon Sep 17 00:00:00 2001 From: Jim Quinlan Date: Tue, 11 Oct 2022 14:42:09 -0400 Subject: [PATCH 0560/1423] PCI: brcmstb: Drop needless 'inline' annotations A number of inline functions are called rarely and/or are not time-critical. Take out the "inline" and let the compiler do its work. Link: https://lore.kernel.org/r/20221011184211.18128-5-jim2101024@gmail.com Signed-off-by: Jim Quinlan Signed-off-by: Lorenzo Pieralisi Reviewed-by: Bjorn Helgaas Acked-by: Florian Fainelli --- drivers/pci/controller/pcie-brcmstb.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c index c7210cec1f582..e3045f1eadbc5 100644 --- a/drivers/pci/controller/pcie-brcmstb.c +++ b/drivers/pci/controller/pcie-brcmstb.c @@ -723,7 +723,7 @@ static void __iomem *brcm7425_pcie_map_bus(struct pci_bus *bus, return base + DATA_ADDR(pcie); } -static inline void brcm_pcie_bridge_sw_init_set_generic(struct brcm_pcie *pcie, u32 val) +static void brcm_pcie_bridge_sw_init_set_generic(struct brcm_pcie *pcie, u32 val) { u32 tmp, mask = RGR1_SW_INIT_1_INIT_GENERIC_MASK; u32 shift = RGR1_SW_INIT_1_INIT_GENERIC_SHIFT; @@ -733,7 +733,7 @@ static inline void brcm_pcie_bridge_sw_init_set_generic(struct brcm_pcie *pcie, writel(tmp, pcie->base + PCIE_RGR1_SW_INIT_1(pcie)); } -static inline void brcm_pcie_bridge_sw_init_set_7278(struct brcm_pcie *pcie, u32 val) +static void brcm_pcie_bridge_sw_init_set_7278(struct brcm_pcie *pcie, u32 val) { u32 tmp, mask = RGR1_SW_INIT_1_INIT_7278_MASK; u32 shift = RGR1_SW_INIT_1_INIT_7278_SHIFT; @@ -743,7 +743,7 @@ static inline void brcm_pcie_bridge_sw_init_set_7278(struct brcm_pcie *pcie, u32 writel(tmp, pcie->base + PCIE_RGR1_SW_INIT_1(pcie)); } -static inline void brcm_pcie_perst_set_4908(struct brcm_pcie *pcie, u32 val) +static void brcm_pcie_perst_set_4908(struct brcm_pcie *pcie, u32 val) { if (WARN_ONCE(!pcie->perst_reset, "missing PERST# reset controller\n")) return; @@ -754,7 +754,7 @@ static inline void brcm_pcie_perst_set_4908(struct brcm_pcie *pcie, u32 val) reset_control_deassert(pcie->perst_reset); } -static inline void brcm_pcie_perst_set_7278(struct brcm_pcie *pcie, u32 val) +static void brcm_pcie_perst_set_7278(struct brcm_pcie *pcie, u32 val) { u32 tmp; @@ -764,7 +764,7 @@ static inline void brcm_pcie_perst_set_7278(struct brcm_pcie *pcie, u32 val) writel(tmp, pcie->base + PCIE_MISC_PCIE_CTRL); } -static inline void brcm_pcie_perst_set_generic(struct brcm_pcie *pcie, u32 val) +static void brcm_pcie_perst_set_generic(struct brcm_pcie *pcie, u32 val) { u32 tmp; @@ -773,7 +773,7 @@ static inline void brcm_pcie_perst_set_generic(struct brcm_pcie *pcie, u32 val) writel(tmp, pcie->base + PCIE_RGR1_SW_INIT_1(pcie)); } -static inline int brcm_pcie_get_rc_bar2_size_and_offset(struct brcm_pcie *pcie, +static int brcm_pcie_get_rc_bar2_size_and_offset(struct brcm_pcie *pcie, u64 *rc_bar2_size, u64 *rc_bar2_offset) { -- GitLab From 602fb860945fd6dce7989fcd3727d5fe4282f785 Mon Sep 17 00:00:00 2001 From: Jim Quinlan Date: Tue, 11 Oct 2022 14:42:10 -0400 Subject: [PATCH 0561/1423] PCI: brcmstb: Set RCB_{MPS,64B}_MODE bits Set RCB_MPS mode bit so that data for PCIe read requests up to the size of the Maximum Payload Size (MPS) are returned in one completion, and data for PCIe read requests greater than the MPS are split at the specified Read Completion Boundary setting. Set RCB_64B so that the Read Compeletion Boundary is 64B. Link: https://lore.kernel.org/r/20221011184211.18128-6-jim2101024@gmail.com Signed-off-by: Jim Quinlan Signed-off-by: Lorenzo Pieralisi Acked-by: Florian Fainelli --- drivers/pci/controller/pcie-brcmstb.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c index e3045f1eadbc5..edf283e2b5dd0 100644 --- a/drivers/pci/controller/pcie-brcmstb.c +++ b/drivers/pci/controller/pcie-brcmstb.c @@ -53,6 +53,8 @@ #define PCIE_RC_DL_MDIO_RD_DATA 0x1108 #define PCIE_MISC_MISC_CTRL 0x4008 +#define PCIE_MISC_MISC_CTRL_PCIE_RCB_64B_MODE_MASK 0x80 +#define PCIE_MISC_MISC_CTRL_PCIE_RCB_MPS_MODE_MASK 0x400 #define PCIE_MISC_MISC_CTRL_SCB_ACCESS_EN_MASK 0x1000 #define PCIE_MISC_MISC_CTRL_CFG_READ_UR_MODE_MASK 0x2000 #define PCIE_MISC_MISC_CTRL_MAX_BURST_SIZE_MASK 0x300000 @@ -900,11 +902,16 @@ static int brcm_pcie_setup(struct brcm_pcie *pcie) else burst = 0x2; /* 512 bytes */ - /* Set SCB_MAX_BURST_SIZE, CFG_READ_UR_MODE, SCB_ACCESS_EN */ + /* + * Set SCB_MAX_BURST_SIZE, CFG_READ_UR_MODE, SCB_ACCESS_EN, + * RCB_MPS_MODE, RCB_64B_MODE + */ tmp = readl(base + PCIE_MISC_MISC_CTRL); u32p_replace_bits(&tmp, 1, PCIE_MISC_MISC_CTRL_SCB_ACCESS_EN_MASK); u32p_replace_bits(&tmp, 1, PCIE_MISC_MISC_CTRL_CFG_READ_UR_MODE_MASK); u32p_replace_bits(&tmp, burst, PCIE_MISC_MISC_CTRL_MAX_BURST_SIZE_MASK); + u32p_replace_bits(&tmp, 1, PCIE_MISC_MISC_CTRL_PCIE_RCB_MPS_MODE_MASK); + u32p_replace_bits(&tmp, 1, PCIE_MISC_MISC_CTRL_PCIE_RCB_64B_MODE_MASK); writel(tmp, base + PCIE_MISC_MISC_CTRL); ret = brcm_pcie_get_rc_bar2_size_and_offset(pcie, &rc_bar2_size, -- GitLab From d899aa668498c07ff217b666ae9712990306e682 Mon Sep 17 00:00:00 2001 From: Nirmal Patel Date: Wed, 9 Nov 2022 07:26:52 -0700 Subject: [PATCH 0562/1423] PCI: vmd: Disable MSI remapping after suspend MSI remapping is disabled by VMD driver for Intel's Icelake and newer systems in order to improve performance by setting VMCONFIG_MSI_REMAP. By design VMCONFIG_MSI_REMAP register is cleared by firmware during boot. The same register gets cleared when system is put in S3 power state. VMD driver needs to set this register again in order to avoid interrupt issues with devices behind VMD if MSI remapping was disabled before. Link: https://lore.kernel.org/r/20221109142652.450998-1-nirmal.patel@linux.intel.com Fixes: ee81ee84f873 ("PCI: vmd: Disable MSI-X remapping when possible") Signed-off-by: Nirmal Patel Signed-off-by: Lorenzo Pieralisi Reviewed-by: Francisco Munoz --- drivers/pci/controller/vmd.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index e06e9f4fc50f7..98e0746e681c9 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -980,6 +980,11 @@ static int vmd_resume(struct device *dev) struct vmd_dev *vmd = pci_get_drvdata(pdev); int err, i; + if (vmd->irq_domain) + vmd_set_msi_remapping(vmd, true); + else + vmd_set_msi_remapping(vmd, false); + for (i = 0; i < vmd->msix_count; i++) { err = devm_request_irq(dev, vmd->irqs[i].virq, vmd_irq, IRQF_NO_THREAD, -- GitLab From 3a936b2a5a581e50dd5cab20a48eb0055a527f02 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 2 Nov 2022 10:07:04 +0100 Subject: [PATCH 0563/1423] dt-bindings: PCI: qcom: Add SC8280XP/SA8540P interconnects Add the missing SC8280XP/SA8540P "pcie-mem" and "cpu-pcie" interconnect paths to the bindings. Link: https://lore.kernel.org/r/20221102090705.23634-2-johan+linaro@kernel.org Fixes: 76d777ae045e ("dt-bindings: PCI: qcom: Add SC8280XP to binding") Fixes: 76c4207f4085 ("dt-bindings: PCI: qcom: Add SA8540P to binding") Signed-off-by: Johan Hovold Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Acked-by: Manivannan Sadhasivam --- .../devicetree/bindings/pci/qcom,pcie.yaml | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie.yaml b/Documentation/devicetree/bindings/pci/qcom,pcie.yaml index 54f07852d279e..2f851c804bb07 100644 --- a/Documentation/devicetree/bindings/pci/qcom,pcie.yaml +++ b/Documentation/devicetree/bindings/pci/qcom,pcie.yaml @@ -62,6 +62,14 @@ properties: minItems: 3 maxItems: 13 + interconnects: + maxItems: 2 + + interconnect-names: + items: + - const: pcie-mem + - const: cpu-pcie + resets: minItems: 1 maxItems: 12 @@ -631,6 +639,18 @@ allOf: items: - const: pci # PCIe core reset + - if: + properties: + compatible: + contains: + enum: + - qcom,pcie-sa8540p + - qcom,pcie-sc8280xp + then: + required: + - interconnects + - interconnect-names + - if: not: properties: -- GitLab From c4860af88d0cb1bb006df12615c5515ae509f73b Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 2 Nov 2022 10:07:05 +0100 Subject: [PATCH 0564/1423] PCI: qcom: Add basic interconnect support On Qualcomm platforms like SC8280XP and SA8540P, interconnect bandwidth must be requested before enabling interconnect clocks. Add basic support for managing an optional "pcie-mem" interconnect path by setting a low constraint before enabling clocks and updating it after the link is up. Note that it is not possible for a controller driver to set anything but a maximum peak bandwidth as expected average bandwidth will vary with use case and actual use (and power policy?). This very much remains an unresolved problem with the interconnect framework. Also note that no constraint is set for the SC8280XP/SA8540P "cpu-pcie" path for now as it is not clear what an appropriate constraint would be (and the system does not crash when left unspecified). Link: https://lore.kernel.org/r/20221102090705.23634-3-johan+linaro@kernel.org Fixes: 70574511f3fc ("PCI: qcom: Add support for SC8280XP") Signed-off-by: Johan Hovold Signed-off-by: Lorenzo Pieralisi Reviewed-by: Brian Masney Reviewed-by: Manivannan Sadhasivam Acked-by: Georgi Djakov --- drivers/pci/controller/dwc/pcie-qcom.c | 76 ++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c index f711acacaeaf8..ec230b988ba25 100644 --- a/drivers/pci/controller/dwc/pcie-qcom.c +++ b/drivers/pci/controller/dwc/pcie-qcom.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -223,6 +224,7 @@ struct qcom_pcie { union qcom_pcie_resources res; struct phy *phy; struct gpio_desc *reset; + struct icc_path *icc_mem; const struct qcom_pcie_cfg *cfg; }; @@ -1639,6 +1641,74 @@ static const struct dw_pcie_ops dw_pcie_ops = { .start_link = qcom_pcie_start_link, }; +static int qcom_pcie_icc_init(struct qcom_pcie *pcie) +{ + struct dw_pcie *pci = pcie->pci; + int ret; + + pcie->icc_mem = devm_of_icc_get(pci->dev, "pcie-mem"); + if (IS_ERR(pcie->icc_mem)) + return PTR_ERR(pcie->icc_mem); + + /* + * Some Qualcomm platforms require interconnect bandwidth constraints + * to be set before enabling interconnect clocks. + * + * Set an initial peak bandwidth corresponding to single-lane Gen 1 + * for the pcie-mem path. + */ + ret = icc_set_bw(pcie->icc_mem, 0, MBps_to_icc(250)); + if (ret) { + dev_err(pci->dev, "failed to set interconnect bandwidth: %d\n", + ret); + return ret; + } + + return 0; +} + +static void qcom_pcie_icc_update(struct qcom_pcie *pcie) +{ + struct dw_pcie *pci = pcie->pci; + u32 offset, status, bw; + int speed, width; + int ret; + + if (!pcie->icc_mem) + return; + + offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP); + status = readw(pci->dbi_base + offset + PCI_EXP_LNKSTA); + + /* Only update constraints if link is up. */ + if (!(status & PCI_EXP_LNKSTA_DLLLA)) + return; + + speed = FIELD_GET(PCI_EXP_LNKSTA_CLS, status); + width = FIELD_GET(PCI_EXP_LNKSTA_NLW, status); + + switch (speed) { + case 1: + bw = MBps_to_icc(250); + break; + case 2: + bw = MBps_to_icc(500); + break; + default: + WARN_ON_ONCE(1); + fallthrough; + case 3: + bw = MBps_to_icc(985); + break; + } + + ret = icc_set_bw(pcie->icc_mem, 0, width * bw); + if (ret) { + dev_err(pci->dev, "failed to set interconnect bandwidth: %d\n", + ret); + } +} + static int qcom_pcie_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; @@ -1699,6 +1769,10 @@ static int qcom_pcie_probe(struct platform_device *pdev) goto err_pm_runtime_put; } + ret = qcom_pcie_icc_init(pcie); + if (ret) + goto err_pm_runtime_put; + ret = pcie->cfg->ops->get_resources(pcie); if (ret) goto err_pm_runtime_put; @@ -1717,6 +1791,8 @@ static int qcom_pcie_probe(struct platform_device *pdev) goto err_phy_exit; } + qcom_pcie_icc_update(pcie); + return 0; err_phy_exit: -- GitLab From 2759ddf7535d63381f9b9b1412e4c46e13ed773a Mon Sep 17 00:00:00 2001 From: Shunsuke Mie Date: Mon, 15 Aug 2022 11:50:06 +0900 Subject: [PATCH 0565/1423] PCI: endpoint: Fix Kconfig indent style Change to follow the Kconfig style guide. This patch fixes to use tab rather than space to indent, while help text is indented an additional two spaces. Link: https://lore.kernel.org/r/20220815025006.48167-1-mie@igel.co.jp Fixes: e35f56bb0330 ("PCI: endpoint: Support NTB transfer between RC and EP") Signed-off-by: Shunsuke Mie Signed-off-by: Lorenzo Pieralisi --- drivers/pci/endpoint/functions/Kconfig | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/pci/endpoint/functions/Kconfig b/drivers/pci/endpoint/functions/Kconfig index 295a033ee9a27..9fd5608868718 100644 --- a/drivers/pci/endpoint/functions/Kconfig +++ b/drivers/pci/endpoint/functions/Kconfig @@ -27,13 +27,13 @@ config PCI_EPF_NTB If in doubt, say "N" to disable Endpoint NTB driver. config PCI_EPF_VNTB - tristate "PCI Endpoint NTB driver" - depends on PCI_ENDPOINT - depends on NTB - select CONFIGFS_FS - help - Select this configuration option to enable the Non-Transparent - Bridge (NTB) driver for PCIe Endpoint. NTB driver implements NTB - between PCI Root Port and PCIe Endpoint. + tristate "PCI Endpoint NTB driver" + depends on PCI_ENDPOINT + depends on NTB + select CONFIGFS_FS + help + Select this configuration option to enable the Non-Transparent + Bridge (NTB) driver for PCIe Endpoint. NTB driver implements NTB + between PCI Root Port and PCIe Endpoint. - If in doubt, say "N" to disable Endpoint NTB driver. + If in doubt, say "N" to disable Endpoint NTB driver. -- GitLab From ae6b9a65af480144da323436d90e149501ea8937 Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Tue, 1 Nov 2022 10:57:14 +0100 Subject: [PATCH 0566/1423] PCI: imx6: Initialize PHY before deasserting core reset When the PHY is the reference clock provider then it must be initialized and powered on before the reset on the client is deasserted, otherwise the link will never come up. The order was changed in cf236e0c0d59. Restore the correct order to make the driver work again on boards where the PHY provides the reference clock. This also changes the order for boards where the Soc is the PHY reference clock divider, but this shouldn't do any harm. Link: https://lore.kernel.org/r/20221101095714.440001-1-s.hauer@pengutronix.de Fixes: cf236e0c0d59 ("PCI: imx6: Do not hide PHY driver callbacks and refine the error handling") Tested-by: Richard Zhu Signed-off-by: Sascha Hauer Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/dwc/pci-imx6.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index 2616585ca5f8a..1dde5c579edc8 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -952,12 +952,6 @@ static int imx6_pcie_host_init(struct dw_pcie_rp *pp) } } - ret = imx6_pcie_deassert_core_reset(imx6_pcie); - if (ret < 0) { - dev_err(dev, "pcie deassert core reset failed: %d\n", ret); - goto err_phy_off; - } - if (imx6_pcie->phy) { ret = phy_power_on(imx6_pcie->phy); if (ret) { @@ -965,6 +959,13 @@ static int imx6_pcie_host_init(struct dw_pcie_rp *pp) goto err_phy_off; } } + + ret = imx6_pcie_deassert_core_reset(imx6_pcie); + if (ret < 0) { + dev_err(dev, "pcie deassert core reset failed: %d\n", ret); + goto err_phy_off; + } + imx6_setup_phy_mpll(imx6_pcie); return 0; -- GitLab From 83f8a81dece8bc4237d8d94af357fb5df0083e63 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Thu, 3 Nov 2022 13:12:10 +0000 Subject: [PATCH 0567/1423] KVM: arm64: Fix pvtime documentation This includes table format and using reST labels for cross-referencing to vcpu.rst. Suggested-by: Bagas Sanjaya Signed-off-by: Usama Arif Reviewed-by: Steven Price Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221103131210.3603385-1-usama.arif@bytedance.com --- Documentation/virt/kvm/arm/pvtime.rst | 14 ++++++++------ Documentation/virt/kvm/devices/vcpu.rst | 2 ++ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/Documentation/virt/kvm/arm/pvtime.rst b/Documentation/virt/kvm/arm/pvtime.rst index 392521af7c905..e88b34e586be2 100644 --- a/Documentation/virt/kvm/arm/pvtime.rst +++ b/Documentation/virt/kvm/arm/pvtime.rst @@ -23,21 +23,23 @@ the PV_TIME_FEATURES hypercall should be probed using the SMCCC 1.1 ARCH_FEATURES mechanism before calling it. PV_TIME_FEATURES - ============= ======== ========== + + ============= ======== ================================================= Function ID: (uint32) 0xC5000020 PV_call_id: (uint32) The function to query for support. Currently only PV_TIME_ST is supported. Return value: (int64) NOT_SUPPORTED (-1) or SUCCESS (0) if the relevant PV-time feature is supported by the hypervisor. - ============= ======== ========== + ============= ======== ================================================= PV_TIME_ST - ============= ======== ========== + + ============= ======== ============================================== Function ID: (uint32) 0xC5000021 Return value: (int64) IPA of the stolen time data structure for this VCPU. On failure: NOT_SUPPORTED (-1) - ============= ======== ========== + ============= ======== ============================================== The IPA returned by PV_TIME_ST should be mapped by the guest as normal memory with inner and outer write back caching attributes, in the inner shareable @@ -76,5 +78,5 @@ It is advisable that one or more 64k pages are set aside for the purpose of these structures and not used for other purposes, this enables the guest to map the region using 64k pages and avoids conflicting attributes with other memory. -For the user space interface see Documentation/virt/kvm/devices/vcpu.rst -section "3. GROUP: KVM_ARM_VCPU_PVTIME_CTRL". +For the user space interface see +:ref:`Documentation/virt/kvm/devices/vcpu.rst `. \ No newline at end of file diff --git a/Documentation/virt/kvm/devices/vcpu.rst b/Documentation/virt/kvm/devices/vcpu.rst index 716aa3edae14f..31f14ec4a65b6 100644 --- a/Documentation/virt/kvm/devices/vcpu.rst +++ b/Documentation/virt/kvm/devices/vcpu.rst @@ -171,6 +171,8 @@ configured values on other VCPUs. Userspace should configure the interrupt numbers on at least one VCPU after creating all VCPUs and before running any VCPUs. +.. _kvm_arm_vcpu_pvtime_ctrl: + 3. GROUP: KVM_ARM_VCPU_PVTIME_CTRL ================================== -- GitLab From e1b3253340029b06f5f648d8390807cba4e4ec23 Mon Sep 17 00:00:00 2001 From: Zhiyuan Dai Date: Sun, 6 Nov 2022 20:30:40 +0800 Subject: [PATCH 0568/1423] KVM: arm64: Fix typo in comment Fix typo in comment (nVHE/VHE). Signed-off-by: Zhiyuan Dai Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/1667737840-702-1-git-send-email-daizhiyuan@phytium.com.cn --- arch/arm64/kvm/hyp/vhe/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/vhe/Makefile b/arch/arm64/kvm/hyp/vhe/Makefile index 96bec0ecf9dd9..3b9e5464b5b39 100644 --- a/arch/arm64/kvm/hyp/vhe/Makefile +++ b/arch/arm64/kvm/hyp/vhe/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 # -# Makefile for Kernel-based Virtual Machine module, HYP/nVHE part +# Makefile for Kernel-based Virtual Machine module, HYP/VHE part # asflags-y := -D__KVM_VHE_HYPERVISOR__ -- GitLab From 0f4f7ae10ee4e6403659b2d9ddf05424eecde45b Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 10 Nov 2022 19:02:34 +0000 Subject: [PATCH 0569/1423] KVM: arm64: Move hyp refcount manipulation helpers to common header file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We will soon need to manipulate 'struct hyp_page' refcounts from outside page_alloc.c, so move the helpers to a common header file to allow them to be reused easily. Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Oliver Upton Tested-by: Vincent Donnefort Signed-off-by: Quentin Perret Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110190259.26861-2-will@kernel.org --- arch/arm64/kvm/hyp/include/nvhe/memory.h | 22 ++++++++++++++++++++++ arch/arm64/kvm/hyp/nvhe/page_alloc.c | 19 ------------------- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h index 592b7edb3edb4..9422900e5c6a0 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/memory.h +++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h @@ -38,6 +38,10 @@ static inline phys_addr_t hyp_virt_to_phys(void *addr) #define hyp_page_to_virt(page) __hyp_va(hyp_page_to_phys(page)) #define hyp_page_to_pool(page) (((struct hyp_page *)page)->pool) +/* + * Refcounting for 'struct hyp_page'. + * hyp_pool::lock must be held if atomic access to the refcount is required. + */ static inline int hyp_page_count(void *addr) { struct hyp_page *p = hyp_virt_to_page(addr); @@ -45,4 +49,22 @@ static inline int hyp_page_count(void *addr) return p->refcount; } +static inline void hyp_page_ref_inc(struct hyp_page *p) +{ + BUG_ON(p->refcount == USHRT_MAX); + p->refcount++; +} + +static inline int hyp_page_ref_dec_and_test(struct hyp_page *p) +{ + BUG_ON(!p->refcount); + p->refcount--; + return (p->refcount == 0); +} + +static inline void hyp_set_page_refcounted(struct hyp_page *p) +{ + BUG_ON(p->refcount); + p->refcount = 1; +} #endif /* __KVM_HYP_MEMORY_H */ diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c index d40f0b30b5347..1ded09fc9b104 100644 --- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c +++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c @@ -144,25 +144,6 @@ static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool, return p; } -static inline void hyp_page_ref_inc(struct hyp_page *p) -{ - BUG_ON(p->refcount == USHRT_MAX); - p->refcount++; -} - -static inline int hyp_page_ref_dec_and_test(struct hyp_page *p) -{ - BUG_ON(!p->refcount); - p->refcount--; - return (p->refcount == 0); -} - -static inline void hyp_set_page_refcounted(struct hyp_page *p) -{ - BUG_ON(p->refcount); - p->refcount = 1; -} - static void __hyp_put_page(struct hyp_pool *pool, struct hyp_page *p) { if (hyp_page_ref_dec_and_test(p)) -- GitLab From 72a5bc0f153ce8ca80e9abbd1d9adec7d586915a Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 10 Nov 2022 19:02:35 +0000 Subject: [PATCH 0570/1423] KVM: arm64: Allow attaching of non-coalescable pages to a hyp pool All the contiguous pages used to initialize a 'struct hyp_pool' are considered coalescable, which means that the hyp page allocator will actively try to merge them with their buddies on the hyp_put_page() path. However, using hyp_put_page() on a page that is not part of the inital memory range given to a hyp_pool() is currently unsupported. In order to allow dynamically extending hyp pools at run-time, add a check to __hyp_attach_page() to allow inserting 'external' pages into the free-list of order 0. This will be necessary to allow lazy donation of pages from the host to the hypervisor when allocating guest stage-2 page-table pages at EL2. Tested-by: Vincent Donnefort Signed-off-by: Quentin Perret Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110190259.26861-3-will@kernel.org --- arch/arm64/kvm/hyp/nvhe/page_alloc.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c index 1ded09fc9b104..dad88e2035984 100644 --- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c +++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c @@ -93,11 +93,16 @@ static inline struct hyp_page *node_to_page(struct list_head *node) static void __hyp_attach_page(struct hyp_pool *pool, struct hyp_page *p) { + phys_addr_t phys = hyp_page_to_phys(p); unsigned short order = p->order; struct hyp_page *buddy; memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order); + /* Skip coalescing for 'external' pages being freed into the pool. */ + if (phys < pool->range_start || phys >= pool->range_end) + goto insert; + /* * Only the first struct hyp_page of a high-order page (otherwise known * as the 'head') should have p->order set. The non-head pages should @@ -116,6 +121,7 @@ static void __hyp_attach_page(struct hyp_pool *pool, p = min(p, buddy); } +insert: /* Mark the new head, and insert it */ p->order = order; page_add_to_list(p, &pool->free_area[order]); -- GitLab From 8e6bcc3a4502a0d8d065466efd888b6b59b85789 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 10 Nov 2022 19:02:36 +0000 Subject: [PATCH 0571/1423] KVM: arm64: Back the hypervisor 'struct hyp_page' array for all memory The EL2 'vmemmap' array in nVHE Protected mode is currently very sparse: only memory pages owned by the hypervisor itself have a matching 'struct hyp_page'. However, as the size of this struct has been reduced significantly since its introduction, it appears that we can now afford to back the vmemmap for all of memory. Having an easily accessible 'struct hyp_page' for every physical page in memory provides the hypervisor with a simple mechanism to store metadata (e.g. a refcount) that wouldn't otherwise fit in the very limited number of software bits available in the host stage-2 page-table entries. This will be used in subsequent patches when pinning host memory pages for use by the hypervisor at EL2. Tested-by: Vincent Donnefort Signed-off-by: Quentin Perret Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110190259.26861-4-will@kernel.org --- arch/arm64/include/asm/kvm_pkvm.h | 26 +++++++++++++++++++++++ arch/arm64/kvm/hyp/include/nvhe/mm.h | 14 +------------ arch/arm64/kvm/hyp/nvhe/mm.c | 31 ++++++++++++++++++++++++---- arch/arm64/kvm/hyp/nvhe/page_alloc.c | 4 +--- arch/arm64/kvm/hyp/nvhe/setup.c | 7 +++---- arch/arm64/kvm/pkvm.c | 18 ++-------------- 6 files changed, 60 insertions(+), 40 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h index 9f4ad2a8df59c..8f7b8a2314bbb 100644 --- a/arch/arm64/include/asm/kvm_pkvm.h +++ b/arch/arm64/include/asm/kvm_pkvm.h @@ -14,6 +14,32 @@ extern struct memblock_region kvm_nvhe_sym(hyp_memory)[]; extern unsigned int kvm_nvhe_sym(hyp_memblock_nr); +static inline unsigned long +hyp_vmemmap_memblock_size(struct memblock_region *reg, size_t vmemmap_entry_size) +{ + unsigned long nr_pages = reg->size >> PAGE_SHIFT; + unsigned long start, end; + + start = (reg->base >> PAGE_SHIFT) * vmemmap_entry_size; + end = start + nr_pages * vmemmap_entry_size; + start = ALIGN_DOWN(start, PAGE_SIZE); + end = ALIGN(end, PAGE_SIZE); + + return end - start; +} + +static inline unsigned long hyp_vmemmap_pages(size_t vmemmap_entry_size) +{ + unsigned long res = 0, i; + + for (i = 0; i < kvm_nvhe_sym(hyp_memblock_nr); i++) { + res += hyp_vmemmap_memblock_size(&kvm_nvhe_sym(hyp_memory)[i], + vmemmap_entry_size); + } + + return res >> PAGE_SHIFT; +} + static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages) { unsigned long total = 0, i; diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h index 42d8eb9bfe725..b2ee6d5df55b1 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h @@ -15,7 +15,7 @@ extern hyp_spinlock_t pkvm_pgd_lock; int hyp_create_idmap(u32 hyp_va_bits); int hyp_map_vectors(void); -int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back); +int hyp_back_vmemmap(phys_addr_t back); int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot); int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot); int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot); @@ -24,16 +24,4 @@ int __pkvm_create_private_mapping(phys_addr_t phys, size_t size, unsigned long *haddr); int pkvm_alloc_private_va_range(size_t size, unsigned long *haddr); -static inline void hyp_vmemmap_range(phys_addr_t phys, unsigned long size, - unsigned long *start, unsigned long *end) -{ - unsigned long nr_pages = size >> PAGE_SHIFT; - struct hyp_page *p = hyp_phys_to_page(phys); - - *start = (unsigned long)p; - *end = *start + nr_pages * sizeof(struct hyp_page); - *start = ALIGN_DOWN(*start, PAGE_SIZE); - *end = ALIGN(*end, PAGE_SIZE); -} - #endif /* __KVM_HYP_MM_H */ diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index 96193cb31a399..d3a3b47181de0 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -129,13 +129,36 @@ int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot) return ret; } -int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back) +int hyp_back_vmemmap(phys_addr_t back) { - unsigned long start, end; + unsigned long i, start, size, end = 0; + int ret; - hyp_vmemmap_range(phys, size, &start, &end); + for (i = 0; i < hyp_memblock_nr; i++) { + start = hyp_memory[i].base; + start = ALIGN_DOWN((u64)hyp_phys_to_page(start), PAGE_SIZE); + /* + * The begining of the hyp_vmemmap region for the current + * memblock may already be backed by the page backing the end + * the previous region, so avoid mapping it twice. + */ + start = max(start, end); + + end = hyp_memory[i].base + hyp_memory[i].size; + end = PAGE_ALIGN((u64)hyp_phys_to_page(end)); + if (start >= end) + continue; + + size = end - start; + ret = __pkvm_create_mappings(start, size, back, PAGE_HYP); + if (ret) + return ret; + + memset(hyp_phys_to_virt(back), 0, size); + back += size; + } - return __pkvm_create_mappings(start, end - start, back, PAGE_HYP); + return 0; } static void *__hyp_bp_vect_base; diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c index dad88e2035984..803ba3222e75f 100644 --- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c +++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c @@ -236,10 +236,8 @@ int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages, /* Init the vmemmap portion */ p = hyp_phys_to_page(phys); - for (i = 0; i < nr_pages; i++) { - p[i].order = 0; + for (i = 0; i < nr_pages; i++) hyp_set_page_refcounted(&p[i]); - } /* Attach the unused pages to the buddy tree */ for (i = reserved_pages; i < nr_pages; i++) diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index e8d4ea2fcfa09..579eb4f734768 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -31,12 +31,11 @@ static struct hyp_pool hpool; static int divide_memory_pool(void *virt, unsigned long size) { - unsigned long vstart, vend, nr_pages; + unsigned long nr_pages; hyp_early_alloc_init(virt, size); - hyp_vmemmap_range(__hyp_pa(virt), size, &vstart, &vend); - nr_pages = (vend - vstart) >> PAGE_SHIFT; + nr_pages = hyp_vmemmap_pages(sizeof(struct hyp_page)); vmemmap_base = hyp_early_alloc_contig(nr_pages); if (!vmemmap_base) return -ENOMEM; @@ -78,7 +77,7 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, if (ret) return ret; - ret = hyp_back_vmemmap(phys, size, hyp_virt_to_phys(vmemmap_base)); + ret = hyp_back_vmemmap(hyp_virt_to_phys(vmemmap_base)); if (ret) return ret; diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index ebecb7c045f43..34229425b25d3 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -53,7 +53,7 @@ static int __init register_memblock_regions(void) void __init kvm_hyp_reserve(void) { - u64 nr_pages, prev, hyp_mem_pages = 0; + u64 hyp_mem_pages = 0; int ret; if (!is_hyp_mode_available() || is_kernel_in_hyp_mode()) @@ -71,21 +71,7 @@ void __init kvm_hyp_reserve(void) hyp_mem_pages += hyp_s1_pgtable_pages(); hyp_mem_pages += host_s2_pgtable_pages(); - - /* - * The hyp_vmemmap needs to be backed by pages, but these pages - * themselves need to be present in the vmemmap, so compute the number - * of pages needed by looking for a fixed point. - */ - nr_pages = 0; - do { - prev = nr_pages; - nr_pages = hyp_mem_pages + prev; - nr_pages = DIV_ROUND_UP(nr_pages * STRUCT_HYP_PAGE_SIZE, - PAGE_SIZE); - nr_pages += __hyp_pgtable_max_pages(nr_pages); - } while (nr_pages != prev); - hyp_mem_pages += nr_pages; + hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE); /* * Try to allocate a PMD-aligned region to reduce TLB pressure once -- GitLab From 0d16d12eb26ef85602ef8a678d94825a66772774 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 10 Nov 2022 19:02:37 +0000 Subject: [PATCH 0572/1423] KVM: arm64: Fix-up hyp stage-1 refcounts for all pages mapped at EL2 In order to allow unmapping arbitrary memory pages from the hypervisor stage-1 page-table, fix-up the initial refcount for pages that have been mapped before the 'vmemmap' array was up and running so that it accurately accounts for all existing hypervisor mappings. This is achieved by traversing the entire hypervisor stage-1 page-table during initialisation of EL2 and updating the corresponding 'struct hyp_page' for each valid mapping. Reviewed-by: Oliver Upton Tested-by: Vincent Donnefort Signed-off-by: Quentin Perret Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110190259.26861-5-will@kernel.org --- arch/arm64/kvm/hyp/nvhe/setup.c | 62 +++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 19 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 579eb4f734768..8f2726d7e2010 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -185,12 +185,11 @@ static void hpool_put_page(void *addr) hyp_put_page(&hpool, addr); } -static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level, - kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, - void * const arg) +static int fix_host_ownership_walker(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, + void * const arg) { - struct kvm_pgtable_mm_ops *mm_ops = arg; enum kvm_pgtable_prot prot; enum pkvm_page_state state; kvm_pte_t pte = *ptep; @@ -199,15 +198,6 @@ static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level, if (!kvm_pte_valid(pte)) return 0; - /* - * Fix-up the refcount for the page-table pages as the early allocator - * was unable to access the hyp_vmemmap and so the buddy allocator has - * initialised the refcount to '1'. - */ - mm_ops->get_page(ptep); - if (flag != KVM_PGTABLE_WALK_LEAF) - return 0; - if (level != (KVM_PGTABLE_MAX_LEVELS - 1)) return -EINVAL; @@ -236,12 +226,30 @@ static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level, return host_stage2_idmap_locked(phys, PAGE_SIZE, prot); } -static int finalize_host_mappings(void) +static int fix_hyp_pgtable_refcnt_walker(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, + void * const arg) +{ + struct kvm_pgtable_mm_ops *mm_ops = arg; + kvm_pte_t pte = *ptep; + + /* + * Fix-up the refcount for the page-table pages as the early allocator + * was unable to access the hyp_vmemmap and so the buddy allocator has + * initialised the refcount to '1'. + */ + if (kvm_pte_valid(pte)) + mm_ops->get_page(ptep); + + return 0; +} + +static int fix_host_ownership(void) { struct kvm_pgtable_walker walker = { - .cb = finalize_host_mappings_walker, - .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, - .arg = pkvm_pgtable.mm_ops, + .cb = fix_host_ownership_walker, + .flags = KVM_PGTABLE_WALK_LEAF, }; int i, ret; @@ -257,6 +265,18 @@ static int finalize_host_mappings(void) return 0; } +static int fix_hyp_pgtable_refcnt(void) +{ + struct kvm_pgtable_walker walker = { + .cb = fix_hyp_pgtable_refcnt_walker, + .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, + .arg = pkvm_pgtable.mm_ops, + }; + + return kvm_pgtable_walk(&pkvm_pgtable, 0, BIT(pkvm_pgtable.ia_bits), + &walker); +} + void __noreturn __pkvm_init_finalise(void) { struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data); @@ -286,7 +306,11 @@ void __noreturn __pkvm_init_finalise(void) }; pkvm_pgtable.mm_ops = &pkvm_pgtable_mm_ops; - ret = finalize_host_mappings(); + ret = fix_host_ownership(); + if (ret) + goto out; + + ret = fix_hyp_pgtable_refcnt(); if (ret) goto out; -- GitLab From 33bc332d4061e95db55594893c4f80105b1dd813 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 10 Nov 2022 19:02:38 +0000 Subject: [PATCH 0573/1423] KVM: arm64: Unify identifiers used to distinguish host and hypervisor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'pkvm_component_id' enum type provides constants to refer to the host and the hypervisor, yet this information is duplicated by the 'pkvm_hyp_id' constant. Remove the definition of 'pkvm_hyp_id' and move the 'pkvm_component_id' type definition to 'mem_protect.h' so that it can be used outside of the memory protection code, for example when initialising the owner for hypervisor-owned pages. Reviewed-by: Oliver Upton Reviewed-by: Philippe Mathieu-Daudé Tested-by: Vincent Donnefort Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110190259.26861-6-will@kernel.org --- arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 6 +++++- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 8 -------- arch/arm64/kvm/hyp/nvhe/setup.c | 2 +- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 80e99836eac79..f5705a1e972ff 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -51,7 +51,11 @@ struct host_kvm { }; extern struct host_kvm host_kvm; -extern const u8 pkvm_hyp_id; +/* This corresponds to page-table locking order */ +enum pkvm_component_id { + PKVM_ID_HOST, + PKVM_ID_HYP, +}; int __pkvm_prot_finalize(void); int __pkvm_host_share_hyp(u64 pfn); diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 1e78acf9662eb..ff86f5bd230f8 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -26,8 +26,6 @@ struct host_kvm host_kvm; static struct hyp_pool host_s2_pool; -const u8 pkvm_hyp_id = 1; - static void host_lock_component(void) { hyp_spin_lock(&host_kvm.lock); @@ -380,12 +378,6 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) BUG_ON(ret && ret != -EAGAIN); } -/* This corresponds to locking order */ -enum pkvm_component_id { - PKVM_ID_HOST, - PKVM_ID_HYP, -}; - struct pkvm_mem_transition { u64 nr_pages; diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 8f2726d7e2010..0312c9c74a5a4 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -212,7 +212,7 @@ static int fix_host_ownership_walker(u64 addr, u64 end, u32 level, state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(pte)); switch (state) { case PKVM_PAGE_OWNED: - return host_stage2_set_owner_locked(phys, PAGE_SIZE, pkvm_hyp_id); + return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP); case PKVM_PAGE_SHARED_OWNED: prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_BORROWED); break; -- GitLab From 1ed5c24c26f48ff61dc5d97c655769821f36a622 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 10 Nov 2022 19:02:39 +0000 Subject: [PATCH 0574/1423] KVM: arm64: Implement do_donate() helper for donating memory Transferring ownership information of a memory region from one component to another can be achieved using a "donate" operation, which results in the previous owner losing access to the underlying pages entirely and the new owner having exclusive access to the page. Implement a do_donate() helper, along the same lines as do_{un,}share, and provide this functionality for the host-{to,from}-hyp cases as this will later be used to donate/reclaim memory pages to store VM metadata at EL2. In a similar manner to the sharing transitions, permission checks are performed by the hypervisor to ensure that the component initiating the transition really is the owner of the page and also that the completer does not currently have a page mapped at the target address. Tested-by: Vincent Donnefort Co-developed-by: Quentin Perret Signed-off-by: Quentin Perret Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110190259.26861-7-will@kernel.org --- arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 2 + arch/arm64/kvm/hyp/nvhe/mem_protect.c | 239 ++++++++++++++++++ 2 files changed, 241 insertions(+) diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index f5705a1e972ff..c87b19b2d4680 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -60,6 +60,8 @@ enum pkvm_component_id { int __pkvm_prot_finalize(void); int __pkvm_host_share_hyp(u64 pfn); int __pkvm_host_unshare_hyp(u64 pfn); +int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages); +int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages); bool addr_is_memory(phys_addr_t phys); int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot); diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index ff86f5bd230f8..10069cd327873 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -391,6 +391,9 @@ struct pkvm_mem_transition { /* Address in the completer's address space */ u64 completer_addr; } host; + struct { + u64 completer_addr; + } hyp; }; } initiator; @@ -404,6 +407,10 @@ struct pkvm_mem_share { const enum kvm_pgtable_prot completer_prot; }; +struct pkvm_mem_donation { + const struct pkvm_mem_transition tx; +}; + struct check_walk_data { enum pkvm_page_state desired; enum pkvm_page_state (*get_page_state)(kvm_pte_t pte); @@ -503,6 +510,46 @@ static int host_initiate_unshare(u64 *completer_addr, return __host_set_page_state_range(addr, size, PKVM_PAGE_OWNED); } +static int host_initiate_donation(u64 *completer_addr, + const struct pkvm_mem_transition *tx) +{ + u8 owner_id = tx->completer.id; + u64 size = tx->nr_pages * PAGE_SIZE; + + *completer_addr = tx->initiator.host.completer_addr; + return host_stage2_set_owner_locked(tx->initiator.addr, size, owner_id); +} + +static bool __host_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx) +{ + return !(IS_ENABLED(CONFIG_NVHE_EL2_DEBUG) || + tx->initiator.id != PKVM_ID_HYP); +} + +static int __host_ack_transition(u64 addr, const struct pkvm_mem_transition *tx, + enum pkvm_page_state state) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + + if (__host_ack_skip_pgtable_check(tx)) + return 0; + + return __host_check_page_state_range(addr, size, state); +} + +static int host_ack_donation(u64 addr, const struct pkvm_mem_transition *tx) +{ + return __host_ack_transition(addr, tx, PKVM_NOPAGE); +} + +static int host_complete_donation(u64 addr, const struct pkvm_mem_transition *tx) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + u8 host_id = tx->completer.id; + + return host_stage2_set_owner_locked(addr, size, host_id); +} + static enum pkvm_page_state hyp_get_page_state(kvm_pte_t pte) { if (!kvm_pte_valid(pte)) @@ -523,6 +570,27 @@ static int __hyp_check_page_state_range(u64 addr, u64 size, return check_page_state_range(&pkvm_pgtable, addr, size, &d); } +static int hyp_request_donation(u64 *completer_addr, + const struct pkvm_mem_transition *tx) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + u64 addr = tx->initiator.addr; + + *completer_addr = tx->initiator.hyp.completer_addr; + return __hyp_check_page_state_range(addr, size, PKVM_PAGE_OWNED); +} + +static int hyp_initiate_donation(u64 *completer_addr, + const struct pkvm_mem_transition *tx) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + int ret; + + *completer_addr = tx->initiator.hyp.completer_addr; + ret = kvm_pgtable_hyp_unmap(&pkvm_pgtable, tx->initiator.addr, size); + return (ret != size) ? -EFAULT : 0; +} + static bool __hyp_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx) { return !(IS_ENABLED(CONFIG_NVHE_EL2_DEBUG) || @@ -554,6 +622,16 @@ static int hyp_ack_unshare(u64 addr, const struct pkvm_mem_transition *tx) PKVM_PAGE_SHARED_BORROWED); } +static int hyp_ack_donation(u64 addr, const struct pkvm_mem_transition *tx) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + + if (__hyp_ack_skip_pgtable_check(tx)) + return 0; + + return __hyp_check_page_state_range(addr, size, PKVM_NOPAGE); +} + static int hyp_complete_share(u64 addr, const struct pkvm_mem_transition *tx, enum kvm_pgtable_prot perms) { @@ -572,6 +650,15 @@ static int hyp_complete_unshare(u64 addr, const struct pkvm_mem_transition *tx) return (ret != size) ? -EFAULT : 0; } +static int hyp_complete_donation(u64 addr, + const struct pkvm_mem_transition *tx) +{ + void *start = (void *)addr, *end = start + (tx->nr_pages * PAGE_SIZE); + enum kvm_pgtable_prot prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_OWNED); + + return pkvm_create_mappings_locked(start, end, prot); +} + static int check_share(struct pkvm_mem_share *share) { const struct pkvm_mem_transition *tx = &share->tx; @@ -724,6 +811,94 @@ static int do_unshare(struct pkvm_mem_share *share) return WARN_ON(__do_unshare(share)); } +static int check_donation(struct pkvm_mem_donation *donation) +{ + const struct pkvm_mem_transition *tx = &donation->tx; + u64 completer_addr; + int ret; + + switch (tx->initiator.id) { + case PKVM_ID_HOST: + ret = host_request_owned_transition(&completer_addr, tx); + break; + case PKVM_ID_HYP: + ret = hyp_request_donation(&completer_addr, tx); + break; + default: + ret = -EINVAL; + } + + if (ret) + return ret; + + switch (tx->completer.id) { + case PKVM_ID_HOST: + ret = host_ack_donation(completer_addr, tx); + break; + case PKVM_ID_HYP: + ret = hyp_ack_donation(completer_addr, tx); + break; + default: + ret = -EINVAL; + } + + return ret; +} + +static int __do_donate(struct pkvm_mem_donation *donation) +{ + const struct pkvm_mem_transition *tx = &donation->tx; + u64 completer_addr; + int ret; + + switch (tx->initiator.id) { + case PKVM_ID_HOST: + ret = host_initiate_donation(&completer_addr, tx); + break; + case PKVM_ID_HYP: + ret = hyp_initiate_donation(&completer_addr, tx); + break; + default: + ret = -EINVAL; + } + + if (ret) + return ret; + + switch (tx->completer.id) { + case PKVM_ID_HOST: + ret = host_complete_donation(completer_addr, tx); + break; + case PKVM_ID_HYP: + ret = hyp_complete_donation(completer_addr, tx); + break; + default: + ret = -EINVAL; + } + + return ret; +} + +/* + * do_donate(): + * + * The page owner transfers ownership to another component, losing access + * as a consequence. + * + * Initiator: OWNED => NOPAGE + * Completer: NOPAGE => OWNED + */ +static int do_donate(struct pkvm_mem_donation *donation) +{ + int ret; + + ret = check_donation(donation); + if (ret) + return ret; + + return WARN_ON(__do_donate(donation)); +} + int __pkvm_host_share_hyp(u64 pfn) { int ret; @@ -789,3 +964,67 @@ int __pkvm_host_unshare_hyp(u64 pfn) return ret; } + +int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages) +{ + int ret; + u64 host_addr = hyp_pfn_to_phys(pfn); + u64 hyp_addr = (u64)__hyp_va(host_addr); + struct pkvm_mem_donation donation = { + .tx = { + .nr_pages = nr_pages, + .initiator = { + .id = PKVM_ID_HOST, + .addr = host_addr, + .host = { + .completer_addr = hyp_addr, + }, + }, + .completer = { + .id = PKVM_ID_HYP, + }, + }, + }; + + host_lock_component(); + hyp_lock_component(); + + ret = do_donate(&donation); + + hyp_unlock_component(); + host_unlock_component(); + + return ret; +} + +int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages) +{ + int ret; + u64 host_addr = hyp_pfn_to_phys(pfn); + u64 hyp_addr = (u64)__hyp_va(host_addr); + struct pkvm_mem_donation donation = { + .tx = { + .nr_pages = nr_pages, + .initiator = { + .id = PKVM_ID_HYP, + .addr = hyp_addr, + .hyp = { + .completer_addr = host_addr, + }, + }, + .completer = { + .id = PKVM_ID_HOST, + }, + }, + }; + + host_lock_component(); + hyp_lock_component(); + + ret = do_donate(&donation); + + hyp_unlock_component(); + host_unlock_component(); + + return ret; +} -- GitLab From 43c1ff8b75011bc3e3e923adf31ba815864a2494 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 10 Nov 2022 19:02:40 +0000 Subject: [PATCH 0575/1423] KVM: arm64: Prevent the donation of no-map pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Memory regions marked as "no-map" in the host device-tree routinely include TrustZone carev-outs and DMA pools. Although donating such pages to the hypervisor may not breach confidentiality, it could be used to corrupt its state in uncontrollable ways. To prevent this, let's block host-initiated memory transitions targeting "no-map" pages altogether in nVHE protected mode as there should be no valid reason to do this in current operation. Thankfully, the pKVM EL2 hypervisor has a full copy of the host's list of memblock regions, so we can easily check for the presence of the MEMBLOCK_NOMAP flag on a region containing pages being donated from the host. Reviewed-by: Philippe Mathieu-Daudé Tested-by: Vincent Donnefort Signed-off-by: Quentin Perret Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110190259.26861-8-will@kernel.org --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 10069cd327873..f7e3afaf9f119 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -193,7 +193,7 @@ struct kvm_mem_range { u64 end; }; -static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range) +static struct memblock_region *find_mem_range(phys_addr_t addr, struct kvm_mem_range *range) { int cur, left = 0, right = hyp_memblock_nr; struct memblock_region *reg; @@ -216,18 +216,28 @@ static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range) } else { range->start = reg->base; range->end = end; - return true; + return reg; } } - return false; + return NULL; } bool addr_is_memory(phys_addr_t phys) { struct kvm_mem_range range; - return find_mem_range(phys, &range); + return !!find_mem_range(phys, &range); +} + +static bool addr_is_allowed_memory(phys_addr_t phys) +{ + struct memblock_region *reg; + struct kvm_mem_range range; + + reg = find_mem_range(phys, &range); + + return reg && !(reg->flags & MEMBLOCK_NOMAP); } static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range) @@ -346,7 +356,7 @@ static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot pr static int host_stage2_idmap(u64 addr) { struct kvm_mem_range range; - bool is_memory = find_mem_range(addr, &range); + bool is_memory = !!find_mem_range(addr, &range); enum kvm_pgtable_prot prot; int ret; @@ -424,7 +434,7 @@ static int __check_page_state_visitor(u64 addr, u64 end, u32 level, struct check_walk_data *d = arg; kvm_pte_t pte = *ptep; - if (kvm_pte_valid(pte) && !addr_is_memory(kvm_pte_to_phys(pte))) + if (kvm_pte_valid(pte) && !addr_is_allowed_memory(kvm_pte_to_phys(pte))) return -EINVAL; return d->get_page_state(pte) == d->desired ? 0 : -EPERM; -- GitLab From 9926cfce8dcb880255f30ab9ac930add787e1ead Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 10 Nov 2022 19:02:41 +0000 Subject: [PATCH 0576/1423] KVM: arm64: Add helpers to pin memory shared with the hypervisor at EL2 Add helpers allowing the hypervisor to check whether a range of pages are currently shared by the host, and 'pin' them if so by blocking host unshare operations until the memory has been unpinned. This will allow the hypervisor to take references on host-provided data-structures (e.g. 'struct kvm') with the guarantee that these pages will remain in a stable state until the hypervisor decides to release them, for example during guest teardown. Tested-by: Vincent Donnefort Signed-off-by: Quentin Perret Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110190259.26861-9-will@kernel.org --- arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 3 ++ arch/arm64/kvm/hyp/include/nvhe/memory.h | 7 ++- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 48 +++++++++++++++++++ 3 files changed, 57 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index c87b19b2d4680..998bf165af711 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -69,6 +69,9 @@ int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id); int kvm_host_prepare_stage2(void *pgt_pool_base); void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt); +int hyp_pin_shared_mem(void *from, void *to); +void hyp_unpin_shared_mem(void *from, void *to); + static __always_inline void __load_host_stage2(void) { if (static_branch_likely(&kvm_protected_mode_initialized)) diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h index 9422900e5c6a0..ab205c4d67748 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/memory.h +++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h @@ -55,10 +55,15 @@ static inline void hyp_page_ref_inc(struct hyp_page *p) p->refcount++; } -static inline int hyp_page_ref_dec_and_test(struct hyp_page *p) +static inline void hyp_page_ref_dec(struct hyp_page *p) { BUG_ON(!p->refcount); p->refcount--; +} + +static inline int hyp_page_ref_dec_and_test(struct hyp_page *p) +{ + hyp_page_ref_dec(p); return (p->refcount == 0); } diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index f7e3afaf9f119..83c2f67e1b582 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -625,6 +625,9 @@ static int hyp_ack_unshare(u64 addr, const struct pkvm_mem_transition *tx) { u64 size = tx->nr_pages * PAGE_SIZE; + if (tx->initiator.id == PKVM_ID_HOST && hyp_page_count((void *)addr)) + return -EBUSY; + if (__hyp_ack_skip_pgtable_check(tx)) return 0; @@ -1038,3 +1041,48 @@ int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages) return ret; } + +int hyp_pin_shared_mem(void *from, void *to) +{ + u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE); + u64 end = PAGE_ALIGN((u64)to); + u64 size = end - start; + int ret; + + host_lock_component(); + hyp_lock_component(); + + ret = __host_check_page_state_range(__hyp_pa(start), size, + PKVM_PAGE_SHARED_OWNED); + if (ret) + goto unlock; + + ret = __hyp_check_page_state_range(start, size, + PKVM_PAGE_SHARED_BORROWED); + if (ret) + goto unlock; + + for (cur = start; cur < end; cur += PAGE_SIZE) + hyp_page_ref_inc(hyp_virt_to_page(cur)); + +unlock: + hyp_unlock_component(); + host_unlock_component(); + + return ret; +} + +void hyp_unpin_shared_mem(void *from, void *to) +{ + u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE); + u64 end = PAGE_ALIGN((u64)to); + + host_lock_component(); + hyp_lock_component(); + + for (cur = start; cur < end; cur += PAGE_SIZE) + hyp_page_ref_dec(hyp_virt_to_page(cur)); + + hyp_unlock_component(); + host_unlock_component(); +} -- GitLab From 4d968b12e6bbe4440f4f220c41d779e02df8af1a Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 10 Nov 2022 19:02:42 +0000 Subject: [PATCH 0577/1423] KVM: arm64: Include asm/kvm_mmu.h in nvhe/mem_protect.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit nvhe/mem_protect.h refers to __load_stage2() in the definition of __load_host_stage2() but doesn't include the relevant header. Include asm/kvm_mmu.h in nvhe/mem_protect.h so that users of the latter don't have to do this themselves. Reviewed-by: Philippe Mathieu-Daudé Tested-by: Vincent Donnefort Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110190259.26861-10-will@kernel.org --- arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 998bf165af711..3bea816296dc8 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -8,6 +8,7 @@ #define __KVM_NVHE_MEM_PROTECT__ #include #include +#include #include #include #include -- GitLab From 1c80002e3264552d8b9c0e162e09aa4087403716 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 10 Nov 2022 19:02:43 +0000 Subject: [PATCH 0578/1423] KVM: arm64: Add hyp_spinlock_t static initializer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a static initializer macro for 'hyp_spinlock_t' so that it is straightforward to instantiate global locks at EL2. This will be later utilised for locking the VM table in the hypervisor. Reviewed-by: Philippe Mathieu-Daudé Tested-by: Vincent Donnefort Signed-off-by: Fuad Tabba Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110190259.26861-11-will@kernel.org --- arch/arm64/kvm/hyp/include/nvhe/spinlock.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h index 4652fd04bdbee..7c7ea8c55405f 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h +++ b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h @@ -28,9 +28,17 @@ typedef union hyp_spinlock { }; } hyp_spinlock_t; +#define __HYP_SPIN_LOCK_INITIALIZER \ + { .__val = 0 } + +#define __HYP_SPIN_LOCK_UNLOCKED \ + ((hyp_spinlock_t) __HYP_SPIN_LOCK_INITIALIZER) + +#define DEFINE_HYP_SPINLOCK(x) hyp_spinlock_t x = __HYP_SPIN_LOCK_UNLOCKED + #define hyp_spin_lock_init(l) \ do { \ - *(l) = (hyp_spinlock_t){ .__val = 0 }; \ + *(l) = __HYP_SPIN_LOCK_UNLOCKED; \ } while (0) static inline void hyp_spin_lock(hyp_spinlock_t *lock) -- GitLab From 5304002dc3754a5663d75c977bfa2d9e3c08906d Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 10 Nov 2022 19:02:44 +0000 Subject: [PATCH 0579/1423] KVM: arm64: Rename 'host_kvm' to 'host_mmu' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation for introducing VM and vCPU state at EL2, rename the existing 'struct host_kvm' and its singleton 'host_kvm' instance to 'host_mmu' so as to avoid confusion between the structure tracking the host stage-2 MMU state and the host instance of a 'struct kvm' for a protected guest. Reviewed-by: Philippe Mathieu-Daudé Tested-by: Vincent Donnefort Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110190259.26861-12-will@kernel.org --- arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 6 +-- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 46 +++++++++---------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 3bea816296dc8..0a6d3e7f2a435 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -44,13 +44,13 @@ static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot) return prot & PKVM_PAGE_STATE_PROT_MASK; } -struct host_kvm { +struct host_mmu { struct kvm_arch arch; struct kvm_pgtable pgt; struct kvm_pgtable_mm_ops mm_ops; hyp_spinlock_t lock; }; -extern struct host_kvm host_kvm; +extern struct host_mmu host_mmu; /* This corresponds to page-table locking order */ enum pkvm_component_id { @@ -76,7 +76,7 @@ void hyp_unpin_shared_mem(void *from, void *to); static __always_inline void __load_host_stage2(void) { if (static_branch_likely(&kvm_protected_mode_initialized)) - __load_stage2(&host_kvm.arch.mmu, &host_kvm.arch); + __load_stage2(&host_mmu.arch.mmu, &host_mmu.arch); else write_sysreg(0, vttbr_el2); } diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 83c2f67e1b582..06c6a24c0eae6 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -22,18 +22,18 @@ #define KVM_HOST_S2_FLAGS (KVM_PGTABLE_S2_NOFWB | KVM_PGTABLE_S2_IDMAP) extern unsigned long hyp_nr_cpus; -struct host_kvm host_kvm; +struct host_mmu host_mmu; static struct hyp_pool host_s2_pool; static void host_lock_component(void) { - hyp_spin_lock(&host_kvm.lock); + hyp_spin_lock(&host_mmu.lock); } static void host_unlock_component(void) { - hyp_spin_unlock(&host_kvm.lock); + hyp_spin_unlock(&host_mmu.lock); } static void hyp_lock_component(void) @@ -88,7 +88,7 @@ static int prepare_s2_pool(void *pgt_pool_base) if (ret) return ret; - host_kvm.mm_ops = (struct kvm_pgtable_mm_ops) { + host_mmu.mm_ops = (struct kvm_pgtable_mm_ops) { .zalloc_pages_exact = host_s2_zalloc_pages_exact, .zalloc_page = host_s2_zalloc_page, .phys_to_virt = hyp_phys_to_virt, @@ -109,7 +109,7 @@ static void prepare_host_vtcr(void) parange = kvm_get_parange(id_aa64mmfr0_el1_sys_val); phys_shift = id_aa64mmfr0_parange_to_phys_shift(parange); - host_kvm.arch.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val, + host_mmu.arch.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val, id_aa64mmfr1_el1_sys_val, phys_shift); } @@ -117,25 +117,25 @@ static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot pr int kvm_host_prepare_stage2(void *pgt_pool_base) { - struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu; + struct kvm_s2_mmu *mmu = &host_mmu.arch.mmu; int ret; prepare_host_vtcr(); - hyp_spin_lock_init(&host_kvm.lock); - mmu->arch = &host_kvm.arch; + hyp_spin_lock_init(&host_mmu.lock); + mmu->arch = &host_mmu.arch; ret = prepare_s2_pool(pgt_pool_base); if (ret) return ret; - ret = __kvm_pgtable_stage2_init(&host_kvm.pgt, mmu, - &host_kvm.mm_ops, KVM_HOST_S2_FLAGS, + ret = __kvm_pgtable_stage2_init(&host_mmu.pgt, mmu, + &host_mmu.mm_ops, KVM_HOST_S2_FLAGS, host_stage2_force_pte_cb); if (ret) return ret; - mmu->pgd_phys = __hyp_pa(host_kvm.pgt.pgd); - mmu->pgt = &host_kvm.pgt; + mmu->pgd_phys = __hyp_pa(host_mmu.pgt.pgd); + mmu->pgt = &host_mmu.pgt; atomic64_set(&mmu->vmid.id, 0); return 0; @@ -143,19 +143,19 @@ int kvm_host_prepare_stage2(void *pgt_pool_base) int __pkvm_prot_finalize(void) { - struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu; + struct kvm_s2_mmu *mmu = &host_mmu.arch.mmu; struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); if (params->hcr_el2 & HCR_VM) return -EPERM; params->vttbr = kvm_get_vttbr(mmu); - params->vtcr = host_kvm.arch.vtcr; + params->vtcr = host_mmu.arch.vtcr; params->hcr_el2 |= HCR_VM; kvm_flush_dcache_to_poc(params, sizeof(*params)); write_sysreg(params->hcr_el2, hcr_el2); - __load_stage2(&host_kvm.arch.mmu, &host_kvm.arch); + __load_stage2(&host_mmu.arch.mmu, &host_mmu.arch); /* * Make sure to have an ISB before the TLB maintenance below but only @@ -173,7 +173,7 @@ int __pkvm_prot_finalize(void) static int host_stage2_unmap_dev_all(void) { - struct kvm_pgtable *pgt = &host_kvm.pgt; + struct kvm_pgtable *pgt = &host_mmu.pgt; struct memblock_region *reg; u64 addr = 0; int i, ret; @@ -258,7 +258,7 @@ static bool range_is_memory(u64 start, u64 end) static inline int __host_stage2_idmap(u64 start, u64 end, enum kvm_pgtable_prot prot) { - return kvm_pgtable_stage2_map(&host_kvm.pgt, start, end - start, start, + return kvm_pgtable_stage2_map(&host_mmu.pgt, start, end - start, start, prot, &host_s2_pool); } @@ -271,7 +271,7 @@ static inline int __host_stage2_idmap(u64 start, u64 end, #define host_stage2_try(fn, ...) \ ({ \ int __ret; \ - hyp_assert_lock_held(&host_kvm.lock); \ + hyp_assert_lock_held(&host_mmu.lock); \ __ret = fn(__VA_ARGS__); \ if (__ret == -ENOMEM) { \ __ret = host_stage2_unmap_dev_all(); \ @@ -294,8 +294,8 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) u32 level; int ret; - hyp_assert_lock_held(&host_kvm.lock); - ret = kvm_pgtable_get_leaf(&host_kvm.pgt, addr, &pte, &level); + hyp_assert_lock_held(&host_mmu.lock); + ret = kvm_pgtable_get_leaf(&host_mmu.pgt, addr, &pte, &level); if (ret) return ret; @@ -327,7 +327,7 @@ int host_stage2_idmap_locked(phys_addr_t addr, u64 size, int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id) { - return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_kvm.pgt, + return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt, addr, size, &host_s2_pool, owner_id); } @@ -468,8 +468,8 @@ static int __host_check_page_state_range(u64 addr, u64 size, .get_page_state = host_get_page_state, }; - hyp_assert_lock_held(&host_kvm.lock); - return check_page_state_range(&host_kvm.pgt, addr, size, &d); + hyp_assert_lock_held(&host_mmu.lock); + return check_page_state_range(&host_mmu.pgt, addr, size, &d); } static int __host_set_page_state_range(u64 addr, u64 size, -- GitLab From a1ec5c70d3f63d8a143fb83cd7f53bd8ff2f72c8 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 10 Nov 2022 19:02:45 +0000 Subject: [PATCH 0580/1423] KVM: arm64: Add infrastructure to create and track pKVM instances at EL2 Introduce a global table (and lock) to track pKVM instances at EL2, and provide hypercalls that can be used by the untrusted host to create and destroy pKVM VMs and their vCPUs. pKVM VM/vCPU state is directly accessible only by the trusted hypervisor (EL2). Each pKVM VM is directly associated with an untrusted host KVM instance, and is referenced by the host using an opaque handle. Future patches will provide hypercalls to allow the host to initialize/set/get pKVM VM/vCPU state using the opaque handle. Tested-by: Vincent Donnefort Signed-off-by: Fuad Tabba Co-developed-by: Will Deacon Signed-off-by: Will Deacon [maz: silence warning on unmap_donated_memory_noclear()] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110190259.26861-13-will@kernel.org --- arch/arm64/include/asm/kvm_asm.h | 3 + arch/arm64/include/asm/kvm_host.h | 8 + arch/arm64/include/asm/kvm_pgtable.h | 8 + arch/arm64/include/asm/kvm_pkvm.h | 8 + arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 3 + arch/arm64/kvm/hyp/include/nvhe/pkvm.h | 58 +++ arch/arm64/kvm/hyp/nvhe/hyp-main.c | 31 ++ arch/arm64/kvm/hyp/nvhe/mem_protect.c | 14 + arch/arm64/kvm/hyp/nvhe/pkvm.c | 380 ++++++++++++++++++ arch/arm64/kvm/hyp/nvhe/setup.c | 8 + arch/arm64/kvm/hyp/pgtable.c | 9 + arch/arm64/kvm/pkvm.c | 1 + 12 files changed, 531 insertions(+) create mode 100644 arch/arm64/kvm/hyp/include/nvhe/pkvm.h diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 53035763e48e8..de52ba775d48f 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -76,6 +76,9 @@ enum __kvm_host_smccc_func { __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs, __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs, __KVM_HOST_SMCCC_FUNC___pkvm_vcpu_init_traps, + __KVM_HOST_SMCCC_FUNC___pkvm_init_vm, + __KVM_HOST_SMCCC_FUNC___pkvm_init_vcpu, + __KVM_HOST_SMCCC_FUNC___pkvm_teardown_vm, }; #define DECLARE_KVM_VHE_SYM(sym) extern char sym[] diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 45e2136322ba2..d3dd7ab9c79ea 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -115,6 +115,8 @@ struct kvm_smccc_features { unsigned long vendor_hyp_bmap; }; +typedef unsigned int pkvm_handle_t; + struct kvm_arch { struct kvm_s2_mmu mmu; @@ -166,6 +168,12 @@ struct kvm_arch { /* Hypercall features firmware registers' descriptor */ struct kvm_smccc_features smccc_feat; + + /* + * For an untrusted host VM, 'pkvm_handle' is used to lookup + * the associated pKVM instance in the hypervisor. + */ + pkvm_handle_t pkvm_handle; }; struct kvm_vcpu_fault_info { diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 3252eb50ecfe5..15c389db19318 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -296,6 +296,14 @@ u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size); */ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift); +/** + * kvm_pgtable_stage2_pgd_size() - Helper to compute size of a stage-2 PGD + * @vtcr: Content of the VTCR register. + * + * Return: the size (in bytes) of the stage-2 PGD + */ +size_t kvm_pgtable_stage2_pgd_size(u64 vtcr); + /** * __kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table. * @pgt: Uninitialised page-table structure to initialise. diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h index 8f7b8a2314bbb..f4e3133d65500 100644 --- a/arch/arm64/include/asm/kvm_pkvm.h +++ b/arch/arm64/include/asm/kvm_pkvm.h @@ -9,6 +9,9 @@ #include #include +/* Maximum number of VMs that can co-exist under pKVM. */ +#define KVM_MAX_PVMS 255 + #define HYP_MEMBLOCK_REGIONS 128 extern struct memblock_region kvm_nvhe_sym(hyp_memory)[]; @@ -40,6 +43,11 @@ static inline unsigned long hyp_vmemmap_pages(size_t vmemmap_entry_size) return res >> PAGE_SHIFT; } +static inline unsigned long hyp_vm_table_pages(void) +{ + return PAGE_ALIGN(KVM_MAX_PVMS * sizeof(void *)) >> PAGE_SHIFT; +} + static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages) { unsigned long total = 0, i; diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 0a6d3e7f2a435..ce9a796a85eee 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -11,6 +11,7 @@ #include #include #include +#include #include /* @@ -68,10 +69,12 @@ bool addr_is_memory(phys_addr_t phys); int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot); int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id); int kvm_host_prepare_stage2(void *pgt_pool_base); +int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd); void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt); int hyp_pin_shared_mem(void *from, void *to); void hyp_unpin_shared_mem(void *from, void *to); +void reclaim_guest_pages(struct pkvm_hyp_vm *vm); static __always_inline void __load_host_stage2(void) { diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h new file mode 100644 index 0000000000000..8c653a3b9501d --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2021 Google LLC + * Author: Fuad Tabba + */ + +#ifndef __ARM64_KVM_NVHE_PKVM_H__ +#define __ARM64_KVM_NVHE_PKVM_H__ + +#include + +/* + * Holds the relevant data for maintaining the vcpu state completely at hyp. + */ +struct pkvm_hyp_vcpu { + struct kvm_vcpu vcpu; + + /* Backpointer to the host's (untrusted) vCPU instance. */ + struct kvm_vcpu *host_vcpu; +}; + +/* + * Holds the relevant data for running a protected vm. + */ +struct pkvm_hyp_vm { + struct kvm kvm; + + /* Backpointer to the host's (untrusted) KVM instance. */ + struct kvm *host_kvm; + + /* The guest's stage-2 page-table managed by the hypervisor. */ + struct kvm_pgtable pgt; + + /* + * The number of vcpus initialized and ready to run. + * Modifying this is protected by 'vm_table_lock'. + */ + unsigned int nr_vcpus; + + /* Array of the hyp vCPU structures for this VM. */ + struct pkvm_hyp_vcpu *vcpus[]; +}; + +static inline struct pkvm_hyp_vm * +pkvm_hyp_vcpu_to_hyp_vm(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + return container_of(hyp_vcpu->vcpu.kvm, struct pkvm_hyp_vm, kvm); +} + +void pkvm_hyp_vm_table_init(void *tbl); + +int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, + unsigned long pgd_hva); +int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu, + unsigned long vcpu_hva); +int __pkvm_teardown_vm(pkvm_handle_t handle); + +#endif /* __ARM64_KVM_NVHE_PKVM_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 3cea4b6ac23ec..b5f3fcfe91357 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -15,6 +15,7 @@ #include #include +#include #include DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); @@ -191,6 +192,33 @@ static void handle___pkvm_vcpu_init_traps(struct kvm_cpu_context *host_ctxt) __pkvm_vcpu_init_traps(kern_hyp_va(vcpu)); } +static void handle___pkvm_init_vm(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct kvm *, host_kvm, host_ctxt, 1); + DECLARE_REG(unsigned long, vm_hva, host_ctxt, 2); + DECLARE_REG(unsigned long, pgd_hva, host_ctxt, 3); + + host_kvm = kern_hyp_va(host_kvm); + cpu_reg(host_ctxt, 1) = __pkvm_init_vm(host_kvm, vm_hva, pgd_hva); +} + +static void handle___pkvm_init_vcpu(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(struct kvm_vcpu *, host_vcpu, host_ctxt, 2); + DECLARE_REG(unsigned long, vcpu_hva, host_ctxt, 3); + + host_vcpu = kern_hyp_va(host_vcpu); + cpu_reg(host_ctxt, 1) = __pkvm_init_vcpu(handle, host_vcpu, vcpu_hva); +} + +static void handle___pkvm_teardown_vm(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __pkvm_teardown_vm(handle); +} + typedef void (*hcall_t)(struct kvm_cpu_context *); #define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x @@ -220,6 +248,9 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__vgic_v3_save_aprs), HANDLE_FUNC(__vgic_v3_restore_aprs), HANDLE_FUNC(__pkvm_vcpu_init_traps), + HANDLE_FUNC(__pkvm_init_vm), + HANDLE_FUNC(__pkvm_init_vcpu), + HANDLE_FUNC(__pkvm_teardown_vm), }; static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 06c6a24c0eae6..459957b3082ea 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -141,6 +141,20 @@ int kvm_host_prepare_stage2(void *pgt_pool_base) return 0; } +int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd) +{ + vm->pgt.pgd = pgd; + return 0; +} + +void reclaim_guest_pages(struct pkvm_hyp_vm *vm) +{ + unsigned long nr_pages; + + nr_pages = kvm_pgtable_stage2_pgd_size(vm->kvm.arch.vtcr) >> PAGE_SHIFT; + WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(vm->pgt.pgd), nr_pages)); +} + int __pkvm_prot_finalize(void) { struct kvm_s2_mmu *mmu = &host_mmu.arch.mmu; diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 85d3b7ae720fb..135c9a095eca2 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -7,6 +7,9 @@ #include #include #include +#include +#include +#include #include /* @@ -183,3 +186,380 @@ void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu) pvm_init_traps_aa64mmfr0(vcpu); pvm_init_traps_aa64mmfr1(vcpu); } + +/* + * Start the VM table handle at the offset defined instead of at 0. + * Mainly for sanity checking and debugging. + */ +#define HANDLE_OFFSET 0x1000 + +static unsigned int vm_handle_to_idx(pkvm_handle_t handle) +{ + return handle - HANDLE_OFFSET; +} + +static pkvm_handle_t idx_to_vm_handle(unsigned int idx) +{ + return idx + HANDLE_OFFSET; +} + +/* + * Spinlock for protecting state related to the VM table. Protects writes + * to 'vm_table' and 'nr_table_entries' as well as reads and writes to + * 'last_hyp_vcpu_lookup'. + */ +static DEFINE_HYP_SPINLOCK(vm_table_lock); + +/* + * The table of VM entries for protected VMs in hyp. + * Allocated at hyp initialization and setup. + */ +static struct pkvm_hyp_vm **vm_table; + +void pkvm_hyp_vm_table_init(void *tbl) +{ + WARN_ON(vm_table); + vm_table = tbl; +} + +/* + * Return the hyp vm structure corresponding to the handle. + */ +static struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle) +{ + unsigned int idx = vm_handle_to_idx(handle); + + if (unlikely(idx >= KVM_MAX_PVMS)) + return NULL; + + return vm_table[idx]; +} + +static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu) +{ + if (host_vcpu) + hyp_unpin_shared_mem(host_vcpu, host_vcpu + 1); +} + +static void unpin_host_vcpus(struct pkvm_hyp_vcpu *hyp_vcpus[], + unsigned int nr_vcpus) +{ + int i; + + for (i = 0; i < nr_vcpus; i++) + unpin_host_vcpu(hyp_vcpus[i]->host_vcpu); +} + +static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm, + unsigned int nr_vcpus) +{ + hyp_vm->host_kvm = host_kvm; + hyp_vm->kvm.created_vcpus = nr_vcpus; + hyp_vm->kvm.arch.vtcr = host_mmu.arch.vtcr; +} + +static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, + struct pkvm_hyp_vm *hyp_vm, + struct kvm_vcpu *host_vcpu, + unsigned int vcpu_idx) +{ + int ret = 0; + + if (hyp_pin_shared_mem(host_vcpu, host_vcpu + 1)) + return -EBUSY; + + if (host_vcpu->vcpu_idx != vcpu_idx) { + ret = -EINVAL; + goto done; + } + + hyp_vcpu->host_vcpu = host_vcpu; + + hyp_vcpu->vcpu.kvm = &hyp_vm->kvm; + hyp_vcpu->vcpu.vcpu_id = READ_ONCE(host_vcpu->vcpu_id); + hyp_vcpu->vcpu.vcpu_idx = vcpu_idx; + + hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu; +done: + if (ret) + unpin_host_vcpu(host_vcpu); + return ret; +} + +static int find_free_vm_table_entry(struct kvm *host_kvm) +{ + int i; + + for (i = 0; i < KVM_MAX_PVMS; ++i) { + if (!vm_table[i]) + return i; + } + + return -ENOMEM; +} + +/* + * Allocate a VM table entry and insert a pointer to the new vm. + * + * Return a unique handle to the protected VM on success, + * negative error code on failure. + */ +static pkvm_handle_t insert_vm_table_entry(struct kvm *host_kvm, + struct pkvm_hyp_vm *hyp_vm) +{ + struct kvm_s2_mmu *mmu = &hyp_vm->kvm.arch.mmu; + int idx; + + hyp_assert_lock_held(&vm_table_lock); + + /* + * Initializing protected state might have failed, yet a malicious + * host could trigger this function. Thus, ensure that 'vm_table' + * exists. + */ + if (unlikely(!vm_table)) + return -EINVAL; + + idx = find_free_vm_table_entry(host_kvm); + if (idx < 0) + return idx; + + hyp_vm->kvm.arch.pkvm_handle = idx_to_vm_handle(idx); + + /* VMID 0 is reserved for the host */ + atomic64_set(&mmu->vmid.id, idx + 1); + + mmu->arch = &hyp_vm->kvm.arch; + mmu->pgt = &hyp_vm->pgt; + + vm_table[idx] = hyp_vm; + return hyp_vm->kvm.arch.pkvm_handle; +} + +/* + * Deallocate and remove the VM table entry corresponding to the handle. + */ +static void remove_vm_table_entry(pkvm_handle_t handle) +{ + hyp_assert_lock_held(&vm_table_lock); + vm_table[vm_handle_to_idx(handle)] = NULL; +} + +static size_t pkvm_get_hyp_vm_size(unsigned int nr_vcpus) +{ + return size_add(sizeof(struct pkvm_hyp_vm), + size_mul(sizeof(struct pkvm_hyp_vcpu *), nr_vcpus)); +} + +static void *map_donated_memory_noclear(unsigned long host_va, size_t size) +{ + void *va = (void *)kern_hyp_va(host_va); + + if (!PAGE_ALIGNED(va)) + return NULL; + + if (__pkvm_host_donate_hyp(hyp_virt_to_pfn(va), + PAGE_ALIGN(size) >> PAGE_SHIFT)) + return NULL; + + return va; +} + +static void *map_donated_memory(unsigned long host_va, size_t size) +{ + void *va = map_donated_memory_noclear(host_va, size); + + if (va) + memset(va, 0, size); + + return va; +} + +static void __unmap_donated_memory(void *va, size_t size) +{ + WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(va), + PAGE_ALIGN(size) >> PAGE_SHIFT)); +} + +static void unmap_donated_memory(void *va, size_t size) +{ + if (!va) + return; + + memset(va, 0, size); + __unmap_donated_memory(va, size); +} + +static void __maybe_unused unmap_donated_memory_noclear(void *va, size_t size) +{ + if (!va) + return; + + __unmap_donated_memory(va, size); +} + +/* + * Initialize the hypervisor copy of the protected VM state using the + * memory donated by the host. + * + * Unmaps the donated memory from the host at stage 2. + * + * host_kvm: A pointer to the host's struct kvm. + * vm_hva: The host va of the area being donated for the VM state. + * Must be page aligned. + * pgd_hva: The host va of the area being donated for the stage-2 PGD for + * the VM. Must be page aligned. Its size is implied by the VM's + * VTCR. + * + * Return a unique handle to the protected VM on success, + * negative error code on failure. + */ +int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, + unsigned long pgd_hva) +{ + struct pkvm_hyp_vm *hyp_vm = NULL; + size_t vm_size, pgd_size; + unsigned int nr_vcpus; + void *pgd = NULL; + int ret; + + ret = hyp_pin_shared_mem(host_kvm, host_kvm + 1); + if (ret) + return ret; + + nr_vcpus = READ_ONCE(host_kvm->created_vcpus); + if (nr_vcpus < 1) { + ret = -EINVAL; + goto err_unpin_kvm; + } + + vm_size = pkvm_get_hyp_vm_size(nr_vcpus); + pgd_size = kvm_pgtable_stage2_pgd_size(host_mmu.arch.vtcr); + + ret = -ENOMEM; + + hyp_vm = map_donated_memory(vm_hva, vm_size); + if (!hyp_vm) + goto err_remove_mappings; + + pgd = map_donated_memory_noclear(pgd_hva, pgd_size); + if (!pgd) + goto err_remove_mappings; + + init_pkvm_hyp_vm(host_kvm, hyp_vm, nr_vcpus); + + hyp_spin_lock(&vm_table_lock); + ret = insert_vm_table_entry(host_kvm, hyp_vm); + if (ret < 0) + goto err_unlock; + + ret = kvm_guest_prepare_stage2(hyp_vm, pgd); + if (ret) + goto err_remove_vm_table_entry; + hyp_spin_unlock(&vm_table_lock); + + return hyp_vm->kvm.arch.pkvm_handle; + +err_remove_vm_table_entry: + remove_vm_table_entry(hyp_vm->kvm.arch.pkvm_handle); +err_unlock: + hyp_spin_unlock(&vm_table_lock); +err_remove_mappings: + unmap_donated_memory(hyp_vm, vm_size); + unmap_donated_memory(pgd, pgd_size); +err_unpin_kvm: + hyp_unpin_shared_mem(host_kvm, host_kvm + 1); + return ret; +} + +/* + * Initialize the hypervisor copy of the protected vCPU state using the + * memory donated by the host. + * + * handle: The handle for the protected vm. + * host_vcpu: A pointer to the corresponding host vcpu. + * vcpu_hva: The host va of the area being donated for the vcpu state. + * Must be page aligned. The size of the area must be equal to + * the page-aligned size of 'struct pkvm_hyp_vcpu'. + * Return 0 on success, negative error code on failure. + */ +int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu, + unsigned long vcpu_hva) +{ + struct pkvm_hyp_vcpu *hyp_vcpu; + struct pkvm_hyp_vm *hyp_vm; + unsigned int idx; + int ret; + + hyp_vcpu = map_donated_memory(vcpu_hva, sizeof(*hyp_vcpu)); + if (!hyp_vcpu) + return -ENOMEM; + + hyp_spin_lock(&vm_table_lock); + + hyp_vm = get_vm_by_handle(handle); + if (!hyp_vm) { + ret = -ENOENT; + goto unlock; + } + + idx = hyp_vm->nr_vcpus; + if (idx >= hyp_vm->kvm.created_vcpus) { + ret = -EINVAL; + goto unlock; + } + + ret = init_pkvm_hyp_vcpu(hyp_vcpu, hyp_vm, host_vcpu, idx); + if (ret) + goto unlock; + + hyp_vm->vcpus[idx] = hyp_vcpu; + hyp_vm->nr_vcpus++; +unlock: + hyp_spin_unlock(&vm_table_lock); + + if (ret) + unmap_donated_memory(hyp_vcpu, sizeof(*hyp_vcpu)); + + return ret; +} + +int __pkvm_teardown_vm(pkvm_handle_t handle) +{ + struct pkvm_hyp_vm *hyp_vm; + struct kvm *host_kvm; + size_t vm_size; + int err; + + hyp_spin_lock(&vm_table_lock); + hyp_vm = get_vm_by_handle(handle); + if (!hyp_vm) { + err = -ENOENT; + goto err_unlock; + } + + if (WARN_ON(hyp_page_count(hyp_vm))) { + err = -EBUSY; + goto err_unlock; + } + + /* Ensure the VMID is clean before it can be reallocated */ + __kvm_tlb_flush_vmid(&hyp_vm->kvm.arch.mmu); + remove_vm_table_entry(handle); + hyp_spin_unlock(&vm_table_lock); + + /* Reclaim guest pages (including page-table pages) */ + reclaim_guest_pages(hyp_vm); + unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->nr_vcpus); + + /* Return the metadata pages to the host */ + host_kvm = hyp_vm->host_kvm; + vm_size = pkvm_get_hyp_vm_size(hyp_vm->kvm.created_vcpus); + unmap_donated_memory(hyp_vm, vm_size); + hyp_unpin_shared_mem(host_kvm, host_kvm + 1); + return 0; + +err_unlock: + hyp_spin_unlock(&vm_table_lock); + return err; +} diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 0312c9c74a5a4..2be72fbe72799 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -16,6 +16,7 @@ #include #include #include +#include #include unsigned long hyp_nr_cpus; @@ -24,6 +25,7 @@ unsigned long hyp_nr_cpus; (unsigned long)__per_cpu_start) static void *vmemmap_base; +static void *vm_table_base; static void *hyp_pgt_base; static void *host_s2_pgt_base; static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops; @@ -40,6 +42,11 @@ static int divide_memory_pool(void *virt, unsigned long size) if (!vmemmap_base) return -ENOMEM; + nr_pages = hyp_vm_table_pages(); + vm_table_base = hyp_early_alloc_contig(nr_pages); + if (!vm_table_base) + return -ENOMEM; + nr_pages = hyp_s1_pgtable_pages(); hyp_pgt_base = hyp_early_alloc_contig(nr_pages); if (!hyp_pgt_base) @@ -314,6 +321,7 @@ void __noreturn __pkvm_init_finalise(void) if (ret) goto out; + pkvm_hyp_vm_table_init(vm_table_base); out: /* * We tail-called to here from handle___pkvm_init() and will not return, diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index cdf8e76b0be14..a1a27f88a312d 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -1200,6 +1200,15 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, return 0; } +size_t kvm_pgtable_stage2_pgd_size(u64 vtcr) +{ + u32 ia_bits = VTCR_EL2_IPA(vtcr); + u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); + u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; + + return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; +} + static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, enum kvm_pgtable_walk_flags flag, void * const arg) diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index 34229425b25d3..71493136e59c2 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -71,6 +71,7 @@ void __init kvm_hyp_reserve(void) hyp_mem_pages += hyp_s1_pgtable_pages(); hyp_mem_pages += host_s2_pgtable_pages(); + hyp_mem_pages += hyp_vm_table_pages(); hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE); /* -- GitLab From 9d0c063a4d1d10ef8e6288899b8524413e40cfa0 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 10 Nov 2022 19:02:46 +0000 Subject: [PATCH 0581/1423] KVM: arm64: Instantiate pKVM hypervisor VM and vCPU structures from EL1 With the pKVM hypervisor at EL2 now offering hypercalls to the host for creating and destroying VM and vCPU structures, plumb these in to the existing arm64 KVM backend to ensure that the hypervisor data structures are allocated and initialised on first vCPU run for a pKVM guest. In the host, 'struct kvm_protected_vm' is introduced to hold the handle of the pKVM VM instance as well as to track references to the memory donated to the hypervisor so that it can be freed back to the host allocator following VM teardown. The stage-2 page-table, hypervisor VM and vCPU structures are allocated separately so as to avoid the need for a large physically-contiguous allocation in the host at run-time. Tested-by: Vincent Donnefort Signed-off-by: Fuad Tabba Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110190259.26861-14-will@kernel.org --- arch/arm64/include/asm/kvm_host.h | 14 ++- arch/arm64/include/asm/kvm_pkvm.h | 4 + arch/arm64/kvm/arm.c | 14 +++ arch/arm64/kvm/hyp/hyp-constants.c | 3 + arch/arm64/kvm/hyp/nvhe/pkvm.c | 15 +++- arch/arm64/kvm/pkvm.c | 138 +++++++++++++++++++++++++++++ 6 files changed, 182 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index d3dd7ab9c79ea..467393e7331f1 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -117,6 +117,16 @@ struct kvm_smccc_features { typedef unsigned int pkvm_handle_t; +struct kvm_protected_vm { + pkvm_handle_t handle; + + struct { + void *pgd; + void *vm; + void *vcpus[KVM_MAX_VCPUS]; + } hyp_donations; +}; + struct kvm_arch { struct kvm_s2_mmu mmu; @@ -170,10 +180,10 @@ struct kvm_arch { struct kvm_smccc_features smccc_feat; /* - * For an untrusted host VM, 'pkvm_handle' is used to lookup + * For an untrusted host VM, 'pkvm.handle' is used to lookup * the associated pKVM instance in the hypervisor. */ - pkvm_handle_t pkvm_handle; + struct kvm_protected_vm pkvm; }; struct kvm_vcpu_fault_info { diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h index f4e3133d65500..01129b0d4c689 100644 --- a/arch/arm64/include/asm/kvm_pkvm.h +++ b/arch/arm64/include/asm/kvm_pkvm.h @@ -14,6 +14,10 @@ #define HYP_MEMBLOCK_REGIONS 128 +int pkvm_init_host_vm(struct kvm *kvm); +int pkvm_create_hyp_vm(struct kvm *kvm); +void pkvm_destroy_hyp_vm(struct kvm *kvm); + extern struct memblock_region kvm_nvhe_sym(hyp_memory)[]; extern unsigned int kvm_nvhe_sym(hyp_memblock_nr); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 94d33e296e10c..30d6fc5d3a93d 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -150,6 +151,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (ret) goto out_free_stage2_pgd; + ret = pkvm_init_host_vm(kvm); + if (ret) + goto out_free_stage2_pgd; + if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL)) { ret = -ENOMEM; goto out_free_stage2_pgd; @@ -187,6 +192,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_vgic_destroy(kvm); + if (is_protected_kvm_enabled()) + pkvm_destroy_hyp_vm(kvm); + kvm_destroy_vcpus(kvm); kvm_unshare_hyp(kvm, kvm + 1); @@ -569,6 +577,12 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) if (ret) return ret; + if (is_protected_kvm_enabled()) { + ret = pkvm_create_hyp_vm(kvm); + if (ret) + return ret; + } + if (!irqchip_in_kernel(kvm)) { /* * Tell the rest of the code that there are userspace irqchip diff --git a/arch/arm64/kvm/hyp/hyp-constants.c b/arch/arm64/kvm/hyp/hyp-constants.c index b3742a6691e87..b257a3b4bfc5c 100644 --- a/arch/arm64/kvm/hyp/hyp-constants.c +++ b/arch/arm64/kvm/hyp/hyp-constants.c @@ -2,9 +2,12 @@ #include #include +#include int main(void) { DEFINE(STRUCT_HYP_PAGE_SIZE, sizeof(struct hyp_page)); + DEFINE(PKVM_HYP_VM_SIZE, sizeof(struct pkvm_hyp_vm)); + DEFINE(PKVM_HYP_VCPU_SIZE, sizeof(struct pkvm_hyp_vcpu)); return 0; } diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 135c9a095eca2..2c73c4640e4dd 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -324,7 +324,7 @@ static pkvm_handle_t insert_vm_table_entry(struct kvm *host_kvm, if (idx < 0) return idx; - hyp_vm->kvm.arch.pkvm_handle = idx_to_vm_handle(idx); + hyp_vm->kvm.arch.pkvm.handle = idx_to_vm_handle(idx); /* VMID 0 is reserved for the host */ atomic64_set(&mmu->vmid.id, idx + 1); @@ -333,7 +333,7 @@ static pkvm_handle_t insert_vm_table_entry(struct kvm *host_kvm, mmu->pgt = &hyp_vm->pgt; vm_table[idx] = hyp_vm; - return hyp_vm->kvm.arch.pkvm_handle; + return hyp_vm->kvm.arch.pkvm.handle; } /* @@ -458,10 +458,10 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, goto err_remove_vm_table_entry; hyp_spin_unlock(&vm_table_lock); - return hyp_vm->kvm.arch.pkvm_handle; + return hyp_vm->kvm.arch.pkvm.handle; err_remove_vm_table_entry: - remove_vm_table_entry(hyp_vm->kvm.arch.pkvm_handle); + remove_vm_table_entry(hyp_vm->kvm.arch.pkvm.handle); err_unlock: hyp_spin_unlock(&vm_table_lock); err_remove_mappings: @@ -528,6 +528,7 @@ int __pkvm_teardown_vm(pkvm_handle_t handle) { struct pkvm_hyp_vm *hyp_vm; struct kvm *host_kvm; + unsigned int idx; size_t vm_size; int err; @@ -553,6 +554,12 @@ int __pkvm_teardown_vm(pkvm_handle_t handle) unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->nr_vcpus); /* Return the metadata pages to the host */ + for (idx = 0; idx < hyp_vm->nr_vcpus; ++idx) { + struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vm->vcpus[idx]; + + unmap_donated_memory(hyp_vcpu, sizeof(*hyp_vcpu)); + } + host_kvm = hyp_vm->host_kvm; vm_size = pkvm_get_hyp_vm_size(hyp_vm->kvm.created_vcpus); unmap_donated_memory(hyp_vm, vm_size); diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index 71493136e59c2..8c443b915e439 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -6,6 +6,7 @@ #include #include +#include #include #include @@ -94,3 +95,140 @@ void __init kvm_hyp_reserve(void) kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20, hyp_mem_base); } + +/* + * Allocates and donates memory for hypervisor VM structs at EL2. + * + * Allocates space for the VM state, which includes the hyp vm as well as + * the hyp vcpus. + * + * Stores an opaque handler in the kvm struct for future reference. + * + * Return 0 on success, negative error code on failure. + */ +static int __pkvm_create_hyp_vm(struct kvm *host_kvm) +{ + size_t pgd_sz, hyp_vm_sz, hyp_vcpu_sz; + struct kvm_vcpu *host_vcpu; + pkvm_handle_t handle; + void *pgd, *hyp_vm; + unsigned long idx; + int ret; + + if (host_kvm->created_vcpus < 1) + return -EINVAL; + + pgd_sz = kvm_pgtable_stage2_pgd_size(host_kvm->arch.vtcr); + + /* + * The PGD pages will be reclaimed using a hyp_memcache which implies + * page granularity. So, use alloc_pages_exact() to get individual + * refcounts. + */ + pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL_ACCOUNT); + if (!pgd) + return -ENOMEM; + + /* Allocate memory to donate to hyp for vm and vcpu pointers. */ + hyp_vm_sz = PAGE_ALIGN(size_add(PKVM_HYP_VM_SIZE, + size_mul(sizeof(void *), + host_kvm->created_vcpus))); + hyp_vm = alloc_pages_exact(hyp_vm_sz, GFP_KERNEL_ACCOUNT); + if (!hyp_vm) { + ret = -ENOMEM; + goto free_pgd; + } + + /* Donate the VM memory to hyp and let hyp initialize it. */ + ret = kvm_call_hyp_nvhe(__pkvm_init_vm, host_kvm, hyp_vm, pgd); + if (ret < 0) + goto free_vm; + + handle = ret; + + host_kvm->arch.pkvm.handle = handle; + host_kvm->arch.pkvm.hyp_donations.pgd = pgd; + host_kvm->arch.pkvm.hyp_donations.vm = hyp_vm; + + /* Donate memory for the vcpus at hyp and initialize it. */ + hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE); + kvm_for_each_vcpu(idx, host_vcpu, host_kvm) { + void *hyp_vcpu; + + /* Indexing of the vcpus to be sequential starting at 0. */ + if (WARN_ON(host_vcpu->vcpu_idx != idx)) { + ret = -EINVAL; + goto destroy_vm; + } + + hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT); + if (!hyp_vcpu) { + ret = -ENOMEM; + goto destroy_vm; + } + + host_kvm->arch.pkvm.hyp_donations.vcpus[idx] = hyp_vcpu; + + ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, host_vcpu, + hyp_vcpu); + if (ret) + goto destroy_vm; + } + + return 0; + +destroy_vm: + pkvm_destroy_hyp_vm(host_kvm); + return ret; +free_vm: + free_pages_exact(hyp_vm, hyp_vm_sz); +free_pgd: + free_pages_exact(pgd, pgd_sz); + return ret; +} + +int pkvm_create_hyp_vm(struct kvm *host_kvm) +{ + int ret = 0; + + mutex_lock(&host_kvm->lock); + if (!host_kvm->arch.pkvm.handle) + ret = __pkvm_create_hyp_vm(host_kvm); + mutex_unlock(&host_kvm->lock); + + return ret; +} + +void pkvm_destroy_hyp_vm(struct kvm *host_kvm) +{ + unsigned long idx, nr_vcpus = host_kvm->created_vcpus; + size_t pgd_sz, hyp_vm_sz; + + if (host_kvm->arch.pkvm.handle) + WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm, + host_kvm->arch.pkvm.handle)); + + host_kvm->arch.pkvm.handle = 0; + + for (idx = 0; idx < nr_vcpus; ++idx) { + void *hyp_vcpu = host_kvm->arch.pkvm.hyp_donations.vcpus[idx]; + + if (!hyp_vcpu) + break; + + free_pages_exact(hyp_vcpu, PAGE_ALIGN(PKVM_HYP_VCPU_SIZE)); + } + + hyp_vm_sz = PAGE_ALIGN(size_add(PKVM_HYP_VM_SIZE, + size_mul(sizeof(void *), nr_vcpus))); + pgd_sz = kvm_pgtable_stage2_pgd_size(host_kvm->arch.vtcr); + + free_pages_exact(host_kvm->arch.pkvm.hyp_donations.vm, hyp_vm_sz); + free_pages_exact(host_kvm->arch.pkvm.hyp_donations.pgd, pgd_sz); +} + +int pkvm_init_host_vm(struct kvm *host_kvm) +{ + mutex_init(&host_kvm->lock); + return 0; +} -- GitLab From aa6948f82f0b7060fbbac21911dc7996b144ba3c Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 10 Nov 2022 19:02:47 +0000 Subject: [PATCH 0582/1423] KVM: arm64: Add per-cpu fixmap infrastructure at EL2 Mapping pages in a guest page-table from within the pKVM hypervisor at EL2 may require cache maintenance to ensure that the initialised page contents is visible even to non-cacheable (e.g. MMU-off) accesses from the guest. In preparation for performing this maintenance at EL2, introduce a per-vCPU fixmap which allows the pKVM hypervisor to map guest pages temporarily into its stage-1 page-table for the purposes of cache maintenance and, in future, poisoning on the reclaim path. The use of a fixmap avoids the need for memory allocation or locking on the map() path. Tested-by: Vincent Donnefort Signed-off-by: Quentin Perret Co-developed-by: Will Deacon Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110190259.26861-15-will@kernel.org --- arch/arm64/include/asm/kvm_pgtable.h | 14 +++ arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 2 + arch/arm64/kvm/hyp/include/nvhe/mm.h | 4 + arch/arm64/kvm/hyp/nvhe/mem_protect.c | 1 - arch/arm64/kvm/hyp/nvhe/mm.c | 104 ++++++++++++++++++ arch/arm64/kvm/hyp/nvhe/setup.c | 4 + arch/arm64/kvm/hyp/pgtable.c | 12 -- 7 files changed, 128 insertions(+), 13 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 15c389db19318..34cb93f3c96de 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -42,6 +42,8 @@ typedef u64 kvm_pte_t; #define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT) #define KVM_PTE_ADDR_51_48 GENMASK(15, 12) +#define KVM_PHYS_INVALID (-1ULL) + static inline bool kvm_pte_valid(kvm_pte_t pte) { return pte & KVM_PTE_VALID; @@ -57,6 +59,18 @@ static inline u64 kvm_pte_to_phys(kvm_pte_t pte) return pa; } +static inline kvm_pte_t kvm_phys_to_pte(u64 pa) +{ + kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK; + + if (PAGE_SHIFT == 16) { + pa &= GENMASK(51, 48); + pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48); + } + + return pte; +} + static inline u64 kvm_granule_shift(u32 level) { /* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index ce9a796a85eee..ef31a1872c93b 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -59,6 +59,8 @@ enum pkvm_component_id { PKVM_ID_HYP, }; +extern unsigned long hyp_nr_cpus; + int __pkvm_prot_finalize(void); int __pkvm_host_share_hyp(u64 pfn); int __pkvm_host_unshare_hyp(u64 pfn); diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h index b2ee6d5df55b1..d5ec972b5c1ec 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h @@ -13,6 +13,10 @@ extern struct kvm_pgtable pkvm_pgtable; extern hyp_spinlock_t pkvm_pgd_lock; +int hyp_create_pcpu_fixmap(void); +void *hyp_fixmap_map(phys_addr_t phys); +void hyp_fixmap_unmap(void); + int hyp_create_idmap(u32 hyp_va_bits); int hyp_map_vectors(void); int hyp_back_vmemmap(phys_addr_t back); diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 459957b3082ea..8b4d3f0aa7a07 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -21,7 +21,6 @@ #define KVM_HOST_S2_FLAGS (KVM_PGTABLE_S2_NOFWB | KVM_PGTABLE_S2_IDMAP) -extern unsigned long hyp_nr_cpus; struct host_mmu host_mmu; static struct hyp_pool host_s2_pool; diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index d3a3b47181de0..5648ac21e62d0 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -25,6 +26,12 @@ unsigned int hyp_memblock_nr; static u64 __io_map_base; +struct hyp_fixmap_slot { + u64 addr; + kvm_pte_t *ptep; +}; +static DEFINE_PER_CPU(struct hyp_fixmap_slot, fixmap_slots); + static int __pkvm_create_mappings(unsigned long start, unsigned long size, unsigned long phys, enum kvm_pgtable_prot prot) { @@ -212,6 +219,103 @@ int hyp_map_vectors(void) return 0; } +void *hyp_fixmap_map(phys_addr_t phys) +{ + struct hyp_fixmap_slot *slot = this_cpu_ptr(&fixmap_slots); + kvm_pte_t pte, *ptep = slot->ptep; + + pte = *ptep; + pte &= ~kvm_phys_to_pte(KVM_PHYS_INVALID); + pte |= kvm_phys_to_pte(phys) | KVM_PTE_VALID; + WRITE_ONCE(*ptep, pte); + dsb(ishst); + + return (void *)slot->addr; +} + +static void fixmap_clear_slot(struct hyp_fixmap_slot *slot) +{ + kvm_pte_t *ptep = slot->ptep; + u64 addr = slot->addr; + + WRITE_ONCE(*ptep, *ptep & ~KVM_PTE_VALID); + + /* + * Irritatingly, the architecture requires that we use inner-shareable + * broadcast TLB invalidation here in case another CPU speculates + * through our fixmap and decides to create an "amalagamation of the + * values held in the TLB" due to the apparent lack of a + * break-before-make sequence. + * + * https://lore.kernel.org/kvm/20221017115209.2099-1-will@kernel.org/T/#mf10dfbaf1eaef9274c581b81c53758918c1d0f03 + */ + dsb(ishst); + __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), (KVM_PGTABLE_MAX_LEVELS - 1)); + dsb(ish); + isb(); +} + +void hyp_fixmap_unmap(void) +{ + fixmap_clear_slot(this_cpu_ptr(&fixmap_slots)); +} + +static int __create_fixmap_slot_cb(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, + void * const arg) +{ + struct hyp_fixmap_slot *slot = per_cpu_ptr(&fixmap_slots, (u64)arg); + + if (!kvm_pte_valid(*ptep) || level != KVM_PGTABLE_MAX_LEVELS - 1) + return -EINVAL; + + slot->addr = addr; + slot->ptep = ptep; + + /* + * Clear the PTE, but keep the page-table page refcount elevated to + * prevent it from ever being freed. This lets us manipulate the PTEs + * by hand safely without ever needing to allocate memory. + */ + fixmap_clear_slot(slot); + + return 0; +} + +static int create_fixmap_slot(u64 addr, u64 cpu) +{ + struct kvm_pgtable_walker walker = { + .cb = __create_fixmap_slot_cb, + .flags = KVM_PGTABLE_WALK_LEAF, + .arg = (void *)cpu, + }; + + return kvm_pgtable_walk(&pkvm_pgtable, addr, PAGE_SIZE, &walker); +} + +int hyp_create_pcpu_fixmap(void) +{ + unsigned long addr, i; + int ret; + + for (i = 0; i < hyp_nr_cpus; i++) { + ret = pkvm_alloc_private_va_range(PAGE_SIZE, &addr); + if (ret) + return ret; + + ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr, PAGE_SIZE, + __hyp_pa(__hyp_bss_start), PAGE_HYP); + if (ret) + return ret; + + ret = create_fixmap_slot(addr, i); + if (ret) + return ret; + } + + return 0; +} + int hyp_create_idmap(u32 hyp_va_bits) { unsigned long start, end; diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 2be72fbe72799..0f69c1393416d 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -321,6 +321,10 @@ void __noreturn __pkvm_init_finalise(void) if (ret) goto out; + ret = hyp_create_pcpu_fixmap(); + if (ret) + goto out; + pkvm_hyp_vm_table_init(vm_table_base); out: /* diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index a1a27f88a312d..2bcb2d5903bac 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -57,8 +57,6 @@ struct kvm_pgtable_walk_data { u64 end; }; -#define KVM_PHYS_INVALID (-1ULL) - static bool kvm_phys_is_valid(u64 phys) { return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_EL1_PARANGE_MAX)); @@ -122,16 +120,6 @@ static bool kvm_pte_table(kvm_pte_t pte, u32 level) return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE; } -static kvm_pte_t kvm_phys_to_pte(u64 pa) -{ - kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK; - - if (PAGE_SHIFT == 16) - pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48); - - return pte; -} - static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops) { return mm_ops->phys_to_virt(kvm_pte_to_phys(pte)); -- GitLab From 6c165223e9a6384aa1e934b90f2650e71adb972a Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 10 Nov 2022 19:02:48 +0000 Subject: [PATCH 0583/1423] KVM: arm64: Initialise hypervisor copies of host symbols unconditionally MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The nVHE object at EL2 maintains its own copies of some host variables so that, when pKVM is enabled, the host cannot directly modify the hypervisor state. When running in normal nVHE mode, however, these variables are still mirrored at EL2 but are not initialised. Initialise the hypervisor symbols from the host copies regardless of pKVM, ensuring that any reference to this data at EL2 with normal nVHE will return a sensibly initialised value. Reviewed-by: Philippe Mathieu-Daudé Tested-by: Vincent Donnefort Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110190259.26861-16-will@kernel.org --- arch/arm64/kvm/arm.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 30d6fc5d3a93d..584626e11797f 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1884,11 +1884,8 @@ static int do_pkvm_init(u32 hyp_va_bits) return ret; } -static int kvm_hyp_init_protection(u32 hyp_va_bits) +static void kvm_hyp_init_symbols(void) { - void *addr = phys_to_virt(hyp_mem_base); - int ret; - kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); kvm_nvhe_sym(id_aa64pfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); kvm_nvhe_sym(id_aa64isar0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR0_EL1); @@ -1897,6 +1894,12 @@ static int kvm_hyp_init_protection(u32 hyp_va_bits) kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1); +} + +static int kvm_hyp_init_protection(u32 hyp_va_bits) +{ + void *addr = phys_to_virt(hyp_mem_base); + int ret; ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP); if (ret) @@ -2071,6 +2074,8 @@ static int init_hyp_mode(void) cpu_prepare_hyp_mode(cpu); } + kvm_hyp_init_symbols(); + if (is_protected_kvm_enabled()) { init_cpu_logical_map(); @@ -2078,9 +2083,7 @@ static int init_hyp_mode(void) err = -ENODEV; goto out_err; } - } - if (is_protected_kvm_enabled()) { err = kvm_hyp_init_protection(hyp_va_bits); if (err) { kvm_err("Failed to init hyp memory protection\n"); -- GitLab From 13e248aab73d2f1c27b458ef09d38b44f3e5bf2e Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 10 Nov 2022 19:02:49 +0000 Subject: [PATCH 0584/1423] KVM: arm64: Provide I-cache invalidation by virtual address at EL2 In preparation for handling cache maintenance of guest pages from within the pKVM hypervisor at EL2, introduce an EL2 copy of icache_inval_pou() which will later be plumbed into the stage-2 page-table cache maintenance callbacks, ensuring that the initial contents of pages mapped as executable into the guest stage-2 page-table is visible to the instruction fetcher. Tested-by: Vincent Donnefort Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110190259.26861-17-will@kernel.org --- arch/arm64/include/asm/kvm_hyp.h | 1 + arch/arm64/kernel/image-vars.h | 3 --- arch/arm64/kvm/arm.c | 1 + arch/arm64/kvm/hyp/nvhe/cache.S | 11 +++++++++++ arch/arm64/kvm/hyp/nvhe/pkvm.c | 3 +++ 5 files changed, 16 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index aa7fa2a08f060..fd99cf09972de 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -123,4 +123,5 @@ extern u64 kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val); extern u64 kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val); extern u64 kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val); +extern unsigned long kvm_nvhe_sym(__icache_flags); #endif /* __ARM64_KVM_HYP_H__ */ diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 8151412653de2..7f4e43bfaade7 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -71,9 +71,6 @@ KVM_NVHE_ALIAS(nvhe_hyp_panic_handler); /* Vectors installed by hyp-init on reset HVC. */ KVM_NVHE_ALIAS(__hyp_stub_vectors); -/* Kernel symbol used by icache_is_vpipt(). */ -KVM_NVHE_ALIAS(__icache_flags); - /* VMID bits set by the KVM VMID allocator */ KVM_NVHE_ALIAS(kvm_arm_vmid_bits); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 584626e11797f..d99e93e6ddf71 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1894,6 +1894,7 @@ static void kvm_hyp_init_symbols(void) kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1); + kvm_nvhe_sym(__icache_flags) = __icache_flags; } static int kvm_hyp_init_protection(u32 hyp_va_bits) diff --git a/arch/arm64/kvm/hyp/nvhe/cache.S b/arch/arm64/kvm/hyp/nvhe/cache.S index 0c367eb5f4e28..85936c17ae407 100644 --- a/arch/arm64/kvm/hyp/nvhe/cache.S +++ b/arch/arm64/kvm/hyp/nvhe/cache.S @@ -12,3 +12,14 @@ SYM_FUNC_START(__pi_dcache_clean_inval_poc) ret SYM_FUNC_END(__pi_dcache_clean_inval_poc) SYM_FUNC_ALIAS(dcache_clean_inval_poc, __pi_dcache_clean_inval_poc) + +SYM_FUNC_START(__pi_icache_inval_pou) +alternative_if ARM64_HAS_CACHE_DIC + isb + ret +alternative_else_nop_endif + + invalidate_icache_by_line x0, x1, x2, x3 + ret +SYM_FUNC_END(__pi_icache_inval_pou) +SYM_FUNC_ALIAS(icache_inval_pou, __pi_icache_inval_pou) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 2c73c4640e4dd..0768307566d42 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -12,6 +12,9 @@ #include #include +/* Used by icache_is_vpipt(). */ +unsigned long __icache_flags; + /* * Set trap register values based on features in ID_AA64PFR0. */ -- GitLab From 717a7eebac106a5cc5d5493f8eef9cf4ae6edf19 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 10 Nov 2022 19:02:50 +0000 Subject: [PATCH 0585/1423] KVM: arm64: Add generic hyp_memcache helpers The host at EL1 and the pKVM hypervisor at EL2 will soon need to exchange memory pages dynamically for creating and destroying VM state. Indeed, the hypervisor will rely on the host to donate memory pages it can use to create guest stage-2 page-tables and to store VM and vCPU metadata. In order to ease this process, introduce a 'struct hyp_memcache' which is essentially a linked list of available pages, indexed by physical addresses so that it can be passed meaningfully between the different virtual address spaces configured at EL1 and EL2. Tested-by: Vincent Donnefort Signed-off-by: Quentin Perret Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110190259.26861-18-will@kernel.org --- arch/arm64/include/asm/kvm_host.h | 57 +++++++++++++++++++ arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 2 + arch/arm64/kvm/hyp/nvhe/mm.c | 33 +++++++++++ arch/arm64/kvm/mmu.c | 26 +++++++++ 4 files changed, 118 insertions(+) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 467393e7331f1..835987e0f868d 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -73,6 +73,63 @@ u32 __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu); +struct kvm_hyp_memcache { + phys_addr_t head; + unsigned long nr_pages; +}; + +static inline void push_hyp_memcache(struct kvm_hyp_memcache *mc, + phys_addr_t *p, + phys_addr_t (*to_pa)(void *virt)) +{ + *p = mc->head; + mc->head = to_pa(p); + mc->nr_pages++; +} + +static inline void *pop_hyp_memcache(struct kvm_hyp_memcache *mc, + void *(*to_va)(phys_addr_t phys)) +{ + phys_addr_t *p = to_va(mc->head); + + if (!mc->nr_pages) + return NULL; + + mc->head = *p; + mc->nr_pages--; + + return p; +} + +static inline int __topup_hyp_memcache(struct kvm_hyp_memcache *mc, + unsigned long min_pages, + void *(*alloc_fn)(void *arg), + phys_addr_t (*to_pa)(void *virt), + void *arg) +{ + while (mc->nr_pages < min_pages) { + phys_addr_t *p = alloc_fn(arg); + + if (!p) + return -ENOMEM; + push_hyp_memcache(mc, p, to_pa); + } + + return 0; +} + +static inline void __free_hyp_memcache(struct kvm_hyp_memcache *mc, + void (*free_fn)(void *virt, void *arg), + void *(*to_va)(phys_addr_t phys), + void *arg) +{ + while (mc->nr_pages) + free_fn(pop_hyp_memcache(mc, to_va), arg); +} + +void free_hyp_memcache(struct kvm_hyp_memcache *mc); +int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages); + struct kvm_vmid { atomic64_t id; }; diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index ef31a1872c93b..420b87e755a42 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -77,6 +77,8 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt); int hyp_pin_shared_mem(void *from, void *to); void hyp_unpin_shared_mem(void *from, void *to); void reclaim_guest_pages(struct pkvm_hyp_vm *vm); +int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages, + struct kvm_hyp_memcache *host_mc); static __always_inline void __load_host_stage2(void) { diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index 5648ac21e62d0..c80b2c007619a 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -340,3 +340,36 @@ int hyp_create_idmap(u32 hyp_va_bits) return __pkvm_create_mappings(start, end - start, start, PAGE_HYP_EXEC); } + +static void *admit_host_page(void *arg) +{ + struct kvm_hyp_memcache *host_mc = arg; + + if (!host_mc->nr_pages) + return NULL; + + /* + * The host still owns the pages in its memcache, so we need to go + * through a full host-to-hyp donation cycle to change it. Fortunately, + * __pkvm_host_donate_hyp() takes care of races for us, so if it + * succeeds we're good to go. + */ + if (__pkvm_host_donate_hyp(hyp_phys_to_pfn(host_mc->head), 1)) + return NULL; + + return pop_hyp_memcache(host_mc, hyp_phys_to_virt); +} + +/* Refill our local memcache by poping pages from the one provided by the host. */ +int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages, + struct kvm_hyp_memcache *host_mc) +{ + struct kvm_hyp_memcache tmp = *host_mc; + int ret; + + ret = __topup_hyp_memcache(mc, min_pages, admit_host_page, + hyp_virt_to_phys, &tmp); + *host_mc = tmp; + + return ret; +} diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 60ee3d9f01f8c..18061163c607b 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -807,6 +807,32 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) } } +static void hyp_mc_free_fn(void *addr, void *unused) +{ + free_page((unsigned long)addr); +} + +static void *hyp_mc_alloc_fn(void *unused) +{ + return (void *)__get_free_page(GFP_KERNEL_ACCOUNT); +} + +void free_hyp_memcache(struct kvm_hyp_memcache *mc) +{ + if (is_protected_kvm_enabled()) + __free_hyp_memcache(mc, hyp_mc_free_fn, + kvm_host_va, NULL); +} + +int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) +{ + if (!is_protected_kvm_enabled()) + return 0; + + return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn, + kvm_host_pa, NULL); +} + /** * kvm_phys_addr_ioremap - map a device range to guest IPA * -- GitLab From 315775ff7c6de497dd07c3f6eff499fb538783eb Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 10 Nov 2022 19:02:51 +0000 Subject: [PATCH 0586/1423] KVM: arm64: Consolidate stage-2 initialisation into a single function The initialisation of guest stage-2 page-tables is currently split across two functions: kvm_init_stage2_mmu() and kvm_arm_setup_stage2(). That is presumably for historical reasons as kvm_arm_setup_stage2() originates from the (now defunct) KVM port for 32-bit Arm. Simplify this code path by merging both functions into one, taking care to map the 'struct kvm' into the hypervisor stage-1 early on in order to simplify the failure path. Tested-by: Vincent Donnefort Co-developed-by: Fuad Tabba Signed-off-by: Fuad Tabba Signed-off-by: Quentin Perret Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110190259.26861-19-will@kernel.org --- arch/arm64/include/asm/kvm_arm.h | 2 +- arch/arm64/include/asm/kvm_host.h | 2 -- arch/arm64/include/asm/kvm_mmu.h | 2 +- arch/arm64/kvm/arm.c | 27 +++++++++++++-------------- arch/arm64/kvm/mmu.c | 27 ++++++++++++++++++++++++++- arch/arm64/kvm/reset.c | 29 ----------------------------- 6 files changed, 41 insertions(+), 48 deletions(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 8aa8492dafc0f..89e63585dae4c 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -135,7 +135,7 @@ * 40 bits wide (T0SZ = 24). Systems with a PARange smaller than 40 bits are * not known to exist and will break with this configuration. * - * The VTCR_EL2 is configured per VM and is initialised in kvm_arm_setup_stage2(). + * The VTCR_EL2 is configured per VM and is initialised in kvm_init_stage2_mmu. * * Note that when using 4K pages, we concatenate two first level page tables * together. With 16K pages, we concatenate 16 first level page tables. diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 835987e0f868d..57218f0c449e0 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -990,8 +990,6 @@ int kvm_set_ipa_limit(void); #define __KVM_HAVE_ARCH_VM_ALLOC struct kvm *kvm_arch_alloc_vm(void); -int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type); - static inline bool kvm_vm_is_protected(struct kvm *kvm) { return false; diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 7784081088e78..e4a7e63694998 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -166,7 +166,7 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, void free_hyp_pgds(void); void stage2_unmap_vm(struct kvm *kvm); -int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu); +int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type); void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu); int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, phys_addr_t pa, unsigned long size, bool writable); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index d99e93e6ddf71..f78eefa02f6bd 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -139,28 +139,24 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { int ret; - ret = kvm_arm_setup_stage2(kvm, type); - if (ret) - return ret; - - ret = kvm_init_stage2_mmu(kvm, &kvm->arch.mmu); - if (ret) - return ret; - ret = kvm_share_hyp(kvm, kvm + 1); if (ret) - goto out_free_stage2_pgd; + return ret; ret = pkvm_init_host_vm(kvm); if (ret) - goto out_free_stage2_pgd; + goto err_unshare_kvm; if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL)) { ret = -ENOMEM; - goto out_free_stage2_pgd; + goto err_unshare_kvm; } cpumask_copy(kvm->arch.supported_cpus, cpu_possible_mask); + ret = kvm_init_stage2_mmu(kvm, &kvm->arch.mmu, type); + if (ret) + goto err_free_cpumask; + kvm_vgic_early_init(kvm); /* The maximum number of VCPUs is limited by the host's GIC model */ @@ -169,9 +165,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) set_default_spectre(kvm); kvm_arm_init_hypercalls(kvm); - return ret; -out_free_stage2_pgd: - kvm_free_stage2_pgd(&kvm->arch.mmu); + return 0; + +err_free_cpumask: + free_cpumask_var(kvm->arch.supported_cpus); +err_unshare_kvm: + kvm_unshare_hyp(kvm, kvm + 1); return ret; } diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 18061163c607b..3e56c6393caeb 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -675,15 +675,40 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { * kvm_init_stage2_mmu - Initialise a S2 MMU structure * @kvm: The pointer to the KVM structure * @mmu: The pointer to the s2 MMU structure + * @type: The machine type of the virtual machine * * Allocates only the stage-2 HW PGD level table(s). * Note we don't need locking here as this is only called when the VM is * created, which can only be done once. */ -int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) +int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) { + u32 kvm_ipa_limit = get_kvm_ipa_limit(); int cpu, err; struct kvm_pgtable *pgt; + u64 mmfr0, mmfr1; + u32 phys_shift; + + if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) + return -EINVAL; + + phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); + if (phys_shift) { + if (phys_shift > kvm_ipa_limit || + phys_shift < ARM64_MIN_PARANGE_BITS) + return -EINVAL; + } else { + phys_shift = KVM_PHYS_SHIFT; + if (phys_shift > kvm_ipa_limit) { + pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", + current->comm); + return -EINVAL; + } + } + + mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); + mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); if (mmu->pgt != NULL) { kvm_err("kvm_arch already initialized?\n"); diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 5ae18472205a9..e0267f672b8ab 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -395,32 +395,3 @@ int kvm_set_ipa_limit(void) return 0; } - -int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) -{ - u64 mmfr0, mmfr1; - u32 phys_shift; - - if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) - return -EINVAL; - - phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); - if (phys_shift) { - if (phys_shift > kvm_ipa_limit || - phys_shift < ARM64_MIN_PARANGE_BITS) - return -EINVAL; - } else { - phys_shift = KVM_PHYS_SHIFT; - if (phys_shift > kvm_ipa_limit) { - pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", - current->comm); - return -EINVAL; - } - } - - mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); - mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); - kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); - - return 0; -} -- GitLab From 60dfe093ec13b056856c672e1daa35134be38283 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 10 Nov 2022 19:02:52 +0000 Subject: [PATCH 0587/1423] KVM: arm64: Instantiate guest stage-2 page-tables at EL2 Extend the initialisation of guest data structures within the pKVM hypervisor at EL2 so that we instantiate a memory pool and a full 'struct kvm_s2_mmu' structure for each VM, with a stage-2 page-table entirely independent from the one managed by the host at EL1. The 'struct kvm_pgtable_mm_ops' used by the page-table code is populated with a set of callbacks that can manage guest pages in the hypervisor without any direct intervention from the host, allocating page-table pages from the provided pool and returning these to the host on VM teardown. To keep things simple, the stage-2 MMU for the guest is configured identically to the host stage-2 in the VTCR register and so the IPA size of the guest must match the PA size of the host. For now, the new page-table is unused as there is no way for the host to map anything into it. Yet. Tested-by: Vincent Donnefort Signed-off-by: Quentin Perret Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110190259.26861-20-will@kernel.org --- arch/arm64/kvm/hyp/include/nvhe/pkvm.h | 6 ++ arch/arm64/kvm/hyp/nvhe/mem_protect.c | 125 ++++++++++++++++++++++++- arch/arm64/kvm/mmu.c | 4 +- 3 files changed, 132 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h index 8c653a3b9501d..d14dfbcb7da16 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h @@ -9,6 +9,9 @@ #include +#include +#include + /* * Holds the relevant data for maintaining the vcpu state completely at hyp. */ @@ -30,6 +33,9 @@ struct pkvm_hyp_vm { /* The guest's stage-2 page-table managed by the hypervisor. */ struct kvm_pgtable pgt; + struct kvm_pgtable_mm_ops mm_ops; + struct hyp_pool pool; + hyp_spinlock_t lock; /* * The number of vcpus initialized and ready to run. diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 8b4d3f0aa7a07..0162afba6dc48 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -25,6 +25,21 @@ struct host_mmu host_mmu; static struct hyp_pool host_s2_pool; +static DEFINE_PER_CPU(struct pkvm_hyp_vm *, __current_vm); +#define current_vm (*this_cpu_ptr(&__current_vm)) + +static void guest_lock_component(struct pkvm_hyp_vm *vm) +{ + hyp_spin_lock(&vm->lock); + current_vm = vm; +} + +static void guest_unlock_component(struct pkvm_hyp_vm *vm) +{ + current_vm = NULL; + hyp_spin_unlock(&vm->lock); +} + static void host_lock_component(void) { hyp_spin_lock(&host_mmu.lock); @@ -140,18 +155,124 @@ int kvm_host_prepare_stage2(void *pgt_pool_base) return 0; } +static bool guest_stage2_force_pte_cb(u64 addr, u64 end, + enum kvm_pgtable_prot prot) +{ + return true; +} + +static void *guest_s2_zalloc_pages_exact(size_t size) +{ + void *addr = hyp_alloc_pages(¤t_vm->pool, get_order(size)); + + WARN_ON(size != (PAGE_SIZE << get_order(size))); + hyp_split_page(hyp_virt_to_page(addr)); + + return addr; +} + +static void guest_s2_free_pages_exact(void *addr, unsigned long size) +{ + u8 order = get_order(size); + unsigned int i; + + for (i = 0; i < (1 << order); i++) + hyp_put_page(¤t_vm->pool, addr + (i * PAGE_SIZE)); +} + +static void *guest_s2_zalloc_page(void *mc) +{ + struct hyp_page *p; + void *addr; + + addr = hyp_alloc_pages(¤t_vm->pool, 0); + if (addr) + return addr; + + addr = pop_hyp_memcache(mc, hyp_phys_to_virt); + if (!addr) + return addr; + + memset(addr, 0, PAGE_SIZE); + p = hyp_virt_to_page(addr); + memset(p, 0, sizeof(*p)); + p->refcount = 1; + + return addr; +} + +static void guest_s2_get_page(void *addr) +{ + hyp_get_page(¤t_vm->pool, addr); +} + +static void guest_s2_put_page(void *addr) +{ + hyp_put_page(¤t_vm->pool, addr); +} + +static void clean_dcache_guest_page(void *va, size_t size) +{ + __clean_dcache_guest_page(hyp_fixmap_map(__hyp_pa(va)), size); + hyp_fixmap_unmap(); +} + +static void invalidate_icache_guest_page(void *va, size_t size) +{ + __invalidate_icache_guest_page(hyp_fixmap_map(__hyp_pa(va)), size); + hyp_fixmap_unmap(); +} + int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd) { - vm->pgt.pgd = pgd; + struct kvm_s2_mmu *mmu = &vm->kvm.arch.mmu; + unsigned long nr_pages; + int ret; + + nr_pages = kvm_pgtable_stage2_pgd_size(vm->kvm.arch.vtcr) >> PAGE_SHIFT; + ret = hyp_pool_init(&vm->pool, hyp_virt_to_pfn(pgd), nr_pages, 0); + if (ret) + return ret; + + hyp_spin_lock_init(&vm->lock); + vm->mm_ops = (struct kvm_pgtable_mm_ops) { + .zalloc_pages_exact = guest_s2_zalloc_pages_exact, + .free_pages_exact = guest_s2_free_pages_exact, + .zalloc_page = guest_s2_zalloc_page, + .phys_to_virt = hyp_phys_to_virt, + .virt_to_phys = hyp_virt_to_phys, + .page_count = hyp_page_count, + .get_page = guest_s2_get_page, + .put_page = guest_s2_put_page, + .dcache_clean_inval_poc = clean_dcache_guest_page, + .icache_inval_pou = invalidate_icache_guest_page, + }; + + guest_lock_component(vm); + ret = __kvm_pgtable_stage2_init(mmu->pgt, mmu, &vm->mm_ops, 0, + guest_stage2_force_pte_cb); + guest_unlock_component(vm); + if (ret) + return ret; + + vm->kvm.arch.mmu.pgd_phys = __hyp_pa(vm->pgt.pgd); + return 0; } void reclaim_guest_pages(struct pkvm_hyp_vm *vm) { + void *pgd = vm->pgt.pgd; unsigned long nr_pages; nr_pages = kvm_pgtable_stage2_pgd_size(vm->kvm.arch.vtcr) >> PAGE_SHIFT; - WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(vm->pgt.pgd), nr_pages)); + + guest_lock_component(vm); + kvm_pgtable_stage2_destroy(&vm->pgt); + vm->kvm.arch.mmu.pgd_phys = 0ULL; + guest_unlock_component(vm); + + WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(pgd), nr_pages)); } int __pkvm_prot_finalize(void) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 3e56c6393caeb..962f4472601b1 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -693,7 +693,9 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t return -EINVAL; phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); - if (phys_shift) { + if (is_protected_kvm_enabled()) { + phys_shift = kvm_ipa_limit; + } else if (phys_shift) { if (phys_shift > kvm_ipa_limit || phys_shift < ARM64_MIN_PARANGE_BITS) return -EINVAL; -- GitLab From f41dff4efb918db68923a826e966ca62c7c8e929 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 10 Nov 2022 19:02:53 +0000 Subject: [PATCH 0588/1423] KVM: arm64: Return guest memory from EL2 via dedicated teardown memcache Rather than relying on the host to free the previously-donated pKVM hypervisor VM pages explicitly on teardown, introduce a dedicated teardown memcache which allows the host to reclaim guest memory resources without having to keep track of all of the allocations made by the pKVM hypervisor at EL2. Tested-by: Vincent Donnefort Co-developed-by: Fuad Tabba Signed-off-by: Fuad Tabba Signed-off-by: Quentin Perret Signed-off-by: Will Deacon [maz: dropped __maybe_unused from unmap_donated_memory_noclear()] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110190259.26861-21-will@kernel.org --- arch/arm64/include/asm/kvm_host.h | 7 +---- arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 2 +- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 17 ++++++---- arch/arm64/kvm/hyp/nvhe/pkvm.c | 27 ++++++++++++---- arch/arm64/kvm/pkvm.c | 31 ++++--------------- 5 files changed, 40 insertions(+), 44 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 57218f0c449e0..63307e7dc9c5b 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -176,12 +176,7 @@ typedef unsigned int pkvm_handle_t; struct kvm_protected_vm { pkvm_handle_t handle; - - struct { - void *pgd; - void *vm; - void *vcpus[KVM_MAX_VCPUS]; - } hyp_donations; + struct kvm_hyp_memcache teardown_mc; }; struct kvm_arch { diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 420b87e755a42..b7bdbe63deed8 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -76,7 +76,7 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt); int hyp_pin_shared_mem(void *from, void *to); void hyp_unpin_shared_mem(void *from, void *to); -void reclaim_guest_pages(struct pkvm_hyp_vm *vm); +void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc); int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages, struct kvm_hyp_memcache *host_mc); diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 0162afba6dc48..94cd48f7850e5 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -260,19 +260,24 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd) return 0; } -void reclaim_guest_pages(struct pkvm_hyp_vm *vm) +void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc) { - void *pgd = vm->pgt.pgd; - unsigned long nr_pages; - - nr_pages = kvm_pgtable_stage2_pgd_size(vm->kvm.arch.vtcr) >> PAGE_SHIFT; + void *addr; + /* Dump all pgtable pages in the hyp_pool */ guest_lock_component(vm); kvm_pgtable_stage2_destroy(&vm->pgt); vm->kvm.arch.mmu.pgd_phys = 0ULL; guest_unlock_component(vm); - WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(pgd), nr_pages)); + /* Drain the hyp_pool into the memcache */ + addr = hyp_alloc_pages(&vm->pool, 0); + while (addr) { + memset(hyp_virt_to_page(addr), 0, sizeof(struct hyp_page)); + push_hyp_memcache(mc, addr, hyp_virt_to_phys); + WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(addr), 1)); + addr = hyp_alloc_pages(&vm->pool, 0); + } } int __pkvm_prot_finalize(void) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 0768307566d42..81835c2f4c5a0 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -393,7 +393,7 @@ static void unmap_donated_memory(void *va, size_t size) __unmap_donated_memory(va, size); } -static void __maybe_unused unmap_donated_memory_noclear(void *va, size_t size) +static void unmap_donated_memory_noclear(void *va, size_t size) { if (!va) return; @@ -527,8 +527,21 @@ unlock: return ret; } +static void +teardown_donated_memory(struct kvm_hyp_memcache *mc, void *addr, size_t size) +{ + size = PAGE_ALIGN(size); + memset(addr, 0, size); + + for (void *start = addr; start < addr + size; start += PAGE_SIZE) + push_hyp_memcache(mc, start, hyp_virt_to_phys); + + unmap_donated_memory_noclear(addr, size); +} + int __pkvm_teardown_vm(pkvm_handle_t handle) { + struct kvm_hyp_memcache *mc; struct pkvm_hyp_vm *hyp_vm; struct kvm *host_kvm; unsigned int idx; @@ -547,25 +560,27 @@ int __pkvm_teardown_vm(pkvm_handle_t handle) goto err_unlock; } + host_kvm = hyp_vm->host_kvm; + /* Ensure the VMID is clean before it can be reallocated */ __kvm_tlb_flush_vmid(&hyp_vm->kvm.arch.mmu); remove_vm_table_entry(handle); hyp_spin_unlock(&vm_table_lock); /* Reclaim guest pages (including page-table pages) */ - reclaim_guest_pages(hyp_vm); + mc = &host_kvm->arch.pkvm.teardown_mc; + reclaim_guest_pages(hyp_vm, mc); unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->nr_vcpus); - /* Return the metadata pages to the host */ + /* Push the metadata pages to the teardown memcache */ for (idx = 0; idx < hyp_vm->nr_vcpus; ++idx) { struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vm->vcpus[idx]; - unmap_donated_memory(hyp_vcpu, sizeof(*hyp_vcpu)); + teardown_donated_memory(mc, hyp_vcpu, sizeof(*hyp_vcpu)); } - host_kvm = hyp_vm->host_kvm; vm_size = pkvm_get_hyp_vm_size(hyp_vm->kvm.created_vcpus); - unmap_donated_memory(hyp_vm, vm_size); + teardown_donated_memory(mc, hyp_vm, vm_size); hyp_unpin_shared_mem(host_kvm, host_kvm + 1); return 0; diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index 8c443b915e439..cf56958b1492a 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -147,8 +147,6 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm) handle = ret; host_kvm->arch.pkvm.handle = handle; - host_kvm->arch.pkvm.hyp_donations.pgd = pgd; - host_kvm->arch.pkvm.hyp_donations.vm = hyp_vm; /* Donate memory for the vcpus at hyp and initialize it. */ hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE); @@ -167,12 +165,12 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm) goto destroy_vm; } - host_kvm->arch.pkvm.hyp_donations.vcpus[idx] = hyp_vcpu; - ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, host_vcpu, hyp_vcpu); - if (ret) + if (ret) { + free_pages_exact(hyp_vcpu, hyp_vcpu_sz); goto destroy_vm; + } } return 0; @@ -201,30 +199,13 @@ int pkvm_create_hyp_vm(struct kvm *host_kvm) void pkvm_destroy_hyp_vm(struct kvm *host_kvm) { - unsigned long idx, nr_vcpus = host_kvm->created_vcpus; - size_t pgd_sz, hyp_vm_sz; - - if (host_kvm->arch.pkvm.handle) + if (host_kvm->arch.pkvm.handle) { WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm, host_kvm->arch.pkvm.handle)); - - host_kvm->arch.pkvm.handle = 0; - - for (idx = 0; idx < nr_vcpus; ++idx) { - void *hyp_vcpu = host_kvm->arch.pkvm.hyp_donations.vcpus[idx]; - - if (!hyp_vcpu) - break; - - free_pages_exact(hyp_vcpu, PAGE_ALIGN(PKVM_HYP_VCPU_SIZE)); } - hyp_vm_sz = PAGE_ALIGN(size_add(PKVM_HYP_VM_SIZE, - size_mul(sizeof(void *), nr_vcpus))); - pgd_sz = kvm_pgtable_stage2_pgd_size(host_kvm->arch.vtcr); - - free_pages_exact(host_kvm->arch.pkvm.hyp_donations.vm, hyp_vm_sz); - free_pages_exact(host_kvm->arch.pkvm.hyp_donations.pgd, pgd_sz); + host_kvm->arch.pkvm.handle = 0; + free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc); } int pkvm_init_host_vm(struct kvm *host_kvm) -- GitLab From fe41a7f8c0ee3ee2f682f8c28c7e1c5ff2be8a79 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 10 Nov 2022 19:02:54 +0000 Subject: [PATCH 0589/1423] KVM: arm64: Unmap 'kvm_arm_hyp_percpu_base' from the host When pKVM is enabled, the hypervisor at EL2 does not trust the host at EL1 and must therefore prevent it from having unrestricted access to internal hypervisor state. The 'kvm_arm_hyp_percpu_base' array holds the offsets for hypervisor per-cpu allocations, so move this this into the nVHE code where it cannot be modified by the untrusted host at EL1. Tested-by: Vincent Donnefort Signed-off-by: Quentin Perret Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110190259.26861-22-will@kernel.org --- arch/arm64/include/asm/kvm_asm.h | 4 ++-- arch/arm64/kernel/image-vars.h | 3 --- arch/arm64/kvm/arm.c | 9 ++++----- arch/arm64/kvm/hyp/nvhe/hyp-smp.c | 2 ++ 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index de52ba775d48f..43c3bc0f9544d 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -109,7 +109,7 @@ enum __kvm_host_smccc_func { #define per_cpu_ptr_nvhe_sym(sym, cpu) \ ({ \ unsigned long base, off; \ - base = kvm_arm_hyp_percpu_base[cpu]; \ + base = kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu]; \ off = (unsigned long)&CHOOSE_NVHE_SYM(sym) - \ (unsigned long)&CHOOSE_NVHE_SYM(__per_cpu_start); \ base ? (typeof(CHOOSE_NVHE_SYM(sym))*)(base + off) : NULL; \ @@ -214,7 +214,7 @@ DECLARE_KVM_HYP_SYM(__kvm_hyp_vector); #define __kvm_hyp_init CHOOSE_NVHE_SYM(__kvm_hyp_init) #define __kvm_hyp_vector CHOOSE_HYP_SYM(__kvm_hyp_vector) -extern unsigned long kvm_arm_hyp_percpu_base[NR_CPUS]; +extern unsigned long kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[]; DECLARE_KVM_NVHE_SYM(__per_cpu_start); DECLARE_KVM_NVHE_SYM(__per_cpu_end); diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 7f4e43bfaade7..ae8f37f4aa8c6 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -89,9 +89,6 @@ KVM_NVHE_ALIAS(gic_nonsecure_priorities); KVM_NVHE_ALIAS(__start___kvm_ex_table); KVM_NVHE_ALIAS(__stop___kvm_ex_table); -/* Array containing bases of nVHE per-CPU memory regions. */ -KVM_NVHE_ALIAS(kvm_arm_hyp_percpu_base); - /* PMU available static key */ #ifdef CONFIG_HW_PERF_EVENTS KVM_NVHE_ALIAS(kvm_arm_pmu_available); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index f78eefa02f6bd..25467f24803dd 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -51,7 +51,6 @@ DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized); DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); -unsigned long kvm_arm_hyp_percpu_base[NR_CPUS]; DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); static bool vgic_present; @@ -1857,13 +1856,13 @@ static void teardown_hyp_mode(void) free_hyp_pgds(); for_each_possible_cpu(cpu) { free_page(per_cpu(kvm_arm_hyp_stack_page, cpu)); - free_pages(kvm_arm_hyp_percpu_base[cpu], nvhe_percpu_order()); + free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order()); } } static int do_pkvm_init(u32 hyp_va_bits) { - void *per_cpu_base = kvm_ksym_ref(kvm_arm_hyp_percpu_base); + void *per_cpu_base = kvm_ksym_ref(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)); int ret; preempt_disable(); @@ -1967,7 +1966,7 @@ static int init_hyp_mode(void) page_addr = page_address(page); memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size()); - kvm_arm_hyp_percpu_base[cpu] = (unsigned long)page_addr; + kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu] = (unsigned long)page_addr; } /* @@ -2060,7 +2059,7 @@ static int init_hyp_mode(void) } for_each_possible_cpu(cpu) { - char *percpu_begin = (char *)kvm_arm_hyp_percpu_base[cpu]; + char *percpu_begin = (char *)kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu]; char *percpu_end = percpu_begin + nvhe_percpu_size(); /* Map Hyp percpu pages */ diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c index 9f54833af4009..04d194583f1e4 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c @@ -23,6 +23,8 @@ u64 cpu_logical_map(unsigned int cpu) return hyp_cpu_logical_map[cpu]; } +unsigned long __ro_after_init kvm_arm_hyp_percpu_base[NR_CPUS]; + unsigned long __hyp_per_cpu_offset(unsigned int cpu) { unsigned long *cpu_base_array; -- GitLab From 73f38ef2ae531b180685173e0923225551434fcb Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 10 Nov 2022 19:02:55 +0000 Subject: [PATCH 0590/1423] KVM: arm64: Maintain a copy of 'kvm_arm_vmid_bits' at EL2 Sharing 'kvm_arm_vmid_bits' between EL1 and EL2 allows the host to modify the variable arbitrarily, potentially leading to all sorts of shenanians as this is used to configure the VTTBR register for the guest stage-2. In preparation for unmapping host sections entirely from EL2, maintain a copy of 'kvm_arm_vmid_bits' in the pKVM hypervisor and initialise it from the host value while it is still trusted. Tested-by: Vincent Donnefort Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110190259.26861-23-will@kernel.org --- arch/arm64/include/asm/kvm_hyp.h | 2 ++ arch/arm64/kernel/image-vars.h | 3 --- arch/arm64/kvm/arm.c | 1 + arch/arm64/kvm/hyp/nvhe/pkvm.c | 3 +++ 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index fd99cf09972de..6797eafe7890b 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -124,4 +124,6 @@ extern u64 kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val); extern u64 kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val); extern unsigned long kvm_nvhe_sym(__icache_flags); +extern unsigned int kvm_nvhe_sym(kvm_arm_vmid_bits); + #endif /* __ARM64_KVM_HYP_H__ */ diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index ae8f37f4aa8c6..31ad75da4d589 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -71,9 +71,6 @@ KVM_NVHE_ALIAS(nvhe_hyp_panic_handler); /* Vectors installed by hyp-init on reset HVC. */ KVM_NVHE_ALIAS(__hyp_stub_vectors); -/* VMID bits set by the KVM VMID allocator */ -KVM_NVHE_ALIAS(kvm_arm_vmid_bits); - /* Static keys which are set if a vGIC trap should be handled in hyp. */ KVM_NVHE_ALIAS(vgic_v2_cpuif_trap); KVM_NVHE_ALIAS(vgic_v3_cpuif_trap); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 25467f24803dd..1d4b8122d0108 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1893,6 +1893,7 @@ static void kvm_hyp_init_symbols(void) kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1); kvm_nvhe_sym(__icache_flags) = __icache_flags; + kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits; } static int kvm_hyp_init_protection(u32 hyp_va_bits) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 81835c2f4c5a0..ed6ceac1e8547 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -15,6 +15,9 @@ /* Used by icache_is_vpipt(). */ unsigned long __icache_flags; +/* Used by kvm_get_vttbr(). */ +unsigned int kvm_arm_vmid_bits; + /* * Set trap register values based on features in ID_AA64PFR0. */ -- GitLab From 27eb26bfff5d358d42911d04bbecc62e659ec32b Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 10 Nov 2022 19:02:56 +0000 Subject: [PATCH 0591/1423] KVM: arm64: Explicitly map 'kvm_vgic_global_state' at EL2 The pkvm hypervisor at EL2 may need to read the 'kvm_vgic_global_state' variable from the host, for example when saving and restoring the state of the virtual GIC. Explicitly map 'kvm_vgic_global_state' in the stage-1 page-table of the pKVM hypervisor rather than relying on mapping all of the host '.rodata' section. Tested-by: Vincent Donnefort Signed-off-by: Quentin Perret Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110190259.26861-24-will@kernel.org --- arch/arm64/kvm/hyp/nvhe/setup.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 0f69c1393416d..5a371ab236db7 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -161,6 +161,11 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, if (ret) return ret; + ret = pkvm_create_mappings(&kvm_vgic_global_state, + &kvm_vgic_global_state + 1, prot); + if (ret) + return ret; + return 0; } -- GitLab From 169cd0f8238f2598b85d2db2e15828e8f8da18e5 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 10 Nov 2022 19:02:57 +0000 Subject: [PATCH 0592/1423] KVM: arm64: Don't unnecessarily map host kernel sections at EL2 We no longer need to map the host's '.rodata' and '.bss' sections in the stage-1 page-table of the pKVM hypervisor at EL2, so remove those mappings and avoid creating any future dependencies at EL2 on host-controlled data structures. Tested-by: Vincent Donnefort Signed-off-by: Quentin Perret Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110190259.26861-25-will@kernel.org --- arch/arm64/kernel/image-vars.h | 6 ------ arch/arm64/kvm/hyp/nvhe/setup.c | 14 +++----------- 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 31ad75da4d589..e3f88b5836a20 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -102,12 +102,6 @@ KVM_NVHE_ALIAS_HYP(__memcpy, __pi_memcpy); KVM_NVHE_ALIAS_HYP(__memset, __pi_memset); #endif -/* Kernel memory sections */ -KVM_NVHE_ALIAS(__start_rodata); -KVM_NVHE_ALIAS(__end_rodata); -KVM_NVHE_ALIAS(__bss_start); -KVM_NVHE_ALIAS(__bss_stop); - /* Hyp memory sections */ KVM_NVHE_ALIAS(__hyp_idmap_text_start); KVM_NVHE_ALIAS(__hyp_idmap_text_end); diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 5a371ab236db7..5cdf3fb09bb42 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -144,23 +144,15 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, } /* - * Map the host's .bss and .rodata sections RO in the hypervisor, but - * transfer the ownership from the host to the hypervisor itself to - * make sure it can't be donated or shared with another entity. + * Map the host sections RO in the hypervisor, but transfer the + * ownership from the host to the hypervisor itself to make sure they + * can't be donated or shared with another entity. * * The ownership transition requires matching changes in the host * stage-2. This will be done later (see finalize_host_mappings()) once * the hyp_vmemmap is addressable. */ prot = pkvm_mkstate(PAGE_HYP_RO, PKVM_PAGE_SHARED_OWNED); - ret = pkvm_create_mappings(__start_rodata, __end_rodata, prot); - if (ret) - return ret; - - ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop, prot); - if (ret) - return ret; - ret = pkvm_create_mappings(&kvm_vgic_global_state, &kvm_vgic_global_state + 1, prot); if (ret) -- GitLab From be66e67f175096f283c9d5614c4991fc9e7ed975 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 10 Nov 2022 19:02:59 +0000 Subject: [PATCH 0593/1423] KVM: arm64: Use the pKVM hyp vCPU structure in handle___kvm_vcpu_run() As a stepping stone towards deprivileging the host's access to the guest's vCPU structures, introduce some naive flush/sync routines to copy most of the host vCPU into the hyp vCPU on vCPU run and back again on return to EL1. This allows us to run using the pKVM hyp structures when KVM is initialised in protected mode. Tested-by: Vincent Donnefort Co-developed-by: Fuad Tabba Signed-off-by: Fuad Tabba Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110190259.26861-27-will@kernel.org --- arch/arm64/kvm/hyp/include/nvhe/pkvm.h | 4 ++ arch/arm64/kvm/hyp/nvhe/hyp-main.c | 79 +++++++++++++++++++++++++- arch/arm64/kvm/hyp/nvhe/pkvm.c | 28 +++++++++ 3 files changed, 109 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h index d14dfbcb7da16..82b3d62538a61 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h @@ -61,4 +61,8 @@ int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu, unsigned long vcpu_hva); int __pkvm_teardown_vm(pkvm_handle_t handle); +struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle, + unsigned int vcpu_idx); +void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu); + #endif /* __ARM64_KVM_NVHE_PKVM_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index b5f3fcfe91357..728e01d4536b0 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -22,11 +22,86 @@ DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt); +static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + hyp_vcpu->vcpu.arch.ctxt = host_vcpu->arch.ctxt; + + hyp_vcpu->vcpu.arch.sve_state = kern_hyp_va(host_vcpu->arch.sve_state); + hyp_vcpu->vcpu.arch.sve_max_vl = host_vcpu->arch.sve_max_vl; + + hyp_vcpu->vcpu.arch.hw_mmu = host_vcpu->arch.hw_mmu; + + hyp_vcpu->vcpu.arch.hcr_el2 = host_vcpu->arch.hcr_el2; + hyp_vcpu->vcpu.arch.mdcr_el2 = host_vcpu->arch.mdcr_el2; + hyp_vcpu->vcpu.arch.cptr_el2 = host_vcpu->arch.cptr_el2; + + hyp_vcpu->vcpu.arch.iflags = host_vcpu->arch.iflags; + hyp_vcpu->vcpu.arch.fp_state = host_vcpu->arch.fp_state; + + hyp_vcpu->vcpu.arch.debug_ptr = kern_hyp_va(host_vcpu->arch.debug_ptr); + hyp_vcpu->vcpu.arch.host_fpsimd_state = host_vcpu->arch.host_fpsimd_state; + + hyp_vcpu->vcpu.arch.vsesr_el2 = host_vcpu->arch.vsesr_el2; + + hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3 = host_vcpu->arch.vgic_cpu.vgic_v3; +} + +static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + struct vgic_v3_cpu_if *hyp_cpu_if = &hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3; + struct vgic_v3_cpu_if *host_cpu_if = &host_vcpu->arch.vgic_cpu.vgic_v3; + unsigned int i; + + host_vcpu->arch.ctxt = hyp_vcpu->vcpu.arch.ctxt; + + host_vcpu->arch.hcr_el2 = hyp_vcpu->vcpu.arch.hcr_el2; + host_vcpu->arch.cptr_el2 = hyp_vcpu->vcpu.arch.cptr_el2; + + host_vcpu->arch.fault = hyp_vcpu->vcpu.arch.fault; + + host_vcpu->arch.iflags = hyp_vcpu->vcpu.arch.iflags; + host_vcpu->arch.fp_state = hyp_vcpu->vcpu.arch.fp_state; + + host_cpu_if->vgic_hcr = hyp_cpu_if->vgic_hcr; + for (i = 0; i < hyp_cpu_if->used_lrs; ++i) + host_cpu_if->vgic_lr[i] = hyp_cpu_if->vgic_lr[i]; +} + static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) { - DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1); + DECLARE_REG(struct kvm_vcpu *, host_vcpu, host_ctxt, 1); + int ret; + + host_vcpu = kern_hyp_va(host_vcpu); + + if (unlikely(is_protected_kvm_enabled())) { + struct pkvm_hyp_vcpu *hyp_vcpu; + struct kvm *host_kvm; + + host_kvm = kern_hyp_va(host_vcpu->kvm); + hyp_vcpu = pkvm_load_hyp_vcpu(host_kvm->arch.pkvm.handle, + host_vcpu->vcpu_idx); + if (!hyp_vcpu) { + ret = -EINVAL; + goto out; + } + + flush_hyp_vcpu(hyp_vcpu); + + ret = __kvm_vcpu_run(&hyp_vcpu->vcpu); + + sync_hyp_vcpu(hyp_vcpu); + pkvm_put_hyp_vcpu(hyp_vcpu); + } else { + /* The host is fully trusted, run its vCPU directly. */ + ret = __kvm_vcpu_run(host_vcpu); + } - cpu_reg(host_ctxt, 1) = __kvm_vcpu_run(kern_hyp_va(vcpu)); +out: + cpu_reg(host_ctxt, 1) = ret; } static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index ed6ceac1e8547..a06ece14a6d8e 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -241,6 +241,33 @@ static struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle) return vm_table[idx]; } +struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle, + unsigned int vcpu_idx) +{ + struct pkvm_hyp_vcpu *hyp_vcpu = NULL; + struct pkvm_hyp_vm *hyp_vm; + + hyp_spin_lock(&vm_table_lock); + hyp_vm = get_vm_by_handle(handle); + if (!hyp_vm || hyp_vm->nr_vcpus <= vcpu_idx) + goto unlock; + + hyp_vcpu = hyp_vm->vcpus[vcpu_idx]; + hyp_page_ref_inc(hyp_virt_to_page(hyp_vm)); +unlock: + hyp_spin_unlock(&vm_table_lock); + return hyp_vcpu; +} + +void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu); + + hyp_spin_lock(&vm_table_lock); + hyp_page_ref_dec(hyp_virt_to_page(hyp_vm)); + hyp_spin_unlock(&vm_table_lock); +} + static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu) { if (host_vcpu) @@ -286,6 +313,7 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, hyp_vcpu->vcpu.vcpu_idx = vcpu_idx; hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu; + hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags); done: if (ret) unpin_host_vcpu(host_vcpu); -- GitLab From e6ecb142429183cef4835f31d4134050ae660032 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 8 Nov 2022 17:59:34 -0800 Subject: [PATCH 0594/1423] f2fs: allow to read node block after shutdown If block address is still alive, we should give a valid node block even after shutdown. Otherwise, we can see zero data when reading out a file. Cc: stable@vger.kernel.org Fixes: 83a3bfdb5a8a ("f2fs: indicate shutdown f2fs to allow unmount successfully") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 983572f238969..b9ee5a1176a07 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1360,8 +1360,7 @@ static int read_node_page(struct page *page, blk_opf_t op_flags) return err; /* NEW_ADDR can be seen, after cp_error drops some dirty node pages */ - if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR) || - is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) { + if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR)) { ClearPageUptodate(page); return -ENOENT; } -- GitLab From 225d6795abf47c3340214ca1b4c22728e463db4f Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 25 Oct 2022 21:26:38 +0800 Subject: [PATCH 0595/1423] f2fs: add proc entry to show discard_plist info This patch adds a new proc entry to show discard_plist information in more detail, which is very helpful to know the discard pend list count clearly. Such as: Discard pend list(Show diacrd_cmd count on each entry, .:not exist): 0 390 156 85 67 46 37 26 14 8 17 12 9 9 6 12 11 10 16 5 9 2 4 8 3 4 1 24 3 2 2 5 2 4 5 4 32 3 3 2 3 . 3 3 1 40 . 4 1 3 2 1 2 1 48 1 . 1 1 . 1 1 . 56 . 1 1 1 . 2 . 1 64 1 2 . . . . . . 72 . 1 . . . . . . 80 3 1 . . 1 1 . . 88 1 . . . 1 . . 1 ...... Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 032c03e09580d..97bf0dbb0974b 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -1252,6 +1252,44 @@ static int __maybe_unused victim_bits_seq_show(struct seq_file *seq, return 0; } +static int __maybe_unused discard_plist_seq_show(struct seq_file *seq, + void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int i, count; + + seq_puts(seq, "Discard pend list(Show diacrd_cmd count on each entry, .:not exist):\n"); + if (!f2fs_realtime_discard_enable(sbi)) + return 0; + + if (dcc) { + mutex_lock(&dcc->cmd_lock); + for (i = 0; i < MAX_PLIST_NUM; i++) { + struct list_head *pend_list; + struct discard_cmd *dc, *tmp; + + if (i % 8 == 0) + seq_printf(seq, " %-3d", i); + count = 0; + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) + count++; + if (count) + seq_printf(seq, " %7d", count); + else + seq_puts(seq, " ."); + if (i % 8 == 7) + seq_putc(seq, '\n'); + } + seq_putc(seq, '\n'); + mutex_unlock(&dcc->cmd_lock); + } + + return 0; +} + int __init f2fs_init_sysfs(void) { int ret; @@ -1322,6 +1360,8 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) #endif proc_create_single_data("victim_bits", 0444, sbi->s_proc, victim_bits_seq_show, sb); + proc_create_single_data("discard_plist_info", 0444, sbi->s_proc, + discard_plist_seq_show, sb); } return 0; put_feature_list_kobj: @@ -1345,6 +1385,7 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) remove_proc_entry("segment_info", sbi->s_proc); remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry("victim_bits", sbi->s_proc); + remove_proc_entry("discard_plist_info", sbi->s_proc); remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); } -- GitLab From 4d8d45df2252980f800c1b2fde941a103a18a70e Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Mon, 31 Oct 2022 12:24:15 -0700 Subject: [PATCH 0596/1423] f2fs: correct i_size change for atomic writes We need to make sure i_size doesn't change until atomic write commit is successful and restore it when commit is failed. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 8 ++++++++ fs/f2fs/file.c | 18 +++++++++++------- fs/f2fs/inode.c | 5 ++++- fs/f2fs/segment.c | 14 ++++++++++---- 4 files changed, 33 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 04ef4cce3d7f4..11c475beca2c1 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -768,6 +768,7 @@ enum { FI_COMPRESS_RELEASED, /* compressed blocks were released */ FI_ALIGNED_WRITE, /* enable aligned write */ FI_COW_FILE, /* indicate COW file */ + FI_ATOMIC_COMMITTED, /* indicate atomic commit completed except disk sync */ FI_MAX, /* max flag, never be used */ }; @@ -826,6 +827,7 @@ struct f2fs_inode_info { unsigned int i_cluster_size; /* cluster size */ unsigned int atomic_write_cnt; + loff_t original_i_size; /* original i_size before atomic write */ }; static inline void get_extent_info(struct extent_info *ext, @@ -3075,6 +3077,8 @@ static inline void f2fs_i_blocks_write(struct inode *inode, set_inode_flag(inode, FI_AUTO_RECOVER); } +static inline bool f2fs_is_atomic_file(struct inode *inode); + static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) { bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); @@ -3084,6 +3088,10 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) return; i_size_write(inode, i_size); + + if (f2fs_is_atomic_file(inode)) + return; + f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c605a4f2bce21..28f586e779992 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2041,6 +2041,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct inode *pinode; + loff_t isize; int ret; if (!inode_owner_or_capable(mnt_userns, inode)) @@ -2099,7 +2100,12 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) f2fs_up_write(&fi->i_gc_rwsem[WRITE]); goto out; } - f2fs_i_size_write(fi->cow_inode, i_size_read(inode)); + + f2fs_write_inode(inode, NULL); + + isize = i_size_read(inode); + fi->original_i_size = isize; + f2fs_i_size_write(fi->cow_inode, isize); stat_inc_atomic_inode(inode); @@ -2137,16 +2143,14 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (f2fs_is_atomic_file(inode)) { ret = f2fs_commit_atomic_write(inode); - if (ret) - goto unlock_out; - - ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); if (!ret) - f2fs_abort_atomic_write(inode, false); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); + + f2fs_abort_atomic_write(inode, ret); } else { ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } -unlock_out: + inode_unlock(inode); mnt_drop_write_file(filp); return ret; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 9f0d3864d9f13..577f109b4e1d9 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -621,9 +621,12 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) ri->i_uid = cpu_to_le32(i_uid_read(inode)); ri->i_gid = cpu_to_le32(i_gid_read(inode)); ri->i_links = cpu_to_le32(inode->i_nlink); - ri->i_size = cpu_to_le64(i_size_read(inode)); ri->i_blocks = cpu_to_le64(SECTOR_TO_BLOCK(inode->i_blocks) + 1); + if (!f2fs_is_atomic_file(inode) || + is_inode_flag_set(inode, FI_ATOMIC_COMMITTED)) + ri->i_size = cpu_to_le64(i_size_read(inode)); + if (et) { read_lock(&et->lock); set_raw_extent(&et->largest, &ri->i_ext); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index aa4be7f25963d..8aa81238c770f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -192,14 +192,18 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean) if (!f2fs_is_atomic_file(inode)) return; - if (clean) - truncate_inode_pages_final(inode->i_mapping); clear_inode_flag(fi->cow_inode, FI_COW_FILE); iput(fi->cow_inode); fi->cow_inode = NULL; release_atomic_write_cnt(inode); + clear_inode_flag(inode, FI_ATOMIC_COMMITTED); clear_inode_flag(inode, FI_ATOMIC_FILE); stat_dec_atomic_inode(inode); + + if (clean) { + truncate_inode_pages_final(inode->i_mapping); + f2fs_i_size_write(inode, fi->original_i_size); + } } static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, @@ -335,10 +339,12 @@ next: } out: - if (ret) + if (ret) { sbi->revoked_atomic_block += fi->atomic_write_cnt; - else + } else { sbi->committed_atomic_block += fi->atomic_write_cnt; + set_inode_flag(inode, FI_ATOMIC_COMMITTED); + } __complete_revoke_list(inode, &revoke_list, ret ? true : false); -- GitLab From cc249e4cba9a6002c9d9e1438daf8440a160bc9e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 6 Nov 2022 21:25:44 +0800 Subject: [PATCH 0597/1423] f2fs: fix to avoid accessing uninitialized spinlock syzbot reports a kernel bug: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x1e3/0x2cb lib/dump_stack.c:106 assign_lock_key+0x22a/0x240 kernel/locking/lockdep.c:981 register_lock_class+0x287/0x9b0 kernel/locking/lockdep.c:1294 __lock_acquire+0xe4/0x1f60 kernel/locking/lockdep.c:4934 lock_acquire+0x1a7/0x400 kernel/locking/lockdep.c:5668 __raw_spin_lock include/linux/spinlock_api_smp.h:133 [inline] _raw_spin_lock+0x2a/0x40 kernel/locking/spinlock.c:154 spin_lock include/linux/spinlock.h:350 [inline] f2fs_save_errors fs/f2fs/super.c:3868 [inline] f2fs_handle_error+0x29/0x230 fs/f2fs/super.c:3896 f2fs_iget+0x215/0x4bb0 fs/f2fs/inode.c:516 f2fs_fill_super+0x47d3/0x7b50 fs/f2fs/super.c:4222 mount_bdev+0x26c/0x3a0 fs/super.c:1401 legacy_get_tree+0xea/0x180 fs/fs_context.c:610 vfs_get_tree+0x88/0x270 fs/super.c:1531 do_new_mount+0x289/0xad0 fs/namespace.c:3040 do_mount fs/namespace.c:3383 [inline] __do_sys_mount fs/namespace.c:3591 [inline] __se_sys_mount+0x2e3/0x3d0 fs/namespace.c:3568 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x2b/0x70 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd F2FS-fs (loop1): Failed to read F2FS meta data inode The root cause is if sbi->error_lock may be accessed before its initialization, fix it. Link: https://lore.kernel.org/linux-f2fs-devel/0000000000007edb6605ecbb6442@google.com/T/#u Reported-by: syzbot+40642be9b7e0bb28e0df@syzkaller.appspotmail.com Fixes: 95fa90c9e5a7 ("f2fs: support recording errors into superblock") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a43d8a46a6e56..68a6c2eedcaca 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4196,6 +4196,9 @@ try_onemore: if (err) goto free_bio_info; + spin_lock_init(&sbi->error_lock); + memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS); + init_f2fs_rwsem(&sbi->cp_rwsem); init_f2fs_rwsem(&sbi->quota_sem); init_waitqueue_head(&sbi->cp_wait); @@ -4263,9 +4266,6 @@ try_onemore: goto free_devices; } - spin_lock_init(&sbi->error_lock); - memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS); - sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); percpu_counter_set(&sbi->total_valid_inode_count, -- GitLab From 59237a21776f70ffb0420611c23e7158e1317037 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 8 Nov 2022 22:33:21 +0800 Subject: [PATCH 0598/1423] f2fs: optimize iteration over sparse directories Wei Chen reports a kernel bug as blew: INFO: task syz-executor.0:29056 blocked for more than 143 seconds. Not tainted 5.15.0-rc5 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:syz-executor.0 state:D stack:14632 pid:29056 ppid: 6574 flags:0x00000004 Call Trace: __schedule+0x4a1/0x1720 schedule+0x36/0xe0 rwsem_down_write_slowpath+0x322/0x7a0 fscrypt_ioctl_set_policy+0x11f/0x2a0 __f2fs_ioctl+0x1a9f/0x5780 f2fs_ioctl+0x89/0x3a0 __x64_sys_ioctl+0xe8/0x140 do_syscall_64+0x34/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae Eric did some investigation on this issue, quoted from reply of Eric: "Well, the quality of this bug report has a lot to be desired (not on upstream kernel, reproducer is full of totally irrelevant stuff, not sent to the mailing list of the filesystem whose disk image is being fuzzed, etc.). But what is going on is that f2fs_empty_dir() doesn't consider the case of a directory with an extremely large i_size on a malicious disk image. Specifically, the reproducer mounts an f2fs image with a directory that has an i_size of 14814520042850357248, then calls FS_IOC_SET_ENCRYPTION_POLICY on it. That results in a call to f2fs_empty_dir() to check whether the directory is empty. f2fs_empty_dir() then iterates through all 3616826182336513 blocks the directory allegedly contains to check whether any contain anything. i_rwsem is held during this, so anything else that tries to take it will hang." In order to solve this issue, let's use f2fs_get_next_page_offset() to speed up iteration by skipping holes for all below functions: - f2fs_empty_dir - f2fs_readdir - find_in_level The way why we can speed up iteration was described in 'commit 3cf4574705b4 ("f2fs: introduce get_next_page_offset to speed up SEEK_DATA")'. Meanwhile, in f2fs_empty_dir(), let's use f2fs_find_data_page() instead f2fs_get_lock_data_page(), due to i_rwsem was held in caller of f2fs_empty_dir(), there shouldn't be any races, so it's fine to not lock dentry page during lookuping dirents in the page. Link: https://lore.kernel.org/lkml/536944df-a0ae-1dd8-148f-510b476e1347@kernel.org/T/ Reported-by: Wei Chen Cc: Eric Biggers Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 17 ++++++++++++----- fs/f2fs/dir.c | 34 ++++++++++++++++++++++++---------- fs/f2fs/f2fs.h | 5 +++-- fs/f2fs/gc.c | 4 ++-- 4 files changed, 41 insertions(+), 19 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a71e818cd67b4..9b47ded653d1c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1206,7 +1206,8 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) } struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, - blk_opf_t op_flags, bool for_write) + blk_opf_t op_flags, bool for_write, + pgoff_t *next_pgofs) { struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; @@ -1232,12 +1233,17 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); - if (err) + if (err) { + if (err == -ENOENT && next_pgofs) + *next_pgofs = f2fs_get_next_page_offset(&dn, index); goto put_err; + } f2fs_put_dnode(&dn); if (unlikely(dn.data_blkaddr == NULL_ADDR)) { err = -ENOENT; + if (next_pgofs) + *next_pgofs = index + 1; goto put_err; } if (dn.data_blkaddr != NEW_ADDR && @@ -1281,7 +1287,8 @@ put_err: return ERR_PTR(err); } -struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index) +struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index, + pgoff_t *next_pgofs) { struct address_space *mapping = inode->i_mapping; struct page *page; @@ -1291,7 +1298,7 @@ struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index) return page; f2fs_put_page(page, 0); - page = f2fs_get_read_data_page(inode, index, 0, false); + page = f2fs_get_read_data_page(inode, index, 0, false, next_pgofs); if (IS_ERR(page)) return page; @@ -1317,7 +1324,7 @@ struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct page *page; repeat: - page = f2fs_get_read_data_page(inode, index, 0, for_write); + page = f2fs_get_read_data_page(inode, index, 0, for_write, NULL); if (IS_ERR(page)) return page; diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 21960a899b6ad..030b7fd4142ff 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -340,6 +340,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, unsigned int bidx, end_block; struct page *dentry_page; struct f2fs_dir_entry *de = NULL; + pgoff_t next_pgofs; bool room = false; int max_slots; @@ -350,12 +351,13 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, le32_to_cpu(fname->hash) % nbucket); end_block = bidx + nblock; - for (; bidx < end_block; bidx++) { + while (bidx < end_block) { /* no need to allocate new dentry pages to all the indices */ - dentry_page = f2fs_find_data_page(dir, bidx); + dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs); if (IS_ERR(dentry_page)) { if (PTR_ERR(dentry_page) == -ENOENT) { room = true; + bidx = next_pgofs; continue; } else { *res_page = dentry_page; @@ -376,6 +378,8 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, if (max_slots >= s) room = true; f2fs_put_page(dentry_page, 0); + + bidx++; } if (!de && room && F2FS_I(dir)->chash != fname->hash) { @@ -956,7 +960,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, bool f2fs_empty_dir(struct inode *dir) { - unsigned long bidx; + unsigned long bidx = 0; struct page *dentry_page; unsigned int bit_pos; struct f2fs_dentry_block *dentry_blk; @@ -965,13 +969,17 @@ bool f2fs_empty_dir(struct inode *dir) if (f2fs_has_inline_dentry(dir)) return f2fs_empty_inline_dir(dir); - for (bidx = 0; bidx < nblock; bidx++) { - dentry_page = f2fs_get_lock_data_page(dir, bidx, false); + while (bidx < nblock) { + pgoff_t next_pgofs; + + dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs); if (IS_ERR(dentry_page)) { - if (PTR_ERR(dentry_page) == -ENOENT) + if (PTR_ERR(dentry_page) == -ENOENT) { + bidx = next_pgofs; continue; - else + } else { return false; + } } dentry_blk = page_address(dentry_page); @@ -983,10 +991,12 @@ bool f2fs_empty_dir(struct inode *dir) NR_DENTRY_IN_BLOCK, bit_pos); - f2fs_put_page(dentry_page, 1); + f2fs_put_page(dentry_page, 0); if (bit_pos < NR_DENTRY_IN_BLOCK) return false; + + bidx++; } return true; } @@ -1104,7 +1114,8 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) goto out_free; } - for (; n < npages; n++, ctx->pos = n * NR_DENTRY_IN_BLOCK) { + for (; n < npages; ctx->pos = n * NR_DENTRY_IN_BLOCK) { + pgoff_t next_pgofs; /* allow readdir() to be interrupted */ if (fatal_signal_pending(current)) { @@ -1118,11 +1129,12 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) page_cache_sync_readahead(inode->i_mapping, ra, file, n, min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); - dentry_page = f2fs_find_data_page(inode, n); + dentry_page = f2fs_find_data_page(inode, n, &next_pgofs); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); if (err == -ENOENT) { err = 0; + n = next_pgofs; continue; } else { goto out_free; @@ -1141,6 +1153,8 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) } f2fs_put_page(dentry_page, 0); + + n++; } out_free: fscrypt_fname_free_buffer(&fstr); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 11c475beca2c1..6a8cbf5bb1871 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3807,8 +3807,9 @@ int f2fs_reserve_new_block(struct dnode_of_data *dn); int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, - blk_opf_t op_flags, bool for_write); -struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index); + blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs); +struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index, + pgoff_t *next_pgofs); struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, bool for_write); struct page *f2fs_get_new_data_page(struct inode *inode, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 6466db75af5d2..f1a46519a5fe3 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1562,8 +1562,8 @@ next_step: continue; } - data_page = f2fs_get_read_data_page(inode, - start_bidx, REQ_RAHEAD, true); + data_page = f2fs_get_read_data_page(inode, start_bidx, + REQ_RAHEAD, true, NULL); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (IS_ERR(data_page)) { iput(inode); -- GitLab From 92b4cf5b48955a4bdd15fe4e2067db8ebd87f04c Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 9 Nov 2022 07:04:42 +0900 Subject: [PATCH 0599/1423] f2fs: initialize locks earlier in f2fs_fill_super() syzbot is reporting lockdep warning at f2fs_handle_error() [1], for spin_lock(&sbi->error_lock) is called before spin_lock_init() is called. For safe locking in error handling, move initialization of locks (and obvious structures) in f2fs_fill_super() to immediately after memory allocation. Link: https://syzkaller.appspot.com/bug?extid=40642be9b7e0bb28e0df [1] Reported-by: syzbot Signed-off-by: Tetsuo Handa Tested-by: syzbot Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 68a6c2eedcaca..8f4fc3ad67650 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4103,6 +4103,24 @@ try_onemore: sbi->sb = sb; + /* initialize locks within allocated memory */ + init_f2fs_rwsem(&sbi->gc_lock); + mutex_init(&sbi->writepages); + init_f2fs_rwsem(&sbi->cp_global_sem); + init_f2fs_rwsem(&sbi->node_write); + init_f2fs_rwsem(&sbi->node_change); + spin_lock_init(&sbi->stat_lock); + init_f2fs_rwsem(&sbi->cp_rwsem); + init_f2fs_rwsem(&sbi->quota_sem); + init_waitqueue_head(&sbi->cp_wait); + spin_lock_init(&sbi->error_lock); + + for (i = 0; i < NR_INODE_TYPE; i++) { + INIT_LIST_HEAD(&sbi->inode_list[i]); + spin_lock_init(&sbi->inode_lock[i]); + } + mutex_init(&sbi->flush_lock); + /* Load the checksum driver */ sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0); if (IS_ERR(sbi->s_chksum_driver)) { @@ -4126,6 +4144,8 @@ try_onemore: sb->s_fs_info = sbi; sbi->raw_super = raw_super; + memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS); + /* precompute checksum seed for metadata */ if (f2fs_sb_has_inode_chksum(sbi)) sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, @@ -4182,26 +4202,14 @@ try_onemore: /* init f2fs-specific super block info */ sbi->valid_super_block = valid_super_block; - init_f2fs_rwsem(&sbi->gc_lock); - mutex_init(&sbi->writepages); - init_f2fs_rwsem(&sbi->cp_global_sem); - init_f2fs_rwsem(&sbi->node_write); - init_f2fs_rwsem(&sbi->node_change); /* disallow all the data/node/meta page writes */ set_sbi_flag(sbi, SBI_POR_DOING); - spin_lock_init(&sbi->stat_lock); err = f2fs_init_write_merge_io(sbi); if (err) goto free_bio_info; - spin_lock_init(&sbi->error_lock); - memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS); - - init_f2fs_rwsem(&sbi->cp_rwsem); - init_f2fs_rwsem(&sbi->quota_sem); - init_waitqueue_head(&sbi->cp_wait); init_sb_info(sbi); err = f2fs_init_iostat(sbi); @@ -4279,12 +4287,6 @@ try_onemore: limit_reserve_root(sbi); adjust_unusable_cap_perc(sbi); - for (i = 0; i < NR_INODE_TYPE; i++) { - INIT_LIST_HEAD(&sbi->inode_list[i]); - spin_lock_init(&sbi->inode_lock[i]); - } - mutex_init(&sbi->flush_lock); - f2fs_init_extent_cache_info(sbi); f2fs_init_ino_entry_info(sbi); -- GitLab From 967eaad1fed5f6335ea97a47d45214744dc57925 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Thu, 10 Nov 2022 17:15:01 +0800 Subject: [PATCH 0600/1423] f2fs: fix to set flush_merge opt and show noflush_merge Some minor modifications to flush_merge and related parameters: 1.The FLUSH_MERGE opt is set by default only in non-ro mode. 2.When ro and merge are set at the same time, an error is reported. 3.Display noflush_merge mount opt. Suggested-by: Chao Yu Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8f4fc3ad67650..75027ff85cd93 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1353,6 +1353,12 @@ default_check: return -EINVAL; } + if ((f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb)) && + test_opt(sbi, FLUSH_MERGE)) { + f2fs_err(sbi, "FLUSH_MERGE not compatible with readonly mode"); + return -EINVAL; + } + if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) { f2fs_err(sbi, "Allow to mount readonly mode only"); return -EROFS; @@ -1941,8 +1947,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",inline_dentry"); else seq_puts(seq, ",noinline_dentry"); - if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE)) + if (test_opt(sbi, FLUSH_MERGE)) seq_puts(seq, ",flush_merge"); + else + seq_puts(seq, ",noflush_merge"); if (test_opt(sbi, NOBARRIER)) seq_puts(seq, ",nobarrier"); else @@ -2073,7 +2081,8 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, MERGE_CHECKPOINT); F2FS_OPTION(sbi).unusable_cap = 0; sbi->sb->s_flags |= SB_LAZYTIME; - set_opt(sbi, FLUSH_MERGE); + if (!f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) + set_opt(sbi, FLUSH_MERGE); if (f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) set_opt(sbi, DISCARD); if (f2fs_sb_has_blkzoned(sbi)) { -- GitLab From 98b04dd0b4577894520493d96bc4623387767445 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 26 Oct 2022 02:11:21 -0400 Subject: [PATCH 0601/1423] PCI: Fix pci_device_is_present() for VFs by checking PF pci_device_is_present() previously didn't work for VFs because it reads the Vendor and Device ID, which are 0xffff for VFs, which looks like they aren't present. Check the PF instead. Wei Gong reported that if virtio I/O is in progress when the driver is unbound or "0" is written to /sys/.../sriov_numvfs, the virtio I/O operation hangs, which may result in output like this: task:bash state:D stack: 0 pid: 1773 ppid: 1241 flags:0x00004002 Call Trace: schedule+0x4f/0xc0 blk_mq_freeze_queue_wait+0x69/0xa0 blk_mq_freeze_queue+0x1b/0x20 blk_cleanup_queue+0x3d/0xd0 virtblk_remove+0x3c/0xb0 [virtio_blk] virtio_dev_remove+0x4b/0x80 ... device_unregister+0x1b/0x60 unregister_virtio_device+0x18/0x30 virtio_pci_remove+0x41/0x80 pci_device_remove+0x3e/0xb0 This happened because pci_device_is_present(VF) returned "false" in virtio_pci_remove(), so it called virtio_break_device(). The broken vq meant that vring_interrupt() skipped the vq.callback() that would have completed the virtio I/O operation via virtblk_done(). [bhelgaas: commit log, simplify to always use pci_physfn(), add stable tag] Link: https://lore.kernel.org/r/20221026060912.173250-1-mst@redhat.com Reported-by: Wei Gong Tested-by: Wei Gong Signed-off-by: Michael S. Tsirkin Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org --- drivers/pci/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 9f3cc829dfeea..fba95486caaf2 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -6447,6 +6447,8 @@ bool pci_device_is_present(struct pci_dev *pdev) { u32 v; + /* Check PF if pdev is a VF, since VF Vendor/Device IDs are 0xffff */ + pdev = pci_physfn(pdev); if (pci_dev_is_disconnected(pdev)) return false; return pci_bus_read_dev_vendor_id(pdev->bus, pdev->devfn, &v, 0); -- GitLab From c42edde5de3af6285fbb38c9d503a40ef491d10d Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Mon, 7 Nov 2022 15:42:37 +0200 Subject: [PATCH 0602/1423] i2c: designware: Fix slave state machine for sequential reads Some read types from I2C bus don't work correctly when testing the i2c-designware-slave.c with the slave-eeprom backend. The same reads work correctly when testing with a real 24c02 EEPROM chip. In the following tests an i2c-designware-slave.c instance with the slave-eeprom backend is configured to act as a simulated 24c02 at address 0x65 on an I2C host bus 6: 1. i2cdump -y 6 0x65 b (OK) Random read. Each byte are read using a byte address write with a current address read in a same message. 2. i2cdump -y 6 0x65 c (OK, was NOK before commit 3b5f7f10ff6e when it was repeating the 1st byte) Repeated current address read. One byte address write message followed by repeated current address read messages. 3. i2cdump -y 6 0x65 i (NOK, each 32 byte block repeats the 1st byte of block) Sequential read using SMBus Block Read. For each 32 byte block a byte address write followed by 32 sequental reads in a same message. These findings are explained because the implementation has had a mismatch between hardware interrupts and what I2C slave events should be sent after those interrupts. Despite that the case 1 happened to have always the I2C slave events sent to a right order with a right data between backend and the I2C bus. Hardware generates the DW_IC_INTR_RD_REQ interrupt when another host is attempting to read and for sequential reads after. DW_IC_INTR_RX_DONE occurs when host does not acknowledge a transmitted byte which is an indication the end of transmission. Those interrupts do not match directly with I2C_SLAVE_READ_REQUESTED and I2C_SLAVE_READ_PROCESSED events which is how the code was and is practically using them. The slave-eeprom backend increases the buffer index with the I2C_SLAVE_READ_PROCESSED event and returns the data from current index when receiving only the I2C_SLAVE_READ_REQUESTED event. That explains the repeated bytes in case 3 and also case 2 before commit 3b5f7f10ff6e ("i2c: designware: slave should do WRITE_REQUESTED before WRITE_RECEIVED"). Patch fixes the case 3 while keep cases 1 and 2 working with following changes: - First DW_IC_INTR_RD_REQ interrupt will change the state machine to read in progress state, send I2C_SLAVE_READ_REQUESTED event and transmit the first byte from backend - Subsequent DW_IC_INTR_RD_REQ interrupts will send I2C_SLAVE_READ_PROCESSED events and transmit next bytes from backend - STOP won't change the state machine. Otherwise case 2 won't work since we cannot distinguish current address read from sequentiel read - DW_IC_INTR_RX_DONE interrupt is needless since there is no mechanism to inform it to a backend. It cannot be used to change state machine at the end of read either due the same reason than above - Next host write to us will change the state machine from read to write in progress state - STATUS_WRITE_IN_PROGRESS and STATUS_READ_IN_PROGRESS are considered now to be status flags not the state of the driver. This is how we treat them in i2c-designware-master.c While at it do not test the return code from i2c_slave_event() for I2C_SLAVE_READ_REQUESTED and I2C_SLAVE_READ_PROCESSED since it returns always 0. Signed-off-by: Jarkko Nikula Reviewed-by: Andy Shevchenko Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-core.h | 1 - drivers/i2c/busses/i2c-designware-slave.c | 32 +++++++++++------------ 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-core.h b/drivers/i2c/busses/i2c-designware-core.h index 4d3a3b464ecd8..dbf6bdc5f01b3 100644 --- a/drivers/i2c/busses/i2c-designware-core.h +++ b/drivers/i2c/busses/i2c-designware-core.h @@ -103,7 +103,6 @@ #define DW_IC_INTR_MASTER_MASK (DW_IC_INTR_DEFAULT_MASK | \ DW_IC_INTR_TX_EMPTY) #define DW_IC_INTR_SLAVE_MASK (DW_IC_INTR_DEFAULT_MASK | \ - DW_IC_INTR_RX_DONE | \ DW_IC_INTR_RX_UNDER | \ DW_IC_INTR_RD_REQ) diff --git a/drivers/i2c/busses/i2c-designware-slave.c b/drivers/i2c/busses/i2c-designware-slave.c index 0d15f4c1e9f7e..1eac4f4d5573b 100644 --- a/drivers/i2c/busses/i2c-designware-slave.c +++ b/drivers/i2c/busses/i2c-designware-slave.c @@ -173,8 +173,9 @@ static int i2c_dw_irq_handler_slave(struct dw_i2c_dev *dev) enabled, slave_activity, raw_stat, stat); if (stat & DW_IC_INTR_RX_FULL) { - if (dev->status != STATUS_WRITE_IN_PROGRESS) { - dev->status = STATUS_WRITE_IN_PROGRESS; + if (!(dev->status & STATUS_WRITE_IN_PROGRESS)) { + dev->status |= STATUS_WRITE_IN_PROGRESS; + dev->status &= ~STATUS_READ_IN_PROGRESS; i2c_slave_event(dev->slave, I2C_SLAVE_WRITE_REQUESTED, &val); } @@ -190,24 +191,23 @@ static int i2c_dw_irq_handler_slave(struct dw_i2c_dev *dev) if (slave_activity) { regmap_read(dev->map, DW_IC_CLR_RD_REQ, &tmp); - dev->status = STATUS_READ_IN_PROGRESS; - if (!i2c_slave_event(dev->slave, - I2C_SLAVE_READ_REQUESTED, - &val)) - regmap_write(dev->map, DW_IC_DATA_CMD, val); + if (!(dev->status & STATUS_READ_IN_PROGRESS)) { + i2c_slave_event(dev->slave, + I2C_SLAVE_READ_REQUESTED, + &val); + dev->status |= STATUS_READ_IN_PROGRESS; + dev->status &= ~STATUS_WRITE_IN_PROGRESS; + } else { + i2c_slave_event(dev->slave, + I2C_SLAVE_READ_PROCESSED, + &val); + } + regmap_write(dev->map, DW_IC_DATA_CMD, val); } } - if (stat & DW_IC_INTR_RX_DONE) { - if (!i2c_slave_event(dev->slave, I2C_SLAVE_READ_PROCESSED, - &val)) - regmap_read(dev->map, DW_IC_CLR_RX_DONE, &tmp); - } - - if (stat & DW_IC_INTR_STOP_DET) { - dev->status = STATUS_IDLE; + if (stat & DW_IC_INTR_STOP_DET) i2c_slave_event(dev->slave, I2C_SLAVE_STOP, &val); - } return 1; } -- GitLab From dcf1bf648f94d651a3e6fb433e9ba0a3e43c1047 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Mon, 7 Nov 2022 15:42:38 +0200 Subject: [PATCH 0603/1423] i2c: designware: Empty receive FIFO in slave interrupt handler Writes from I2C bus often fail when testing the i2c-designware-slave.c with the slave-eeprom backend. The same writes work correctly when testing with a real 24c02 EEPROM chip. In the tests below an i2c-designware-slave.c instance with the slave-eeprom backend is configured to act as a simulated 24c02 at address 0x65 on an I2C host bus 6. 1. i2cset -y 6 0x65 0x00 0x55 Single byte 0x55 write into address 0x00. No data goes into simulated EEPROM. Debug prints from the i2c_dw_irq_handler_slave(): 0x1 STATUS SLAVE_ACTIVITY=0x0 : RAW_INTR_STAT=0x714 : INTR_STAT=0x204 0x1 STATUS SLAVE_ACTIVITY=0x0 : RAW_INTR_STAT=0x514 : INTR_STAT=0x4 2. i2ctransfer -y 6 w9@0x65 0x00 0xff- Write 8 bytes with decrementing value starting from 0xff at address 0x00 and forward. Only some of the data goes into arbitrary addresses. Content is something like below but varies: 00000000 f9 f8 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| 00000050 00 00 00 00 00 00 ff fe 00 00 00 00 00 00 00 00 |................| 000000f0 00 00 00 00 00 00 00 00 00 00 00 00 00 fc fb fa |................| In this case debug prints were: 0x1 STATUS SLAVE_ACTIVITY=0x1 : RAW_INTR_STAT=0x514 : INTR_STAT=0x4 0x1 STATUS SLAVE_ACTIVITY=0x1 : RAW_INTR_STAT=0x514 : INTR_STAT=0x4 0x1 STATUS SLAVE_ACTIVITY=0x0 : RAW_INTR_STAT=0x714 : INTR_STAT=0x204 0x1 STATUS SLAVE_ACTIVITY=0x0 : RAW_INTR_STAT=0x514 : INTR_STAT=0x4 0x1 STATUS SLAVE_ACTIVITY=0x0 : RAW_INTR_STAT=0x514 : INTR_STAT=0x4 0x1 STATUS SLAVE_ACTIVITY=0x0 : RAW_INTR_STAT=0x514 : INTR_STAT=0x4 0x1 STATUS SLAVE_ACTIVITY=0x0 : RAW_INTR_STAT=0x514 : INTR_STAT=0x4 0x1 STATUS SLAVE_ACTIVITY=0x0 : RAW_INTR_STAT=0x514 : INTR_STAT=0x4 0x1 STATUS SLAVE_ACTIVITY=0x0 : RAW_INTR_STAT=0x514 : INTR_STAT=0x4 0x1 STATUS SLAVE_ACTIVITY=0x0 : RAW_INTR_STAT=0x510 : INTR_STAT=0x0 Both cases show there is more data coming from the receive FIFO still after detecting the STOP condition. This can be seen from interrupt status bits DW_IC_INTR_STOP_DET (0x200) and DW_IC_INTR_RX_FULL (0x4). Perhaps due interrupt latencies the receive FIFO is not read fast enough, STOP detection happens synchronously when it occurs on the I2C bus and the DW_IC_INTR_RX_FULL keeps coming as long as there are more bytes in the receive FIFO. Fix this by reading the receive FIFO completely empty whenever DW_IC_INTR_RX_FULL occurs. Use RFNE, Receive FIFO Not Empty bit in the DW_IC_STATUS register to loop through bytes in the FIFO. While at it do not test the return code from i2c_slave_event() for the I2C_SLAVE_WRITE_RECEIVED since to my understanding this hardware cannot generate NACK to incoming bytes and debug print itself does not have much value. Reported-by: Tian Ye Signed-off-by: Jarkko Nikula Reviewed-by: Andy Shevchenko Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-core.h | 1 + drivers/i2c/busses/i2c-designware-slave.c | 12 +++++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-core.h b/drivers/i2c/busses/i2c-designware-core.h index dbf6bdc5f01b3..6d1df28dd93b1 100644 --- a/drivers/i2c/busses/i2c-designware-core.h +++ b/drivers/i2c/busses/i2c-designware-core.h @@ -108,6 +108,7 @@ #define DW_IC_STATUS_ACTIVITY BIT(0) #define DW_IC_STATUS_TFE BIT(2) +#define DW_IC_STATUS_RFNE BIT(3) #define DW_IC_STATUS_MASTER_ACTIVITY BIT(5) #define DW_IC_STATUS_SLAVE_ACTIVITY BIT(6) diff --git a/drivers/i2c/busses/i2c-designware-slave.c b/drivers/i2c/busses/i2c-designware-slave.c index 1eac4f4d5573b..295774a69b67a 100644 --- a/drivers/i2c/busses/i2c-designware-slave.c +++ b/drivers/i2c/busses/i2c-designware-slave.c @@ -180,11 +180,13 @@ static int i2c_dw_irq_handler_slave(struct dw_i2c_dev *dev) &val); } - regmap_read(dev->map, DW_IC_DATA_CMD, &tmp); - val = tmp; - if (!i2c_slave_event(dev->slave, I2C_SLAVE_WRITE_RECEIVED, - &val)) - dev_vdbg(dev->dev, "Byte %X acked!", val); + do { + regmap_read(dev->map, DW_IC_DATA_CMD, &tmp); + val = tmp; + i2c_slave_event(dev->slave, I2C_SLAVE_WRITE_RECEIVED, + &val); + regmap_read(dev->map, DW_IC_STATUS, &tmp); + } while (tmp & DW_IC_STATUS_RFNE); } if (stat & DW_IC_INTR_RD_REQ) { -- GitLab From 4d827824b7bb29ce944172f6297db6d7a1ec37b3 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Mon, 7 Nov 2022 15:42:39 +0200 Subject: [PATCH 0604/1423] i2c: designware: Define software status flags with BIT() Define software status flags with a BIT() macro. While at it remove STATUS_IDLE and replace its use with zero initialization and status flags clearing with a mask. Suggested-by: Andy Shevchenko Signed-off-by: Jarkko Nikula Reviewed-by: Andy Shevchenko Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-core.h | 10 +++++----- drivers/i2c/busses/i2c-designware-master.c | 4 ++-- drivers/i2c/busses/i2c-designware-slave.c | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-core.h b/drivers/i2c/busses/i2c-designware-core.h index 6d1df28dd93b1..457e6966f85e0 100644 --- a/drivers/i2c/busses/i2c-designware-core.h +++ b/drivers/i2c/busses/i2c-designware-core.h @@ -123,12 +123,12 @@ #define DW_IC_COMP_PARAM_1_SPEED_MODE_MASK GENMASK(3, 2) /* - * status codes + * Sofware status flags */ -#define STATUS_IDLE 0x0 -#define STATUS_ACTIVE 0x1 -#define STATUS_WRITE_IN_PROGRESS 0x2 -#define STATUS_READ_IN_PROGRESS 0x4 +#define STATUS_ACTIVE BIT(0) +#define STATUS_WRITE_IN_PROGRESS BIT(1) +#define STATUS_READ_IN_PROGRESS BIT(2) +#define STATUS_MASK GENMASK(2, 0) /* * operation modes diff --git a/drivers/i2c/busses/i2c-designware-master.c b/drivers/i2c/busses/i2c-designware-master.c index dc3c5a15a95b9..1b7db2b58f311 100644 --- a/drivers/i2c/busses/i2c-designware-master.c +++ b/drivers/i2c/busses/i2c-designware-master.c @@ -574,7 +574,7 @@ i2c_dw_xfer(struct i2c_adapter *adap, struct i2c_msg msgs[], int num) dev->msg_write_idx = 0; dev->msg_read_idx = 0; dev->msg_err = 0; - dev->status = STATUS_IDLE; + dev->status = 0; dev->abort_source = 0; dev->rx_outstanding = 0; @@ -731,7 +731,7 @@ static int i2c_dw_irq_handler_master(struct dw_i2c_dev *dev) if (stat & DW_IC_INTR_TX_ABRT) { dev->cmd_err |= DW_IC_ERR_TX_ABRT; - dev->status = STATUS_IDLE; + dev->status &= ~STATUS_MASK; dev->rx_outstanding = 0; /* diff --git a/drivers/i2c/busses/i2c-designware-slave.c b/drivers/i2c/busses/i2c-designware-slave.c index 295774a69b67a..84eb0bec70fa0 100644 --- a/drivers/i2c/busses/i2c-designware-slave.c +++ b/drivers/i2c/busses/i2c-designware-slave.c @@ -82,7 +82,7 @@ static int i2c_dw_reg_slave(struct i2c_client *slave) dev->msg_write_idx = 0; dev->msg_read_idx = 0; dev->msg_err = 0; - dev->status = STATUS_IDLE; + dev->status = 0; dev->abort_source = 0; dev->rx_outstanding = 0; -- GitLab From 40015c67533548986eaba42a3d9ba9d004bee41e Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Mon, 7 Nov 2022 15:42:40 +0200 Subject: [PATCH 0605/1423] i2c: designware: Remove needless initializations from i2c_dw_reg_slave() These struct dw_i2c_dev members are not used in i2c-designware-slave.c so remove re-initialization of them from i2c_dw_reg_slave(). Signed-off-by: Jarkko Nikula Reviewed-by: Andy Shevchenko Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-slave.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-slave.c b/drivers/i2c/busses/i2c-designware-slave.c index 84eb0bec70fa0..421a604bf68f2 100644 --- a/drivers/i2c/busses/i2c-designware-slave.c +++ b/drivers/i2c/busses/i2c-designware-slave.c @@ -78,13 +78,7 @@ static int i2c_dw_reg_slave(struct i2c_client *slave) __i2c_dw_enable(dev); - dev->cmd_err = 0; - dev->msg_write_idx = 0; - dev->msg_read_idx = 0; - dev->msg_err = 0; dev->status = 0; - dev->abort_source = 0; - dev->rx_outstanding = 0; return 0; } -- GitLab From 5cd69850308f7da3c2bfc117fca5fbfa3c530ade Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Mon, 7 Nov 2022 15:42:41 +0200 Subject: [PATCH 0606/1423] i2c: designware: Remove unused completion code from i2c-designware-slave Remove unused completion code from i2c-designware-slave.c. Used only in i2c-designware-master.c. Signed-off-by: Jarkko Nikula Reviewed-by: Andy Shevchenko Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-slave.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-slave.c b/drivers/i2c/busses/i2c-designware-slave.c index 421a604bf68f2..12f0417aa0ae3 100644 --- a/drivers/i2c/busses/i2c-designware-slave.c +++ b/drivers/i2c/busses/i2c-designware-slave.c @@ -214,8 +214,6 @@ static irqreturn_t i2c_dw_isr_slave(int this_irq, void *dev_id) int ret; ret = i2c_dw_irq_handler_slave(dev); - if (ret > 0) - complete(&dev->cmd_complete); return IRQ_RETVAL(ret); } @@ -242,8 +240,6 @@ int i2c_dw_probe_slave(struct dw_i2c_dev *dev) struct i2c_adapter *adap = &dev->adapter; int ret; - init_completion(&dev->cmd_complete); - dev->init = i2c_dw_init_slave; dev->disable = i2c_dw_disable; dev->disable_int = i2c_dw_disable_int; -- GitLab From 4c7107c2974270ff369342f6f1bd5d3669c182f1 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Mon, 7 Nov 2022 15:42:42 +0200 Subject: [PATCH 0607/1423] i2c: designware: Simplify slave interrupt handler nesting Interrupt processing code in i2c-designware-slave.c is bit more readable if not divided into another subroutine. Also explicit IRQ_NONE and IRQ_HANDLED return values are more obvious. Signed-off-by: Jarkko Nikula Reviewed-by: Andy Shevchenko Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-slave.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-slave.c b/drivers/i2c/busses/i2c-designware-slave.c index 12f0417aa0ae3..3c855cd45c34c 100644 --- a/drivers/i2c/busses/i2c-designware-slave.c +++ b/drivers/i2c/busses/i2c-designware-slave.c @@ -147,9 +147,9 @@ static u32 i2c_dw_read_clear_intrbits_slave(struct dw_i2c_dev *dev) * Interrupt service routine. This gets called whenever an I2C slave interrupt * occurs. */ - -static int i2c_dw_irq_handler_slave(struct dw_i2c_dev *dev) +static irqreturn_t i2c_dw_isr_slave(int this_irq, void *dev_id) { + struct dw_i2c_dev *dev = dev_id; u32 raw_stat, stat, enabled, tmp; u8 val = 0, slave_activity; @@ -159,7 +159,7 @@ static int i2c_dw_irq_handler_slave(struct dw_i2c_dev *dev) slave_activity = ((tmp & DW_IC_STATUS_SLAVE_ACTIVITY) >> 6); if (!enabled || !(raw_stat & ~DW_IC_INTR_ACTIVITY) || !dev->slave) - return 0; + return IRQ_NONE; stat = i2c_dw_read_clear_intrbits_slave(dev); dev_dbg(dev->dev, @@ -205,17 +205,7 @@ static int i2c_dw_irq_handler_slave(struct dw_i2c_dev *dev) if (stat & DW_IC_INTR_STOP_DET) i2c_slave_event(dev->slave, I2C_SLAVE_STOP, &val); - return 1; -} - -static irqreturn_t i2c_dw_isr_slave(int this_irq, void *dev_id) -{ - struct dw_i2c_dev *dev = dev_id; - int ret; - - ret = i2c_dw_irq_handler_slave(dev); - - return IRQ_RETVAL(ret); + return IRQ_HANDLED; } static const struct i2c_algorithm i2c_dw_algo = { -- GitLab From cdbd2f169bf1aff9f679a3e07d62244fc4341968 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Mon, 7 Nov 2022 15:42:43 +0200 Subject: [PATCH 0608/1423] i2c: designware: Do not process interrupt when device is suspended Do not return with interrupt handled if host controller is off and thus interrupt is originating from other device or is spurious. Add a check to detect when controller is runtime suspended or transitioning/reset. In latter case all raw interrupt status register bits may read one. In both cases return IRQ_NONE to indicate interrupt was not from this device. Signed-off-by: Jarkko Nikula Reviewed-by: Andy Shevchenko Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-master.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/i2c/busses/i2c-designware-master.c b/drivers/i2c/busses/i2c-designware-master.c index 1b7db2b58f311..d6fcf955dfc0b 100644 --- a/drivers/i2c/busses/i2c-designware-master.c +++ b/drivers/i2c/busses/i2c-designware-master.c @@ -778,6 +778,8 @@ static irqreturn_t i2c_dw_isr(int this_irq, void *dev_id) dev_dbg(dev->dev, "enabled=%#x stat=%#x\n", enabled, stat); if (!enabled || !(stat & ~DW_IC_INTR_ACTIVITY)) return IRQ_NONE; + if (pm_runtime_suspended(dev->dev) || stat == GENMASK(31, 0)) + return IRQ_NONE; i2c_dw_irq_handler_master(dev); -- GitLab From 184c475ace922e4029f157080be559d49d70009f Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Mon, 7 Nov 2022 15:42:44 +0200 Subject: [PATCH 0609/1423] i2c: designware: Move debug print in i2c_dw_isr() It is kind of needless to print interrupt status when code immediately after that finds interrupt was not originating from this device. Therefore move it after spurious interrupt detection. Signed-off-by: Jarkko Nikula Reviewed-by: Andy Shevchenko Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-master.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-designware-master.c b/drivers/i2c/busses/i2c-designware-master.c index d6fcf955dfc0b..9c2c9d002dc34 100644 --- a/drivers/i2c/busses/i2c-designware-master.c +++ b/drivers/i2c/busses/i2c-designware-master.c @@ -775,11 +775,11 @@ static irqreturn_t i2c_dw_isr(int this_irq, void *dev_id) regmap_read(dev->map, DW_IC_ENABLE, &enabled); regmap_read(dev->map, DW_IC_RAW_INTR_STAT, &stat); - dev_dbg(dev->dev, "enabled=%#x stat=%#x\n", enabled, stat); if (!enabled || !(stat & ~DW_IC_INTR_ACTIVITY)) return IRQ_NONE; if (pm_runtime_suspended(dev->dev) || stat == GENMASK(31, 0)) return IRQ_NONE; + dev_dbg(dev->dev, "enabled=%#x stat=%#x\n", enabled, stat); i2c_dw_irq_handler_master(dev); -- GitLab From a92c3388b4ce34ee83f8ea398c1e00676a3dc467 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Mon, 7 Nov 2022 15:42:45 +0200 Subject: [PATCH 0610/1423] i2c: designware: Simplify master interrupt handler nesting In my opinion a few lines of spurious interrupt detection code can be moved to the actual master interrupt handling function i2c_dw_isr() without hurting readability. Signed-off-by: Jarkko Nikula Reviewed-by: Andy Shevchenko Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-master.c | 33 ++++++++-------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-master.c b/drivers/i2c/busses/i2c-designware-master.c index 9c2c9d002dc34..dfb499e54c05d 100644 --- a/drivers/i2c/busses/i2c-designware-master.c +++ b/drivers/i2c/busses/i2c-designware-master.c @@ -711,9 +711,18 @@ static u32 i2c_dw_read_clear_intrbits(struct dw_i2c_dev *dev) * Interrupt service routine. This gets called whenever an I2C master interrupt * occurs. */ -static int i2c_dw_irq_handler_master(struct dw_i2c_dev *dev) +static irqreturn_t i2c_dw_isr(int this_irq, void *dev_id) { - u32 stat; + struct dw_i2c_dev *dev = dev_id; + u32 stat, enabled; + + regmap_read(dev->map, DW_IC_ENABLE, &enabled); + regmap_read(dev->map, DW_IC_RAW_INTR_STAT, &stat); + if (!enabled || !(stat & ~DW_IC_INTR_ACTIVITY)) + return IRQ_NONE; + if (pm_runtime_suspended(dev->dev) || stat == GENMASK(31, 0)) + return IRQ_NONE; + dev_dbg(dev->dev, "enabled=%#x stat=%#x\n", enabled, stat); stat = i2c_dw_read_clear_intrbits(dev); @@ -726,7 +735,7 @@ static int i2c_dw_irq_handler_master(struct dw_i2c_dev *dev) * the HW active). */ regmap_write(dev->map, DW_IC_INTR_MASK, 0); - return 0; + return IRQ_HANDLED; } if (stat & DW_IC_INTR_TX_ABRT) { @@ -765,24 +774,6 @@ tx_aborted: regmap_write(dev->map, DW_IC_INTR_MASK, stat); } - return 0; -} - -static irqreturn_t i2c_dw_isr(int this_irq, void *dev_id) -{ - struct dw_i2c_dev *dev = dev_id; - u32 stat, enabled; - - regmap_read(dev->map, DW_IC_ENABLE, &enabled); - regmap_read(dev->map, DW_IC_RAW_INTR_STAT, &stat); - if (!enabled || !(stat & ~DW_IC_INTR_ACTIVITY)) - return IRQ_NONE; - if (pm_runtime_suspended(dev->dev) || stat == GENMASK(31, 0)) - return IRQ_NONE; - dev_dbg(dev->dev, "enabled=%#x stat=%#x\n", enabled, stat); - - i2c_dw_irq_handler_master(dev); - return IRQ_HANDLED; } -- GitLab From fee61247b7f67a628bb24314b1a3a20d1f1c60f0 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Mon, 7 Nov 2022 15:42:46 +0200 Subject: [PATCH 0611/1423] i2c: designware: Remove common i2c_dw_disable_int() Commit 90312351fd1e ("i2c: designware: MASTER mode as separated driver") introduced disable_int pointer but there is no real use for it. Both i2c-designware-master.c and i2c-designware-slave.c set it to the same i2c_dw_disable_int() and scope is inside the same kernel module. Since i2c_dw_disable_int() is just masking interrupts and the direct DW_IC_INTR_MASK register write looks more clear in the code use that and remove it from common code. Signed-off-by: Jarkko Nikula Reviewed-by: Andy Shevchenko Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-common.c | 5 ----- drivers/i2c/busses/i2c-designware-core.h | 3 --- drivers/i2c/busses/i2c-designware-master.c | 9 ++++----- drivers/i2c/busses/i2c-designware-slave.c | 3 +-- 4 files changed, 5 insertions(+), 15 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-common.c b/drivers/i2c/busses/i2c-designware-common.c index c023b691441ea..a3240ece55b2b 100644 --- a/drivers/i2c/busses/i2c-designware-common.c +++ b/drivers/i2c/busses/i2c-designware-common.c @@ -625,10 +625,5 @@ void i2c_dw_disable(struct dw_i2c_dev *dev) i2c_dw_release_lock(dev); } -void i2c_dw_disable_int(struct dw_i2c_dev *dev) -{ - regmap_write(dev->map, DW_IC_INTR_MASK, 0); -} - MODULE_DESCRIPTION("Synopsys DesignWare I2C bus adapter core"); MODULE_LICENSE("GPL"); diff --git a/drivers/i2c/busses/i2c-designware-core.h b/drivers/i2c/busses/i2c-designware-core.h index 457e6966f85e0..49e5860b16651 100644 --- a/drivers/i2c/busses/i2c-designware-core.h +++ b/drivers/i2c/busses/i2c-designware-core.h @@ -232,7 +232,6 @@ struct reset_control; * -1 if there is no semaphore. * @shared_with_punit: true if this bus is shared with the SoCs PUNIT * @disable: function to disable the controller - * @disable_int: function to disable all interrupts * @init: function to initialize the I2C hardware * @set_sda_hold_time: callback to retrieve IP specific SDA hold timing * @mode: operation mode - DW_IC_MASTER or DW_IC_SLAVE @@ -290,7 +289,6 @@ struct dw_i2c_dev { int semaphore_idx; bool shared_with_punit; void (*disable)(struct dw_i2c_dev *dev); - void (*disable_int)(struct dw_i2c_dev *dev); int (*init)(struct dw_i2c_dev *dev); int (*set_sda_hold_time)(struct dw_i2c_dev *dev); int mode; @@ -331,7 +329,6 @@ int i2c_dw_handle_tx_abort(struct dw_i2c_dev *dev); int i2c_dw_set_fifo_size(struct dw_i2c_dev *dev); u32 i2c_dw_func(struct i2c_adapter *adap); void i2c_dw_disable(struct dw_i2c_dev *dev); -void i2c_dw_disable_int(struct dw_i2c_dev *dev); static inline void __i2c_dw_enable(struct dw_i2c_dev *dev) { diff --git a/drivers/i2c/busses/i2c-designware-master.c b/drivers/i2c/busses/i2c-designware-master.c index dfb499e54c05d..45f569155bfe1 100644 --- a/drivers/i2c/busses/i2c-designware-master.c +++ b/drivers/i2c/busses/i2c-designware-master.c @@ -239,7 +239,7 @@ static void i2c_dw_xfer_init(struct dw_i2c_dev *dev) msgs[dev->msg_write_idx].addr | ic_tar); /* Enforce disabled interrupts (due to HW issues) */ - i2c_dw_disable_int(dev); + regmap_write(dev->map, DW_IC_INTR_MASK, 0); /* Enable the adapter */ __i2c_dw_enable(dev); @@ -299,7 +299,7 @@ static int amd_i2c_dw_xfer_quirk(struct i2c_adapter *adap, struct i2c_msg *msgs, dev->msgs = msgs; dev->msgs_num = num_msgs; i2c_dw_xfer_init(dev); - i2c_dw_disable_int(dev); + regmap_write(dev->map, DW_IC_INTR_MASK, 0); /* Initiate messages read/write transaction */ for (msg_wrt_idx = 0; msg_wrt_idx < num_msgs; msg_wrt_idx++) { @@ -770,7 +770,7 @@ tx_aborted: else if (unlikely(dev->flags & ACCESS_INTR_MASK)) { /* Workaround to trigger pending interrupt */ regmap_read(dev->map, DW_IC_INTR_MASK, &stat); - i2c_dw_disable_int(dev); + regmap_write(dev->map, DW_IC_INTR_MASK, 0); regmap_write(dev->map, DW_IC_INTR_MASK, stat); } @@ -871,7 +871,6 @@ int i2c_dw_probe_master(struct dw_i2c_dev *dev) dev->init = i2c_dw_init_master; dev->disable = i2c_dw_disable; - dev->disable_int = i2c_dw_disable_int; ret = i2c_dw_init_regmap(dev); if (ret) @@ -910,7 +909,7 @@ int i2c_dw_probe_master(struct dw_i2c_dev *dev) if (ret) return ret; - i2c_dw_disable_int(dev); + regmap_write(dev->map, DW_IC_INTR_MASK, 0); i2c_dw_release_lock(dev); ret = devm_request_irq(dev->dev, dev->irq, i2c_dw_isr, irq_flags, diff --git a/drivers/i2c/busses/i2c-designware-slave.c b/drivers/i2c/busses/i2c-designware-slave.c index 3c855cd45c34c..c6d2e4c2ac23c 100644 --- a/drivers/i2c/busses/i2c-designware-slave.c +++ b/drivers/i2c/busses/i2c-designware-slave.c @@ -87,7 +87,7 @@ static int i2c_dw_unreg_slave(struct i2c_client *slave) { struct dw_i2c_dev *dev = i2c_get_adapdata(slave->adapter); - dev->disable_int(dev); + regmap_write(dev->map, DW_IC_INTR_MASK, 0); dev->disable(dev); synchronize_irq(dev->irq); dev->slave = NULL; @@ -232,7 +232,6 @@ int i2c_dw_probe_slave(struct dw_i2c_dev *dev) dev->init = i2c_dw_init_slave; dev->disable = i2c_dw_disable; - dev->disable_int = i2c_dw_disable_int; ret = i2c_dw_init_regmap(dev); if (ret) -- GitLab From 966b7d3c738ab8d82363eff9e4c62e429697893e Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Mon, 7 Nov 2022 15:42:47 +0200 Subject: [PATCH 0612/1423] i2c: designware: Align defines in i2c-designware-core.h Align all defines to the same column. Signed-off-by: Jarkko Nikula Reviewed-by: Andy Shevchenko Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-core.h | 230 +++++++++++------------ 1 file changed, 115 insertions(+), 115 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-core.h b/drivers/i2c/busses/i2c-designware-core.h index 49e5860b16651..0668888d557d5 100644 --- a/drivers/i2c/busses/i2c-designware-core.h +++ b/drivers/i2c/busses/i2c-designware-core.h @@ -18,12 +18,12 @@ #include #include -#define DW_IC_DEFAULT_FUNCTIONALITY (I2C_FUNC_I2C | \ - I2C_FUNC_SMBUS_BYTE | \ - I2C_FUNC_SMBUS_BYTE_DATA | \ - I2C_FUNC_SMBUS_WORD_DATA | \ - I2C_FUNC_SMBUS_BLOCK_DATA | \ - I2C_FUNC_SMBUS_I2C_BLOCK) +#define DW_IC_DEFAULT_FUNCTIONALITY (I2C_FUNC_I2C | \ + I2C_FUNC_SMBUS_BYTE | \ + I2C_FUNC_SMBUS_BYTE_DATA | \ + I2C_FUNC_SMBUS_WORD_DATA | \ + I2C_FUNC_SMBUS_BLOCK_DATA | \ + I2C_FUNC_SMBUS_I2C_BLOCK) #define DW_IC_CON_MASTER BIT(0) #define DW_IC_CON_SPEED_STD (1 << 1) @@ -43,81 +43,81 @@ /* * Registers offset */ -#define DW_IC_CON 0x00 -#define DW_IC_TAR 0x04 -#define DW_IC_SAR 0x08 -#define DW_IC_DATA_CMD 0x10 -#define DW_IC_SS_SCL_HCNT 0x14 -#define DW_IC_SS_SCL_LCNT 0x18 -#define DW_IC_FS_SCL_HCNT 0x1c -#define DW_IC_FS_SCL_LCNT 0x20 -#define DW_IC_HS_SCL_HCNT 0x24 -#define DW_IC_HS_SCL_LCNT 0x28 -#define DW_IC_INTR_STAT 0x2c -#define DW_IC_INTR_MASK 0x30 -#define DW_IC_RAW_INTR_STAT 0x34 -#define DW_IC_RX_TL 0x38 -#define DW_IC_TX_TL 0x3c -#define DW_IC_CLR_INTR 0x40 -#define DW_IC_CLR_RX_UNDER 0x44 -#define DW_IC_CLR_RX_OVER 0x48 -#define DW_IC_CLR_TX_OVER 0x4c -#define DW_IC_CLR_RD_REQ 0x50 -#define DW_IC_CLR_TX_ABRT 0x54 -#define DW_IC_CLR_RX_DONE 0x58 -#define DW_IC_CLR_ACTIVITY 0x5c -#define DW_IC_CLR_STOP_DET 0x60 -#define DW_IC_CLR_START_DET 0x64 -#define DW_IC_CLR_GEN_CALL 0x68 -#define DW_IC_ENABLE 0x6c -#define DW_IC_STATUS 0x70 -#define DW_IC_TXFLR 0x74 -#define DW_IC_RXFLR 0x78 -#define DW_IC_SDA_HOLD 0x7c -#define DW_IC_TX_ABRT_SOURCE 0x80 -#define DW_IC_ENABLE_STATUS 0x9c -#define DW_IC_CLR_RESTART_DET 0xa8 -#define DW_IC_COMP_PARAM_1 0xf4 -#define DW_IC_COMP_VERSION 0xf8 -#define DW_IC_SDA_HOLD_MIN_VERS 0x3131312A -#define DW_IC_COMP_TYPE 0xfc -#define DW_IC_COMP_TYPE_VALUE 0x44570140 - -#define DW_IC_INTR_RX_UNDER BIT(0) -#define DW_IC_INTR_RX_OVER BIT(1) -#define DW_IC_INTR_RX_FULL BIT(2) -#define DW_IC_INTR_TX_OVER BIT(3) -#define DW_IC_INTR_TX_EMPTY BIT(4) -#define DW_IC_INTR_RD_REQ BIT(5) -#define DW_IC_INTR_TX_ABRT BIT(6) -#define DW_IC_INTR_RX_DONE BIT(7) -#define DW_IC_INTR_ACTIVITY BIT(8) -#define DW_IC_INTR_STOP_DET BIT(9) -#define DW_IC_INTR_START_DET BIT(10) -#define DW_IC_INTR_GEN_CALL BIT(11) -#define DW_IC_INTR_RESTART_DET BIT(12) - -#define DW_IC_INTR_DEFAULT_MASK (DW_IC_INTR_RX_FULL | \ - DW_IC_INTR_TX_ABRT | \ - DW_IC_INTR_STOP_DET) -#define DW_IC_INTR_MASTER_MASK (DW_IC_INTR_DEFAULT_MASK | \ - DW_IC_INTR_TX_EMPTY) -#define DW_IC_INTR_SLAVE_MASK (DW_IC_INTR_DEFAULT_MASK | \ - DW_IC_INTR_RX_UNDER | \ - DW_IC_INTR_RD_REQ) - -#define DW_IC_STATUS_ACTIVITY BIT(0) -#define DW_IC_STATUS_TFE BIT(2) -#define DW_IC_STATUS_RFNE BIT(3) -#define DW_IC_STATUS_MASTER_ACTIVITY BIT(5) -#define DW_IC_STATUS_SLAVE_ACTIVITY BIT(6) - -#define DW_IC_SDA_HOLD_RX_SHIFT 16 -#define DW_IC_SDA_HOLD_RX_MASK GENMASK(23, 16) - -#define DW_IC_ERR_TX_ABRT 0x1 - -#define DW_IC_TAR_10BITADDR_MASTER BIT(12) +#define DW_IC_CON 0x00 +#define DW_IC_TAR 0x04 +#define DW_IC_SAR 0x08 +#define DW_IC_DATA_CMD 0x10 +#define DW_IC_SS_SCL_HCNT 0x14 +#define DW_IC_SS_SCL_LCNT 0x18 +#define DW_IC_FS_SCL_HCNT 0x1c +#define DW_IC_FS_SCL_LCNT 0x20 +#define DW_IC_HS_SCL_HCNT 0x24 +#define DW_IC_HS_SCL_LCNT 0x28 +#define DW_IC_INTR_STAT 0x2c +#define DW_IC_INTR_MASK 0x30 +#define DW_IC_RAW_INTR_STAT 0x34 +#define DW_IC_RX_TL 0x38 +#define DW_IC_TX_TL 0x3c +#define DW_IC_CLR_INTR 0x40 +#define DW_IC_CLR_RX_UNDER 0x44 +#define DW_IC_CLR_RX_OVER 0x48 +#define DW_IC_CLR_TX_OVER 0x4c +#define DW_IC_CLR_RD_REQ 0x50 +#define DW_IC_CLR_TX_ABRT 0x54 +#define DW_IC_CLR_RX_DONE 0x58 +#define DW_IC_CLR_ACTIVITY 0x5c +#define DW_IC_CLR_STOP_DET 0x60 +#define DW_IC_CLR_START_DET 0x64 +#define DW_IC_CLR_GEN_CALL 0x68 +#define DW_IC_ENABLE 0x6c +#define DW_IC_STATUS 0x70 +#define DW_IC_TXFLR 0x74 +#define DW_IC_RXFLR 0x78 +#define DW_IC_SDA_HOLD 0x7c +#define DW_IC_TX_ABRT_SOURCE 0x80 +#define DW_IC_ENABLE_STATUS 0x9c +#define DW_IC_CLR_RESTART_DET 0xa8 +#define DW_IC_COMP_PARAM_1 0xf4 +#define DW_IC_COMP_VERSION 0xf8 +#define DW_IC_SDA_HOLD_MIN_VERS 0x3131312A +#define DW_IC_COMP_TYPE 0xfc +#define DW_IC_COMP_TYPE_VALUE 0x44570140 + +#define DW_IC_INTR_RX_UNDER BIT(0) +#define DW_IC_INTR_RX_OVER BIT(1) +#define DW_IC_INTR_RX_FULL BIT(2) +#define DW_IC_INTR_TX_OVER BIT(3) +#define DW_IC_INTR_TX_EMPTY BIT(4) +#define DW_IC_INTR_RD_REQ BIT(5) +#define DW_IC_INTR_TX_ABRT BIT(6) +#define DW_IC_INTR_RX_DONE BIT(7) +#define DW_IC_INTR_ACTIVITY BIT(8) +#define DW_IC_INTR_STOP_DET BIT(9) +#define DW_IC_INTR_START_DET BIT(10) +#define DW_IC_INTR_GEN_CALL BIT(11) +#define DW_IC_INTR_RESTART_DET BIT(12) + +#define DW_IC_INTR_DEFAULT_MASK (DW_IC_INTR_RX_FULL | \ + DW_IC_INTR_TX_ABRT | \ + DW_IC_INTR_STOP_DET) +#define DW_IC_INTR_MASTER_MASK (DW_IC_INTR_DEFAULT_MASK | \ + DW_IC_INTR_TX_EMPTY) +#define DW_IC_INTR_SLAVE_MASK (DW_IC_INTR_DEFAULT_MASK | \ + DW_IC_INTR_RX_UNDER | \ + DW_IC_INTR_RD_REQ) + +#define DW_IC_STATUS_ACTIVITY BIT(0) +#define DW_IC_STATUS_TFE BIT(2) +#define DW_IC_STATUS_RFNE BIT(3) +#define DW_IC_STATUS_MASTER_ACTIVITY BIT(5) +#define DW_IC_STATUS_SLAVE_ACTIVITY BIT(6) + +#define DW_IC_SDA_HOLD_RX_SHIFT 16 +#define DW_IC_SDA_HOLD_RX_MASK GENMASK(23, 16) + +#define DW_IC_ERR_TX_ABRT 0x1 + +#define DW_IC_TAR_10BITADDR_MASTER BIT(12) #define DW_IC_COMP_PARAM_1_SPEED_MODE_HIGH (BIT(2) | BIT(3)) #define DW_IC_COMP_PARAM_1_SPEED_MODE_MASK GENMASK(3, 2) @@ -125,16 +125,16 @@ /* * Sofware status flags */ -#define STATUS_ACTIVE BIT(0) -#define STATUS_WRITE_IN_PROGRESS BIT(1) -#define STATUS_READ_IN_PROGRESS BIT(2) -#define STATUS_MASK GENMASK(2, 0) +#define STATUS_ACTIVE BIT(0) +#define STATUS_WRITE_IN_PROGRESS BIT(1) +#define STATUS_READ_IN_PROGRESS BIT(2) +#define STATUS_MASK GENMASK(2, 0) /* * operation modes */ -#define DW_IC_MASTER 0 -#define DW_IC_SLAVE 1 +#define DW_IC_MASTER 0 +#define DW_IC_SLAVE 1 /* * Hardware abort codes from the DW_IC_TX_ABRT_SOURCE register @@ -142,20 +142,20 @@ * Only expected abort codes are listed here * refer to the datasheet for the full list */ -#define ABRT_7B_ADDR_NOACK 0 -#define ABRT_10ADDR1_NOACK 1 -#define ABRT_10ADDR2_NOACK 2 -#define ABRT_TXDATA_NOACK 3 -#define ABRT_GCALL_NOACK 4 -#define ABRT_GCALL_READ 5 -#define ABRT_SBYTE_ACKDET 7 -#define ABRT_SBYTE_NORSTRT 9 -#define ABRT_10B_RD_NORSTRT 10 -#define ABRT_MASTER_DIS 11 -#define ARB_LOST 12 -#define ABRT_SLAVE_FLUSH_TXFIFO 13 -#define ABRT_SLAVE_ARBLOST 14 -#define ABRT_SLAVE_RD_INTX 15 +#define ABRT_7B_ADDR_NOACK 0 +#define ABRT_10ADDR1_NOACK 1 +#define ABRT_10ADDR2_NOACK 2 +#define ABRT_TXDATA_NOACK 3 +#define ABRT_GCALL_NOACK 4 +#define ABRT_GCALL_READ 5 +#define ABRT_SBYTE_ACKDET 7 +#define ABRT_SBYTE_NORSTRT 9 +#define ABRT_10B_RD_NORSTRT 10 +#define ABRT_MASTER_DIS 11 +#define ARB_LOST 12 +#define ABRT_SLAVE_FLUSH_TXFIFO 13 +#define ABRT_SLAVE_ARBLOST 14 +#define ABRT_SLAVE_RD_INTX 15 #define DW_IC_TX_ABRT_7B_ADDR_NOACK BIT(ABRT_7B_ADDR_NOACK) #define DW_IC_TX_ABRT_10ADDR1_NOACK BIT(ABRT_10ADDR1_NOACK) @@ -172,11 +172,11 @@ #define DW_IC_RX_ABRT_SLAVE_ARBLOST BIT(ABRT_SLAVE_ARBLOST) #define DW_IC_RX_ABRT_SLAVE_FLUSH_TXFIFO BIT(ABRT_SLAVE_FLUSH_TXFIFO) -#define DW_IC_TX_ABRT_NOACK (DW_IC_TX_ABRT_7B_ADDR_NOACK | \ - DW_IC_TX_ABRT_10ADDR1_NOACK | \ - DW_IC_TX_ABRT_10ADDR2_NOACK | \ - DW_IC_TX_ABRT_TXDATA_NOACK | \ - DW_IC_TX_ABRT_GCALL_NOACK) +#define DW_IC_TX_ABRT_NOACK (DW_IC_TX_ABRT_7B_ADDR_NOACK | \ + DW_IC_TX_ABRT_10ADDR1_NOACK | \ + DW_IC_TX_ABRT_10ADDR2_NOACK | \ + DW_IC_TX_ABRT_TXDATA_NOACK | \ + DW_IC_TX_ABRT_GCALL_NOACK) struct clk; struct device; @@ -295,21 +295,21 @@ struct dw_i2c_dev { struct i2c_bus_recovery_info rinfo; }; -#define ACCESS_INTR_MASK BIT(0) -#define ACCESS_NO_IRQ_SUSPEND BIT(1) -#define ARBITRATION_SEMAPHORE BIT(2) +#define ACCESS_INTR_MASK BIT(0) +#define ACCESS_NO_IRQ_SUSPEND BIT(1) +#define ARBITRATION_SEMAPHORE BIT(2) -#define MODEL_MSCC_OCELOT BIT(8) -#define MODEL_BAIKAL_BT1 BIT(9) -#define MODEL_AMD_NAVI_GPU BIT(10) -#define MODEL_MASK GENMASK(11, 8) +#define MODEL_MSCC_OCELOT BIT(8) +#define MODEL_BAIKAL_BT1 BIT(9) +#define MODEL_AMD_NAVI_GPU BIT(10) +#define MODEL_MASK GENMASK(11, 8) /* * Enable UCSI interrupt by writing 0xd at register * offset 0x474 specified in hardware specification. */ -#define AMD_UCSI_INTR_REG 0x474 -#define AMD_UCSI_INTR_EN 0xd +#define AMD_UCSI_INTR_REG 0x474 +#define AMD_UCSI_INTR_EN 0xd struct i2c_dw_semaphore_callbacks { int (*probe)(struct dw_i2c_dev *dev); -- GitLab From 4bae6da1cbf4658a92e6f58da30bf536746d1e5d Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Mon, 7 Nov 2022 15:42:48 +0200 Subject: [PATCH 0613/1423] i2c: designware: Add comment to custom register value constants DW_IC_COMP_VERSION register contains the ASCII representation of the Synopsys component version. Here 0x3131312A == "111*" means version 1.11* required for DW_IC_SDA_HOLD register availability where '*' means any letter starting from 'a'. DW_IC_COMP_TYPE is constant and is derived from two ASCII letters "DW" followed by a 16-bit unsigned number. Suggested-by: Andy Shevchenko Signed-off-by: Jarkko Nikula Reviewed-by: Andy Shevchenko Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-core.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-core.h b/drivers/i2c/busses/i2c-designware-core.h index 0668888d557d5..95ebc5eaa5d12 100644 --- a/drivers/i2c/busses/i2c-designware-core.h +++ b/drivers/i2c/busses/i2c-designware-core.h @@ -79,9 +79,9 @@ #define DW_IC_CLR_RESTART_DET 0xa8 #define DW_IC_COMP_PARAM_1 0xf4 #define DW_IC_COMP_VERSION 0xf8 -#define DW_IC_SDA_HOLD_MIN_VERS 0x3131312A +#define DW_IC_SDA_HOLD_MIN_VERS 0x3131312A /* "111*" == v1.11* */ #define DW_IC_COMP_TYPE 0xfc -#define DW_IC_COMP_TYPE_VALUE 0x44570140 +#define DW_IC_COMP_TYPE_VALUE 0x44570140 /* "DW" + 0x0140 */ #define DW_IC_INTR_RX_UNDER BIT(0) #define DW_IC_INTR_RX_OVER BIT(1) -- GitLab From c57351a75d013c30e4a726aef1ad441676a99da4 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Sat, 12 Nov 2022 17:43:22 +0800 Subject: [PATCH 0614/1423] KVM: Push dirty information unconditionally to backup bitmap In mark_page_dirty_in_slot(), we bail out when no running vcpu exists and a running vcpu context is strictly required by architecture. It may cause backwards compatible issue. Currently, saving vgic/its tables is the only known case where no running vcpu context is expected. We may have other unknown cases where no running vcpu context exists and it's reported by the warning message and we bail out without pushing the dirty information to the backup bitmap. For this, the application is going to enable the backup bitmap for the unknown cases. However, the dirty information can't be pushed to the backup bitmap even though the backup bitmap is enabled for those unknown cases in the application, until the unknown cases are added to the allowed list of non-running vcpu context with extra code changes to the host kernel. In order to make the new application, where the backup bitmap has been enabled, to work with the unchanged host, we continue to push the dirty information to the backup bitmap instead of bailing out early. With the added check on 'memslot->dirty_bitmap' to mark_page_dirty_in_slot(), the kernel crash is avoided silently by the combined conditions: no running vcpu context, kvm_arch_allow_write_without_running_vcpu() returns 'true', and the backup bitmap (KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP) isn't enabled yet. Suggested-by: Sean Christopherson Signed-off-by: Gavin Shan Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221112094322.21911-1-gshan@redhat.com --- virt/kvm/kvm_main.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index be40d1ce6e913..0fa541ba8ab54 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3308,8 +3308,7 @@ void mark_page_dirty_in_slot(struct kvm *kvm, if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm)) return; - if (WARN_ON_ONCE(!kvm_arch_allow_write_without_running_vcpu(kvm) && !vcpu)) - return; + WARN_ON_ONCE(!vcpu && !kvm_arch_allow_write_without_running_vcpu(kvm)); #endif if (memslot && kvm_slot_dirty_track_enabled(memslot)) { @@ -3318,7 +3317,7 @@ void mark_page_dirty_in_slot(struct kvm *kvm, if (kvm->dirty_ring_size && vcpu) kvm_dirty_ring_push(vcpu, slot, rel_gfn); - else + else if (memslot->dirty_bitmap) set_bit_le(rel_gfn, memslot->dirty_bitmap); } } -- GitLab From 8502bee5584235943c4d371597c740d6779991db Mon Sep 17 00:00:00 2001 From: Minghao Chi Date: Thu, 10 Nov 2022 17:23:42 +0800 Subject: [PATCH 0615/1423] i2c: imx: use devm_platform_get_and_ioremap_resource() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert platform_get_resource(), devm_ioremap_resource() to a single call to devm_platform_get_and_ioremap_resource(), as this is exactly what this function does. Reported-by: Zeal Robot Signed-off-by: Minghao Chi Reviewed-by: Mukesh Ojha Reviewed-by: Uwe Kleine-König Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-imx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c index 3082183bd66a4..1ce0cf7a323fa 100644 --- a/drivers/i2c/busses/i2c-imx.c +++ b/drivers/i2c/busses/i2c-imx.c @@ -1449,8 +1449,7 @@ static int i2c_imx_probe(struct platform_device *pdev) if (irq < 0) return irq; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - base = devm_ioremap_resource(&pdev->dev, res); + base = devm_platform_get_and_ioremap_resource(pdev, 0, &res); if (IS_ERR(base)) return PTR_ERR(base); -- GitLab From e826192cc26bd69746bbf22e6bdf72b87cb3d97b Mon Sep 17 00:00:00 2001 From: Arminder Singh Date: Sat, 5 Nov 2022 07:56:49 -0400 Subject: [PATCH 0616/1423] i2c: /pasemi: PASemi I2C controller IRQ enablement This patch adds IRQ support to the PASemi I2C controller driver to increase the performace of I2C transactions on platforms with PASemi I2C controllers. While primarily intended for Apple silicon platforms, this patch should also help in enabling IRQ support for older PASemi hardware as well should the need arise. This version of the patch has been tested on an M1 Ultra Mac Studio, as well as an M1 MacBook Pro, and userspace launches successfully while using the IRQ path for I2C transactions. Signed-off-by: Arminder Singh Reviewed-by: Sven Peter Reviewed-by: Hector Martin Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-pasemi-core.c | 32 ++++++++++++++++++++---- drivers/i2c/busses/i2c-pasemi-core.h | 5 ++++ drivers/i2c/busses/i2c-pasemi-platform.c | 6 +++++ 3 files changed, 38 insertions(+), 5 deletions(-) diff --git a/drivers/i2c/busses/i2c-pasemi-core.c b/drivers/i2c/busses/i2c-pasemi-core.c index 9028ffb58cc07..7d54a9f34c74b 100644 --- a/drivers/i2c/busses/i2c-pasemi-core.c +++ b/drivers/i2c/busses/i2c-pasemi-core.c @@ -21,6 +21,7 @@ #define REG_MTXFIFO 0x00 #define REG_MRXFIFO 0x04 #define REG_SMSTA 0x14 +#define REG_IMASK 0x18 #define REG_CTL 0x1c #define REG_REV 0x28 @@ -66,6 +67,7 @@ static void pasemi_reset(struct pasemi_smbus *smbus) val |= CTL_EN; reg_write(smbus, REG_CTL, val); + reinit_completion(&smbus->irq_completion); } static void pasemi_smb_clear(struct pasemi_smbus *smbus) @@ -78,14 +80,21 @@ static void pasemi_smb_clear(struct pasemi_smbus *smbus) static int pasemi_smb_waitready(struct pasemi_smbus *smbus) { - int timeout = 10; + int timeout = 100; unsigned int status; - status = reg_read(smbus, REG_SMSTA); - - while (!(status & SMSTA_XEN) && timeout--) { - msleep(1); + if (smbus->use_irq) { + reinit_completion(&smbus->irq_completion); + reg_write(smbus, REG_IMASK, SMSTA_XEN | SMSTA_MTN); + wait_for_completion_timeout(&smbus->irq_completion, msecs_to_jiffies(100)); + reg_write(smbus, REG_IMASK, 0); status = reg_read(smbus, REG_SMSTA); + } else { + status = reg_read(smbus, REG_SMSTA); + while (!(status & SMSTA_XEN) && timeout--) { + msleep(1); + status = reg_read(smbus, REG_SMSTA); + } } /* Got NACK? */ @@ -344,10 +353,14 @@ int pasemi_i2c_common_probe(struct pasemi_smbus *smbus) /* set up the sysfs linkage to our parent device */ smbus->adapter.dev.parent = smbus->dev; + smbus->use_irq = 0; + init_completion(&smbus->irq_completion); if (smbus->hw_rev != PASEMI_HW_REV_PCI) smbus->hw_rev = reg_read(smbus, REG_REV); + reg_write(smbus, REG_IMASK, 0); + pasemi_reset(smbus); error = devm_i2c_add_adapter(smbus->dev, &smbus->adapter); @@ -356,3 +369,12 @@ int pasemi_i2c_common_probe(struct pasemi_smbus *smbus) return 0; } + +irqreturn_t pasemi_irq_handler(int irq, void *dev_id) +{ + struct pasemi_smbus *smbus = dev_id; + + reg_write(smbus, REG_IMASK, 0); + complete(&smbus->irq_completion); + return IRQ_HANDLED; +} diff --git a/drivers/i2c/busses/i2c-pasemi-core.h b/drivers/i2c/busses/i2c-pasemi-core.h index 4655124a37f3a..88821f4e8a9f8 100644 --- a/drivers/i2c/busses/i2c-pasemi-core.h +++ b/drivers/i2c/busses/i2c-pasemi-core.h @@ -7,6 +7,7 @@ #include #include #include +#include #define PASEMI_HW_REV_PCI -1 @@ -16,6 +17,10 @@ struct pasemi_smbus { void __iomem *ioaddr; unsigned int clk_div; int hw_rev; + int use_irq; + struct completion irq_completion; }; int pasemi_i2c_common_probe(struct pasemi_smbus *smbus); + +irqreturn_t pasemi_irq_handler(int irq, void *dev_id); diff --git a/drivers/i2c/busses/i2c-pasemi-platform.c b/drivers/i2c/busses/i2c-pasemi-platform.c index 88a54aaf7e3c3..e35945a91dbef 100644 --- a/drivers/i2c/busses/i2c-pasemi-platform.c +++ b/drivers/i2c/busses/i2c-pasemi-platform.c @@ -49,6 +49,7 @@ static int pasemi_platform_i2c_probe(struct platform_device *pdev) struct pasemi_smbus *smbus; u32 frequency; int error; + int irq_num; data = devm_kzalloc(dev, sizeof(struct pasemi_platform_i2c_data), GFP_KERNEL); @@ -82,6 +83,11 @@ static int pasemi_platform_i2c_probe(struct platform_device *pdev) if (error) goto out_clk_disable; + irq_num = platform_get_irq(pdev, 0); + error = devm_request_irq(smbus->dev, irq_num, pasemi_irq_handler, 0, "pasemi_apple_i2c", (void *)smbus); + + if (!error) + smbus->use_irq = 1; platform_set_drvdata(pdev, data); return 0; -- GitLab From 3574cfdca28543e2e8db649297cd6659ea8e4bb8 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 11 Nov 2022 11:55:29 +0200 Subject: [PATCH 0617/1423] RDMA/mana: Remove redefinition of basic u64 type gdma_obj_handle_t is no more than redefinition of basic u64 type. Remove such obfuscation. Link: https://lore.kernel.org/r/3c1e821279e6a165d058655d2343722d6650e776.1668160486.git.leonro@nvidia.com Acked-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/mr.c | 5 ++- .../net/ethernet/microsoft/mana/gdma_main.c | 3 +- include/net/mana/gdma.h | 31 +++++++++---------- 3 files changed, 17 insertions(+), 22 deletions(-) diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c index a56236cdd9eec..351207c60eb65 100644 --- a/drivers/infiniband/hw/mana/mr.c +++ b/drivers/infiniband/hw/mana/mr.c @@ -73,8 +73,7 @@ static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct mana_ib_mr *mr, return 0; } -static int mana_ib_gd_destroy_mr(struct mana_ib_dev *dev, - gdma_obj_handle_t mr_handle) +static int mana_ib_gd_destroy_mr(struct mana_ib_dev *dev, u64 mr_handle) { struct gdma_destroy_mr_response resp = {}; struct gdma_destroy_mr_request req = {}; @@ -108,9 +107,9 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd); struct gdma_create_mr_params mr_params = {}; struct ib_device *ibdev = ibpd->device; - gdma_obj_handle_t dma_region_handle; struct mana_ib_dev *dev; struct mana_ib_mr *mr; + u64 dma_region_handle; int err; dev = container_of(ibdev, struct mana_ib_dev, ib_dev); diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 46a7d1e6ece98..69224ff8efb60 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -671,8 +671,7 @@ free_q: return err; } -int mana_gd_destroy_dma_region(struct gdma_context *gc, - gdma_obj_handle_t dma_region_handle) +int mana_gd_destroy_dma_region(struct gdma_context *gc, u64 dma_region_handle) { struct gdma_destroy_dma_region_req req = {}; struct gdma_general_resp resp = {}; diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 221adc96340cb..a9fdae14d24c0 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -65,8 +65,6 @@ enum { GDMA_DEVICE_MANA = 2, }; -typedef u64 gdma_obj_handle_t; - struct gdma_resource { /* Protect the bitmap */ spinlock_t lock; @@ -200,7 +198,7 @@ struct gdma_mem_info { u64 length; /* Allocated by the PF driver */ - gdma_obj_handle_t dma_region_handle; + u64 dma_region_handle; }; #define REGISTER_ATB_MST_MKEY_LOWER_SIZE 8 @@ -624,7 +622,7 @@ struct gdma_create_queue_req { u32 reserved1; u32 pdid; u32 doolbell_id; - gdma_obj_handle_t gdma_region; + u64 gdma_region; u32 reserved2; u32 queue_size; u32 log2_throttle_limit; @@ -699,14 +697,14 @@ struct gdma_create_dma_region_req { struct gdma_create_dma_region_resp { struct gdma_resp_hdr hdr; - gdma_obj_handle_t dma_region_handle; + u64 dma_region_handle; }; /* HW DATA */ /* GDMA_DMA_REGION_ADD_PAGES */ struct gdma_dma_region_add_pages_req { struct gdma_req_hdr hdr; - gdma_obj_handle_t dma_region_handle; + u64 dma_region_handle; u32 page_addr_list_len; u32 reserved3; @@ -718,7 +716,7 @@ struct gdma_dma_region_add_pages_req { struct gdma_destroy_dma_region_req { struct gdma_req_hdr hdr; - gdma_obj_handle_t dma_region_handle; + u64 dma_region_handle; }; /* HW DATA */ enum gdma_pd_flags { @@ -733,14 +731,14 @@ struct gdma_create_pd_req { struct gdma_create_pd_resp { struct gdma_resp_hdr hdr; - gdma_obj_handle_t pd_handle; + u64 pd_handle; u32 pd_id; u32 reserved; };/* HW DATA */ struct gdma_destroy_pd_req { struct gdma_req_hdr hdr; - gdma_obj_handle_t pd_handle; + u64 pd_handle; };/* HW DATA */ struct gdma_destory_pd_resp { @@ -756,11 +754,11 @@ enum gdma_mr_type { }; struct gdma_create_mr_params { - gdma_obj_handle_t pd_handle; + u64 pd_handle; enum gdma_mr_type mr_type; union { struct { - gdma_obj_handle_t dma_region_handle; + u64 dma_region_handle; u64 virtual_address; enum gdma_mr_access_flags access_flags; } gva; @@ -769,13 +767,13 @@ struct gdma_create_mr_params { struct gdma_create_mr_request { struct gdma_req_hdr hdr; - gdma_obj_handle_t pd_handle; + u64 pd_handle; enum gdma_mr_type mr_type; u32 reserved_1; union { struct { - gdma_obj_handle_t dma_region_handle; + u64 dma_region_handle; u64 virtual_address; enum gdma_mr_access_flags access_flags; } gva; @@ -786,14 +784,14 @@ struct gdma_create_mr_request { struct gdma_create_mr_response { struct gdma_resp_hdr hdr; - gdma_obj_handle_t mr_handle; + u64 mr_handle; u32 lkey; u32 rkey; };/* HW DATA */ struct gdma_destroy_mr_request { struct gdma_req_hdr hdr; - gdma_obj_handle_t mr_handle; + u64 mr_handle; };/* HW DATA */ struct gdma_destroy_mr_response { @@ -827,7 +825,6 @@ void mana_gd_free_memory(struct gdma_mem_info *gmi); int mana_gd_send_request(struct gdma_context *gc, u32 req_len, const void *req, u32 resp_len, void *resp); -int mana_gd_destroy_dma_region(struct gdma_context *gc, - gdma_obj_handle_t dma_region_handle); +int mana_gd_destroy_dma_region(struct gdma_context *gc, u64 dma_region_handle); #endif /* _GDMA_H */ -- GitLab From 1d26a55fbeb9c24bb24fa84595c56efee8783f35 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Tue, 6 Sep 2022 13:43:00 -0700 Subject: [PATCH 0618/1423] PCI: histb: Switch to using gpiod API This patch switches the driver away from legacy gpio/of_gpio API to gpiod API, and removes use of of_get_named_gpio_flags() which I want to make private to gpiolib. Link: https://lore.kernel.org/r/20220906204301.3736813-1-dmitry.torokhov@gmail.com Signed-off-by: Dmitry Torokhov Signed-off-by: Lorenzo Pieralisi Reviewed-by: Linus Walleij --- drivers/pci/controller/dwc/pcie-histb.c | 39 ++++++++++++------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-histb.c b/drivers/pci/controller/dwc/pcie-histb.c index e2b80f10030d0..43c27812dd6d4 100644 --- a/drivers/pci/controller/dwc/pcie-histb.c +++ b/drivers/pci/controller/dwc/pcie-histb.c @@ -10,11 +10,11 @@ #include #include +#include #include #include #include #include -#include #include #include #include @@ -60,7 +60,7 @@ struct histb_pcie { struct reset_control *sys_reset; struct reset_control *bus_reset; void __iomem *ctrl; - int reset_gpio; + struct gpio_desc *reset_gpio; struct regulator *vpcie; }; @@ -212,8 +212,8 @@ static void histb_pcie_host_disable(struct histb_pcie *hipcie) clk_disable_unprepare(hipcie->sys_clk); clk_disable_unprepare(hipcie->bus_clk); - if (gpio_is_valid(hipcie->reset_gpio)) - gpio_set_value_cansleep(hipcie->reset_gpio, 0); + if (hipcie->reset_gpio) + gpiod_set_value_cansleep(hipcie->reset_gpio, 1); if (hipcie->vpcie) regulator_disable(hipcie->vpcie); @@ -235,8 +235,8 @@ static int histb_pcie_host_enable(struct dw_pcie_rp *pp) } } - if (gpio_is_valid(hipcie->reset_gpio)) - gpio_set_value_cansleep(hipcie->reset_gpio, 1); + if (hipcie->reset_gpio) + gpiod_set_value_cansleep(hipcie->reset_gpio, 0); ret = clk_prepare_enable(hipcie->bus_clk); if (ret) { @@ -298,10 +298,7 @@ static int histb_pcie_probe(struct platform_device *pdev) struct histb_pcie *hipcie; struct dw_pcie *pci; struct dw_pcie_rp *pp; - struct device_node *np = pdev->dev.of_node; struct device *dev = &pdev->dev; - enum of_gpio_flags of_flags; - unsigned long flag = GPIOF_DIR_OUT; int ret; hipcie = devm_kzalloc(dev, sizeof(*hipcie), GFP_KERNEL); @@ -336,17 +333,19 @@ static int histb_pcie_probe(struct platform_device *pdev) hipcie->vpcie = NULL; } - hipcie->reset_gpio = of_get_named_gpio_flags(np, - "reset-gpios", 0, &of_flags); - if (of_flags & OF_GPIO_ACTIVE_LOW) - flag |= GPIOF_ACTIVE_LOW; - if (gpio_is_valid(hipcie->reset_gpio)) { - ret = devm_gpio_request_one(dev, hipcie->reset_gpio, - flag, "PCIe device power control"); - if (ret) { - dev_err(dev, "unable to request gpio\n"); - return ret; - } + hipcie->reset_gpio = devm_gpiod_get_optional(dev, "reset", + GPIOD_OUT_HIGH); + ret = PTR_ERR_OR_ZERO(hipcie->reset_gpio); + if (ret) { + dev_err(dev, "unable to request reset gpio: %d\n", ret); + return ret; + } + + ret = gpiod_set_consumer_name(hipcie->reset_gpio, + "PCIe device power control"); + if (ret) { + dev_err(dev, "unable to set reset gpio name: %d\n", ret); + return ret; } hipcie->aux_clk = devm_clk_get(dev, "aux"); -- GitLab From 4e016f969529f2aec0545e90119e7eb3cb124c46 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 6 Nov 2022 19:46:18 +0200 Subject: [PATCH 0619/1423] vfio: Add an option to get migration data size Add an option to get migration data size by introducing a new migration feature named VFIO_DEVICE_FEATURE_MIG_DATA_SIZE. Upon VFIO_DEVICE_FEATURE_GET the estimated data length that will be required to complete STOP_COPY is returned. This option may better enable user space to consider before moving to STOP_COPY whether it can meet the downtime SLA based on the returned data. The patch also includes the implementation for mlx5 and hisi for this new option to make it feature complete for the existing drivers in this area. Signed-off-by: Yishai Hadas Reviewed-by: Jason Gunthorpe Reviewed-by: Longfang Liu Link: https://lore.kernel.org/r/20221106174630.25909-2-yishaih@nvidia.com Signed-off-by: Alex Williamson --- .../vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 9 ++++++ drivers/vfio/pci/mlx5/main.c | 18 +++++++++++ drivers/vfio/pci/vfio_pci_core.c | 3 +- drivers/vfio/vfio_main.c | 32 +++++++++++++++++++ include/linux/vfio.h | 5 +++ include/uapi/linux/vfio.h | 13 ++++++++ 6 files changed, 79 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 39eeca18a0f7c..0c0c0c7f05215 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -957,6 +957,14 @@ hisi_acc_vfio_pci_set_device_state(struct vfio_device *vdev, return res; } +static int +hisi_acc_vfio_pci_get_data_size(struct vfio_device *vdev, + unsigned long *stop_copy_length) +{ + *stop_copy_length = sizeof(struct acc_vf_data); + return 0; +} + static int hisi_acc_vfio_pci_get_device_state(struct vfio_device *vdev, enum vfio_device_mig_state *curr_state) @@ -1213,6 +1221,7 @@ static void hisi_acc_vfio_pci_close_device(struct vfio_device *core_vdev) static const struct vfio_migration_ops hisi_acc_vfio_pci_migrn_state_ops = { .migration_set_state = hisi_acc_vfio_pci_set_device_state, .migration_get_state = hisi_acc_vfio_pci_get_device_state, + .migration_get_data_size = hisi_acc_vfio_pci_get_data_size, }; static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev) diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 457138b92f134..6e9cf2aacc527 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -512,6 +512,23 @@ mlx5vf_pci_set_device_state(struct vfio_device *vdev, return res; } +static int mlx5vf_pci_get_data_size(struct vfio_device *vdev, + unsigned long *stop_copy_length) +{ + struct mlx5vf_pci_core_device *mvdev = container_of( + vdev, struct mlx5vf_pci_core_device, core_device.vdev); + size_t state_size; + int ret; + + mutex_lock(&mvdev->state_mutex); + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, + &state_size); + if (!ret) + *stop_copy_length = state_size; + mlx5vf_state_mutex_unlock(mvdev); + return ret; +} + static int mlx5vf_pci_get_device_state(struct vfio_device *vdev, enum vfio_device_mig_state *curr_state) { @@ -577,6 +594,7 @@ static void mlx5vf_pci_close_device(struct vfio_device *core_vdev) static const struct vfio_migration_ops mlx5vf_pci_mig_ops = { .migration_set_state = mlx5vf_pci_set_device_state, .migration_get_state = mlx5vf_pci_get_device_state, + .migration_get_data_size = mlx5vf_pci_get_data_size, }; static const struct vfio_log_ops mlx5vf_pci_log_ops = { diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 9be2d5be5d959..189d4930c276d 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -2127,7 +2127,8 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) if (vdev->vdev.mig_ops) { if (!(vdev->vdev.mig_ops->migration_get_state && - vdev->vdev.mig_ops->migration_set_state) || + vdev->vdev.mig_ops->migration_set_state && + vdev->vdev.mig_ops->migration_get_data_size) || !(vdev->vdev.migration_flags & VFIO_MIGRATION_STOP_COPY)) return -EINVAL; } diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 9835757e2bee4..662e267a3e13d 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1242,6 +1242,34 @@ out_copy: return 0; } +static int +vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device, + u32 flags, void __user *arg, + size_t argsz) +{ + struct vfio_device_feature_mig_data_size data_size = {}; + unsigned long stop_copy_length; + int ret; + + if (!device->mig_ops) + return -ENOTTY; + + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, + sizeof(data_size)); + if (ret != 1) + return ret; + + ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length); + if (ret) + return ret; + + data_size.stop_copy_length = stop_copy_length; + if (copy_to_user(arg, &data_size, sizeof(data_size))) + return -EFAULT; + + return 0; +} + static int vfio_ioctl_device_feature_migration(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz) @@ -1469,6 +1497,10 @@ static int vfio_ioctl_device_feature(struct vfio_device *device, return vfio_ioctl_device_feature_logging_report( device, feature.flags, arg->data, feature.argsz - minsz); + case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE: + return vfio_ioctl_device_feature_migration_data_size( + device, feature.flags, arg->data, + feature.argsz - minsz); default: if (unlikely(!device->ops->device_feature)) return -EINVAL; diff --git a/include/linux/vfio.h b/include/linux/vfio.h index e7480154825ec..43b67e46a2cb5 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -107,6 +107,9 @@ struct vfio_device_ops { * @migration_get_state: Optional callback to get the migration state for * devices that support migration. It's mandatory for * VFIO_DEVICE_FEATURE_MIGRATION migration support. + * @migration_get_data_size: Optional callback to get the estimated data + * length that will be required to complete stop copy. It's mandatory for + * VFIO_DEVICE_FEATURE_MIGRATION migration support. */ struct vfio_migration_ops { struct file *(*migration_set_state)( @@ -114,6 +117,8 @@ struct vfio_migration_ops { enum vfio_device_mig_state new_state); int (*migration_get_state)(struct vfio_device *device, enum vfio_device_mig_state *curr_state); + int (*migration_get_data_size)(struct vfio_device *device, + unsigned long *stop_copy_length); }; /** diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index d7d8e0922376c..3e45dbaf190ef 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -1128,6 +1128,19 @@ struct vfio_device_feature_dma_logging_report { #define VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT 8 +/* + * Upon VFIO_DEVICE_FEATURE_GET read back the estimated data length that will + * be required to complete stop copy. + * + * Note: Can be called on each device state. + */ + +struct vfio_device_feature_mig_data_size { + __aligned_u64 stop_copy_length; +}; + +#define VFIO_DEVICE_FEATURE_MIG_DATA_SIZE 9 + /* -------- API for Type1 VFIO IOMMU -------- */ /** -- GitLab From 2f5d8cef45c30edcf3972d345f606df563d3a48e Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 6 Nov 2022 19:46:19 +0200 Subject: [PATCH 0620/1423] vfio/mlx5: Fix a typo in mlx5vf_cmd_load_vhca_state() Fix a typo in mlx5vf_cmd_load_vhca_state() to use the 'load' memory layout. As in/out sizes are equal for save and load commands there wasn't any functional issue. Fixes: f1d98f346ee3 ("vfio/mlx5: Expose migration commands over mlx5 device") Signed-off-by: Yishai Hadas Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20221106174630.25909-3-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index c604b70437a5d..0848bc905d3ea 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -378,8 +378,8 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf) { struct mlx5_core_dev *mdev; - u32 out[MLX5_ST_SZ_DW(save_vhca_state_out)] = {}; - u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; + u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {}; + u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {}; u32 pdn, mkey; int err; -- GitLab From dac153f2802db1ad46207283cb9b2aae3d707a45 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Mon, 7 Nov 2022 10:51:34 +0200 Subject: [PATCH 0621/1423] RDMA/restrack: Release MR restrack when delete The MR restrack also needs to be released when delete it, otherwise it cause memory leak as the task struct won't be released. Fixes: 13ef5539def7 ("RDMA/restrack: Count references to the verbs objects") Signed-off-by: Mark Zhang Reviewed-by: Michael Guralnik Link: https://lore.kernel.org/r/703db18e8d4ef628691fb93980a709be673e62e3.1667810736.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/restrack.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 1f935d9f61785..01a499a8b88db 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -343,8 +343,6 @@ void rdma_restrack_del(struct rdma_restrack_entry *res) rt = &dev->res[res->type]; old = xa_erase(&rt->xa, res->id); - if (res->type == RDMA_RESTRACK_MR) - return; WARN_ON(old != res); out: -- GitLab From 5e15ff29b156bbbdeadae230c8ecd5ecd8ca2477 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Mon, 7 Nov 2022 10:51:35 +0200 Subject: [PATCH 0622/1423] RDMA/core: Make sure "ib_port" is valid when access sysfs node The "ib_port" structure must be set before adding the sysfs kobject, and reset after removing it, otherwise it may crash when accessing the sysfs node: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000050 Mem abort info: ESR = 0x96000006 Exception class = DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 Data abort info: ISV = 0, ISS = 0x00000006 CM = 0, WnR = 0 user pgtable: 4k pages, 48-bit VAs, pgdp = 00000000e85f5ba5 [0000000000000050] pgd=0000000848fd9003, pud=000000085b387003, pmd=0000000000000000 Internal error: Oops: 96000006 [#2] PREEMPT SMP Modules linked in: ib_umad(O) mlx5_ib(O) nfnetlink_cttimeout(E) nfnetlink(E) act_gact(E) cls_flower(E) sch_ingress(E) openvswitch(E) nsh(E) nf_nat_ipv6(E) nf_nat_ipv4(E) nf_conncount(E) nf_nat(E) nf_conntrack(E) nf_defrag_ipv6(E) nf_defrag_ipv4(E) mst_pciconf(O) ipmi_devintf(E) ipmi_msghandler(E) ipmb_dev_int(OE) mlx5_core(O) mlxfw(O) mlxdevm(O) auxiliary(O) ib_uverbs(O) ib_core(O) mlx_compat(O) psample(E) sbsa_gwdt(E) uio_pdrv_genirq(E) uio(E) mlxbf_pmc(OE) mlxbf_gige(OE) mlxbf_tmfifo(OE) gpio_mlxbf2(OE) pwr_mlxbf(OE) mlx_trio(OE) i2c_mlxbf(OE) mlx_bootctl(OE) bluefield_edac(OE) knem(O) ip_tables(E) ipv6(E) crc_ccitt(E) [last unloaded: mst_pci] Process grep (pid: 3372, stack limit = 0x0000000022055c92) CPU: 5 PID: 3372 Comm: grep Tainted: G D OE 4.19.161-mlnx.47.gadcd9e3 #1 Hardware name: https://www.mellanox.com BlueField SoC/BlueField SoC, BIOS BlueField:3.9.2-15-ga2403ab Sep 8 2022 pstate: 40000005 (nZcv daif -PAN -UAO) pc : hw_stat_port_show+0x4c/0x80 [ib_core] lr : port_attr_show+0x40/0x58 [ib_core] sp : ffff000029f43b50 x29: ffff000029f43b50 x28: 0000000019375000 x27: ffff8007b821a540 x26: ffff000029f43e30 x25: 0000000000008000 x24: ffff000000eaa958 x23: 0000000000001000 x22: ffff8007a4ce3000 x21: ffff8007baff8000 x20: ffff8007b9066ac0 x19: ffff8007bae97578 x18: 0000000000000000 x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000 x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000 x11: 0000000000000000 x10: 0000000000000000 x9 : 0000000000000000 x8 : ffff8007a4ce4000 x7 : 0000000000000000 x6 : 000000000000003f x5 : ffff000000e6a280 x4 : ffff8007a4ce3000 x3 : 0000000000000000 x2 : aaaaaaaaaaaaaaab x1 : ffff8007b9066a10 x0 : ffff8007baff8000 Call trace: hw_stat_port_show+0x4c/0x80 [ib_core] port_attr_show+0x40/0x58 [ib_core] sysfs_kf_seq_show+0x8c/0x150 kernfs_seq_show+0x44/0x50 seq_read+0x1b4/0x45c kernfs_fop_read+0x148/0x1d8 __vfs_read+0x58/0x180 vfs_read+0x94/0x154 ksys_read+0x68/0xd8 __arm64_sys_read+0x28/0x34 el0_svc_common+0x88/0x18c el0_svc_handler+0x78/0x94 el0_svc+0x8/0xe8 Code: f2955562 aa1603e4 aa1503e0 f9405683 (f9402861) Fixes: d8a5883814b9 ("RDMA/core: Replace the ib_port_data hw_stats pointers with a ib_port pointer") Signed-off-by: Mark Zhang Reviewed-by: Michael Guralnik Link: https://lore.kernel.org/r/88867e705c42c1cd2011e45201c25eecdb9fef94.1667810736.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/sysfs.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 84c53bd2a52db..ee59d73915689 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1213,6 +1213,9 @@ static struct ib_port *setup_port(struct ib_core_device *coredev, int port_num, p->port_num = port_num; kobject_init(&p->kobj, &port_type); + if (device->port_data && is_full_dev) + device->port_data[port_num].sysfs = p; + cur_group = p->groups_list; ret = alloc_port_table_group("gids", &p->groups[0], p->attrs_list, attr->gid_tbl_len, show_port_gid); @@ -1258,9 +1261,6 @@ static struct ib_port *setup_port(struct ib_core_device *coredev, int port_num, } list_add_tail(&p->kobj.entry, &coredev->port_list); - if (device->port_data && is_full_dev) - device->port_data[port_num].sysfs = p; - return p; err_groups: @@ -1268,6 +1268,8 @@ err_groups: err_del: kobject_del(&p->kobj); err_put: + if (device->port_data && is_full_dev) + device->port_data[port_num].sysfs = NULL; kobject_put(&p->kobj); return ERR_PTR(ret); } @@ -1276,14 +1278,17 @@ static void destroy_port(struct ib_core_device *coredev, struct ib_port *port) { bool is_full_dev = &port->ibdev->coredev == coredev; - if (port->ibdev->port_data && - port->ibdev->port_data[port->port_num].sysfs == port) - port->ibdev->port_data[port->port_num].sysfs = NULL; list_del(&port->kobj.entry); if (is_full_dev) sysfs_remove_groups(&port->kobj, port->ibdev->ops.port_groups); + sysfs_remove_groups(&port->kobj, port->groups_list); kobject_del(&port->kobj); + + if (port->ibdev->port_data && + port->ibdev->port_data[port->port_num].sysfs == port) + port->ibdev->port_data[port->port_num].sysfs = NULL; + kobject_put(&port->kobj); } -- GitLab From ecacb3751f254572af0009b9501e2cdc83a30b6a Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Mon, 7 Nov 2022 10:51:36 +0200 Subject: [PATCH 0623/1423] RDMA/nldev: Return "-EAGAIN" if the cm_id isn't from expected port When filling a cm_id entry, return "-EAGAIN" instead of 0 if the cm_id doesn'the have the same port as requested, otherwise an incomplete entry may be returned, which causes "rdam res show cm_id" to return an error. For example on a machine with two rdma devices with "rping -C 1 -v -s" running background, the "rdma" command fails: $ rdma -V rdma utility, iproute2-5.19.0 $ rdma res show cm_id link mlx5_0/- cm-idn 0 state LISTEN ps TCP pid 28056 comm rping src-addr 0.0.0.0:7174 error: Protocol not available While with this fix it succeeds: $ rdma res show cm_id link mlx5_0/- cm-idn 0 state LISTEN ps TCP pid 26395 comm rping src-addr 0.0.0.0:7174 link mlx5_1/- cm-idn 0 state LISTEN ps TCP pid 26395 comm rping src-addr 0.0.0.0:7174 Fixes: 00313983cda6 ("RDMA/nldev: provide detailed CM_ID information") Signed-off-by: Mark Zhang Link: https://lore.kernel.org/r/a08e898cdac5e28428eb749a99d9d981571b8ea7.1667810736.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/nldev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index b92358f606d00..2be76a3fdd87a 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -552,7 +552,7 @@ static int fill_res_cm_id_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_cm_id *cm_id = &id_priv->id; if (port && port != cm_id->port_num) - return 0; + return -EAGAIN; if (cm_id->port_num && nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, cm_id->port_num)) -- GitLab From 07445ae1c26367928311e13f2a821ae94410da7e Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Tue, 15 Nov 2022 11:16:24 +0100 Subject: [PATCH 0624/1423] gpiolib: of: change of_find_gpio() to accept device node In preparation of switching all OF-based GPIO lookups to go through of_find_gpio() let's change it to accept device node as its argument as we do not always have access to device structure. Reviewed-by: Andy Shevchenko Acked-by: Linus Walleij Signed-off-by: Dmitry Torokhov Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-of.c | 7 +++---- drivers/gpio/gpiolib-of.h | 4 ++-- drivers/gpio/gpiolib.c | 5 +++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index 4be3c21aa7188..596b8e21700ef 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -605,7 +605,7 @@ static const of_find_gpio_quirk of_find_gpio_quirks[] = { NULL }; -struct gpio_desc *of_find_gpio(struct device *dev, const char *con_id, +struct gpio_desc *of_find_gpio(struct device_node *np, const char *con_id, unsigned int idx, unsigned long *flags) { char prop_name[32]; /* 32 is max size of property name */ @@ -623,8 +623,7 @@ struct gpio_desc *of_find_gpio(struct device *dev, const char *con_id, snprintf(prop_name, sizeof(prop_name), "%s", gpio_suffixes[i]); - desc = of_get_named_gpiod_flags(dev->of_node, prop_name, idx, - &of_flags); + desc = of_get_named_gpiod_flags(np, prop_name, idx, &of_flags); if (!gpiod_not_found(desc)) break; @@ -632,7 +631,7 @@ struct gpio_desc *of_find_gpio(struct device *dev, const char *con_id, /* Properly named GPIO was not found, try workarounds */ for (q = of_find_gpio_quirks; gpiod_not_found(desc) && *q; q++) - desc = (*q)(dev->of_node, con_id, idx, &of_flags); + desc = (*q)(np, con_id, idx, &of_flags); if (IS_ERR(desc)) return desc; diff --git a/drivers/gpio/gpiolib-of.h b/drivers/gpio/gpiolib-of.h index 2c32a332ede59..bd4131ff61d36 100644 --- a/drivers/gpio/gpiolib-of.h +++ b/drivers/gpio/gpiolib-of.h @@ -7,7 +7,7 @@ struct gpio_chip; enum of_gpio_flags; #ifdef CONFIG_OF_GPIO -struct gpio_desc *of_find_gpio(struct device *dev, +struct gpio_desc *of_find_gpio(struct device_node *np, const char *con_id, unsigned int idx, unsigned long *lookupflags); @@ -16,7 +16,7 @@ void of_gpiochip_remove(struct gpio_chip *gc); int of_gpio_get_count(struct device *dev, const char *con_id); void of_gpio_dev_init(struct gpio_chip *gc, struct gpio_device *gdev); #else -static inline struct gpio_desc *of_find_gpio(struct device *dev, +static inline struct gpio_desc *of_find_gpio(struct device_node *np, const char *con_id, unsigned int idx, unsigned long *lookupflags) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 11fb7ec883e9f..a80fc8abb03f3 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -4122,14 +4122,15 @@ struct gpio_desc *__must_check gpiod_get_index(struct device *dev, int ret; /* Maybe we have a device name, maybe not */ const char *devname = dev ? dev_name(dev) : "?"; - const struct fwnode_handle *fwnode = dev ? dev_fwnode(dev) : NULL; + struct fwnode_handle *fwnode = dev ? dev_fwnode(dev) : NULL; dev_dbg(dev, "GPIO lookup for consumer %s\n", con_id); /* Using device tree? */ if (is_of_node(fwnode)) { dev_dbg(dev, "using device tree for GPIO lookup\n"); - desc = of_find_gpio(dev, con_id, idx, &lookupflags); + desc = of_find_gpio(to_of_node(fwnode), + con_id, idx, &lookupflags); } else if (is_acpi_node(fwnode)) { dev_dbg(dev, "using ACPI for GPIO lookup\n"); desc = acpi_find_gpio(dev, con_id, idx, &flags, &lookupflags); -- GitLab From 2b6bce80ae70b91134a5731d85076042ae90c300 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 11 Nov 2022 14:19:04 -0800 Subject: [PATCH 0625/1423] gpiolib: acpi: change acpi_find_gpio() to accept firmware node In preparation of switching all ACPI-based GPIO lookups to go through acpi_find_gpio() let's change it to accept device node as its argument as we do not always have access to device structure. Reviewed-by: Andy Shevchenko Acked-by: Linus Walleij Signed-off-by: Dmitry Torokhov Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-acpi.c | 8 ++++++-- drivers/gpio/gpiolib-acpi.h | 4 ++-- drivers/gpio/gpiolib.c | 3 ++- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c index a7d2358736fe7..61b311e4560ff 100644 --- a/drivers/gpio/gpiolib-acpi.c +++ b/drivers/gpio/gpiolib-acpi.c @@ -906,18 +906,22 @@ static bool acpi_can_fallback_to_crs(struct acpi_device *adev, return con_id == NULL; } -struct gpio_desc *acpi_find_gpio(struct device *dev, +struct gpio_desc *acpi_find_gpio(struct fwnode_handle *fwnode, const char *con_id, unsigned int idx, enum gpiod_flags *dflags, unsigned long *lookupflags) { - struct acpi_device *adev = ACPI_COMPANION(dev); + struct acpi_device *adev; struct acpi_gpio_info info; struct gpio_desc *desc; char propname[32]; int i; + adev = to_acpi_device_node(fwnode); + if (!adev) + return ERR_PTR(-ENODEV); + /* Try first from _DSD */ for (i = 0; i < ARRAY_SIZE(gpio_suffixes); i++) { if (con_id) { diff --git a/drivers/gpio/gpiolib-acpi.h b/drivers/gpio/gpiolib-acpi.h index 1ac6816839dbc..9fc34830639ce 100644 --- a/drivers/gpio/gpiolib-acpi.h +++ b/drivers/gpio/gpiolib-acpi.h @@ -48,7 +48,7 @@ int acpi_gpio_update_gpiod_flags(enum gpiod_flags *flags, int acpi_gpio_update_gpiod_lookup_flags(unsigned long *lookupflags, struct acpi_gpio_info *info); -struct gpio_desc *acpi_find_gpio(struct device *dev, +struct gpio_desc *acpi_find_gpio(struct fwnode_handle *fwnode, const char *con_id, unsigned int idx, enum gpiod_flags *dflags, @@ -83,7 +83,7 @@ acpi_gpio_update_gpiod_lookup_flags(unsigned long *lookupflags, } static inline struct gpio_desc * -acpi_find_gpio(struct device *dev, const char *con_id, +acpi_find_gpio(struct fwnode_handle *fwnode, const char *con_id, unsigned int idx, enum gpiod_flags *dflags, unsigned long *lookupflags) { diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index a80fc8abb03f3..e874bb0ef685e 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -4133,7 +4133,8 @@ struct gpio_desc *__must_check gpiod_get_index(struct device *dev, con_id, idx, &lookupflags); } else if (is_acpi_node(fwnode)) { dev_dbg(dev, "using ACPI for GPIO lookup\n"); - desc = acpi_find_gpio(dev, con_id, idx, &flags, &lookupflags); + desc = acpi_find_gpio(fwnode, + con_id, idx, &flags, &lookupflags); } /* -- GitLab From 16ba046e86e93f42117efe7ca7a7940b83c60afc Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 11 Nov 2022 14:19:05 -0800 Subject: [PATCH 0626/1423] gpiolib: acpi: teach acpi_find_gpio() to handle data-only nodes In preparation of switching all ACPI-based GPIO lookups to go through acpi_find_gpio() we need to make sure it can handle data-only ACPI nodes, same as existing acpi_node_get_gpiod(). Reviewed-by: Andy Shevchenko Acked-by: Linus Walleij Signed-off-by: Dmitry Torokhov Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-acpi.c | 76 ++++++++++++++++++++++++------------- 1 file changed, 50 insertions(+), 26 deletions(-) diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c index 61b311e4560ff..bd36fac20ea0d 100644 --- a/drivers/gpio/gpiolib-acpi.c +++ b/drivers/gpio/gpiolib-acpi.c @@ -864,8 +864,9 @@ static int acpi_gpio_property_lookup(struct fwnode_handle *fwnode, * function only returns the first. */ static struct gpio_desc *acpi_get_gpiod_by_index(struct acpi_device *adev, - const char *propname, int index, - struct acpi_gpio_info *info) + const char *propname, + int index, + struct acpi_gpio_info *info) { struct acpi_gpio_lookup lookup; int ret; @@ -896,6 +897,44 @@ static struct gpio_desc *acpi_get_gpiod_by_index(struct acpi_device *adev, return ret ? ERR_PTR(ret) : lookup.desc; } +/** + * acpi_get_gpiod_from_data() - get a GPIO descriptor from ACPI data node + * @fwnode: pointer to an ACPI firmware node to get the GPIO information from + * @propname: Property name of the GPIO + * @index: index of GpioIo/GpioInt resource (starting from %0) + * @info: info pointer to fill in (optional) + * + * This function uses the property-based GPIO lookup to get to the GPIO + * resource with the relevant information from a data-only ACPI firmware node + * and uses that to obtain the GPIO descriptor to return. + * + * If the GPIO cannot be translated or there is an error an ERR_PTR is + * returned. + */ +static struct gpio_desc *acpi_get_gpiod_from_data(struct fwnode_handle *fwnode, + const char *propname, + int index, + struct acpi_gpio_info *info) +{ + struct acpi_gpio_lookup lookup; + int ret; + + if (!is_acpi_data_node(fwnode)) + return ERR_PTR(-ENODEV); + + if (!propname) + return ERR_PTR(-EINVAL); + + lookup.index = index; + + ret = acpi_gpio_property_lookup(fwnode, propname, index, &lookup); + if (ret) + return ERR_PTR(ret); + + ret = acpi_gpio_resource_lookup(&lookup, info); + return ret ? ERR_PTR(ret) : lookup.desc; +} + static bool acpi_can_fallback_to_crs(struct acpi_device *adev, const char *con_id) { @@ -912,16 +951,12 @@ struct gpio_desc *acpi_find_gpio(struct fwnode_handle *fwnode, enum gpiod_flags *dflags, unsigned long *lookupflags) { - struct acpi_device *adev; + struct acpi_device *adev = to_acpi_device_node(fwnode); struct acpi_gpio_info info; struct gpio_desc *desc; char propname[32]; int i; - adev = to_acpi_device_node(fwnode); - if (!adev) - return ERR_PTR(-ENODEV); - /* Try first from _DSD */ for (i = 0; i < ARRAY_SIZE(gpio_suffixes); i++) { if (con_id) { @@ -932,7 +967,12 @@ struct gpio_desc *acpi_find_gpio(struct fwnode_handle *fwnode, gpio_suffixes[i]); } - desc = acpi_get_gpiod_by_index(adev, propname, idx, &info); + if (adev) + desc = acpi_get_gpiod_by_index(adev, + propname, idx, &info); + else + desc = acpi_get_gpiod_from_data(fwnode, + propname, idx, &info); if (!IS_ERR(desc)) break; if (PTR_ERR(desc) == -EPROBE_DEFER) @@ -941,7 +981,7 @@ struct gpio_desc *acpi_find_gpio(struct fwnode_handle *fwnode, /* Then from plain _CRS GPIOs */ if (IS_ERR(desc)) { - if (!acpi_can_fallback_to_crs(adev, con_id)) + if (!adev || !acpi_can_fallback_to_crs(adev, con_id)) return ERR_PTR(-ENOENT); desc = acpi_get_gpiod_by_index(adev, NULL, idx, &info); @@ -979,29 +1019,13 @@ struct gpio_desc *acpi_node_get_gpiod(struct fwnode_handle *fwnode, const char *propname, int index, struct acpi_gpio_info *info) { - struct acpi_gpio_lookup lookup; struct acpi_device *adev; - int ret; adev = to_acpi_device_node(fwnode); if (adev) return acpi_get_gpiod_by_index(adev, propname, index, info); - if (!is_acpi_data_node(fwnode)) - return ERR_PTR(-ENODEV); - - if (!propname) - return ERR_PTR(-EINVAL); - - memset(&lookup, 0, sizeof(lookup)); - lookup.index = index; - - ret = acpi_gpio_property_lookup(fwnode, propname, index, &lookup); - if (ret) - return ERR_PTR(ret); - - ret = acpi_gpio_resource_lookup(&lookup, info); - return ret ? ERR_PTR(ret) : lookup.desc; + return acpi_get_gpiod_from_data(fwnode, propname, index, info); } /** -- GitLab From b7452d670fdef8974e18754342fe6f68e20c2567 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Tue, 15 Nov 2022 11:20:47 +0100 Subject: [PATCH 0627/1423] gpiolib: acpi: avoid leaking ACPI details into upper gpiolib layers There is no need for the generic parts of GPIOLIB to be aware of implementation details of ACPI-bases lookups. Reviewed-by: Andy Shevchenko Acked-by: Linus Walleij Signed-off-by: Dmitry Torokhov Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-acpi.c | 51 +++++++++++++++++++++++++++++++------ drivers/gpio/gpiolib-acpi.h | 46 +++------------------------------ drivers/gpio/gpiolib.c | 8 ++---- 3 files changed, 48 insertions(+), 57 deletions(-) diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c index bd36fac20ea0d..1d69b707cbb18 100644 --- a/drivers/gpio/gpiolib-acpi.c +++ b/drivers/gpio/gpiolib-acpi.c @@ -89,6 +89,30 @@ struct acpi_gpio_chip { struct list_head deferred_req_irqs_list_entry; }; +/** + * struct acpi_gpio_info - ACPI GPIO specific information + * @adev: reference to ACPI device which consumes GPIO resource + * @flags: GPIO initialization flags + * @gpioint: if %true this GPIO is of type GpioInt otherwise type is GpioIo + * @pin_config: pin bias as provided by ACPI + * @polarity: interrupt polarity as provided by ACPI + * @triggering: triggering type as provided by ACPI + * @wake_capable: wake capability as provided by ACPI + * @debounce: debounce timeout as provided by ACPI + * @quirks: Linux specific quirks as provided by struct acpi_gpio_mapping + */ +struct acpi_gpio_info { + struct acpi_device *adev; + enum gpiod_flags flags; + bool gpioint; + int pin_config; + int polarity; + int triggering; + bool wake_capable; + unsigned int debounce; + unsigned int quirks; +}; + /* * For GPIO chips which call acpi_gpiochip_request_interrupts() before late_init * (so builtin drivers) we register the ACPI GpioInt IRQ handlers from a @@ -670,8 +694,8 @@ __acpi_gpio_update_gpiod_flags(enum gpiod_flags *flags, enum gpiod_flags update) return ret; } -int -acpi_gpio_update_gpiod_flags(enum gpiod_flags *flags, struct acpi_gpio_info *info) +static int acpi_gpio_update_gpiod_flags(enum gpiod_flags *flags, + struct acpi_gpio_info *info) { struct device *dev = &info->adev->dev; enum gpiod_flags old = *flags; @@ -690,8 +714,8 @@ acpi_gpio_update_gpiod_flags(enum gpiod_flags *flags, struct acpi_gpio_info *inf return ret; } -int acpi_gpio_update_gpiod_lookup_flags(unsigned long *lookupflags, - struct acpi_gpio_info *info) +static int acpi_gpio_update_gpiod_lookup_flags(unsigned long *lookupflags, + struct acpi_gpio_info *info) { switch (info->pin_config) { case ACPI_PIN_CONFIG_PULLUP: @@ -1005,7 +1029,8 @@ struct gpio_desc *acpi_find_gpio(struct fwnode_handle *fwnode, * @fwnode: pointer to an ACPI firmware node to get the GPIO information from * @propname: Property name of the GPIO * @index: index of GpioIo/GpioInt resource (starting from %0) - * @info: info pointer to fill in (optional) + * @lflags: bitmask of gpio_lookup_flags GPIO_* values + * @dflags: gpiod initialization flags * * If @fwnode is an ACPI device object, call acpi_get_gpiod_by_index() for it. * Otherwise (i.e. it is a data-only non-device object), use the property-based @@ -1017,15 +1042,25 @@ struct gpio_desc *acpi_find_gpio(struct fwnode_handle *fwnode, */ struct gpio_desc *acpi_node_get_gpiod(struct fwnode_handle *fwnode, const char *propname, int index, - struct acpi_gpio_info *info) + unsigned long *lflags, + enum gpiod_flags *dflags) { + struct acpi_gpio_info info; struct acpi_device *adev; + struct gpio_desc *desc; adev = to_acpi_device_node(fwnode); if (adev) - return acpi_get_gpiod_by_index(adev, propname, index, info); + desc = acpi_get_gpiod_by_index(adev, propname, index, &info); + else + desc = acpi_get_gpiod_from_data(fwnode, propname, index, &info); - return acpi_get_gpiod_from_data(fwnode, propname, index, info); + if (!IS_ERR(desc)) { + acpi_gpio_update_gpiod_flags(dflags, &info); + acpi_gpio_update_gpiod_lookup_flags(lflags, &info); + } + + return desc; } /** diff --git a/drivers/gpio/gpiolib-acpi.h b/drivers/gpio/gpiolib-acpi.h index 9fc34830639ce..42adaab518c5b 100644 --- a/drivers/gpio/gpiolib-acpi.h +++ b/drivers/gpio/gpiolib-acpi.h @@ -10,30 +10,6 @@ struct acpi_device; -/** - * struct acpi_gpio_info - ACPI GPIO specific information - * @adev: reference to ACPI device which consumes GPIO resource - * @flags: GPIO initialization flags - * @gpioint: if %true this GPIO is of type GpioInt otherwise type is GpioIo - * @pin_config: pin bias as provided by ACPI - * @polarity: interrupt polarity as provided by ACPI - * @triggering: triggering type as provided by ACPI - * @wake_capable: wake capability as provided by ACPI - * @debounce: debounce timeout as provided by ACPI - * @quirks: Linux specific quirks as provided by struct acpi_gpio_mapping - */ -struct acpi_gpio_info { - struct acpi_device *adev; - enum gpiod_flags flags; - bool gpioint; - int pin_config; - int polarity; - int triggering; - bool wake_capable; - unsigned int debounce; - unsigned int quirks; -}; - #ifdef CONFIG_ACPI void acpi_gpiochip_add(struct gpio_chip *chip); void acpi_gpiochip_remove(struct gpio_chip *chip); @@ -43,11 +19,6 @@ void acpi_gpio_dev_init(struct gpio_chip *gc, struct gpio_device *gdev); void acpi_gpiochip_request_interrupts(struct gpio_chip *chip); void acpi_gpiochip_free_interrupts(struct gpio_chip *chip); -int acpi_gpio_update_gpiod_flags(enum gpiod_flags *flags, - struct acpi_gpio_info *info); -int acpi_gpio_update_gpiod_lookup_flags(unsigned long *lookupflags, - struct acpi_gpio_info *info); - struct gpio_desc *acpi_find_gpio(struct fwnode_handle *fwnode, const char *con_id, unsigned int idx, @@ -55,7 +26,8 @@ struct gpio_desc *acpi_find_gpio(struct fwnode_handle *fwnode, unsigned long *lookupflags); struct gpio_desc *acpi_node_get_gpiod(struct fwnode_handle *fwnode, const char *propname, int index, - struct acpi_gpio_info *info); + unsigned long *lflags, + enum gpiod_flags *dflags); int acpi_gpio_count(struct device *dev, const char *con_id); #else @@ -70,18 +42,6 @@ acpi_gpiochip_request_interrupts(struct gpio_chip *chip) { } static inline void acpi_gpiochip_free_interrupts(struct gpio_chip *chip) { } -static inline int -acpi_gpio_update_gpiod_flags(enum gpiod_flags *flags, struct acpi_gpio_info *info) -{ - return 0; -} -static inline int -acpi_gpio_update_gpiod_lookup_flags(unsigned long *lookupflags, - struct acpi_gpio_info *info) -{ - return 0; -} - static inline struct gpio_desc * acpi_find_gpio(struct fwnode_handle *fwnode, const char *con_id, unsigned int idx, enum gpiod_flags *dflags, @@ -91,7 +51,7 @@ acpi_find_gpio(struct fwnode_handle *fwnode, const char *con_id, } static inline struct gpio_desc * acpi_node_get_gpiod(struct fwnode_handle *fwnode, const char *propname, - int index, struct acpi_gpio_info *info) + int index, unsigned long *lflags, enum gpiod_flags *dflags) { return ERR_PTR(-ENXIO); } diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index e874bb0ef685e..b213f5f93ae09 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -3890,14 +3890,10 @@ static struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode, label); return desc; } else if (is_acpi_node(fwnode)) { - struct acpi_gpio_info info; - - desc = acpi_node_get_gpiod(fwnode, propname, index, &info); + desc = acpi_node_get_gpiod(fwnode, propname, index, + &lflags, &dflags); if (IS_ERR(desc)) return desc; - - acpi_gpio_update_gpiod_flags(&dflags, &info); - acpi_gpio_update_gpiod_lookup_flags(&lflags, &info); } else { return ERR_PTR(-EINVAL); } -- GitLab From 8eb1f71e7acca4f92cf9cf83030cbb8ec2524025 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 11 Nov 2022 14:19:07 -0800 Subject: [PATCH 0628/1423] gpiolib: consolidate GPIO lookups Ensure that all paths to obtain/look up GPIOD from generic consumer-visible APIs go through the new gpiod_find_and_request() helper, so that we can easily extend it with support for new firmware mechanisms. The only exception is OF-specific [devm_]gpiod_get_from_of_node() API that is still being used by a couple of drivers and will be removed as soon as patches converting them to use generic fwnode/device APIs are accepted. Acked-by: Linus Walleij Signed-off-by: Dmitry Torokhov Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-acpi.c | 39 ------- drivers/gpio/gpiolib-acpi.h | 10 -- drivers/gpio/gpiolib.c | 204 ++++++++++++++---------------------- 3 files changed, 76 insertions(+), 177 deletions(-) diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c index 1d69b707cbb18..c99c94e5483f6 100644 --- a/drivers/gpio/gpiolib-acpi.c +++ b/drivers/gpio/gpiolib-acpi.c @@ -1024,45 +1024,6 @@ struct gpio_desc *acpi_find_gpio(struct fwnode_handle *fwnode, return desc; } -/** - * acpi_node_get_gpiod() - get a GPIO descriptor from ACPI resources - * @fwnode: pointer to an ACPI firmware node to get the GPIO information from - * @propname: Property name of the GPIO - * @index: index of GpioIo/GpioInt resource (starting from %0) - * @lflags: bitmask of gpio_lookup_flags GPIO_* values - * @dflags: gpiod initialization flags - * - * If @fwnode is an ACPI device object, call acpi_get_gpiod_by_index() for it. - * Otherwise (i.e. it is a data-only non-device object), use the property-based - * GPIO lookup to get to the GPIO resource with the relevant information and use - * that to obtain the GPIO descriptor to return. - * - * If the GPIO cannot be translated or there is an error an ERR_PTR is - * returned. - */ -struct gpio_desc *acpi_node_get_gpiod(struct fwnode_handle *fwnode, - const char *propname, int index, - unsigned long *lflags, - enum gpiod_flags *dflags) -{ - struct acpi_gpio_info info; - struct acpi_device *adev; - struct gpio_desc *desc; - - adev = to_acpi_device_node(fwnode); - if (adev) - desc = acpi_get_gpiod_by_index(adev, propname, index, &info); - else - desc = acpi_get_gpiod_from_data(fwnode, propname, index, &info); - - if (!IS_ERR(desc)) { - acpi_gpio_update_gpiod_flags(dflags, &info); - acpi_gpio_update_gpiod_lookup_flags(lflags, &info); - } - - return desc; -} - /** * acpi_dev_gpio_irq_wake_get_by() - Find GpioInt and translate it to Linux IRQ number * @adev: pointer to a ACPI device to get IRQ from diff --git a/drivers/gpio/gpiolib-acpi.h b/drivers/gpio/gpiolib-acpi.h index 42adaab518c5b..d2b5dab5c5bff 100644 --- a/drivers/gpio/gpiolib-acpi.h +++ b/drivers/gpio/gpiolib-acpi.h @@ -24,10 +24,6 @@ struct gpio_desc *acpi_find_gpio(struct fwnode_handle *fwnode, unsigned int idx, enum gpiod_flags *dflags, unsigned long *lookupflags); -struct gpio_desc *acpi_node_get_gpiod(struct fwnode_handle *fwnode, - const char *propname, int index, - unsigned long *lflags, - enum gpiod_flags *dflags); int acpi_gpio_count(struct device *dev, const char *con_id); #else @@ -49,12 +45,6 @@ acpi_find_gpio(struct fwnode_handle *fwnode, const char *con_id, { return ERR_PTR(-ENOENT); } -static inline struct gpio_desc * -acpi_node_get_gpiod(struct fwnode_handle *fwnode, const char *propname, - int index, unsigned long *lflags, enum gpiod_flags *dflags) -{ - return ERR_PTR(-ENXIO); -} static inline int acpi_gpio_count(struct device *dev, const char *con_id) { return -ENODEV; diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index b213f5f93ae09..7f739096c4cf7 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -366,7 +366,7 @@ static int gpiochip_set_desc_names(struct gpio_chip *gc) static int devprop_gpiochip_set_names(struct gpio_chip *chip) { struct gpio_device *gdev = chip->gpiodev; - struct fwnode_handle *fwnode = dev_fwnode(&gdev->dev); + const struct fwnode_handle *fwnode = dev_fwnode(&gdev->dev); const char **names; int ret, i; int count; @@ -3853,58 +3853,84 @@ static int platform_gpio_count(struct device *dev, const char *con_id) return count; } -/** - * fwnode_get_named_gpiod - obtain a GPIO from firmware node - * @fwnode: handle of the firmware node - * @propname: name of the firmware property representing the GPIO - * @index: index of the GPIO to obtain for the consumer - * @dflags: GPIO initialization flags - * @label: label to attach to the requested GPIO - * - * This function can be used for drivers that get their configuration - * from opaque firmware. - * - * The function properly finds the corresponding GPIO using whatever is the - * underlying firmware interface and then makes sure that the GPIO - * descriptor is requested before it is returned to the caller. - * - * Returns: - * On successful request the GPIO pin is configured in accordance with - * provided @dflags. - * - * In case of error an ERR_PTR() is returned. - */ -static struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode, - const char *propname, int index, - enum gpiod_flags dflags, - const char *label) +static struct gpio_desc *gpiod_find_by_fwnode(struct fwnode_handle *fwnode, + struct device *consumer, + const char *con_id, + unsigned int idx, + enum gpiod_flags *flags, + unsigned long *lookupflags) { - unsigned long lflags = GPIO_LOOKUP_FLAGS_DEFAULT; - struct gpio_desc *desc = ERR_PTR(-ENODEV); - int ret; + struct gpio_desc *desc = ERR_PTR(-ENOENT); if (is_of_node(fwnode)) { - desc = gpiod_get_from_of_node(to_of_node(fwnode), - propname, index, - dflags, - label); - return desc; + dev_dbg(consumer, "using DT '%pfw' for '%s' GPIO lookup\n", + fwnode, con_id); + desc = of_find_gpio(to_of_node(fwnode), con_id, idx, lookupflags); } else if (is_acpi_node(fwnode)) { - desc = acpi_node_get_gpiod(fwnode, propname, index, - &lflags, &dflags); - if (IS_ERR(desc)) - return desc; - } else { - return ERR_PTR(-EINVAL); + dev_dbg(consumer, "using ACPI '%pfw' for '%s' GPIO lookup\n", + fwnode, con_id); + desc = acpi_find_gpio(fwnode, con_id, idx, flags, lookupflags); } - /* Currently only ACPI takes this path */ + return desc; +} + +static struct gpio_desc *gpiod_find_and_request(struct device *consumer, + struct fwnode_handle *fwnode, + const char *con_id, + unsigned int idx, + enum gpiod_flags flags, + const char *label, + bool platform_lookup_allowed) +{ + struct gpio_desc *desc = ERR_PTR(-ENOENT); + unsigned long lookupflags; + int ret; + + if (!IS_ERR_OR_NULL(fwnode)) + desc = gpiod_find_by_fwnode(fwnode, consumer, con_id, idx, + &flags, &lookupflags); + + if (gpiod_not_found(desc) && platform_lookup_allowed) { + /* + * Either we are not using DT or ACPI, or their lookup did not + * return a result. In that case, use platform lookup as a + * fallback. + */ + dev_dbg(consumer, "using lookup tables for GPIO lookup\n"); + desc = gpiod_find(consumer, con_id, idx, &lookupflags); + } + + if (IS_ERR(desc)) { + dev_dbg(consumer, "No GPIO consumer %s found\n", con_id); + return desc; + } + + /* + * If a connection label was passed use that, else attempt to use + * the device name as label + */ ret = gpiod_request(desc, label); - if (ret) - return ERR_PTR(ret); + if (ret) { + if (!(ret == -EBUSY && flags & GPIOD_FLAGS_BIT_NONEXCLUSIVE)) + return ERR_PTR(ret); + + /* + * This happens when there are several consumers for + * the same GPIO line: we just return here without + * further initialization. It is a bit of a hack. + * This is necessary to support fixed regulators. + * + * FIXME: Make this more sane and safe. + */ + dev_info(consumer, + "nonexclusive access to GPIO for %s\n", con_id); + return desc; + } - ret = gpiod_configure_flags(desc, propname, lflags, dflags); + ret = gpiod_configure_flags(desc, con_id, lookupflags, flags); if (ret < 0) { + dev_dbg(consumer, "setup of GPIO %s failed\n", con_id); gpiod_put(desc); return ERR_PTR(ret); } @@ -3937,29 +3963,12 @@ static struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode, * In case of error an ERR_PTR() is returned. */ struct gpio_desc *fwnode_gpiod_get_index(struct fwnode_handle *fwnode, - const char *con_id, int index, + const char *con_id, + int index, enum gpiod_flags flags, const char *label) { - struct gpio_desc *desc; - char prop_name[32]; /* 32 is max size of property name */ - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(gpio_suffixes); i++) { - if (con_id) - snprintf(prop_name, sizeof(prop_name), "%s-%s", - con_id, gpio_suffixes[i]); - else - snprintf(prop_name, sizeof(prop_name), "%s", - gpio_suffixes[i]); - - desc = fwnode_get_named_gpiod(fwnode, prop_name, index, flags, - label); - if (!gpiod_not_found(desc)) - break; - } - - return desc; + return gpiod_find_and_request(NULL, fwnode, con_id, index, flags, label, false); } EXPORT_SYMBOL_GPL(fwnode_gpiod_get_index); @@ -4113,72 +4122,11 @@ struct gpio_desc *__must_check gpiod_get_index(struct device *dev, unsigned int idx, enum gpiod_flags flags) { - unsigned long lookupflags = GPIO_LOOKUP_FLAGS_DEFAULT; - struct gpio_desc *desc = NULL; - int ret; - /* Maybe we have a device name, maybe not */ - const char *devname = dev ? dev_name(dev) : "?"; struct fwnode_handle *fwnode = dev ? dev_fwnode(dev) : NULL; + const char *devname = dev ? dev_name(dev) : "?"; + const char *label = con_id ?: devname; - dev_dbg(dev, "GPIO lookup for consumer %s\n", con_id); - - /* Using device tree? */ - if (is_of_node(fwnode)) { - dev_dbg(dev, "using device tree for GPIO lookup\n"); - desc = of_find_gpio(to_of_node(fwnode), - con_id, idx, &lookupflags); - } else if (is_acpi_node(fwnode)) { - dev_dbg(dev, "using ACPI for GPIO lookup\n"); - desc = acpi_find_gpio(fwnode, - con_id, idx, &flags, &lookupflags); - } - - /* - * Either we are not using DT or ACPI, or their lookup did not return - * a result. In that case, use platform lookup as a fallback. - */ - if (!desc || gpiod_not_found(desc)) { - dev_dbg(dev, "using lookup tables for GPIO lookup\n"); - desc = gpiod_find(dev, con_id, idx, &lookupflags); - } - - if (IS_ERR(desc)) { - dev_dbg(dev, "No GPIO consumer %s found\n", con_id); - return desc; - } - - /* - * If a connection label was passed use that, else attempt to use - * the device name as label - */ - ret = gpiod_request(desc, con_id ?: devname); - if (ret) { - if (!(ret == -EBUSY && flags & GPIOD_FLAGS_BIT_NONEXCLUSIVE)) - return ERR_PTR(ret); - - /* - * This happens when there are several consumers for - * the same GPIO line: we just return here without - * further initialization. It is a bit of a hack. - * This is necessary to support fixed regulators. - * - * FIXME: Make this more sane and safe. - */ - dev_info(dev, "nonexclusive access to GPIO for %s\n", con_id ?: devname); - return desc; - } - - ret = gpiod_configure_flags(desc, con_id, lookupflags, flags); - if (ret < 0) { - dev_dbg(dev, "setup of GPIO %s failed\n", con_id); - gpiod_put(desc); - return ERR_PTR(ret); - } - - blocking_notifier_call_chain(&desc->gdev->notifier, - GPIOLINE_CHANGED_REQUESTED, desc); - - return desc; + return gpiod_find_and_request(dev, fwnode, con_id, idx, flags, label, true); } EXPORT_SYMBOL_GPL(gpiod_get_index); -- GitLab From e7f9ff5dc90c3826231343439c35c6b7e9e57378 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 11 Nov 2022 14:19:08 -0800 Subject: [PATCH 0629/1423] gpiolib: add support for software nodes Now that static device properties understand notion of child nodes and references, let's teach gpiolib to handle them: - GPIOs are represented as a references to software nodes representing gpiochip - references must have 2 arguments - GPIO number within the chip and GPIO flags (GPIO_ACTIVE_LOW/GPIO_ACTIVE_HIGH, etc) - a new PROPERTY_ENTRY_GPIO() macro is supplied to ensure the above - name of the software node representing gpiochip must match label of the gpiochip, as we use it to locate gpiochip structure at runtime The following illustrates use of software nodes to describe a "System" button that is currently specified via use of gpio_keys_platform_data in arch/mips/alchemy/board-mtx1.c. It follows bindings specified in Documentation/devicetree/bindings/input/gpio-keys.yaml. static const struct software_node mxt1_gpiochip2_node = { .name = "alchemy-gpio2", }; static const struct property_entry mtx1_gpio_button_props[] = { PROPERTY_ENTRY_U32("linux,code", BTN_0), PROPERTY_ENTRY_STRING("label", "System button"), PROPERTY_ENTRY_GPIO("gpios", &mxt1_gpiochip2_node, 7, GPIO_ACTIVE_LOW), { } }; Similarly, arch/arm/mach-tegra/board-paz00.c can be converted to: static const struct software_node tegra_gpiochip_node = { .name = "tegra-gpio", }; static struct property_entry wifi_rfkill_prop[] __initdata = { PROPERTY_ENTRY_STRING("name", "wifi_rfkill"), PROPERTY_ENTRY_STRING("type", "wlan"), PROPERTY_ENTRY_GPIO("reset-gpios", &tegra_gpiochip_node, 25, GPIO_ACTIVE_HIGH); PROPERTY_ENTRY_GPIO("shutdown-gpios", &tegra_gpiochip_node, 85, GPIO_ACTIVE_HIGH); { }, }; static struct platform_device wifi_rfkill_device = { .name = "rfkill_gpio", .id = -1, }; ... software_node_register(&tegra_gpiochip_node); device_create_managed_software_node(&wifi_rfkill_device.dev, wifi_rfkill_prop, NULL); Acked-by: Linus Walleij Reviewed-by: Andy Shevchenko Signed-off-by: Dmitry Torokhov Signed-off-by: Bartosz Golaszewski --- drivers/gpio/Makefile | 1 + drivers/gpio/gpiolib-swnode.c | 123 ++++++++++++++++++++++++++++++++++ drivers/gpio/gpiolib-swnode.h | 14 ++++ drivers/gpio/gpiolib.c | 7 ++ include/linux/gpio/property.h | 11 +++ 5 files changed, 156 insertions(+) create mode 100644 drivers/gpio/gpiolib-swnode.c create mode 100644 drivers/gpio/gpiolib-swnode.h create mode 100644 include/linux/gpio/property.h diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile index 8629e9eaf79e1..010587025fc87 100644 --- a/drivers/gpio/Makefile +++ b/drivers/gpio/Makefile @@ -10,6 +10,7 @@ obj-$(CONFIG_OF_GPIO) += gpiolib-of.o obj-$(CONFIG_GPIO_CDEV) += gpiolib-cdev.o obj-$(CONFIG_GPIO_SYSFS) += gpiolib-sysfs.o obj-$(CONFIG_GPIO_ACPI) += gpiolib-acpi.o +obj-$(CONFIG_GPIOLIB) += gpiolib-swnode.o # Device drivers. Generally keep list sorted alphabetically obj-$(CONFIG_GPIO_REGMAP) += gpio-regmap.o diff --git a/drivers/gpio/gpiolib-swnode.c b/drivers/gpio/gpiolib-swnode.c new file mode 100644 index 0000000000000..dd9ccac214d19 --- /dev/null +++ b/drivers/gpio/gpiolib-swnode.c @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Software Node helpers for the GPIO API + * + * Copyright 2022 Google LLC + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gpiolib.h" +#include "gpiolib-swnode.h" + +static void swnode_format_propname(const char *con_id, char *propname, + size_t max_size) +{ + /* + * Note we do not need to try both -gpios and -gpio suffixes, + * as, unlike OF and ACPI, we can fix software nodes to conform + * to the proper binding. + */ + if (con_id) + snprintf(propname, max_size, "%s-gpios", con_id); + else + strscpy(propname, "gpios", max_size); +} + +static int swnode_gpiochip_match_name(struct gpio_chip *chip, void *data) +{ + return !strcmp(chip->label, data); +} + +static struct gpio_chip *swnode_get_chip(struct fwnode_handle *fwnode) +{ + const struct software_node *chip_node; + struct gpio_chip *chip; + + chip_node = to_software_node(fwnode); + if (!chip_node || !chip_node->name) + return ERR_PTR(-EINVAL); + + chip = gpiochip_find((void *)chip_node->name, swnode_gpiochip_match_name); + return chip ?: ERR_PTR(-EPROBE_DEFER); +} + +struct gpio_desc *swnode_find_gpio(struct fwnode_handle *fwnode, + const char *con_id, unsigned int idx, + unsigned long *flags) +{ + const struct software_node *swnode; + struct fwnode_reference_args args; + struct gpio_chip *chip; + struct gpio_desc *desc; + char propname[32]; /* 32 is max size of property name */ + int error; + + swnode = to_software_node(fwnode); + if (!swnode) + return ERR_PTR(-EINVAL); + + swnode_format_propname(con_id, propname, sizeof(propname)); + + /* + * We expect all swnode-described GPIOs have GPIO number and + * polarity arguments, hence nargs is set to 2. + */ + error = fwnode_property_get_reference_args(fwnode, propname, NULL, 2, idx, &args); + if (error) { + pr_debug("%s: can't parse '%s' property of node '%pfwP[%d]'\n", + __func__, propname, fwnode, idx); + return ERR_PTR(error); + } + + chip = swnode_get_chip(args.fwnode); + fwnode_handle_put(args.fwnode); + if (IS_ERR(chip)) + return ERR_CAST(chip); + + desc = gpiochip_get_desc(chip, args.args[0]); + *flags = args.args[1]; /* We expect native GPIO flags */ + + pr_debug("%s: parsed '%s' property of node '%pfwP[%d]' - status (%d)\n", + __func__, propname, fwnode, idx, PTR_ERR_OR_ZERO(desc)); + + return desc; +} + +/** + * swnode_gpio_count - count the GPIOs associated with a device / function + * @fwnode: firmware node of the GPIO consumer, can be %NULL for + * system-global GPIOs + * @con_id: function within the GPIO consumer + * + * Return: + * The number of GPIOs associated with a device / function or %-ENOENT, + * if no GPIO has been assigned to the requested function. + */ +int swnode_gpio_count(const struct fwnode_handle *fwnode, const char *con_id) +{ + struct fwnode_reference_args args; + char propname[32]; + int count; + + swnode_format_propname(con_id, propname, sizeof(propname)); + + /* + * This is not very efficient, but GPIO lists usually have only + * 1 or 2 entries. + */ + count = 0; + while (fwnode_property_get_reference_args(fwnode, propname, NULL, 0, + count, &args) == 0) { + fwnode_handle_put(args.fwnode); + count++; + } + + return count ?: -ENOENT; +} diff --git a/drivers/gpio/gpiolib-swnode.h b/drivers/gpio/gpiolib-swnode.h new file mode 100644 index 0000000000000..af849e56f6bce --- /dev/null +++ b/drivers/gpio/gpiolib-swnode.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef GPIOLIB_SWNODE_H +#define GPIOLIB_SWNODE_H + +struct fwnode_handle; +struct gpio_desc; + +struct gpio_desc *swnode_find_gpio(struct fwnode_handle *fwnode, + const char *con_id, unsigned int idx, + unsigned long *flags); +int swnode_gpio_count(const struct fwnode_handle *fwnode, const char *con_id); + +#endif /* GPIOLIB_SWNODE_H */ diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 7f739096c4cf7..7936d54a2e302 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -26,6 +26,7 @@ #include "gpiolib.h" #include "gpiolib-of.h" #include "gpiolib-acpi.h" +#include "gpiolib-swnode.h" #include "gpiolib-cdev.h" #include "gpiolib-sysfs.h" @@ -3870,6 +3871,10 @@ static struct gpio_desc *gpiod_find_by_fwnode(struct fwnode_handle *fwnode, dev_dbg(consumer, "using ACPI '%pfw' for '%s' GPIO lookup\n", fwnode, con_id); desc = acpi_find_gpio(fwnode, con_id, idx, flags, lookupflags); + } else if (is_software_node(fwnode)) { + dev_dbg(consumer, "using swnode '%pfw' for '%s' GPIO lookup\n", + fwnode, con_id); + desc = swnode_find_gpio(fwnode, con_id, idx, lookupflags); } return desc; @@ -3987,6 +3992,8 @@ int gpiod_count(struct device *dev, const char *con_id) count = of_gpio_get_count(dev, con_id); else if (is_acpi_node(fwnode)) count = acpi_gpio_count(dev, con_id); + else if (is_software_node(fwnode)) + count = swnode_gpio_count(fwnode, con_id); if (count < 0) count = platform_gpio_count(dev, con_id); diff --git a/include/linux/gpio/property.h b/include/linux/gpio/property.h new file mode 100644 index 0000000000000..6c75c8bd44a0b --- /dev/null +++ b/include/linux/gpio/property.h @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0+ +#ifndef __LINUX_GPIO_PROPERTY_H +#define __LINUX_GPIO_PROPERTY_H + +#include /* for GPIO_* flags */ +#include + +#define PROPERTY_ENTRY_GPIO(_name_, _chip_node_, _idx_, _flags_) \ + PROPERTY_ENTRY_REF(_name_, _chip_node_, _idx_, _flags_) + +#endif /* __LINUX_GPIO_PROPERTY_H */ -- GitLab From 77289b2f5aa3535a2e49b448c6afb36f5526016a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 14 Nov 2022 20:46:25 +0200 Subject: [PATCH 0630/1423] gpiolib: of: Prepare of_mm_gpiochip_add_data() for fwnode GPIO library is getting rid of of_node, fwnode should be utilized instead. Prepare of_mm_gpiochip_add_data() for fwnode. Signed-off-by: Andy Shevchenko Acked-by: Linus Walleij Reviewed-by: Dmitry Torokhov Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-of.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index 596b8e21700ef..c9b0c9fdeca8a 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -934,8 +934,8 @@ int of_mm_gpiochip_add_data(struct device_node *np, if (mm_gc->save_regs) mm_gc->save_regs(mm_gc); - of_node_put(mm_gc->gc.of_node); - mm_gc->gc.of_node = of_node_get(np); + fwnode_handle_put(mm_gc->gc.fwnode); + mm_gc->gc.fwnode = fwnode_handle_get(of_fwnode_handle(np)); ret = gpiochip_add_data(gc, data); if (ret) -- GitLab From a431803852de00d8d3c143b19f5690254225538f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 14 Nov 2022 20:46:26 +0200 Subject: [PATCH 0631/1423] gpiolib: of: Drop redundant check in of_mm_gpiochip_remove() The callers never call the function with invalid pointer. Moreover, compiler quite likely dropped that check anyway because we use that pointer before the check. Signed-off-by: Andy Shevchenko Acked-by: Linus Walleij Reviewed-by: Dmitry Torokhov Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-of.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index c9b0c9fdeca8a..a6871859a59de 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -961,9 +961,6 @@ void of_mm_gpiochip_remove(struct of_mm_gpio_chip *mm_gc) { struct gpio_chip *gc = &mm_gc->gc; - if (!mm_gc) - return; - gpiochip_remove(gc); iounmap(mm_gc->regs); kfree(gc->label); -- GitLab From ddf07bd874be791a63fca5ac0e3def1e15f2338f Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Fri, 11 Nov 2022 12:37:32 +0100 Subject: [PATCH 0632/1423] gpiolib: of: Use correct fwnode for DT-probed chips The OF node store in chip->fwnode is used to explicitly override the FW node for a GPIO chip. For chips that use the default FW node (i.e. that of their parent device), this will be NULL and cause the chip not to be fully registered. Instead, use the GPIO device's FW node, which is set to either the node of the parent device or the explicit override in chip->fwnode. Fixes: 8afe82550240 ("gpiolib: of: Prepare of_gpiochip_add() / of_gpiochip_remove() for fwnode") Tested-by: Marek Szyprowski Signed-off-by: Thierry Reding Reviewed-by: Andy Shevchenko Reviewed-by: Linus Walleij Tested-by: Robert Marko Tested-by: Andrew Halaney Reviewed-by: Brian Masney Tested-by: Brian Masney Tested-by: Geert Uytterhoeven Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-of.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index a6871859a59de..4fff7258ee41a 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -1063,7 +1063,7 @@ int of_gpiochip_add(struct gpio_chip *chip) struct device_node *np; int ret; - np = to_of_node(chip->fwnode); + np = to_of_node(dev_fwnode(&chip->gpiodev->dev)); if (!np) return 0; -- GitLab From 739be9b6a84b23c40b0fb534b749602fb8285e70 Mon Sep 17 00:00:00 2001 From: Aidan MacDonald Date: Sat, 12 Nov 2022 15:29:28 +0000 Subject: [PATCH 0633/1423] gpio: sl28cpld: Replace irqchip mask_invert with unmask_base Remove use of the deprecated mask_invert flag. Inverted mask registers (where a '1' bit enables an IRQ) can be described more directly as an unmask register. Signed-off-by: Aidan MacDonald Acked-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-sl28cpld.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpio/gpio-sl28cpld.c b/drivers/gpio/gpio-sl28cpld.c index 52404736ac861..2195f88c20485 100644 --- a/drivers/gpio/gpio-sl28cpld.c +++ b/drivers/gpio/gpio-sl28cpld.c @@ -70,8 +70,7 @@ static int sl28cpld_gpio_irq_init(struct platform_device *pdev, irq_chip->num_irqs = ARRAY_SIZE(sl28cpld_gpio_irqs); irq_chip->num_regs = 1; irq_chip->status_base = base + GPIO_REG_IP; - irq_chip->mask_base = base + GPIO_REG_IE; - irq_chip->mask_invert = true; + irq_chip->unmask_base = base + GPIO_REG_IE; irq_chip->ack_base = base + GPIO_REG_IP; ret = devm_regmap_add_irq_chip_fwnode(dev, dev_fwnode(dev), -- GitLab From e67ad9354a9b7621341adec4ac2c63d5269f835d Mon Sep 17 00:00:00 2001 From: Albert Zhou Date: Tue, 15 Nov 2022 22:38:56 +1100 Subject: [PATCH 0634/1423] PCI: pciehp: Enable by default if USB4 enabled Thunderbolt/USB4 PCIe tunneling depends on native PCIe hotplug. Enable pciehp by default if USB4 is enabled. [bhelgaas: squash, update subject, commit logs, tidy whitespace] Link: https://lore.kernel.org/r/20221115113857.35800-2-albert.zhou.50@gmail.com Link: https://lore.kernel.org/r/20221115113857.35800-3-albert.zhou.50@gmail.com Signed-off-by: Albert Zhou Signed-off-by: Bjorn Helgaas Reviewed-by: Mika Westerberg --- drivers/pci/hotplug/Kconfig | 3 +++ drivers/pci/pcie/Kconfig | 8 ++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/pci/hotplug/Kconfig b/drivers/pci/hotplug/Kconfig index 840a84bb5ee26..48113b210cf93 100644 --- a/drivers/pci/hotplug/Kconfig +++ b/drivers/pci/hotplug/Kconfig @@ -6,11 +6,14 @@ menuconfig HOTPLUG_PCI bool "Support for PCI Hotplug" depends on PCI && SYSFS + default y if USB4 help Say Y here if you have a motherboard with a PCI Hotplug controller. This allows you to add and remove PCI cards while the machine is powered up and running. + Thunderbolt/USB4 PCIe tunneling depends on native PCIe hotplug. + When in doubt, say N. if HOTPLUG_PCI diff --git a/drivers/pci/pcie/Kconfig b/drivers/pci/pcie/Kconfig index 788ac8df3f9db..228652a59f27e 100644 --- a/drivers/pci/pcie/Kconfig +++ b/drivers/pci/pcie/Kconfig @@ -4,6 +4,7 @@ # config PCIEPORTBUS bool "PCI Express Port Bus support" + default y if USB4 help This enables PCI Express Port Bus support. Users can then enable support for Native Hot-Plug, Advanced Error Reporting, Power @@ -15,9 +16,12 @@ config PCIEPORTBUS config HOTPLUG_PCI_PCIE bool "PCI Express Hotplug driver" depends on HOTPLUG_PCI && PCIEPORTBUS + default y if USB4 help - Say Y here if you have a motherboard that supports PCI Express Native - Hotplug + Say Y here if you have a motherboard that supports PCIe native + hotplug. + + Thunderbolt/USB4 PCIe tunneling depends on native PCIe hotplug. When in doubt, say N. -- GitLab From 60da2d11fcbc043304910e4d2ca82f9bab953e63 Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Tue, 15 Nov 2022 18:07:47 +0100 Subject: [PATCH 0635/1423] RDMA/siw: Set defined status for work completion with undefined status A malicious user may write undefined values into memory mapped completion queue elements status or opcode. Undefined status or opcode values will result in out-of-bounds access to an array mapping siw internal representation of opcode and status to RDMA core representation when reaping CQ elements. While siw detects those undefined values, it did not correctly set completion status to a defined value, thus defeating the whole purpose of the check. This bug leads to the following Smatch static checker warning: drivers/infiniband/sw/siw/siw_cq.c:96 siw_reap_cqe() error: buffer overflow 'map_cqe_status' 10 <= 21 Fixes: bdf1da5df9da ("RDMA/siw: Fix immediate work request flush to completion queue") Link: https://lore.kernel.org/r/20221115170747.1263298-1-bmt@zurich.ibm.com Reported-by: Dan Carpenter Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_cq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/siw/siw_cq.c b/drivers/infiniband/sw/siw/siw_cq.c index acc7bcd538b53..403029de6b92d 100644 --- a/drivers/infiniband/sw/siw/siw_cq.c +++ b/drivers/infiniband/sw/siw/siw_cq.c @@ -88,9 +88,9 @@ int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc) if (opcode >= SIW_NUM_OPCODES) { opcode = 0; - status = IB_WC_GENERAL_ERR; + status = SIW_WC_GENERAL_ERR; } else if (status >= SIW_NUM_WC_STATUS) { - status = IB_WC_GENERAL_ERR; + status = SIW_WC_GENERAL_ERR; } wc->opcode = map_wc_opcode[opcode]; wc->status = map_cqe_status[status].ib; -- GitLab From 2d08a893b87cf9b2f9dbb3afaff60ca4530d55a2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Nov 2022 20:17:07 +0000 Subject: [PATCH 0636/1423] x86/debug: Include percpu.h in debugreg.h to get DECLARE_PER_CPU() et al MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include percpu.h to pick up the definition of DECLARE_PER_CPU() and friends instead of relying on the parent to provide the #include. E.g. swapping the order of includes in arch/x86/kvm/vmx/nested.c (simulating KVM code movement being done for other purposes) results in build errors: In file included from arch/x86/kvm/vmx/nested.c:3: arch/x86/include/asm/debugreg.h:9:32: error: unknown type name ‘cpu_dr7â€=99 9 | DECLARE_PER_CPU(unsigned long, cpu_dr7); | ^~~~~~~ Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20221110201707.1976032-1-seanjc@google.com --- arch/x86/include/asm/debugreg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index cfdf307ddc012..b049d950612fd 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -2,8 +2,8 @@ #ifndef _ASM_X86_DEBUGREG_H #define _ASM_X86_DEBUGREG_H - #include +#include #include DECLARE_PER_CPU(unsigned long, cpu_dr7); -- GitLab From 24c94060fc9b4e0f19e6e018869db46db21d6bc7 Mon Sep 17 00:00:00 2001 From: Brian Masney Date: Mon, 14 Nov 2022 15:29:43 -0500 Subject: [PATCH 0637/1423] gpiolib: ensure that fwnode is properly set Note that this is a RFC patch and not meant to be merged. I looked into a problem with linux-next-20221110 on the Qualcomm SA8540P automotive board (sc8280xp) where the UFS host controller would fail to probe due to repeated probe deferrals when trying to get reset-gpios via devm_gpiod_get_optional(). of_get_named_gpiod_flags() returns -EPROBE_DEFER, which is caused by of_gpiochip_match_node_and_xlate() returning 0 since the of_xlate function pointer is not set for the qcom,sc8280xp-tlmm pinctrl driver. The pinctrl driver doesn't define one, so of_gpiochip_add() should automatically setup of_gpio_simple_xlate() on it's behalf. This doesn't happen since the fwnode member on the struct gpiochip is set to null when of_gpiochip_add() is called. Let's work around this by ensuring that it's set if available. Note that this broke sometime within the last few weeks within linux-next and I haven't bisected this. I'm posting this in the hopes that someone may know offhand which patch(es) may have broken this. Signed-off-by: Brian Masney Tested-by: Marijn Suijten Tested-by: Konrad Dybcio Tested-by: Steev Klimaszewski #Lenovo Thinkpad X13s Tested-by: Neil Armstrong Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 7936d54a2e302..51afdc6ac9198 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -679,7 +679,7 @@ int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data, * Assign fwnode depending on the result of the previous calls, * if none of them succeed, assign it to the parent's one. */ - gdev->dev.fwnode = dev_fwnode(&gdev->dev) ?: fwnode; + gc->fwnode = gdev->dev.fwnode = dev_fwnode(&gdev->dev) ?: fwnode; gdev->id = ida_alloc(&gpio_ida, GFP_KERNEL); if (gdev->id < 0) { -- GitLab From 40059212f99c31f26c69763e560325e59eac02c6 Mon Sep 17 00:00:00 2001 From: Nishanth Menon Date: Tue, 15 Nov 2022 17:10:21 -0600 Subject: [PATCH 0638/1423] dt-bindings: gpio: gpio-davinci: Increase maxItems in gpio-line-names gpio-line-names really depends on ti,ngpios. However, the maximum value we have seen across the board is on K2G and da850 platforms where it can be upto 144. Link: https://lore.kernel.org/linux-arm-kernel/20221115200357.qa2rvw3clbz7unzq@symptom/T/#u Fixes: c830b87a761b ("dt-bindings: gpio: gpio-davinci: Convert to json-schema") Reported-by: Robert Nelson Signed-off-by: Nishanth Menon Acked-by: Krzysztof Kozlowski Signed-off-by: Bartosz Golaszewski --- Documentation/devicetree/bindings/gpio/gpio-davinci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/gpio/gpio-davinci.yaml b/Documentation/devicetree/bindings/gpio/gpio-davinci.yaml index f32e09ef937c0..10e56cf306dba 100644 --- a/Documentation/devicetree/bindings/gpio/gpio-davinci.yaml +++ b/Documentation/devicetree/bindings/gpio/gpio-davinci.yaml @@ -35,7 +35,7 @@ properties: gpio-line-names: description: strings describing the names of each gpio line. minItems: 1 - maxItems: 100 + maxItems: 144 "#gpio-cells": const: 2 -- GitLab From c5c4f72ad4faab641cb852fdd890e8a64cb39f24 Mon Sep 17 00:00:00 2001 From: Vipin Sharma Date: Thu, 3 Nov 2022 12:17:13 -0700 Subject: [PATCH 0639/1423] KVM: selftests: Add missing break between -e and -g option in dirty_log_perf_test Passing -e option (Run VCPUs while dirty logging is being disabled) in dirty_log_perf_test also unintentionally enables -g (Do not enable KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2). Add break between two switch case logic. Fixes: cfe12e64b065 ("KVM: selftests: Add an option to run vCPUs while disabling dirty logging") Signed-off-by: Vipin Sharma Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20221103191719.1559407-2-vipinsh@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/dirty_log_perf_test.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index f99e39a672d3f..56e08da3a87f4 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -411,6 +411,7 @@ int main(int argc, char *argv[]) case 'e': /* 'e' is for evil. */ run_vcpus_while_disabling_dirty_logging = true; + break; case 'g': dirty_log_manual_caps = 0; break; -- GitLab From 0eb88a4121861ce3d5f925a183abb13ad954dbe6 Mon Sep 17 00:00:00 2001 From: Vipin Sharma Date: Thu, 3 Nov 2022 12:17:14 -0700 Subject: [PATCH 0640/1423] KVM: selftests: Put command line options in alphabetical order in dirty_log_perf_test There are 13 command line options and they are not in any order. Put them in alphabetical order to make it easy to add new options. No functional change intended. Signed-off-by: Vipin Sharma Reviewed-by: Wei Wang Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20221103191719.1559407-3-vipinsh@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/dirty_log_perf_test.c | 36 ++++++++++--------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 56e08da3a87f4..5bb6954b23587 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -406,50 +406,52 @@ int main(int argc, char *argv[]) guest_modes_append_default(); - while ((opt = getopt(argc, argv, "eghi:p:m:nb:f:v:os:x:")) != -1) { + while ((opt = getopt(argc, argv, "b:ef:ghi:m:nop:s:v:x:")) != -1) { switch (opt) { + case 'b': + guest_percpu_mem_size = parse_size(optarg); + break; case 'e': /* 'e' is for evil. */ run_vcpus_while_disabling_dirty_logging = true; break; + case 'f': + p.wr_fract = atoi(optarg); + TEST_ASSERT(p.wr_fract >= 1, + "Write fraction cannot be less than one"); + break; case 'g': dirty_log_manual_caps = 0; break; + case 'h': + help(argv[0]); + break; case 'i': p.iterations = atoi(optarg); break; - case 'p': - p.phys_offset = strtoull(optarg, NULL, 0); - break; case 'm': guest_modes_cmdline(optarg); break; case 'n': perf_test_args.nested = true; break; - case 'b': - guest_percpu_mem_size = parse_size(optarg); + case 'o': + p.partition_vcpu_memory_access = false; break; - case 'f': - p.wr_fract = atoi(optarg); - TEST_ASSERT(p.wr_fract >= 1, - "Write fraction cannot be less than one"); + case 'p': + p.phys_offset = strtoull(optarg, NULL, 0); + break; + case 's': + p.backing_src = parse_backing_src_type(optarg); break; case 'v': nr_vcpus = atoi(optarg); TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus, "Invalid number of vcpus, must be between 1 and %d", max_vcpus); break; - case 'o': - p.partition_vcpu_memory_access = false; - break; - case 's': - p.backing_src = parse_backing_src_type(optarg); - break; case 'x': p.slots = atoi(optarg); break; - case 'h': default: help(argv[0]); break; -- GitLab From 018ea2d71a43372cb984021f03514dc6dd3d46df Mon Sep 17 00:00:00 2001 From: Vipin Sharma Date: Thu, 3 Nov 2022 12:17:15 -0700 Subject: [PATCH 0641/1423] KVM: selftests: Add atoi_paranoid() to catch errors missed by atoi() atoi() doesn't detect errors. There is no way to know that a 0 return is correct conversion or due to an error. Introduce atoi_paranoid() to detect errors and provide correct conversion. Replace all atoi() calls with atoi_paranoid(). Signed-off-by: Vipin Sharma Suggested-by: David Matlack Suggested-by: Sean Christopherson Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20221103191719.1559407-4-vipinsh@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/aarch64/arch_timer.c | 8 ++++---- .../selftests/kvm/aarch64/debug-exceptions.c | 2 +- .../testing/selftests/kvm/aarch64/vgic_irq.c | 6 +++--- .../selftests/kvm/access_tracking_perf_test.c | 2 +- .../selftests/kvm/demand_paging_test.c | 2 +- .../selftests/kvm/dirty_log_perf_test.c | 8 ++++---- .../testing/selftests/kvm/include/test_util.h | 2 ++ .../selftests/kvm/kvm_page_table_test.c | 2 +- tools/testing/selftests/kvm/lib/test_util.c | 19 +++++++++++++++++++ .../selftests/kvm/max_guest_memory_test.c | 6 +++--- .../kvm/memslot_modification_stress_test.c | 6 +++--- .../testing/selftests/kvm/memslot_perf_test.c | 10 +++++----- .../selftests/kvm/set_memory_region_test.c | 2 +- .../selftests/kvm/x86_64/nx_huge_pages_test.c | 4 ++-- 14 files changed, 50 insertions(+), 29 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c index 574eb73f0e904..251e7ff048831 100644 --- a/tools/testing/selftests/kvm/aarch64/arch_timer.c +++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c @@ -414,7 +414,7 @@ static bool parse_args(int argc, char *argv[]) while ((opt = getopt(argc, argv, "hn:i:p:m:")) != -1) { switch (opt) { case 'n': - test_args.nr_vcpus = atoi(optarg); + test_args.nr_vcpus = atoi_paranoid(optarg); if (test_args.nr_vcpus <= 0) { pr_info("Positive value needed for -n\n"); goto err; @@ -425,21 +425,21 @@ static bool parse_args(int argc, char *argv[]) } break; case 'i': - test_args.nr_iter = atoi(optarg); + test_args.nr_iter = atoi_paranoid(optarg); if (test_args.nr_iter <= 0) { pr_info("Positive value needed for -i\n"); goto err; } break; case 'p': - test_args.timer_period_ms = atoi(optarg); + test_args.timer_period_ms = atoi_paranoid(optarg); if (test_args.timer_period_ms <= 0) { pr_info("Positive value needed for -p\n"); goto err; } break; case 'm': - test_args.migration_freq_ms = atoi(optarg); + test_args.migration_freq_ms = atoi_paranoid(optarg); if (test_args.migration_freq_ms < 0) { pr_info("0 or positive value needed for -m\n"); goto err; diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c index 947bd201435ce..19fffdf19c9f9 100644 --- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c +++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c @@ -423,7 +423,7 @@ int main(int argc, char *argv[]) while ((opt = getopt(argc, argv, "i:")) != -1) { switch (opt) { case 'i': - ss_iteration = atoi(optarg); + ss_iteration = atoi_paranoid(optarg); break; case 'h': default: diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c index 17417220a083f..ae90b718070aa 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c @@ -824,16 +824,16 @@ int main(int argc, char **argv) while ((opt = getopt(argc, argv, "hn:e:l:")) != -1) { switch (opt) { case 'n': - nr_irqs = atoi(optarg); + nr_irqs = atoi_paranoid(optarg); if (nr_irqs > 1024 || nr_irqs % 32) help(argv[0]); break; case 'e': - eoi_split = (bool)atoi(optarg); + eoi_split = (bool)atoi_paranoid(optarg); default_args = false; break; case 'l': - level_sensitive = (bool)atoi(optarg); + level_sensitive = (bool)atoi_paranoid(optarg); default_args = false; break; case 'h': diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c index 76c583a07ea22..c6bcc5301e2c2 100644 --- a/tools/testing/selftests/kvm/access_tracking_perf_test.c +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c @@ -368,7 +368,7 @@ int main(int argc, char *argv[]) params.vcpu_memory_bytes = parse_size(optarg); break; case 'v': - params.nr_vcpus = atoi(optarg); + params.nr_vcpus = atoi_paranoid(optarg); break; case 'o': overlap_memory_access = true; diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 779ae54f89c40..82597fb04146f 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -427,7 +427,7 @@ int main(int argc, char *argv[]) p.src_type = parse_backing_src_type(optarg); break; case 'v': - nr_vcpus = atoi(optarg); + nr_vcpus = atoi_paranoid(optarg); TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus, "Invalid number of vcpus, must be between 1 and %d", max_vcpus); break; diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 5bb6954b23587..ecda802b78ffa 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -416,7 +416,7 @@ int main(int argc, char *argv[]) run_vcpus_while_disabling_dirty_logging = true; break; case 'f': - p.wr_fract = atoi(optarg); + p.wr_fract = atoi_paranoid(optarg); TEST_ASSERT(p.wr_fract >= 1, "Write fraction cannot be less than one"); break; @@ -427,7 +427,7 @@ int main(int argc, char *argv[]) help(argv[0]); break; case 'i': - p.iterations = atoi(optarg); + p.iterations = atoi_paranoid(optarg); break; case 'm': guest_modes_cmdline(optarg); @@ -445,12 +445,12 @@ int main(int argc, char *argv[]) p.backing_src = parse_backing_src_type(optarg); break; case 'v': - nr_vcpus = atoi(optarg); + nr_vcpus = atoi_paranoid(optarg); TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus, "Invalid number of vcpus, must be between 1 and %d", max_vcpus); break; case 'x': - p.slots = atoi(optarg); + p.slots = atoi_paranoid(optarg); break; default: help(argv[0]); diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index befc754ce9b3b..feae428637599 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -152,4 +152,6 @@ static inline void *align_ptr_up(void *x, size_t size) return (void *)align_up((unsigned long)x, size); } +int atoi_paranoid(const char *num_str); + #endif /* SELFTEST_KVM_TEST_UTIL_H */ diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index f42c6ac6d71db..ea7feb69bb88a 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -461,7 +461,7 @@ int main(int argc, char *argv[]) p.test_mem_size = parse_size(optarg); break; case 'v': - nr_vcpus = atoi(optarg); + nr_vcpus = atoi_paranoid(optarg); TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus, "Invalid number of vcpus, must be between 1 and %d", max_vcpus); break; diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index 6d23878bbfe1a..c2d9c68277793 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -334,3 +334,22 @@ long get_run_delay(void) return val[1]; } + +int atoi_paranoid(const char *num_str) +{ + char *end_ptr; + long num; + + errno = 0; + num = strtol(num_str, &end_ptr, 0); + TEST_ASSERT(!errno, "strtol(\"%s\") failed", num_str); + TEST_ASSERT(num_str != end_ptr, + "strtol(\"%s\") didn't find a valid integer.", num_str); + TEST_ASSERT(*end_ptr == '\0', + "strtol(\"%s\") failed to parse trailing characters \"%s\".", + num_str, end_ptr); + TEST_ASSERT(num >= INT_MIN && num <= INT_MAX, + "%ld not in range of [%d, %d]", num, INT_MIN, INT_MAX); + + return num; +} diff --git a/tools/testing/selftests/kvm/max_guest_memory_test.c b/tools/testing/selftests/kvm/max_guest_memory_test.c index 9a6e4f3ad6b57..1595b73dc09a8 100644 --- a/tools/testing/selftests/kvm/max_guest_memory_test.c +++ b/tools/testing/selftests/kvm/max_guest_memory_test.c @@ -193,15 +193,15 @@ int main(int argc, char *argv[]) while ((opt = getopt(argc, argv, "c:h:m:s:H")) != -1) { switch (opt) { case 'c': - nr_vcpus = atoi(optarg); + nr_vcpus = atoi_paranoid(optarg); TEST_ASSERT(nr_vcpus > 0, "number of vcpus must be >0"); break; case 'm': - max_mem = atoi(optarg) * size_1gb; + max_mem = atoi_paranoid(optarg) * size_1gb; TEST_ASSERT(max_mem > 0, "memory size must be >0"); break; case 's': - slot_size = atoi(optarg) * size_1gb; + slot_size = atoi_paranoid(optarg) * size_1gb; TEST_ASSERT(slot_size > 0, "slot size must be >0"); break; case 'H': diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index bb1d17a1171bc..7d19a27d80d2d 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -158,7 +158,7 @@ int main(int argc, char *argv[]) guest_modes_cmdline(optarg); break; case 'd': - p.memslot_modification_delay = strtoul(optarg, NULL, 0); + p.memslot_modification_delay = atoi_paranoid(optarg); TEST_ASSERT(p.memslot_modification_delay >= 0, "A negative delay is not supported."); break; @@ -166,7 +166,7 @@ int main(int argc, char *argv[]) guest_percpu_mem_size = parse_size(optarg); break; case 'v': - nr_vcpus = atoi(optarg); + nr_vcpus = atoi_paranoid(optarg); TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus, "Invalid number of vcpus, must be between 1 and %d", max_vcpus); @@ -175,7 +175,7 @@ int main(int argc, char *argv[]) p.partition_vcpu_memory_access = false; break; case 'i': - p.nr_memslot_modifications = atoi(optarg); + p.nr_memslot_modifications = atoi_paranoid(optarg); break; case 'h': default: diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c index 44995446d942a..4bae9e3f5ca1e 100644 --- a/tools/testing/selftests/kvm/memslot_perf_test.c +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -885,21 +885,21 @@ static bool parse_args(int argc, char *argv[], map_unmap_verify = true; break; case 's': - targs->nslots = atoi(optarg); + targs->nslots = atoi_paranoid(optarg); if (targs->nslots <= 0 && targs->nslots != -1) { pr_info("Slot count cap has to be positive or -1 for no cap\n"); return false; } break; case 'f': - targs->tfirst = atoi(optarg); + targs->tfirst = atoi_paranoid(optarg); if (targs->tfirst < 0) { pr_info("First test to run has to be non-negative\n"); return false; } break; case 'e': - targs->tlast = atoi(optarg); + targs->tlast = atoi_paranoid(optarg); if (targs->tlast < 0 || targs->tlast >= NTESTS) { pr_info("Last test to run has to be non-negative and less than %zu\n", NTESTS); @@ -907,14 +907,14 @@ static bool parse_args(int argc, char *argv[], } break; case 'l': - targs->seconds = atoi(optarg); + targs->seconds = atoi_paranoid(optarg); if (targs->seconds < 0) { pr_info("Test length in seconds has to be non-negative\n"); return false; } break; case 'r': - targs->runs = atoi(optarg); + targs->runs = atoi_paranoid(optarg); if (targs->runs <= 0) { pr_info("Runs per test has to be positive\n"); return false; diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index 0d55f508d5953..c366949c8362c 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -407,7 +407,7 @@ int main(int argc, char *argv[]) #ifdef __x86_64__ if (argc > 1) - loops = atoi(argv[1]); + loops = atoi_paranoid(argv[1]); else loops = 10; diff --git a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c index 59ffe7fd354f8..354b6902849c5 100644 --- a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c +++ b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c @@ -241,10 +241,10 @@ int main(int argc, char **argv) while ((opt = getopt(argc, argv, "hp:t:r")) != -1) { switch (opt) { case 'p': - reclaim_period_ms = atoi(optarg); + reclaim_period_ms = atoi_paranoid(optarg); break; case 't': - token = atoi(optarg); + token = atoi_paranoid(optarg); break; case 'r': reboot_permissions = true; -- GitLab From 69a62e2004b8bc3f9572f88a592b168345a6bbf9 Mon Sep 17 00:00:00 2001 From: Vipin Sharma Date: Thu, 3 Nov 2022 12:17:16 -0700 Subject: [PATCH 0642/1423] KVM: selftests: Use SZ_* macros from sizes.h in max_guest_memory_test.c Replace size_1gb defined in max_guest_memory_test.c with the SZ_1G, SZ_2G and SZ_4G from linux/sizes.h header file. Signed-off-by: Vipin Sharma Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20221103191719.1559407-5-vipinsh@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/max_guest_memory_test.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/kvm/max_guest_memory_test.c b/tools/testing/selftests/kvm/max_guest_memory_test.c index 1595b73dc09a8..8056dc5831b52 100644 --- a/tools/testing/selftests/kvm/max_guest_memory_test.c +++ b/tools/testing/selftests/kvm/max_guest_memory_test.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "kvm_util.h" #include "test_util.h" @@ -162,8 +163,7 @@ int main(int argc, char *argv[]) * just below the 4gb boundary. This test could create memory at * 1gb-3gb,but it's simpler to skip straight to 4gb. */ - const uint64_t size_1gb = (1 << 30); - const uint64_t start_gpa = (4ull * size_1gb); + const uint64_t start_gpa = SZ_4G; const int first_slot = 1; struct timespec time_start, time_run1, time_reset, time_run2; @@ -180,13 +180,13 @@ int main(int argc, char *argv[]) * are quite common for x86, requires changing only max_mem (KVM allows * 32k memslots, 32k * 2gb == ~64tb of guest memory). */ - slot_size = 2 * size_1gb; + slot_size = SZ_2G; max_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS); TEST_ASSERT(max_slots > first_slot, "KVM is broken"); /* All KVM MMUs should be able to survive a 128gb guest. */ - max_mem = 128 * size_1gb; + max_mem = 128ull * SZ_1G; calc_default_nr_vcpus(); @@ -197,11 +197,11 @@ int main(int argc, char *argv[]) TEST_ASSERT(nr_vcpus > 0, "number of vcpus must be >0"); break; case 'm': - max_mem = atoi_paranoid(optarg) * size_1gb; + max_mem = 1ull * atoi_paranoid(optarg) * SZ_1G; TEST_ASSERT(max_mem > 0, "memory size must be >0"); break; case 's': - slot_size = atoi_paranoid(optarg) * size_1gb; + slot_size = 1ull * atoi_paranoid(optarg) * SZ_1G; TEST_ASSERT(slot_size > 0, "slot size must be >0"); break; case 'H': @@ -245,7 +245,7 @@ int main(int argc, char *argv[]) #ifdef __x86_64__ /* Identity map memory in the guest using 1gb pages. */ - for (i = 0; i < slot_size; i += size_1gb) + for (i = 0; i < slot_size; i += SZ_1G) __virt_pg_map(vm, gpa + i, gpa + i, PG_LEVEL_1G); #else for (i = 0; i < slot_size; i += vm->page_size) @@ -260,7 +260,7 @@ int main(int argc, char *argv[]) vcpus = NULL; pr_info("Running with %lugb of guest memory and %u vCPUs\n", - (gpa - start_gpa) / size_1gb, nr_vcpus); + (gpa - start_gpa) / SZ_1G, nr_vcpus); rendezvous_with_vcpus(&time_start, "spawning"); rendezvous_with_vcpus(&time_run1, "run 1"); -- GitLab From c15bdebb32ddc73faac5e5180d6997b360e81619 Mon Sep 17 00:00:00 2001 From: Vipin Sharma Date: Thu, 3 Nov 2022 12:17:17 -0700 Subject: [PATCH 0643/1423] KVM: selftests: Shorten the test args in memslot_modification_stress_test.c Change test args memslot_modification_delay and nr_memslot_modifications to delay and nr_iterations for simplicity. Signed-off-by: Vipin Sharma Suggested-by: Sean Christopherson Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20221103191719.1559407-6-vipinsh@google.com Signed-off-by: Sean Christopherson --- .../kvm/memslot_modification_stress_test.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index 7d19a27d80d2d..3a67d3637f481 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -87,8 +87,8 @@ static void add_remove_memslot(struct kvm_vm *vm, useconds_t delay, } struct test_params { - useconds_t memslot_modification_delay; - uint64_t nr_memslot_modifications; + useconds_t delay; + uint64_t nr_iterations; bool partition_vcpu_memory_access; }; @@ -107,8 +107,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) pr_info("Started all vCPUs\n"); - add_remove_memslot(vm, p->memslot_modification_delay, - p->nr_memslot_modifications); + add_remove_memslot(vm, p->delay, p->nr_iterations); run_vcpus = false; @@ -144,9 +143,8 @@ int main(int argc, char *argv[]) int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); int opt; struct test_params p = { - .memslot_modification_delay = 0, - .nr_memslot_modifications = - DEFAULT_MEMSLOT_MODIFICATION_ITERATIONS, + .delay = 0, + .nr_iterations = DEFAULT_MEMSLOT_MODIFICATION_ITERATIONS, .partition_vcpu_memory_access = true }; @@ -158,8 +156,8 @@ int main(int argc, char *argv[]) guest_modes_cmdline(optarg); break; case 'd': - p.memslot_modification_delay = atoi_paranoid(optarg); - TEST_ASSERT(p.memslot_modification_delay >= 0, + p.delay = atoi_paranoid(optarg); + TEST_ASSERT(p.delay >= 0, "A negative delay is not supported."); break; case 'b': @@ -175,7 +173,7 @@ int main(int argc, char *argv[]) p.partition_vcpu_memory_access = false; break; case 'i': - p.nr_memslot_modifications = atoi_paranoid(optarg); + p.nr_iterations = atoi_paranoid(optarg); break; case 'h': default: -- GitLab From 0001725d0f9b5d749540021befb67c117d566416 Mon Sep 17 00:00:00 2001 From: Vipin Sharma Date: Thu, 3 Nov 2022 12:17:18 -0700 Subject: [PATCH 0644/1423] KVM: selftests: Add atoi_positive() and atoi_non_negative() for input validation Many KVM selftests take command line arguments which are supposed to be positive (>0) or non-negative (>=0). Some tests do these validation and some missed adding the check. Add atoi_positive() and atoi_non_negative() to validate inputs in selftests before proceeding to use those values. Signed-off-by: Vipin Sharma Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20221103191719.1559407-7-vipinsh@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/aarch64/arch_timer.c | 25 ++++--------------- .../selftests/kvm/aarch64/debug-exceptions.c | 2 +- .../testing/selftests/kvm/aarch64/vgic_irq.c | 2 +- .../selftests/kvm/access_tracking_perf_test.c | 2 +- .../selftests/kvm/demand_paging_test.c | 4 +-- .../selftests/kvm/dirty_log_perf_test.c | 12 ++++----- .../testing/selftests/kvm/include/test_util.h | 16 ++++++++++++ .../selftests/kvm/kvm_page_table_test.c | 4 +-- .../selftests/kvm/max_guest_memory_test.c | 9 +++---- .../kvm/memslot_modification_stress_test.c | 10 +++----- .../testing/selftests/kvm/memslot_perf_test.c | 22 ++++------------ .../selftests/kvm/set_memory_region_test.c | 2 +- .../selftests/kvm/x86_64/nx_huge_pages_test.c | 3 +-- 13 files changed, 47 insertions(+), 66 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c index 251e7ff048831..9409617fce9c4 100644 --- a/tools/testing/selftests/kvm/aarch64/arch_timer.c +++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c @@ -414,36 +414,21 @@ static bool parse_args(int argc, char *argv[]) while ((opt = getopt(argc, argv, "hn:i:p:m:")) != -1) { switch (opt) { case 'n': - test_args.nr_vcpus = atoi_paranoid(optarg); - if (test_args.nr_vcpus <= 0) { - pr_info("Positive value needed for -n\n"); - goto err; - } else if (test_args.nr_vcpus > KVM_MAX_VCPUS) { + test_args.nr_vcpus = atoi_positive("Number of vCPUs", optarg); + if (test_args.nr_vcpus > KVM_MAX_VCPUS) { pr_info("Max allowed vCPUs: %u\n", KVM_MAX_VCPUS); goto err; } break; case 'i': - test_args.nr_iter = atoi_paranoid(optarg); - if (test_args.nr_iter <= 0) { - pr_info("Positive value needed for -i\n"); - goto err; - } + test_args.nr_iter = atoi_positive("Number of iterations", optarg); break; case 'p': - test_args.timer_period_ms = atoi_paranoid(optarg); - if (test_args.timer_period_ms <= 0) { - pr_info("Positive value needed for -p\n"); - goto err; - } + test_args.timer_period_ms = atoi_positive("Periodicity", optarg); break; case 'm': - test_args.migration_freq_ms = atoi_paranoid(optarg); - if (test_args.migration_freq_ms < 0) { - pr_info("0 or positive value needed for -m\n"); - goto err; - } + test_args.migration_freq_ms = atoi_non_negative("Frequency", optarg); break; case 'h': default: diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c index 19fffdf19c9f9..878c334607e1a 100644 --- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c +++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c @@ -423,7 +423,7 @@ int main(int argc, char *argv[]) while ((opt = getopt(argc, argv, "i:")) != -1) { switch (opt) { case 'i': - ss_iteration = atoi_paranoid(optarg); + ss_iteration = atoi_positive("Number of iterations", optarg); break; case 'h': default: diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c index ae90b718070aa..4ead42a072b7d 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c @@ -824,7 +824,7 @@ int main(int argc, char **argv) while ((opt = getopt(argc, argv, "hn:e:l:")) != -1) { switch (opt) { case 'n': - nr_irqs = atoi_paranoid(optarg); + nr_irqs = atoi_non_negative("Number of IRQs", optarg); if (nr_irqs > 1024 || nr_irqs % 32) help(argv[0]); break; diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c index c6bcc5301e2c2..a81e7a7ae18fe 100644 --- a/tools/testing/selftests/kvm/access_tracking_perf_test.c +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c @@ -368,7 +368,7 @@ int main(int argc, char *argv[]) params.vcpu_memory_bytes = parse_size(optarg); break; case 'v': - params.nr_vcpus = atoi_paranoid(optarg); + params.nr_vcpus = atoi_positive("Number of vCPUs", optarg); break; case 'o': overlap_memory_access = true; diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 82597fb04146f..0c98181fa2489 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -427,8 +427,8 @@ int main(int argc, char *argv[]) p.src_type = parse_backing_src_type(optarg); break; case 'v': - nr_vcpus = atoi_paranoid(optarg); - TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus, + nr_vcpus = atoi_positive("Number of vCPUs", optarg); + TEST_ASSERT(nr_vcpus <= max_vcpus, "Invalid number of vcpus, must be between 1 and %d", max_vcpus); break; case 'o': diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index ecda802b78ffa..4d639683b8efa 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -416,9 +416,7 @@ int main(int argc, char *argv[]) run_vcpus_while_disabling_dirty_logging = true; break; case 'f': - p.wr_fract = atoi_paranoid(optarg); - TEST_ASSERT(p.wr_fract >= 1, - "Write fraction cannot be less than one"); + p.wr_fract = atoi_positive("Write fraction", optarg); break; case 'g': dirty_log_manual_caps = 0; @@ -427,7 +425,7 @@ int main(int argc, char *argv[]) help(argv[0]); break; case 'i': - p.iterations = atoi_paranoid(optarg); + p.iterations = atoi_positive("Number of iterations", optarg); break; case 'm': guest_modes_cmdline(optarg); @@ -445,12 +443,12 @@ int main(int argc, char *argv[]) p.backing_src = parse_backing_src_type(optarg); break; case 'v': - nr_vcpus = atoi_paranoid(optarg); - TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus, + nr_vcpus = atoi_positive("Number of vCPUs", optarg); + TEST_ASSERT(nr_vcpus <= max_vcpus, "Invalid number of vcpus, must be between 1 and %d", max_vcpus); break; case 'x': - p.slots = atoi_paranoid(optarg); + p.slots = atoi_positive("Number of slots", optarg); break; default: help(argv[0]); diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index feae428637599..3be98e81189a7 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -154,4 +154,20 @@ static inline void *align_ptr_up(void *x, size_t size) int atoi_paranoid(const char *num_str); +static inline uint32_t atoi_positive(const char *name, const char *num_str) +{ + int num = atoi_paranoid(num_str); + + TEST_ASSERT(num > 0, "%s must be greater than 0, got '%s'", name, num_str); + return num; +} + +static inline uint32_t atoi_non_negative(const char *name, const char *num_str) +{ + int num = atoi_paranoid(num_str); + + TEST_ASSERT(num >= 0, "%s must be non-negative, got '%s'", name, num_str); + return num; +} + #endif /* SELFTEST_KVM_TEST_UTIL_H */ diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index ea7feb69bb88a..696b366be06b4 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -461,8 +461,8 @@ int main(int argc, char *argv[]) p.test_mem_size = parse_size(optarg); break; case 'v': - nr_vcpus = atoi_paranoid(optarg); - TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus, + nr_vcpus = atoi_positive("Number of vCPUs", optarg); + TEST_ASSERT(nr_vcpus <= max_vcpus, "Invalid number of vcpus, must be between 1 and %d", max_vcpus); break; case 's': diff --git a/tools/testing/selftests/kvm/max_guest_memory_test.c b/tools/testing/selftests/kvm/max_guest_memory_test.c index 8056dc5831b52..feaf2be20ff25 100644 --- a/tools/testing/selftests/kvm/max_guest_memory_test.c +++ b/tools/testing/selftests/kvm/max_guest_memory_test.c @@ -193,16 +193,13 @@ int main(int argc, char *argv[]) while ((opt = getopt(argc, argv, "c:h:m:s:H")) != -1) { switch (opt) { case 'c': - nr_vcpus = atoi_paranoid(optarg); - TEST_ASSERT(nr_vcpus > 0, "number of vcpus must be >0"); + nr_vcpus = atoi_positive("Number of vCPUs", optarg); break; case 'm': - max_mem = 1ull * atoi_paranoid(optarg) * SZ_1G; - TEST_ASSERT(max_mem > 0, "memory size must be >0"); + max_mem = 1ull * atoi_positive("Memory size", optarg) * SZ_1G; break; case 's': - slot_size = 1ull * atoi_paranoid(optarg) * SZ_1G; - TEST_ASSERT(slot_size > 0, "slot size must be >0"); + slot_size = 1ull * atoi_positive("Slot size", optarg) * SZ_1G; break; case 'H': hugepages = true; diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index 3a67d3637f481..4bdfc910ba4d3 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -156,16 +156,14 @@ int main(int argc, char *argv[]) guest_modes_cmdline(optarg); break; case 'd': - p.delay = atoi_paranoid(optarg); - TEST_ASSERT(p.delay >= 0, - "A negative delay is not supported."); + p.delay = atoi_non_negative("Delay", optarg); break; case 'b': guest_percpu_mem_size = parse_size(optarg); break; case 'v': - nr_vcpus = atoi_paranoid(optarg); - TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus, + nr_vcpus = atoi_positive("Number of vCPUs", optarg); + TEST_ASSERT(nr_vcpus <= max_vcpus, "Invalid number of vcpus, must be between 1 and %d", max_vcpus); break; @@ -173,7 +171,7 @@ int main(int argc, char *argv[]) p.partition_vcpu_memory_access = false; break; case 'i': - p.nr_iterations = atoi_paranoid(optarg); + p.nr_iterations = atoi_positive("Number of iterations", optarg); break; case 'h': default: diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c index 4bae9e3f5ca1e..330aaef1c02fb 100644 --- a/tools/testing/selftests/kvm/memslot_perf_test.c +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -892,33 +892,21 @@ static bool parse_args(int argc, char *argv[], } break; case 'f': - targs->tfirst = atoi_paranoid(optarg); - if (targs->tfirst < 0) { - pr_info("First test to run has to be non-negative\n"); - return false; - } + targs->tfirst = atoi_non_negative("First test", optarg); break; case 'e': - targs->tlast = atoi_paranoid(optarg); - if (targs->tlast < 0 || targs->tlast >= NTESTS) { + targs->tlast = atoi_non_negative("Last test", optarg); + if (targs->tlast >= NTESTS) { pr_info("Last test to run has to be non-negative and less than %zu\n", NTESTS); return false; } break; case 'l': - targs->seconds = atoi_paranoid(optarg); - if (targs->seconds < 0) { - pr_info("Test length in seconds has to be non-negative\n"); - return false; - } + targs->seconds = atoi_non_negative("Test length", optarg); break; case 'r': - targs->runs = atoi_paranoid(optarg); - if (targs->runs <= 0) { - pr_info("Runs per test has to be positive\n"); - return false; - } + targs->runs = atoi_positive("Runs per test", optarg); break; } } diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index c366949c8362c..85c16f09a50e1 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -407,7 +407,7 @@ int main(int argc, char *argv[]) #ifdef __x86_64__ if (argc > 1) - loops = atoi_paranoid(argv[1]); + loops = atoi_positive("Number of iterations", argv[1]); else loops = 10; diff --git a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c index 354b6902849c5..ea0978f22db86 100644 --- a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c +++ b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c @@ -241,7 +241,7 @@ int main(int argc, char **argv) while ((opt = getopt(argc, argv, "hp:t:r")) != -1) { switch (opt) { case 'p': - reclaim_period_ms = atoi_paranoid(optarg); + reclaim_period_ms = atoi_non_negative("Reclaim period", optarg); break; case 't': token = atoi_paranoid(optarg); @@ -257,7 +257,6 @@ int main(int argc, char **argv) } TEST_REQUIRE(kvm_has_cap(KVM_CAP_VM_DISABLE_NX_HUGE_PAGES)); - TEST_REQUIRE(reclaim_period_ms > 0); __TEST_REQUIRE(token == MAGIC_TOKEN, "This test must be run with the magic token %d.\n" -- GitLab From d886724ea81c6a4dc5e37d4ee09287a31ab8335e Mon Sep 17 00:00:00 2001 From: Vipin Sharma Date: Thu, 3 Nov 2022 12:17:19 -0700 Subject: [PATCH 0645/1423] KVM: selftests: Allowing running dirty_log_perf_test on specific CPUs Add a command line option, -c, to pin vCPUs to physical CPUs (pCPUs), i.e. to force vCPUs to run on specific pCPUs. Requirement to implement this feature came in discussion on the patch "Make page tables for eager page splitting NUMA aware" https://lore.kernel.org/lkml/YuhPT2drgqL+osLl@google.com/ This feature is useful as it provides a way to analyze performance based on the vCPUs and dirty log worker locations, like on the different NUMA nodes or on the same NUMA nodes. To keep things simple, implementation is intentionally very limited, either all of the vCPUs will be pinned followed by an optional main thread or nothing will be pinned. Signed-off-by: Vipin Sharma Suggested-by: David Matlack Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20221103191719.1559407-8-vipinsh@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/dirty_log_perf_test.c | 25 ++++++++- .../selftests/kvm/include/kvm_util_base.h | 4 ++ .../selftests/kvm/include/perf_test_util.h | 4 ++ tools/testing/selftests/kvm/lib/kvm_util.c | 54 +++++++++++++++++++ .../selftests/kvm/lib/perf_test_util.c | 8 ++- 5 files changed, 92 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 4d639683b8efa..0612158329aa0 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -353,7 +353,7 @@ static void help(char *name) puts(""); printf("usage: %s [-h] [-i iterations] [-p offset] [-g] " "[-m mode] [-n] [-b vcpu bytes] [-v vcpus] [-o] [-s mem type]" - "[-x memslots]\n", name); + "[-x memslots] [-c physical cpus to run test on]\n", name); puts(""); printf(" -i: specify iteration counts (default: %"PRIu64")\n", TEST_HOST_LOOP_N); @@ -383,6 +383,17 @@ static void help(char *name) backing_src_help("-s"); printf(" -x: Split the memory region into this number of memslots.\n" " (default: 1)\n"); + printf(" -c: Pin tasks to physical CPUs. Takes a list of comma separated\n" + " values (target pCPU), one for each vCPU, plus an optional\n" + " entry for the main application task (specified via entry\n" + " ). If used, entries must be provided for all\n" + " vCPUs, i.e. pinning vCPUs is all or nothing.\n\n" + " E.g. to create 3 vCPUs, pin vCPU0=>pCPU22, vCPU1=>pCPU23,\n" + " vCPU2=>pCPU24, and pin the application task to pCPU50:\n\n" + " ./dirty_log_perf_test -v 3 -c 22,23,24,50\n\n" + " To leave the application task unpinned, drop the final entry:\n\n" + " ./dirty_log_perf_test -v 3 -c 22,23,24\n\n" + " (default: no pinning)\n"); puts(""); exit(0); } @@ -390,6 +401,7 @@ static void help(char *name) int main(int argc, char *argv[]) { int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); + const char *pcpu_list = NULL; struct test_params p = { .iterations = TEST_HOST_LOOP_N, .wr_fract = 1, @@ -406,11 +418,14 @@ int main(int argc, char *argv[]) guest_modes_append_default(); - while ((opt = getopt(argc, argv, "b:ef:ghi:m:nop:s:v:x:")) != -1) { + while ((opt = getopt(argc, argv, "b:c:ef:ghi:m:nop:s:v:x:")) != -1) { switch (opt) { case 'b': guest_percpu_mem_size = parse_size(optarg); break; + case 'c': + pcpu_list = optarg; + break; case 'e': /* 'e' is for evil. */ run_vcpus_while_disabling_dirty_logging = true; @@ -456,6 +471,12 @@ int main(int argc, char *argv[]) } } + if (pcpu_list) { + kvm_parse_vcpu_pinning(pcpu_list, perf_test_args.vcpu_to_pcpu, + nr_vcpus); + perf_test_args.pin_vcpus = true; + } + TEST_ASSERT(p.iterations >= 2, "The test should have at least two iterations"); pr_info("Test iterations: %"PRIu64"\n", p.iterations); diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index e42a09cd24a04..3bf2333ef95d2 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -688,6 +688,10 @@ static inline struct kvm_vm *vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm); +void kvm_pin_this_task_to_pcpu(uint32_t pcpu); +void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[], + int nr_vcpus); + unsigned long vm_compute_max_gfn(struct kvm_vm *vm); unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size); unsigned int vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages); diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h index eaa88df0555a8..849c875dd0ffa 100644 --- a/tools/testing/selftests/kvm/include/perf_test_util.h +++ b/tools/testing/selftests/kvm/include/perf_test_util.h @@ -39,6 +39,10 @@ struct perf_test_args { /* Run vCPUs in L2 instead of L1, if the architecture supports it. */ bool nested; + /* True if all vCPUs are pinned to pCPUs */ + bool pin_vcpus; + /* The vCPU=>pCPU pinning map. Only valid if pin_vcpus is true. */ + uint32_t vcpu_to_pcpu[KVM_MAX_VCPUS]; struct perf_test_vcpu_args vcpu_args[KVM_MAX_VCPUS]; }; diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index f1cb1627161fd..3b7710fb37840 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -11,6 +11,7 @@ #include "processor.h" #include +#include #include #include #include @@ -443,6 +444,59 @@ struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm) return vm_vcpu_recreate(vm, 0); } +void kvm_pin_this_task_to_pcpu(uint32_t pcpu) +{ + cpu_set_t mask; + int r; + + CPU_ZERO(&mask); + CPU_SET(pcpu, &mask); + r = sched_setaffinity(0, sizeof(mask), &mask); + TEST_ASSERT(!r, "sched_setaffinity() failed for pCPU '%u'.\n", pcpu); +} + +static uint32_t parse_pcpu(const char *cpu_str, const cpu_set_t *allowed_mask) +{ + uint32_t pcpu = atoi_non_negative("CPU number", cpu_str); + + TEST_ASSERT(CPU_ISSET(pcpu, allowed_mask), + "Not allowed to run on pCPU '%d', check cgroups?\n", pcpu); + return pcpu; +} + +void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[], + int nr_vcpus) +{ + cpu_set_t allowed_mask; + char *cpu, *cpu_list; + char delim[2] = ","; + int i, r; + + cpu_list = strdup(pcpus_string); + TEST_ASSERT(cpu_list, "strdup() allocation failed.\n"); + + r = sched_getaffinity(0, sizeof(allowed_mask), &allowed_mask); + TEST_ASSERT(!r, "sched_getaffinity() failed"); + + cpu = strtok(cpu_list, delim); + + /* 1. Get all pcpus for vcpus. */ + for (i = 0; i < nr_vcpus; i++) { + TEST_ASSERT(cpu, "pCPU not provided for vCPU '%d'\n", i); + vcpu_to_pcpu[i] = parse_pcpu(cpu, &allowed_mask); + cpu = strtok(NULL, delim); + } + + /* 2. Check if the main worker needs to be pinned. */ + if (cpu) { + kvm_pin_this_task_to_pcpu(parse_pcpu(cpu, &allowed_mask)); + cpu = strtok(NULL, delim); + } + + TEST_ASSERT(!cpu, "pCPU list contains trailing garbage characters '%s'", cpu); + free(cpu_list); +} + /* * Userspace Memory Region Find * diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index 9618b37c66f7e..3a1d0a44419b0 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -2,6 +2,8 @@ /* * Copyright (C) 2020, Google LLC. */ +#define _GNU_SOURCE + #include #include "kvm_util.h" @@ -243,6 +245,10 @@ void __weak perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_v static void *vcpu_thread_main(void *data) { struct vcpu_thread *vcpu = data; + int vcpu_idx = vcpu->vcpu_idx; + + if (perf_test_args.pin_vcpus) + kvm_pin_this_task_to_pcpu(perf_test_args.vcpu_to_pcpu[vcpu_idx]); WRITE_ONCE(vcpu->running, true); @@ -255,7 +261,7 @@ static void *vcpu_thread_main(void *data) while (!READ_ONCE(all_vcpu_threads_running)) ; - vcpu_thread_fn(&perf_test_args.vcpu_args[vcpu->vcpu_idx]); + vcpu_thread_fn(&perf_test_args.vcpu_args[vcpu_idx]); return NULL; } -- GitLab From b31f21a7e97eee501db86714868d84377e68e4df Mon Sep 17 00:00:00 2001 From: Colton Lewis Date: Mon, 7 Nov 2022 18:22:05 +0000 Subject: [PATCH 0646/1423] KVM: selftests: implement random number generator for guest code Implement random number generator for guest code to randomize parts of the test, making it less predictable and a more accurate reflection of reality. The random number generator chosen is the Park-Miller Linear Congruential Generator, a fancy name for a basic and well-understood random number generator entirely sufficient for this purpose. Signed-off-by: Colton Lewis Reviewed-by: Sean Christopherson Reviewed-by: David Matlack Link: https://lore.kernel.org/r/20221107182208.479157-2-coltonlewis@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/test_util.h | 7 +++++++ tools/testing/selftests/kvm/lib/test_util.c | 17 +++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index 3be98e81189a7..80d6416f3012e 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -77,6 +77,13 @@ struct timespec timespec_sub(struct timespec ts1, struct timespec ts2); struct timespec timespec_elapsed(struct timespec start); struct timespec timespec_div(struct timespec ts, int divisor); +struct guest_random_state { + uint32_t seed; +}; + +struct guest_random_state new_guest_random_state(uint32_t seed); +uint32_t guest_random_u32(struct guest_random_state *state); + enum vm_mem_backing_src_type { VM_MEM_SRC_ANONYMOUS, VM_MEM_SRC_ANONYMOUS_THP, diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index c2d9c68277793..5c22fa4c2825e 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -17,6 +17,23 @@ #include "test_util.h" +/* + * Random number generator that is usable from guest code. This is the + * Park-Miller LCG using standard constants. + */ + +struct guest_random_state new_guest_random_state(uint32_t seed) +{ + struct guest_random_state s = {.seed = seed}; + return s; +} + +uint32_t guest_random_u32(struct guest_random_state *state) +{ + state->seed = (uint64_t)state->seed * 48271 % ((uint32_t)(1 << 31) - 1); + return state->seed; +} + /* * Parses "[0-9]+[kmgt]?". */ -- GitLab From f11aa24bdbc66a10378d28ee962b95426e8d2a09 Mon Sep 17 00:00:00 2001 From: Colton Lewis Date: Mon, 7 Nov 2022 18:22:06 +0000 Subject: [PATCH 0647/1423] KVM: selftests: create -r argument to specify random seed Create a -r argument to specify a random seed. If no argument is provided, the seed defaults to 1. The random seed is set with perf_test_set_random_seed() and must be set before guest_code runs to apply. Signed-off-by: Colton Lewis Reviewed-by: David Matlack Link: https://lore.kernel.org/r/20221107182208.479157-3-coltonlewis@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/dirty_log_perf_test.c | 12 ++++++++++-- tools/testing/selftests/kvm/include/perf_test_util.h | 2 ++ tools/testing/selftests/kvm/lib/perf_test_util.c | 6 ++++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 0612158329aa0..eb63ca12b519d 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -132,6 +132,7 @@ struct test_params { bool partition_vcpu_memory_access; enum vm_mem_backing_src_type backing_src; int slots; + uint32_t random_seed; }; static void toggle_dirty_logging(struct kvm_vm *vm, int slots, bool enable) @@ -225,6 +226,8 @@ static void run_test(enum vm_guest_mode mode, void *arg) p->slots, p->backing_src, p->partition_vcpu_memory_access); + pr_info("Random seed: %u\n", p->random_seed); + perf_test_set_random_seed(vm, p->random_seed); perf_test_set_wr_fract(vm, p->wr_fract); guest_num_pages = (nr_vcpus * guest_percpu_mem_size) >> vm->page_shift; @@ -352,7 +355,7 @@ static void help(char *name) { puts(""); printf("usage: %s [-h] [-i iterations] [-p offset] [-g] " - "[-m mode] [-n] [-b vcpu bytes] [-v vcpus] [-o] [-s mem type]" + "[-m mode] [-n] [-b vcpu bytes] [-v vcpus] [-o] [-r random seed ] [-s mem type]" "[-x memslots] [-c physical cpus to run test on]\n", name); puts(""); printf(" -i: specify iteration counts (default: %"PRIu64")\n", @@ -380,6 +383,7 @@ static void help(char *name) printf(" -v: specify the number of vCPUs to run.\n"); printf(" -o: Overlap guest memory accesses instead of partitioning\n" " them into a separate region of memory for each vCPU.\n"); + printf(" -r: specify the starting random seed.\n"); backing_src_help("-s"); printf(" -x: Split the memory region into this number of memslots.\n" " (default: 1)\n"); @@ -408,6 +412,7 @@ int main(int argc, char *argv[]) .partition_vcpu_memory_access = true, .backing_src = DEFAULT_VM_MEM_SRC, .slots = 1, + .random_seed = 1, }; int opt; @@ -418,7 +423,7 @@ int main(int argc, char *argv[]) guest_modes_append_default(); - while ((opt = getopt(argc, argv, "b:c:ef:ghi:m:nop:s:v:x:")) != -1) { + while ((opt = getopt(argc, argv, "b:c:ef:ghi:m:nop:r:s:v:x:")) != -1) { switch (opt) { case 'b': guest_percpu_mem_size = parse_size(optarg); @@ -454,6 +459,9 @@ int main(int argc, char *argv[]) case 'p': p.phys_offset = strtoull(optarg, NULL, 0); break; + case 'r': + p.random_seed = atoi_positive("Random seed", optarg); + break; case 's': p.backing_src = parse_backing_src_type(optarg); break; diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h index 849c875dd0ffa..5a0e48b625a20 100644 --- a/tools/testing/selftests/kvm/include/perf_test_util.h +++ b/tools/testing/selftests/kvm/include/perf_test_util.h @@ -35,6 +35,7 @@ struct perf_test_args { uint64_t gpa; uint64_t size; uint64_t guest_page_size; + uint32_t random_seed; int wr_fract; /* Run vCPUs in L2 instead of L1, if the architecture supports it. */ @@ -56,6 +57,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus, void perf_test_destroy_vm(struct kvm_vm *vm); void perf_test_set_wr_fract(struct kvm_vm *vm, int wr_fract); +void perf_test_set_random_seed(struct kvm_vm *vm, uint32_t random_seed); void perf_test_start_vcpu_threads(int vcpus, void (*vcpu_fn)(struct perf_test_vcpu_args *)); void perf_test_join_vcpu_threads(int vcpus); diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index 3a1d0a44419b0..d48ee4f604f0a 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -231,6 +231,12 @@ void perf_test_set_wr_fract(struct kvm_vm *vm, int wr_fract) sync_global_to_guest(vm, perf_test_args); } +void perf_test_set_random_seed(struct kvm_vm *vm, uint32_t random_seed) +{ + perf_test_args.random_seed = random_seed; + sync_global_to_guest(vm, perf_test_args.random_seed); +} + uint64_t __weak perf_test_nested_pages(int nr_vcpus) { return 0; -- GitLab From 6864c6442f4dfa02c7cf48199cf3ea6bb1fe74ed Mon Sep 17 00:00:00 2001 From: Colton Lewis Date: Mon, 7 Nov 2022 18:22:07 +0000 Subject: [PATCH 0648/1423] KVM: selftests: randomize which pages are written vs read Randomize which pages are written vs read using the random number generator. Change the variable wr_fract and associated function calls to write_percent that now operates as a percentage from 0 to 100 where X means each page has an X% chance of being written. Change the -f argument to -w to reflect the new variable semantics. Keep the same default of 100% writes. Population always uses 100% writes to ensure all memory is actually populated and not just mapped to the zero page. The prevents expensive copy-on-write faults from occurring during the dirty memory iterations below, which would pollute the performance results. Each vCPU calculates its own random seed by adding its index to the seed provided. Signed-off-by: Colton Lewis Reviewed-by: David Matlack Link: https://lore.kernel.org/r/20221107182208.479157-4-coltonlewis@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/access_tracking_perf_test.c | 2 +- .../selftests/kvm/dirty_log_perf_test.c | 37 +++++++++++++------ .../selftests/kvm/include/perf_test_util.h | 4 +- .../selftests/kvm/lib/perf_test_util.c | 13 ++++--- 4 files changed, 36 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c index a81e7a7ae18fe..c0cdf07de1471 100644 --- a/tools/testing/selftests/kvm/access_tracking_perf_test.c +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c @@ -279,7 +279,7 @@ static void run_iteration(struct kvm_vm *vm, int nr_vcpus, const char *descripti static void access_memory(struct kvm_vm *vm, int nr_vcpus, enum access_type access, const char *description) { - perf_test_set_wr_fract(vm, (access == ACCESS_READ) ? INT_MAX : 1); + perf_test_set_write_percent(vm, (access == ACCESS_READ) ? 0 : 100); iteration_work = ITERATION_ACCESS_MEMORY; run_iteration(vm, nr_vcpus, description); } diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index eb63ca12b519d..e9ce50fe7295c 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -128,10 +128,10 @@ static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args) struct test_params { unsigned long iterations; uint64_t phys_offset; - int wr_fract; bool partition_vcpu_memory_access; enum vm_mem_backing_src_type backing_src; int slots; + uint32_t write_percent; uint32_t random_seed; }; @@ -228,7 +228,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) pr_info("Random seed: %u\n", p->random_seed); perf_test_set_random_seed(vm, p->random_seed); - perf_test_set_wr_fract(vm, p->wr_fract); + perf_test_set_write_percent(vm, p->write_percent); guest_num_pages = (nr_vcpus * guest_percpu_mem_size) >> vm->page_shift; guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); @@ -251,6 +251,14 @@ static void run_test(enum vm_guest_mode mode, void *arg) for (i = 0; i < nr_vcpus; i++) vcpu_last_completed_iteration[i] = -1; + /* + * Use 100% writes during the population phase to ensure all + * memory is actually populated and not just mapped to the zero + * page. The prevents expensive copy-on-write faults from + * occurring during the dirty memory iterations below, which + * would pollute the performance results. + */ + perf_test_set_write_percent(vm, 100); perf_test_start_vcpu_threads(nr_vcpus, vcpu_worker); /* Allow the vCPUs to populate memory */ @@ -272,6 +280,8 @@ static void run_test(enum vm_guest_mode mode, void *arg) pr_info("Enabling dirty logging time: %ld.%.9lds\n\n", ts_diff.tv_sec, ts_diff.tv_nsec); + perf_test_set_write_percent(vm, p->write_percent); + while (iteration < p->iterations) { /* * Incrementing the iteration number will start the vCPUs @@ -356,7 +366,7 @@ static void help(char *name) puts(""); printf("usage: %s [-h] [-i iterations] [-p offset] [-g] " "[-m mode] [-n] [-b vcpu bytes] [-v vcpus] [-o] [-r random seed ] [-s mem type]" - "[-x memslots] [-c physical cpus to run test on]\n", name); + "[-x memslots] [-w percentage] [-c physical cpus to run test on]\n", name); puts(""); printf(" -i: specify iteration counts (default: %"PRIu64")\n", TEST_HOST_LOOP_N); @@ -376,10 +386,6 @@ static void help(char *name) printf(" -b: specify the size of the memory region which should be\n" " dirtied by each vCPU. e.g. 10M or 3G.\n" " (default: 1G)\n"); - printf(" -f: specify the fraction of pages which should be written to\n" - " as opposed to simply read, in the form\n" - " 1/.\n" - " (default: 1 i.e. all pages are written to.)\n"); printf(" -v: specify the number of vCPUs to run.\n"); printf(" -o: Overlap guest memory accesses instead of partitioning\n" " them into a separate region of memory for each vCPU.\n"); @@ -387,6 +393,11 @@ static void help(char *name) backing_src_help("-s"); printf(" -x: Split the memory region into this number of memslots.\n" " (default: 1)\n"); + printf(" -w: specify the percentage of pages which should be written to\n" + " as an integer from 0-100 inclusive. This is probabalistic,\n" + " so -w X means each page has an X%% chance of writing\n" + " and a (100-X)%% chance of reading.\n" + " (default: 100 i.e. all pages are written to.)\n"); printf(" -c: Pin tasks to physical CPUs. Takes a list of comma separated\n" " values (target pCPU), one for each vCPU, plus an optional\n" " entry for the main application task (specified via entry\n" @@ -408,11 +419,11 @@ int main(int argc, char *argv[]) const char *pcpu_list = NULL; struct test_params p = { .iterations = TEST_HOST_LOOP_N, - .wr_fract = 1, .partition_vcpu_memory_access = true, .backing_src = DEFAULT_VM_MEM_SRC, .slots = 1, .random_seed = 1, + .write_percent = 100, }; int opt; @@ -423,7 +434,7 @@ int main(int argc, char *argv[]) guest_modes_append_default(); - while ((opt = getopt(argc, argv, "b:c:ef:ghi:m:nop:r:s:v:x:")) != -1) { + while ((opt = getopt(argc, argv, "b:c:eghi:m:nop:r:s:v:x:w:")) != -1) { switch (opt) { case 'b': guest_percpu_mem_size = parse_size(optarg); @@ -435,9 +446,6 @@ int main(int argc, char *argv[]) /* 'e' is for evil. */ run_vcpus_while_disabling_dirty_logging = true; break; - case 'f': - p.wr_fract = atoi_positive("Write fraction", optarg); - break; case 'g': dirty_log_manual_caps = 0; break; @@ -470,6 +478,11 @@ int main(int argc, char *argv[]) TEST_ASSERT(nr_vcpus <= max_vcpus, "Invalid number of vcpus, must be between 1 and %d", max_vcpus); break; + case 'w': + p.write_percent = atoi_non_negative("Write percentage", optarg); + TEST_ASSERT(p.write_percent <= 100, + "Write percentage must be between 0 and 100"); + break; case 'x': p.slots = atoi_positive("Number of slots", optarg); break; diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h index 5a0e48b625a20..6470ca0fec4c0 100644 --- a/tools/testing/selftests/kvm/include/perf_test_util.h +++ b/tools/testing/selftests/kvm/include/perf_test_util.h @@ -36,7 +36,7 @@ struct perf_test_args { uint64_t size; uint64_t guest_page_size; uint32_t random_seed; - int wr_fract; + uint32_t write_percent; /* Run vCPUs in L2 instead of L1, if the architecture supports it. */ bool nested; @@ -56,7 +56,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus, bool partition_vcpu_memory_access); void perf_test_destroy_vm(struct kvm_vm *vm); -void perf_test_set_wr_fract(struct kvm_vm *vm, int wr_fract); +void perf_test_set_write_percent(struct kvm_vm *vm, uint32_t write_percent); void perf_test_set_random_seed(struct kvm_vm *vm, uint32_t random_seed); void perf_test_start_vcpu_threads(int vcpus, void (*vcpu_fn)(struct perf_test_vcpu_args *)); diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index d48ee4f604f0a..15000f71cdeea 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -48,10 +48,13 @@ void perf_test_guest_code(uint32_t vcpu_idx) { struct perf_test_args *pta = &perf_test_args; struct perf_test_vcpu_args *vcpu_args = &pta->vcpu_args[vcpu_idx]; + struct guest_random_state rand_state; uint64_t gva; uint64_t pages; int i; + rand_state = new_guest_random_state(pta->random_seed + vcpu_idx); + gva = vcpu_args->gva; pages = vcpu_args->pages; @@ -62,7 +65,7 @@ void perf_test_guest_code(uint32_t vcpu_idx) for (i = 0; i < pages; i++) { uint64_t addr = gva + (i * pta->guest_page_size); - if (i % pta->wr_fract == 0) + if (guest_random_u32(&rand_state) % 100 < pta->write_percent) *(uint64_t *)addr = 0x0123456789ABCDEF; else READ_ONCE(*(uint64_t *)addr); @@ -123,7 +126,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus, pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); /* By default vCPUs will write to memory. */ - pta->wr_fract = 1; + pta->write_percent = 100; /* * Snapshot the non-huge page size. This is used by the guest code to @@ -225,10 +228,10 @@ void perf_test_destroy_vm(struct kvm_vm *vm) kvm_vm_free(vm); } -void perf_test_set_wr_fract(struct kvm_vm *vm, int wr_fract) +void perf_test_set_write_percent(struct kvm_vm *vm, uint32_t write_percent) { - perf_test_args.wr_fract = wr_fract; - sync_global_to_guest(vm, perf_test_args); + perf_test_args.write_percent = write_percent; + sync_global_to_guest(vm, perf_test_args.write_percent); } void perf_test_set_random_seed(struct kvm_vm *vm, uint32_t random_seed) -- GitLab From c967a4752ac66cc0ef8c0b1f4914151ca8758709 Mon Sep 17 00:00:00 2001 From: Colton Lewis Date: Mon, 7 Nov 2022 18:22:08 +0000 Subject: [PATCH 0649/1423] KVM: selftests: randomize page access order Create the ability to randomize page access order with the -a argument. This includes the possibility that the same pages may be hit multiple times during an iteration or not at all. Population has random access as false to ensure all pages will be touched by population and avoid page faults in late dirty memory that would pollute the test results. Signed-off-by: Colton Lewis Reviewed-by: David Matlack Link: https://lore.kernel.org/r/20221107182208.479157-5-coltonlewis@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/dirty_log_perf_test.c | 11 +++++++++-- .../selftests/kvm/include/perf_test_util.h | 3 +++ tools/testing/selftests/kvm/lib/perf_test_util.c | 15 ++++++++++++++- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index e9ce50fe7295c..47cbda3580fd6 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -133,6 +133,7 @@ struct test_params { int slots; uint32_t write_percent; uint32_t random_seed; + bool random_access; }; static void toggle_dirty_logging(struct kvm_vm *vm, int slots, bool enable) @@ -259,6 +260,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) * would pollute the performance results. */ perf_test_set_write_percent(vm, 100); + perf_test_set_random_access(vm, false); perf_test_start_vcpu_threads(nr_vcpus, vcpu_worker); /* Allow the vCPUs to populate memory */ @@ -281,6 +283,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) ts_diff.tv_sec, ts_diff.tv_nsec); perf_test_set_write_percent(vm, p->write_percent); + perf_test_set_random_access(vm, p->random_access); while (iteration < p->iterations) { /* @@ -364,10 +367,11 @@ static void run_test(enum vm_guest_mode mode, void *arg) static void help(char *name) { puts(""); - printf("usage: %s [-h] [-i iterations] [-p offset] [-g] " + printf("usage: %s [-h] [-a] [-i iterations] [-p offset] [-g] " "[-m mode] [-n] [-b vcpu bytes] [-v vcpus] [-o] [-r random seed ] [-s mem type]" "[-x memslots] [-w percentage] [-c physical cpus to run test on]\n", name); puts(""); + printf(" -a: access memory randomly rather than in order.\n"); printf(" -i: specify iteration counts (default: %"PRIu64")\n", TEST_HOST_LOOP_N); printf(" -g: Do not enable KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2. This\n" @@ -434,8 +438,11 @@ int main(int argc, char *argv[]) guest_modes_append_default(); - while ((opt = getopt(argc, argv, "b:c:eghi:m:nop:r:s:v:x:w:")) != -1) { + while ((opt = getopt(argc, argv, "ab:c:eghi:m:nop:r:s:v:x:w:")) != -1) { switch (opt) { + case 'a': + p.random_access = true; + break; case 'b': guest_percpu_mem_size = parse_size(optarg); break; diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h index 6470ca0fec4c0..75ca679059dc2 100644 --- a/tools/testing/selftests/kvm/include/perf_test_util.h +++ b/tools/testing/selftests/kvm/include/perf_test_util.h @@ -40,6 +40,8 @@ struct perf_test_args { /* Run vCPUs in L2 instead of L1, if the architecture supports it. */ bool nested; + /* Randomize which pages are accessed by the guest. */ + bool random_access; /* True if all vCPUs are pinned to pCPUs */ bool pin_vcpus; /* The vCPU=>pCPU pinning map. Only valid if pin_vcpus is true. */ @@ -58,6 +60,7 @@ void perf_test_destroy_vm(struct kvm_vm *vm); void perf_test_set_write_percent(struct kvm_vm *vm, uint32_t write_percent); void perf_test_set_random_seed(struct kvm_vm *vm, uint32_t random_seed); +void perf_test_set_random_access(struct kvm_vm *vm, bool random_access); void perf_test_start_vcpu_threads(int vcpus, void (*vcpu_fn)(struct perf_test_vcpu_args *)); void perf_test_join_vcpu_threads(int vcpus); diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index 15000f71cdeea..3a9a3ea01a973 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -51,6 +51,8 @@ void perf_test_guest_code(uint32_t vcpu_idx) struct guest_random_state rand_state; uint64_t gva; uint64_t pages; + uint64_t addr; + uint64_t page; int i; rand_state = new_guest_random_state(pta->random_seed + vcpu_idx); @@ -63,7 +65,12 @@ void perf_test_guest_code(uint32_t vcpu_idx) while (true) { for (i = 0; i < pages; i++) { - uint64_t addr = gva + (i * pta->guest_page_size); + if (pta->random_access) + page = guest_random_u32(&rand_state) % pages; + else + page = i; + + addr = gva + (page * pta->guest_page_size); if (guest_random_u32(&rand_state) % 100 < pta->write_percent) *(uint64_t *)addr = 0x0123456789ABCDEF; @@ -240,6 +247,12 @@ void perf_test_set_random_seed(struct kvm_vm *vm, uint32_t random_seed) sync_global_to_guest(vm, perf_test_args.random_seed); } +void perf_test_set_random_access(struct kvm_vm *vm, bool random_access) +{ + perf_test_args.random_access = random_access; + sync_global_to_guest(vm, perf_test_args.random_access); +} + uint64_t __weak perf_test_nested_pages(int nr_vcpus) { return 0; -- GitLab From 9fda6753c9dd4594e2c66c56b48b56a326ee686f Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 12 Oct 2022 09:57:27 -0700 Subject: [PATCH 0650/1423] KVM: selftests: Rename perf_test_util.[ch] to memstress.[ch] Rename the perf_test_util.[ch] files to memstress.[ch]. Symbols are renamed in the following commit to reduce the amount of churn here in hopes of playiing nice with git's file rename detection. The name "memstress" was chosen to better describe the functionality proveded by this library, which is to create and run a VM that reads/writes to guest memory on all vCPUs in parallel. "memstress" also contains the same number of chracters as "perf_test", making it a drop-in replacement in symbols, e.g. function names, without impacting line lengths. Also the lack of underscore between "mem" and "stress" makes it clear "memstress" is a noun. Signed-off-by: David Matlack Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20221012165729.3505266-2-dmatlack@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile | 4 ++-- tools/testing/selftests/kvm/access_tracking_perf_test.c | 2 +- tools/testing/selftests/kvm/demand_paging_test.c | 2 +- tools/testing/selftests/kvm/dirty_log_perf_test.c | 2 +- .../kvm/include/{perf_test_util.h => memstress.h} | 8 ++++---- .../selftests/kvm/lib/{perf_test_util.c => memstress.c} | 2 +- .../kvm/lib/x86_64/{perf_test_util.c => memstress.c} | 4 ++-- .../selftests/kvm/memslot_modification_stress_test.c | 4 ++-- 8 files changed, 14 insertions(+), 14 deletions(-) rename tools/testing/selftests/kvm/include/{perf_test_util.h => memstress.h} (91%) rename tools/testing/selftests/kvm/lib/{perf_test_util.c => memstress.c} (99%) rename tools/testing/selftests/kvm/lib/x86_64/{perf_test_util.c => memstress.c} (97%) diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 0172eb6cb6eee..a00253b79040b 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -43,7 +43,7 @@ LIBKVM += lib/elf.c LIBKVM += lib/guest_modes.c LIBKVM += lib/io.c LIBKVM += lib/kvm_util.c -LIBKVM += lib/perf_test_util.c +LIBKVM += lib/memstress.c LIBKVM += lib/rbtree.c LIBKVM += lib/sparsebit.c LIBKVM += lib/test_util.c @@ -52,7 +52,7 @@ LIBKVM_STRING += lib/string_override.c LIBKVM_x86_64 += lib/x86_64/apic.c LIBKVM_x86_64 += lib/x86_64/handlers.S -LIBKVM_x86_64 += lib/x86_64/perf_test_util.c +LIBKVM_x86_64 += lib/x86_64/memstress.c LIBKVM_x86_64 += lib/x86_64/processor.c LIBKVM_x86_64 += lib/x86_64/svm.c LIBKVM_x86_64 += lib/x86_64/ucall.c diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c index c0cdf07de1471..534d18cc4a6af 100644 --- a/tools/testing/selftests/kvm/access_tracking_perf_test.c +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c @@ -44,7 +44,7 @@ #include "kvm_util.h" #include "test_util.h" -#include "perf_test_util.h" +#include "memstress.h" #include "guest_modes.h" /* Global variable used to synchronize all of the vCPU threads. */ diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 0c98181fa2489..37501e83d1d87 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -20,7 +20,7 @@ #include "kvm_util.h" #include "test_util.h" -#include "perf_test_util.h" +#include "memstress.h" #include "guest_modes.h" #ifdef __NR_userfaultfd diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 47cbda3580fd6..d2bac493da5dd 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -16,7 +16,7 @@ #include "kvm_util.h" #include "test_util.h" -#include "perf_test_util.h" +#include "memstress.h" #include "guest_modes.h" #ifdef __aarch64__ diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/memstress.h similarity index 91% rename from tools/testing/selftests/kvm/include/perf_test_util.h rename to tools/testing/selftests/kvm/include/memstress.h index 75ca679059dc2..64a523e061259 100644 --- a/tools/testing/selftests/kvm/include/perf_test_util.h +++ b/tools/testing/selftests/kvm/include/memstress.h @@ -1,12 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 /* - * tools/testing/selftests/kvm/include/perf_test_util.h + * tools/testing/selftests/kvm/include/memstress.h * * Copyright (C) 2020, Google LLC. */ -#ifndef SELFTEST_KVM_PERF_TEST_UTIL_H -#define SELFTEST_KVM_PERF_TEST_UTIL_H +#ifndef SELFTEST_KVM_MEMSTRESS_H +#define SELFTEST_KVM_MEMSTRESS_H #include @@ -69,4 +69,4 @@ void perf_test_guest_code(uint32_t vcpu_id); uint64_t perf_test_nested_pages(int nr_vcpus); void perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]); -#endif /* SELFTEST_KVM_PERF_TEST_UTIL_H */ +#endif /* SELFTEST_KVM_MEMSTRESS_H */ diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/memstress.c similarity index 99% rename from tools/testing/selftests/kvm/lib/perf_test_util.c rename to tools/testing/selftests/kvm/lib/memstress.c index 3a9a3ea01a973..72f88e5851dd6 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/memstress.c @@ -7,7 +7,7 @@ #include #include "kvm_util.h" -#include "perf_test_util.h" +#include "memstress.h" #include "processor.h" struct perf_test_args perf_test_args; diff --git a/tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c b/tools/testing/selftests/kvm/lib/x86_64/memstress.c similarity index 97% rename from tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c rename to tools/testing/selftests/kvm/lib/x86_64/memstress.c index 0f344a7c89c4b..0bb717ac2cc57 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/x86_64/memstress.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * x86_64-specific extensions to perf_test_util.c. + * x86_64-specific extensions to memstress.c. * * Copyright (C) 2022, Google, Inc. */ @@ -11,7 +11,7 @@ #include "test_util.h" #include "kvm_util.h" -#include "perf_test_util.h" +#include "memstress.h" #include "processor.h" #include "vmx.h" diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index 4bdfc910ba4d3..0490bd4606e50 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -21,7 +21,7 @@ #include #include -#include "perf_test_util.h" +#include "memstress.h" #include "processor.h" #include "test_util.h" #include "guest_modes.h" @@ -72,7 +72,7 @@ static void add_remove_memslot(struct kvm_vm *vm, useconds_t delay, int i; /* - * Add the dummy memslot just below the perf_test_util memslot, which is + * Add the dummy memslot just below the memstress memslot, which is * at the top of the guest physical address space. */ gpa = perf_test_args.gpa - pages * vm->page_size; -- GitLab From a008a3351feaffdcf97a2bcf90b789626585258b Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 12 Oct 2022 09:57:28 -0700 Subject: [PATCH 0651/1423] KVM: selftests: Rename pta (short for perf_test_args) to args Rename the local variables "pta" (which is short for perf_test_args) for args. "pta" is not an obvious acronym and using "args" mirrors "vcpu_args". Suggested-by: Sean Christopherson Signed-off-by: David Matlack Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20221012165729.3505266-3-dmatlack@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/memstress.c | 60 ++++++++++----------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/memstress.c b/tools/testing/selftests/kvm/lib/memstress.c index 72f88e5851dd6..255f77d863304 100644 --- a/tools/testing/selftests/kvm/lib/memstress.c +++ b/tools/testing/selftests/kvm/lib/memstress.c @@ -46,8 +46,8 @@ static struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; */ void perf_test_guest_code(uint32_t vcpu_idx) { - struct perf_test_args *pta = &perf_test_args; - struct perf_test_vcpu_args *vcpu_args = &pta->vcpu_args[vcpu_idx]; + struct perf_test_args *args = &perf_test_args; + struct perf_test_vcpu_args *vcpu_args = &args->vcpu_args[vcpu_idx]; struct guest_random_state rand_state; uint64_t gva; uint64_t pages; @@ -55,7 +55,7 @@ void perf_test_guest_code(uint32_t vcpu_idx) uint64_t page; int i; - rand_state = new_guest_random_state(pta->random_seed + vcpu_idx); + rand_state = new_guest_random_state(args->random_seed + vcpu_idx); gva = vcpu_args->gva; pages = vcpu_args->pages; @@ -65,14 +65,14 @@ void perf_test_guest_code(uint32_t vcpu_idx) while (true) { for (i = 0; i < pages; i++) { - if (pta->random_access) + if (args->random_access) page = guest_random_u32(&rand_state) % pages; else page = i; - addr = gva + (page * pta->guest_page_size); + addr = gva + (page * args->guest_page_size); - if (guest_random_u32(&rand_state) % 100 < pta->write_percent) + if (guest_random_u32(&rand_state) % 100 < args->write_percent) *(uint64_t *)addr = 0x0123456789ABCDEF; else READ_ONCE(*(uint64_t *)addr); @@ -87,12 +87,12 @@ void perf_test_setup_vcpus(struct kvm_vm *vm, int nr_vcpus, uint64_t vcpu_memory_bytes, bool partition_vcpu_memory_access) { - struct perf_test_args *pta = &perf_test_args; + struct perf_test_args *args = &perf_test_args; struct perf_test_vcpu_args *vcpu_args; int i; for (i = 0; i < nr_vcpus; i++) { - vcpu_args = &pta->vcpu_args[i]; + vcpu_args = &args->vcpu_args[i]; vcpu_args->vcpu = vcpus[i]; vcpu_args->vcpu_idx = i; @@ -101,20 +101,20 @@ void perf_test_setup_vcpus(struct kvm_vm *vm, int nr_vcpus, vcpu_args->gva = guest_test_virt_mem + (i * vcpu_memory_bytes); vcpu_args->pages = vcpu_memory_bytes / - pta->guest_page_size; - vcpu_args->gpa = pta->gpa + (i * vcpu_memory_bytes); + args->guest_page_size; + vcpu_args->gpa = args->gpa + (i * vcpu_memory_bytes); } else { vcpu_args->gva = guest_test_virt_mem; vcpu_args->pages = (nr_vcpus * vcpu_memory_bytes) / - pta->guest_page_size; - vcpu_args->gpa = pta->gpa; + args->guest_page_size; + vcpu_args->gpa = args->gpa; } vcpu_args_set(vcpus[i], 1, i); pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n", i, vcpu_args->gpa, vcpu_args->gpa + - (vcpu_args->pages * pta->guest_page_size)); + (vcpu_args->pages * args->guest_page_size)); } } @@ -123,7 +123,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus, enum vm_mem_backing_src_type backing_src, bool partition_vcpu_memory_access) { - struct perf_test_args *pta = &perf_test_args; + struct perf_test_args *args = &perf_test_args; struct kvm_vm *vm; uint64_t guest_num_pages, slot0_pages = 0; uint64_t backing_src_pagesz = get_backing_src_pagesz(backing_src); @@ -133,20 +133,20 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus, pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); /* By default vCPUs will write to memory. */ - pta->write_percent = 100; + args->write_percent = 100; /* * Snapshot the non-huge page size. This is used by the guest code to * access/dirty pages at the logging granularity. */ - pta->guest_page_size = vm_guest_mode_params[mode].page_size; + args->guest_page_size = vm_guest_mode_params[mode].page_size; guest_num_pages = vm_adjust_num_guest_pages(mode, - (nr_vcpus * vcpu_memory_bytes) / pta->guest_page_size); + (nr_vcpus * vcpu_memory_bytes) / args->guest_page_size); TEST_ASSERT(vcpu_memory_bytes % getpagesize() == 0, "Guest memory size is not host page size aligned."); - TEST_ASSERT(vcpu_memory_bytes % pta->guest_page_size == 0, + TEST_ASSERT(vcpu_memory_bytes % args->guest_page_size == 0, "Guest memory size is not guest page size aligned."); TEST_ASSERT(guest_num_pages % slots == 0, "Guest memory cannot be evenly divided into %d slots.", @@ -156,7 +156,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus, * If using nested, allocate extra pages for the nested page tables and * in-memory data structures. */ - if (pta->nested) + if (args->nested) slot0_pages += perf_test_nested_pages(nr_vcpus); /* @@ -167,7 +167,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus, vm = __vm_create_with_vcpus(mode, nr_vcpus, slot0_pages + guest_num_pages, perf_test_guest_code, vcpus); - pta->vm = vm; + args->vm = vm; /* Put the test region at the top guest physical memory. */ region_end_gfn = vm->max_gfn + 1; @@ -177,8 +177,8 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus, * When running vCPUs in L2, restrict the test region to 48 bits to * avoid needing 5-level page tables to identity map L2. */ - if (pta->nested) - region_end_gfn = min(region_end_gfn, (1UL << 48) / pta->guest_page_size); + if (args->nested) + region_end_gfn = min(region_end_gfn, (1UL << 48) / args->guest_page_size); #endif /* * If there should be more memory in the guest test region than there @@ -190,20 +190,20 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus, " nr_vcpus: %d wss: %" PRIx64 "]\n", guest_num_pages, region_end_gfn - 1, nr_vcpus, vcpu_memory_bytes); - pta->gpa = (region_end_gfn - guest_num_pages - 1) * pta->guest_page_size; - pta->gpa = align_down(pta->gpa, backing_src_pagesz); + args->gpa = (region_end_gfn - guest_num_pages - 1) * args->guest_page_size; + args->gpa = align_down(args->gpa, backing_src_pagesz); #ifdef __s390x__ /* Align to 1M (segment size) */ - pta->gpa = align_down(pta->gpa, 1 << 20); + args->gpa = align_down(args->gpa, 1 << 20); #endif - pta->size = guest_num_pages * pta->guest_page_size; + args->size = guest_num_pages * args->guest_page_size; pr_info("guest physical test memory: [0x%lx, 0x%lx)\n", - pta->gpa, pta->gpa + pta->size); + args->gpa, args->gpa + args->size); /* Add extra memory slots for testing */ for (i = 0; i < slots; i++) { uint64_t region_pages = guest_num_pages / slots; - vm_paddr_t region_start = pta->gpa + region_pages * pta->guest_page_size * i; + vm_paddr_t region_start = args->gpa + region_pages * args->guest_page_size * i; vm_userspace_mem_region_add(vm, backing_src, region_start, PERF_TEST_MEM_SLOT_INDEX + i, @@ -211,12 +211,12 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus, } /* Do mapping for the demand paging memory slot */ - virt_map(vm, guest_test_virt_mem, pta->gpa, guest_num_pages); + virt_map(vm, guest_test_virt_mem, args->gpa, guest_num_pages); perf_test_setup_vcpus(vm, nr_vcpus, vcpus, vcpu_memory_bytes, partition_vcpu_memory_access); - if (pta->nested) { + if (args->nested) { pr_info("Configuring vCPUs to run in L2 (nested).\n"); perf_test_setup_nested(vm, nr_vcpus, vcpus); } -- GitLab From 7812d80c0f89c2b610558e09647736b6632beb08 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 12 Oct 2022 09:57:29 -0700 Subject: [PATCH 0652/1423] KVM: selftests: Rename perf_test_util symbols to memstress Replace the perf_test_ prefix on symbol names with memstress_ to match the new file name. "memstress" better describes the functionality proveded by this library, which is to provide functionality for creating and running a VM that stresses VM memory by reading and writing to guest memory on all vCPUs in parallel. "memstress" also contains the same number of chracters as "perf_test", making it a drop-in replacement in symbols, e.g. function names, without impacting line lengths. Also the lack of underscore between "mem" and "stress" makes it clear "memstress" is a noun. Signed-off-by: David Matlack Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20221012165729.3505266-4-dmatlack@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/access_tracking_perf_test.c | 18 ++--- .../selftests/kvm/demand_paging_test.c | 18 ++--- .../selftests/kvm/dirty_log_perf_test.c | 34 +++++----- .../testing/selftests/kvm/include/memstress.h | 30 ++++---- tools/testing/selftests/kvm/lib/memstress.c | 68 +++++++++---------- .../selftests/kvm/lib/x86_64/memstress.c | 32 ++++----- .../kvm/memslot_modification_stress_test.c | 12 ++-- 7 files changed, 106 insertions(+), 106 deletions(-) diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c index 534d18cc4a6af..02d3587cab0a3 100644 --- a/tools/testing/selftests/kvm/access_tracking_perf_test.c +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c @@ -126,7 +126,7 @@ static void mark_page_idle(int page_idle_fd, uint64_t pfn) } static void mark_vcpu_memory_idle(struct kvm_vm *vm, - struct perf_test_vcpu_args *vcpu_args) + struct memstress_vcpu_args *vcpu_args) { int vcpu_idx = vcpu_args->vcpu_idx; uint64_t base_gva = vcpu_args->gva; @@ -148,7 +148,7 @@ static void mark_vcpu_memory_idle(struct kvm_vm *vm, TEST_ASSERT(pagemap_fd > 0, "Failed to open pagemap."); for (page = 0; page < pages; page++) { - uint64_t gva = base_gva + page * perf_test_args.guest_page_size; + uint64_t gva = base_gva + page * memstress_args.guest_page_size; uint64_t pfn = lookup_pfn(pagemap_fd, vm, gva); if (!pfn) { @@ -220,10 +220,10 @@ static bool spin_wait_for_next_iteration(int *current_iteration) return true; } -static void vcpu_thread_main(struct perf_test_vcpu_args *vcpu_args) +static void vcpu_thread_main(struct memstress_vcpu_args *vcpu_args) { struct kvm_vcpu *vcpu = vcpu_args->vcpu; - struct kvm_vm *vm = perf_test_args.vm; + struct kvm_vm *vm = memstress_args.vm; int vcpu_idx = vcpu_args->vcpu_idx; int current_iteration = 0; @@ -279,7 +279,7 @@ static void run_iteration(struct kvm_vm *vm, int nr_vcpus, const char *descripti static void access_memory(struct kvm_vm *vm, int nr_vcpus, enum access_type access, const char *description) { - perf_test_set_write_percent(vm, (access == ACCESS_READ) ? 0 : 100); + memstress_set_write_percent(vm, (access == ACCESS_READ) ? 0 : 100); iteration_work = ITERATION_ACCESS_MEMORY; run_iteration(vm, nr_vcpus, description); } @@ -303,10 +303,10 @@ static void run_test(enum vm_guest_mode mode, void *arg) struct kvm_vm *vm; int nr_vcpus = params->nr_vcpus; - vm = perf_test_create_vm(mode, nr_vcpus, params->vcpu_memory_bytes, 1, + vm = memstress_create_vm(mode, nr_vcpus, params->vcpu_memory_bytes, 1, params->backing_src, !overlap_memory_access); - perf_test_start_vcpu_threads(nr_vcpus, vcpu_thread_main); + memstress_start_vcpu_threads(nr_vcpus, vcpu_thread_main); pr_info("\n"); access_memory(vm, nr_vcpus, ACCESS_WRITE, "Populating memory"); @@ -324,8 +324,8 @@ static void run_test(enum vm_guest_mode mode, void *arg) /* Set done to signal the vCPU threads to exit */ done = true; - perf_test_join_vcpu_threads(nr_vcpus); - perf_test_destroy_vm(vm); + memstress_join_vcpu_threads(nr_vcpus); + memstress_destroy_vm(vm); } static void help(char *name) diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 37501e83d1d87..3a977ddf07b20 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -42,7 +42,7 @@ static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE; static size_t demand_paging_size; static char *guest_data_prototype; -static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args) +static void vcpu_worker(struct memstress_vcpu_args *vcpu_args) { struct kvm_vcpu *vcpu = vcpu_args->vcpu; int vcpu_idx = vcpu_args->vcpu_idx; @@ -285,7 +285,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) struct kvm_vm *vm; int r, i; - vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, + vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, p->src_type, p->partition_vcpu_memory_access); demand_paging_size = get_backing_src_pagesz(p->src_type); @@ -307,11 +307,11 @@ static void run_test(enum vm_guest_mode mode, void *arg) TEST_ASSERT(pipefds, "Unable to allocate memory for pipefd"); for (i = 0; i < nr_vcpus; i++) { - struct perf_test_vcpu_args *vcpu_args; + struct memstress_vcpu_args *vcpu_args; void *vcpu_hva; void *vcpu_alias; - vcpu_args = &perf_test_args.vcpu_args[i]; + vcpu_args = &memstress_args.vcpu_args[i]; /* Cache the host addresses of the region */ vcpu_hva = addr_gpa2hva(vm, vcpu_args->gpa); @@ -329,17 +329,17 @@ static void run_test(enum vm_guest_mode mode, void *arg) pipefds[i * 2], p->uffd_mode, p->uffd_delay, &uffd_args[i], vcpu_hva, vcpu_alias, - vcpu_args->pages * perf_test_args.guest_page_size); + vcpu_args->pages * memstress_args.guest_page_size); } } pr_info("Finished creating vCPUs and starting uffd threads\n"); clock_gettime(CLOCK_MONOTONIC, &start); - perf_test_start_vcpu_threads(nr_vcpus, vcpu_worker); + memstress_start_vcpu_threads(nr_vcpus, vcpu_worker); pr_info("Started all vCPUs\n"); - perf_test_join_vcpu_threads(nr_vcpus); + memstress_join_vcpu_threads(nr_vcpus); ts_diff = timespec_elapsed(start); pr_info("All vCPU threads joined\n"); @@ -358,10 +358,10 @@ static void run_test(enum vm_guest_mode mode, void *arg) pr_info("Total guest execution time: %ld.%.9lds\n", ts_diff.tv_sec, ts_diff.tv_nsec); pr_info("Overall demand paging rate: %f pgs/sec\n", - perf_test_args.vcpu_args[0].pages * nr_vcpus / + memstress_args.vcpu_args[0].pages * nr_vcpus / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0)); - perf_test_destroy_vm(vm); + memstress_destroy_vm(vm); free(guest_data_prototype); if (p->uffd_mode) { diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index d2bac493da5dd..c33e89012ae6f 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -67,7 +67,7 @@ static bool host_quit; static int iteration; static int vcpu_last_completed_iteration[KVM_MAX_VCPUS]; -static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args) +static void vcpu_worker(struct memstress_vcpu_args *vcpu_args) { struct kvm_vcpu *vcpu = vcpu_args->vcpu; int vcpu_idx = vcpu_args->vcpu_idx; @@ -141,7 +141,7 @@ static void toggle_dirty_logging(struct kvm_vm *vm, int slots, bool enable) int i; for (i = 0; i < slots; i++) { - int slot = PERF_TEST_MEM_SLOT_INDEX + i; + int slot = MEMSTRESS_MEM_SLOT_INDEX + i; int flags = enable ? KVM_MEM_LOG_DIRTY_PAGES : 0; vm_mem_region_set_flags(vm, slot, flags); @@ -163,7 +163,7 @@ static void get_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[], int slots int i; for (i = 0; i < slots; i++) { - int slot = PERF_TEST_MEM_SLOT_INDEX + i; + int slot = MEMSTRESS_MEM_SLOT_INDEX + i; kvm_vm_get_dirty_log(vm, slot, bitmaps[i]); } @@ -175,7 +175,7 @@ static void clear_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[], int i; for (i = 0; i < slots; i++) { - int slot = PERF_TEST_MEM_SLOT_INDEX + i; + int slot = MEMSTRESS_MEM_SLOT_INDEX + i; kvm_vm_clear_dirty_log(vm, slot, bitmaps[i], 0, pages_per_slot); } @@ -223,13 +223,13 @@ static void run_test(enum vm_guest_mode mode, void *arg) struct timespec clear_dirty_log_total = (struct timespec){0}; int i; - vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, + vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size, p->slots, p->backing_src, p->partition_vcpu_memory_access); pr_info("Random seed: %u\n", p->random_seed); - perf_test_set_random_seed(vm, p->random_seed); - perf_test_set_write_percent(vm, p->write_percent); + memstress_set_random_seed(vm, p->random_seed); + memstress_set_write_percent(vm, p->write_percent); guest_num_pages = (nr_vcpus * guest_percpu_mem_size) >> vm->page_shift; guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); @@ -259,9 +259,9 @@ static void run_test(enum vm_guest_mode mode, void *arg) * occurring during the dirty memory iterations below, which * would pollute the performance results. */ - perf_test_set_write_percent(vm, 100); - perf_test_set_random_access(vm, false); - perf_test_start_vcpu_threads(nr_vcpus, vcpu_worker); + memstress_set_write_percent(vm, 100); + memstress_set_random_access(vm, false); + memstress_start_vcpu_threads(nr_vcpus, vcpu_worker); /* Allow the vCPUs to populate memory */ pr_debug("Starting iteration %d - Populating\n", iteration); @@ -282,8 +282,8 @@ static void run_test(enum vm_guest_mode mode, void *arg) pr_info("Enabling dirty logging time: %ld.%.9lds\n\n", ts_diff.tv_sec, ts_diff.tv_nsec); - perf_test_set_write_percent(vm, p->write_percent); - perf_test_set_random_access(vm, p->random_access); + memstress_set_write_percent(vm, p->write_percent); + memstress_set_random_access(vm, p->random_access); while (iteration < p->iterations) { /* @@ -345,7 +345,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) * wait for them to exit. */ host_quit = true; - perf_test_join_vcpu_threads(nr_vcpus); + memstress_join_vcpu_threads(nr_vcpus); avg = timespec_div(get_dirty_log_total, p->iterations); pr_info("Get dirty log over %lu iterations took %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n", @@ -361,7 +361,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) free_bitmaps(bitmaps, p->slots); arch_cleanup_vm(vm); - perf_test_destroy_vm(vm); + memstress_destroy_vm(vm); } static void help(char *name) @@ -466,7 +466,7 @@ int main(int argc, char *argv[]) guest_modes_cmdline(optarg); break; case 'n': - perf_test_args.nested = true; + memstress_args.nested = true; break; case 'o': p.partition_vcpu_memory_access = false; @@ -500,9 +500,9 @@ int main(int argc, char *argv[]) } if (pcpu_list) { - kvm_parse_vcpu_pinning(pcpu_list, perf_test_args.vcpu_to_pcpu, + kvm_parse_vcpu_pinning(pcpu_list, memstress_args.vcpu_to_pcpu, nr_vcpus); - perf_test_args.pin_vcpus = true; + memstress_args.pin_vcpus = true; } TEST_ASSERT(p.iterations >= 2, "The test should have at least two iterations"); diff --git a/tools/testing/selftests/kvm/include/memstress.h b/tools/testing/selftests/kvm/include/memstress.h index 64a523e061259..bbd2a302df100 100644 --- a/tools/testing/selftests/kvm/include/memstress.h +++ b/tools/testing/selftests/kvm/include/memstress.h @@ -17,9 +17,9 @@ #define DEFAULT_PER_VCPU_MEM_SIZE (1 << 30) /* 1G */ -#define PERF_TEST_MEM_SLOT_INDEX 1 +#define MEMSTRESS_MEM_SLOT_INDEX 1 -struct perf_test_vcpu_args { +struct memstress_vcpu_args { uint64_t gpa; uint64_t gva; uint64_t pages; @@ -29,7 +29,7 @@ struct perf_test_vcpu_args { int vcpu_idx; }; -struct perf_test_args { +struct memstress_args { struct kvm_vm *vm; /* The starting address and size of the guest test region. */ uint64_t gpa; @@ -47,26 +47,26 @@ struct perf_test_args { /* The vCPU=>pCPU pinning map. Only valid if pin_vcpus is true. */ uint32_t vcpu_to_pcpu[KVM_MAX_VCPUS]; - struct perf_test_vcpu_args vcpu_args[KVM_MAX_VCPUS]; + struct memstress_vcpu_args vcpu_args[KVM_MAX_VCPUS]; }; -extern struct perf_test_args perf_test_args; +extern struct memstress_args memstress_args; -struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus, +struct kvm_vm *memstress_create_vm(enum vm_guest_mode mode, int nr_vcpus, uint64_t vcpu_memory_bytes, int slots, enum vm_mem_backing_src_type backing_src, bool partition_vcpu_memory_access); -void perf_test_destroy_vm(struct kvm_vm *vm); +void memstress_destroy_vm(struct kvm_vm *vm); -void perf_test_set_write_percent(struct kvm_vm *vm, uint32_t write_percent); -void perf_test_set_random_seed(struct kvm_vm *vm, uint32_t random_seed); -void perf_test_set_random_access(struct kvm_vm *vm, bool random_access); +void memstress_set_write_percent(struct kvm_vm *vm, uint32_t write_percent); +void memstress_set_random_seed(struct kvm_vm *vm, uint32_t random_seed); +void memstress_set_random_access(struct kvm_vm *vm, bool random_access); -void perf_test_start_vcpu_threads(int vcpus, void (*vcpu_fn)(struct perf_test_vcpu_args *)); -void perf_test_join_vcpu_threads(int vcpus); -void perf_test_guest_code(uint32_t vcpu_id); +void memstress_start_vcpu_threads(int vcpus, void (*vcpu_fn)(struct memstress_vcpu_args *)); +void memstress_join_vcpu_threads(int vcpus); +void memstress_guest_code(uint32_t vcpu_id); -uint64_t perf_test_nested_pages(int nr_vcpus); -void perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]); +uint64_t memstress_nested_pages(int nr_vcpus); +void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]); #endif /* SELFTEST_KVM_MEMSTRESS_H */ diff --git a/tools/testing/selftests/kvm/lib/memstress.c b/tools/testing/selftests/kvm/lib/memstress.c index 255f77d863304..503da78c558dd 100644 --- a/tools/testing/selftests/kvm/lib/memstress.c +++ b/tools/testing/selftests/kvm/lib/memstress.c @@ -10,7 +10,7 @@ #include "memstress.h" #include "processor.h" -struct perf_test_args perf_test_args; +struct memstress_args memstress_args; /* * Guest virtual memory offset of the testing memory slot. @@ -33,7 +33,7 @@ struct vcpu_thread { static struct vcpu_thread vcpu_threads[KVM_MAX_VCPUS]; /* The function run by each vCPU thread, as provided by the test. */ -static void (*vcpu_thread_fn)(struct perf_test_vcpu_args *); +static void (*vcpu_thread_fn)(struct memstress_vcpu_args *); /* Set to true once all vCPU threads are up and running. */ static bool all_vcpu_threads_running; @@ -44,10 +44,10 @@ static struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; * Continuously write to the first 8 bytes of each page in the * specified region. */ -void perf_test_guest_code(uint32_t vcpu_idx) +void memstress_guest_code(uint32_t vcpu_idx) { - struct perf_test_args *args = &perf_test_args; - struct perf_test_vcpu_args *vcpu_args = &args->vcpu_args[vcpu_idx]; + struct memstress_args *args = &memstress_args; + struct memstress_vcpu_args *vcpu_args = &args->vcpu_args[vcpu_idx]; struct guest_random_state rand_state; uint64_t gva; uint64_t pages; @@ -82,13 +82,13 @@ void perf_test_guest_code(uint32_t vcpu_idx) } } -void perf_test_setup_vcpus(struct kvm_vm *vm, int nr_vcpus, +void memstress_setup_vcpus(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[], uint64_t vcpu_memory_bytes, bool partition_vcpu_memory_access) { - struct perf_test_args *args = &perf_test_args; - struct perf_test_vcpu_args *vcpu_args; + struct memstress_args *args = &memstress_args; + struct memstress_vcpu_args *vcpu_args; int i; for (i = 0; i < nr_vcpus; i++) { @@ -118,12 +118,12 @@ void perf_test_setup_vcpus(struct kvm_vm *vm, int nr_vcpus, } } -struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus, +struct kvm_vm *memstress_create_vm(enum vm_guest_mode mode, int nr_vcpus, uint64_t vcpu_memory_bytes, int slots, enum vm_mem_backing_src_type backing_src, bool partition_vcpu_memory_access) { - struct perf_test_args *args = &perf_test_args; + struct memstress_args *args = &memstress_args; struct kvm_vm *vm; uint64_t guest_num_pages, slot0_pages = 0; uint64_t backing_src_pagesz = get_backing_src_pagesz(backing_src); @@ -157,7 +157,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus, * in-memory data structures. */ if (args->nested) - slot0_pages += perf_test_nested_pages(nr_vcpus); + slot0_pages += memstress_nested_pages(nr_vcpus); /* * Pass guest_num_pages to populate the page tables for test memory. @@ -165,7 +165,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus, * effect as KVM allows aliasing HVAs in meslots. */ vm = __vm_create_with_vcpus(mode, nr_vcpus, slot0_pages + guest_num_pages, - perf_test_guest_code, vcpus); + memstress_guest_code, vcpus); args->vm = vm; @@ -206,59 +206,59 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus, vm_paddr_t region_start = args->gpa + region_pages * args->guest_page_size * i; vm_userspace_mem_region_add(vm, backing_src, region_start, - PERF_TEST_MEM_SLOT_INDEX + i, + MEMSTRESS_MEM_SLOT_INDEX + i, region_pages, 0); } /* Do mapping for the demand paging memory slot */ virt_map(vm, guest_test_virt_mem, args->gpa, guest_num_pages); - perf_test_setup_vcpus(vm, nr_vcpus, vcpus, vcpu_memory_bytes, + memstress_setup_vcpus(vm, nr_vcpus, vcpus, vcpu_memory_bytes, partition_vcpu_memory_access); if (args->nested) { pr_info("Configuring vCPUs to run in L2 (nested).\n"); - perf_test_setup_nested(vm, nr_vcpus, vcpus); + memstress_setup_nested(vm, nr_vcpus, vcpus); } ucall_init(vm, NULL); /* Export the shared variables to the guest. */ - sync_global_to_guest(vm, perf_test_args); + sync_global_to_guest(vm, memstress_args); return vm; } -void perf_test_destroy_vm(struct kvm_vm *vm) +void memstress_destroy_vm(struct kvm_vm *vm) { ucall_uninit(vm); kvm_vm_free(vm); } -void perf_test_set_write_percent(struct kvm_vm *vm, uint32_t write_percent) +void memstress_set_write_percent(struct kvm_vm *vm, uint32_t write_percent) { - perf_test_args.write_percent = write_percent; - sync_global_to_guest(vm, perf_test_args.write_percent); + memstress_args.write_percent = write_percent; + sync_global_to_guest(vm, memstress_args.write_percent); } -void perf_test_set_random_seed(struct kvm_vm *vm, uint32_t random_seed) +void memstress_set_random_seed(struct kvm_vm *vm, uint32_t random_seed) { - perf_test_args.random_seed = random_seed; - sync_global_to_guest(vm, perf_test_args.random_seed); + memstress_args.random_seed = random_seed; + sync_global_to_guest(vm, memstress_args.random_seed); } -void perf_test_set_random_access(struct kvm_vm *vm, bool random_access) +void memstress_set_random_access(struct kvm_vm *vm, bool random_access) { - perf_test_args.random_access = random_access; - sync_global_to_guest(vm, perf_test_args.random_access); + memstress_args.random_access = random_access; + sync_global_to_guest(vm, memstress_args.random_access); } -uint64_t __weak perf_test_nested_pages(int nr_vcpus) +uint64_t __weak memstress_nested_pages(int nr_vcpus) { return 0; } -void __weak perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu **vcpus) +void __weak memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu **vcpus) { pr_info("%s() not support on this architecture, skipping.\n", __func__); exit(KSFT_SKIP); @@ -269,8 +269,8 @@ static void *vcpu_thread_main(void *data) struct vcpu_thread *vcpu = data; int vcpu_idx = vcpu->vcpu_idx; - if (perf_test_args.pin_vcpus) - kvm_pin_this_task_to_pcpu(perf_test_args.vcpu_to_pcpu[vcpu_idx]); + if (memstress_args.pin_vcpus) + kvm_pin_this_task_to_pcpu(memstress_args.vcpu_to_pcpu[vcpu_idx]); WRITE_ONCE(vcpu->running, true); @@ -283,13 +283,13 @@ static void *vcpu_thread_main(void *data) while (!READ_ONCE(all_vcpu_threads_running)) ; - vcpu_thread_fn(&perf_test_args.vcpu_args[vcpu_idx]); + vcpu_thread_fn(&memstress_args.vcpu_args[vcpu_idx]); return NULL; } -void perf_test_start_vcpu_threads(int nr_vcpus, - void (*vcpu_fn)(struct perf_test_vcpu_args *)) +void memstress_start_vcpu_threads(int nr_vcpus, + void (*vcpu_fn)(struct memstress_vcpu_args *)) { int i; @@ -313,7 +313,7 @@ void perf_test_start_vcpu_threads(int nr_vcpus, WRITE_ONCE(all_vcpu_threads_running, true); } -void perf_test_join_vcpu_threads(int nr_vcpus) +void memstress_join_vcpu_threads(int nr_vcpus) { int i; diff --git a/tools/testing/selftests/kvm/lib/x86_64/memstress.c b/tools/testing/selftests/kvm/lib/x86_64/memstress.c index 0bb717ac2cc57..2b3b47e4a973e 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/memstress.c +++ b/tools/testing/selftests/kvm/lib/x86_64/memstress.c @@ -15,21 +15,21 @@ #include "processor.h" #include "vmx.h" -void perf_test_l2_guest_code(uint64_t vcpu_id) +void memstress_l2_guest_code(uint64_t vcpu_id) { - perf_test_guest_code(vcpu_id); + memstress_guest_code(vcpu_id); vmcall(); } -extern char perf_test_l2_guest_entry[]; +extern char memstress_l2_guest_entry[]; __asm__( -"perf_test_l2_guest_entry:" +"memstress_l2_guest_entry:" " mov (%rsp), %rdi;" -" call perf_test_l2_guest_code;" +" call memstress_l2_guest_code;" " ud2;" ); -static void perf_test_l1_guest_code(struct vmx_pages *vmx, uint64_t vcpu_id) +static void memstress_l1_guest_code(struct vmx_pages *vmx, uint64_t vcpu_id) { #define L2_GUEST_STACK_SIZE 64 unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; @@ -42,14 +42,14 @@ static void perf_test_l1_guest_code(struct vmx_pages *vmx, uint64_t vcpu_id) rsp = &l2_guest_stack[L2_GUEST_STACK_SIZE - 1]; *rsp = vcpu_id; - prepare_vmcs(vmx, perf_test_l2_guest_entry, rsp); + prepare_vmcs(vmx, memstress_l2_guest_entry, rsp); GUEST_ASSERT(!vmlaunch()); GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); GUEST_DONE(); } -uint64_t perf_test_nested_pages(int nr_vcpus) +uint64_t memstress_nested_pages(int nr_vcpus) { /* * 513 page tables is enough to identity-map 256 TiB of L2 with 1G @@ -59,7 +59,7 @@ uint64_t perf_test_nested_pages(int nr_vcpus) return 513 + 10 * nr_vcpus; } -void perf_test_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm) +void memstress_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm) { uint64_t start, end; @@ -72,12 +72,12 @@ void perf_test_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm) */ nested_identity_map_1g(vmx, vm, 0, 0x100000000ULL); - start = align_down(perf_test_args.gpa, PG_SIZE_1G); - end = align_up(perf_test_args.gpa + perf_test_args.size, PG_SIZE_1G); + start = align_down(memstress_args.gpa, PG_SIZE_1G); + end = align_up(memstress_args.gpa + memstress_args.size, PG_SIZE_1G); nested_identity_map_1g(vmx, vm, start, end - start); } -void perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]) +void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]) { struct vmx_pages *vmx, *vmx0 = NULL; struct kvm_regs regs; @@ -90,7 +90,7 @@ void perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vc vmx = vcpu_alloc_vmx(vm, &vmx_gva); if (vcpu_id == 0) { - perf_test_setup_ept(vmx, vm); + memstress_setup_ept(vmx, vm); vmx0 = vmx; } else { /* Share the same EPT table across all vCPUs. */ @@ -100,11 +100,11 @@ void perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vc } /* - * Override the vCPU to run perf_test_l1_guest_code() which will - * bounce it into L2 before calling perf_test_guest_code(). + * Override the vCPU to run memstress_l1_guest_code() which will + * bounce it into L2 before calling memstress_guest_code(). */ vcpu_regs_get(vcpus[vcpu_id], ®s); - regs.rip = (unsigned long) perf_test_l1_guest_code; + regs.rip = (unsigned long) memstress_l1_guest_code; vcpu_regs_set(vcpus[vcpu_id], ®s); vcpu_args_set(vcpus[vcpu_id], 2, vmx_gva, vcpu_id); } diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index 0490bd4606e50..d07e921bfcc53 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -36,7 +36,7 @@ static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE; static bool run_vcpus = true; -static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args) +static void vcpu_worker(struct memstress_vcpu_args *vcpu_args) { struct kvm_vcpu *vcpu = vcpu_args->vcpu; struct kvm_run *run; @@ -75,7 +75,7 @@ static void add_remove_memslot(struct kvm_vm *vm, useconds_t delay, * Add the dummy memslot just below the memstress memslot, which is * at the top of the guest physical address space. */ - gpa = perf_test_args.gpa - pages * vm->page_size; + gpa = memstress_args.gpa - pages * vm->page_size; for (i = 0; i < nr_modifications; i++) { usleep(delay); @@ -97,13 +97,13 @@ static void run_test(enum vm_guest_mode mode, void *arg) struct test_params *p = arg; struct kvm_vm *vm; - vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, + vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, VM_MEM_SRC_ANONYMOUS, p->partition_vcpu_memory_access); pr_info("Finished creating vCPUs\n"); - perf_test_start_vcpu_threads(nr_vcpus, vcpu_worker); + memstress_start_vcpu_threads(nr_vcpus, vcpu_worker); pr_info("Started all vCPUs\n"); @@ -111,10 +111,10 @@ static void run_test(enum vm_guest_mode mode, void *arg) run_vcpus = false; - perf_test_join_vcpu_threads(nr_vcpus); + memstress_join_vcpu_threads(nr_vcpus); pr_info("All vCPU threads joined\n"); - perf_test_destroy_vm(vm); + memstress_destroy_vm(vm); } static void help(char *name) -- GitLab From a4ff8e7a71601321f7bf7b58ede664dc0d774274 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Wed, 16 Nov 2022 09:56:37 +0800 Subject: [PATCH 0653/1423] PCI/DOE: Fix maximum data object length miscalculation Per PCIe r6.0, sec 6.30.1, a data object Length of 0x0 indicates 2^18 DWORDs (256K DW or 1MB) being transferred. Adjust the value of data object length for this case on both sending side and receiving side. Don't bother checking whether Length is greater than SZ_1M because all values of the 18-bit Length field are valid, and it is impossible to represent anything larger than SZ_1M: 0x00000 256K DW (1M bytes) 0x00001 1 DW (4 bytes) ... 0x3ffff 256K-1 DW (1M - 4 bytes) [bhelgaas: commit log] Link: https://lore.kernel.org/r/20221116015637.3299664-1-ming4.li@intel.com Fixes: 9d24322e887b ("PCI/DOE: Add DOE mailbox support functions") Signed-off-by: Li Ming Signed-off-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron Reviewed-by: Lukas Wunner Cc: stable@vger.kernel.org # v6.0+ --- drivers/pci/doe.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/pci/doe.c b/drivers/pci/doe.c index e402f05068a53..66d9ab2886468 100644 --- a/drivers/pci/doe.c +++ b/drivers/pci/doe.c @@ -29,6 +29,9 @@ #define PCI_DOE_FLAG_CANCEL 0 #define PCI_DOE_FLAG_DEAD 1 +/* Max data object length is 2^18 dwords */ +#define PCI_DOE_MAX_LENGTH (1 << 18) + /** * struct pci_doe_mb - State for a single DOE mailbox * @@ -107,6 +110,7 @@ static int pci_doe_send_req(struct pci_doe_mb *doe_mb, { struct pci_dev *pdev = doe_mb->pdev; int offset = doe_mb->cap_offset; + size_t length; u32 val; int i; @@ -123,15 +127,20 @@ static int pci_doe_send_req(struct pci_doe_mb *doe_mb, if (FIELD_GET(PCI_DOE_STATUS_ERROR, val)) return -EIO; + /* Length is 2 DW of header + length of payload in DW */ + length = 2 + task->request_pl_sz / sizeof(u32); + if (length > PCI_DOE_MAX_LENGTH) + return -EIO; + if (length == PCI_DOE_MAX_LENGTH) + length = 0; + /* Write DOE Header */ val = FIELD_PREP(PCI_DOE_DATA_OBJECT_HEADER_1_VID, task->prot.vid) | FIELD_PREP(PCI_DOE_DATA_OBJECT_HEADER_1_TYPE, task->prot.type); pci_write_config_dword(pdev, offset + PCI_DOE_WRITE, val); - /* Length is 2 DW of header + length of payload in DW */ pci_write_config_dword(pdev, offset + PCI_DOE_WRITE, FIELD_PREP(PCI_DOE_DATA_OBJECT_HEADER_2_LENGTH, - 2 + task->request_pl_sz / - sizeof(u32))); + length)); for (i = 0; i < task->request_pl_sz / sizeof(u32); i++) pci_write_config_dword(pdev, offset + PCI_DOE_WRITE, task->request_pl[i]); @@ -178,7 +187,10 @@ static int pci_doe_recv_resp(struct pci_doe_mb *doe_mb, struct pci_doe_task *tas pci_write_config_dword(pdev, offset + PCI_DOE_READ, 0); length = FIELD_GET(PCI_DOE_DATA_OBJECT_HEADER_2_LENGTH, val); - if (length > SZ_1M || length < 2) + /* A value of 0x0 indicates max data object length */ + if (!length) + length = PCI_DOE_MAX_LENGTH; + if (length < 2) return -EIO; /* First 2 dwords have already been read */ -- GitLab From 9a48b4a6fd512bdaed7e38ba844be743163d49c6 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:13 -0800 Subject: [PATCH 0654/1423] xfs: fully initialize xfs_da_args in xchk_directory_blocks While running the online fsck test suite, I noticed the following assertion in the kernel log (edited for brevity): XFS: Assertion failed: 0, file: fs/xfs/xfs_health.c, line: 571 ------------[ cut here ]------------ WARNING: CPU: 3 PID: 11667 at fs/xfs/xfs_message.c:104 assfail+0x46/0x4a [xfs] CPU: 3 PID: 11667 Comm: xfs_scrub Tainted: G W 5.19.0-rc7-xfsx #rc7 6e6475eb29fd9dda3181f81b7ca7ff961d277a40 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014 RIP: 0010:assfail+0x46/0x4a [xfs] Call Trace: xfs_dir2_isblock+0xcc/0xe0 xchk_directory_blocks+0xc7/0x420 xchk_directory+0x53/0xb0 xfs_scrub_metadata+0x2b6/0x6b0 xfs_scrubv_metadata+0x35e/0x4d0 xfs_ioc_scrubv_metadata+0x111/0x160 xfs_file_ioctl+0x4ec/0xef0 __x64_sys_ioctl+0x82/0xa0 do_syscall_64+0x2b/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 This assertion triggers in xfs_dirattr_mark_sick when the caller passes in a whichfork value that is neither of XFS_{DATA,ATTR}_FORK. The cause of this is that xchk_directory_blocks only partially initializes the xfs_da_args structure that is passed to xfs_dir2_isblock. If the data fork is not correct, the XFS_IS_CORRUPT clause will trigger. My development branch reports this failure to the health monitoring subsystem, which accesses the uninitialized args->whichfork field, leading the the assertion tripping. We really shouldn't be passing random stack contents around, so the solution here is to force the compiler to zero-initialize the struct. Found by fuzzing u3.bmx[0].blockcount = middlebit on xfs/1554. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/dir.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 5c87800ab223d..d1b0f23c2c593 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -666,7 +666,12 @@ xchk_directory_blocks( struct xfs_scrub *sc) { struct xfs_bmbt_irec got; - struct xfs_da_args args; + struct xfs_da_args args = { + .dp = sc ->ip, + .whichfork = XFS_DATA_FORK, + .geo = sc->mp->m_dir_geo, + .trans = sc->tp, + }; struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); struct xfs_mount *mp = sc->mp; xfs_fileoff_t leaf_lblk; @@ -689,9 +694,6 @@ xchk_directory_blocks( free_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_FREE_OFFSET); /* Is this a block dir? */ - args.dp = sc->ip; - args.geo = mp->m_dir_geo; - args.trans = sc->tp; error = xfs_dir2_isblock(&args, &is_block); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) goto out; -- GitLab From be1317fdb8d4e3ccbac43e199b360c248c600d99 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:14 -0800 Subject: [PATCH 0655/1423] xfs: don't track the AGFL buffer in the scrub AG context While scrubbing an allocation group, we don't need to hold the AGFL buffer as part of the scrub context. All that is necessary to lock an AG is to hold the AGI and AGF buffers, so fix all the existing users of the AGFL buffer to grab them only when necessary. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/agheader.c | 47 +++++++++++++++++++++------------- fs/xfs/scrub/agheader_repair.c | 1 - fs/xfs/scrub/common.c | 8 ------ fs/xfs/scrub/repair.c | 11 ++++---- fs/xfs/scrub/scrub.h | 1 - 5 files changed, 35 insertions(+), 33 deletions(-) diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index b7b838bd4ba43..af284baa6f4cb 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -609,9 +609,16 @@ out: /* AGFL */ struct xchk_agfl_info { - unsigned int sz_entries; + /* Number of AGFL entries that the AGF claims are in use. */ + unsigned int agflcount; + + /* Number of AGFL entries that we found. */ unsigned int nr_entries; + + /* Buffer to hold AGFL entries for extent checking. */ xfs_agblock_t *entries; + + struct xfs_buf *agfl_bp; struct xfs_scrub *sc; }; @@ -641,10 +648,10 @@ xchk_agfl_block( struct xfs_scrub *sc = sai->sc; if (xfs_verify_agbno(sc->sa.pag, agbno) && - sai->nr_entries < sai->sz_entries) + sai->nr_entries < sai->agflcount) sai->entries[sai->nr_entries++] = agbno; else - xchk_block_set_corrupt(sc, sc->sa.agfl_bp); + xchk_block_set_corrupt(sc, sai->agfl_bp); xchk_agfl_block_xref(sc, agbno); @@ -696,19 +703,26 @@ int xchk_agfl( struct xfs_scrub *sc) { - struct xchk_agfl_info sai; + struct xchk_agfl_info sai = { + .sc = sc, + }; struct xfs_agf *agf; xfs_agnumber_t agno = sc->sm->sm_agno; - unsigned int agflcount; unsigned int i; int error; + /* Lock the AGF and AGI so that nobody can touch this AG. */ error = xchk_ag_read_headers(sc, agno, &sc->sa); if (!xchk_process_error(sc, agno, XFS_AGFL_BLOCK(sc->mp), &error)) - goto out; + return error; if (!sc->sa.agf_bp) return -EFSCORRUPTED; - xchk_buffer_recheck(sc, sc->sa.agfl_bp); + + /* Try to read the AGFL, and verify its structure if we get it. */ + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &sai.agfl_bp); + if (!xchk_process_error(sc, agno, XFS_AGFL_BLOCK(sc->mp), &error)) + return error; + xchk_buffer_recheck(sc, sai.agfl_bp); xchk_agfl_xref(sc); @@ -717,24 +731,21 @@ xchk_agfl( /* Allocate buffer to ensure uniqueness of AGFL entries. */ agf = sc->sa.agf_bp->b_addr; - agflcount = be32_to_cpu(agf->agf_flcount); - if (agflcount > xfs_agfl_size(sc->mp)) { + sai.agflcount = be32_to_cpu(agf->agf_flcount); + if (sai.agflcount > xfs_agfl_size(sc->mp)) { xchk_block_set_corrupt(sc, sc->sa.agf_bp); goto out; } - memset(&sai, 0, sizeof(sai)); - sai.sc = sc; - sai.sz_entries = agflcount; - sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount, - KM_MAYFAIL); + sai.entries = kvcalloc(sai.agflcount, sizeof(xfs_agblock_t), + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!sai.entries) { error = -ENOMEM; goto out; } /* Check the blocks in the AGFL. */ - error = xfs_agfl_walk(sc->mp, sc->sa.agf_bp->b_addr, - sc->sa.agfl_bp, xchk_agfl_block, &sai); + error = xfs_agfl_walk(sc->mp, sc->sa.agf_bp->b_addr, sai.agfl_bp, + xchk_agfl_block, &sai); if (error == -ECANCELED) { error = 0; goto out_free; @@ -742,7 +753,7 @@ xchk_agfl( if (error) goto out_free; - if (agflcount != sai.nr_entries) { + if (sai.agflcount != sai.nr_entries) { xchk_block_set_corrupt(sc, sc->sa.agf_bp); goto out_free; } @@ -758,7 +769,7 @@ xchk_agfl( } out_free: - kmem_free(sai.entries); + kvfree(sai.entries); out: return error; } diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 1b0b4e243f771..2e75ff9b5b2e9 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -697,7 +697,6 @@ xrep_agfl( * freespace overflow to the freespace btrees. */ sc->sa.agf_bp = agf_bp; - sc->sa.agfl_bp = agfl_bp; error = xrep_roll_ag_trans(sc); if (error) goto err; diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 9bbbf20f401b3..ad70f29233c36 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -424,10 +424,6 @@ xchk_ag_read_headers( if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF)) return error; - error = xfs_alloc_read_agfl(sa->pag, sc->tp, &sa->agfl_bp); - if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL)) - return error; - return 0; } @@ -515,10 +511,6 @@ xchk_ag_free( struct xchk_ag *sa) { xchk_ag_btcur_free(sa); - if (sa->agfl_bp) { - xfs_trans_brelse(sc->tp, sa->agfl_bp); - sa->agfl_bp = NULL; - } if (sa->agf_bp) { xfs_trans_brelse(sc->tp, sa->agf_bp); sa->agf_bp = NULL; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index c18bd039fce9e..2ada7fc1c3983 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -126,8 +126,6 @@ xrep_roll_ag_trans( xfs_trans_bhold(sc->tp, sc->sa.agi_bp); if (sc->sa.agf_bp) xfs_trans_bhold(sc->tp, sc->sa.agf_bp); - if (sc->sa.agfl_bp) - xfs_trans_bhold(sc->tp, sc->sa.agfl_bp); /* * Roll the transaction. We still own the buffer and the buffer lock @@ -145,8 +143,6 @@ xrep_roll_ag_trans( xfs_trans_bjoin(sc->tp, sc->sa.agi_bp); if (sc->sa.agf_bp) xfs_trans_bjoin(sc->tp, sc->sa.agf_bp); - if (sc->sa.agfl_bp) - xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp); return 0; } @@ -498,6 +494,7 @@ xrep_put_freelist( struct xfs_scrub *sc, xfs_agblock_t agbno) { + struct xfs_buf *agfl_bp; int error; /* Make sure there's space on the freelist. */ @@ -516,8 +513,12 @@ xrep_put_freelist( return error; /* Put the block on the AGFL. */ + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); + if (error) + return error; + error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp, - sc->sa.agfl_bp, agbno, 0); + agfl_bp, agbno, 0); if (error) return error; xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1, diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 3de5287e98d84..151567f883665 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -39,7 +39,6 @@ struct xchk_ag { /* AG btree roots */ struct xfs_buf *agf_bp; - struct xfs_buf *agfl_bp; struct xfs_buf *agi_bp; /* AG btrees */ -- GitLab From 3e59c0103e66d6e687a8b47fd70169542aba938e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:14 -0800 Subject: [PATCH 0656/1423] xfs: log the AGI/AGF buffers when rolling transactions during an AG repair Currently, the only way to lock an allocation group is to hold the AGI and AGF buffers. If a repair needs to roll the transaction while repairing some AG metadata, it maintains that lock by holding the two buffers across the transaction roll and joins them afterwards. However, repair is not like other parts of XFS that employ the bhold - roll - bjoin sequence because it's possible that the AGI or AGF buffers are not actually dirty before the roll. This presents two problems -- First, we need to redirty those buffers to keep them moving along in the log to avoid pinning the log tail. Second, a clean buffer log item can detach from the buffer. If this happens, the buffer type state is discarded along with the bli and must be reattached before the next time the buffer is logged. If it is not, the logging code will complain and log recovery will not work properly. An earlier version of this patch tried to fix the second problem by re-setting the buffer type in the bli after joining the buffer to the new transaction, but that looked weird and didn't solve the first problem. Instead, solve both problems by logging the buffer before rolling the transaction. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/repair.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 2ada7fc1c3983..22335619c84e7 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -121,24 +121,36 @@ xrep_roll_ag_trans( { int error; - /* Keep the AG header buffers locked so we can keep going. */ - if (sc->sa.agi_bp) + /* + * Keep the AG header buffers locked while we roll the transaction. + * Ensure that both AG buffers are dirty and held when we roll the + * transaction so that they move forward in the log without losing the + * bli (and hence the bli type) when the transaction commits. + * + * Normal code would never hold clean buffers across a roll, but repair + * needs both buffers to maintain a total lock on the AG. + */ + if (sc->sa.agi_bp) { + xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM); xfs_trans_bhold(sc->tp, sc->sa.agi_bp); - if (sc->sa.agf_bp) + } + + if (sc->sa.agf_bp) { + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM); xfs_trans_bhold(sc->tp, sc->sa.agf_bp); + } /* - * Roll the transaction. We still own the buffer and the buffer lock - * regardless of whether or not the roll succeeds. If the roll fails, - * the buffers will be released during teardown on our way out of the - * kernel. If it succeeds, we join them to the new transaction and - * move on. + * Roll the transaction. We still hold the AG header buffers locked + * regardless of whether or not that succeeds. On failure, the buffers + * will be released during teardown on our way out of the kernel. If + * successful, join the buffers to the new transaction and move on. */ error = xfs_trans_roll(&sc->tp); if (error) return error; - /* Join AG headers to the new transaction. */ + /* Join the AG headers to the new transaction. */ if (sc->sa.agi_bp) xfs_trans_bjoin(sc->tp, sc->sa.agi_bp); if (sc->sa.agf_bp) -- GitLab From 48ff40458f871fb19e7b1b40e0e5084b8751d9cb Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:15 -0800 Subject: [PATCH 0657/1423] xfs: standardize GFP flags usage in online scrub Memory allocation usage is the same throughout online fsck -- we want kernel memory, we have to be able to back out if we can't allocate memory, and we don't want to spray dmesg with memory allocation failure reports. Standardize the GFP flag usage and document these requirements. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/agheader.c | 2 +- fs/xfs/scrub/attr.c | 9 ++++----- fs/xfs/scrub/scrub.h | 9 +++++++++ fs/xfs/scrub/symlink.c | 2 +- 4 files changed, 15 insertions(+), 7 deletions(-) diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index af284baa6f4cb..4dd52b15f09c6 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -737,7 +737,7 @@ xchk_agfl( goto out; } sai.entries = kvcalloc(sai.agflcount, sizeof(xfs_agblock_t), - GFP_KERNEL | __GFP_RETRY_MAYFAIL); + XCHK_GFP_FLAGS); if (!sai.entries) { error = -ENOMEM; goto out; diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index b6f0c9f3f1245..11b2593a2be79 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -79,7 +79,8 @@ xchk_setup_xattr( * without the inode lock held, which means we can sleep. */ if (sc->flags & XCHK_TRY_HARDER) { - error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, GFP_KERNEL); + error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, + XCHK_GFP_FLAGS); if (error) return error; } @@ -138,8 +139,7 @@ xchk_xattr_listent( * doesn't work, we overload the seen_enough variable to convey * the error message back to the main scrub function. */ - error = xchk_setup_xattr_buf(sx->sc, valuelen, - GFP_KERNEL | __GFP_RETRY_MAYFAIL); + error = xchk_setup_xattr_buf(sx->sc, valuelen, XCHK_GFP_FLAGS); if (error == -ENOMEM) error = -EDEADLOCK; if (error) { @@ -324,8 +324,7 @@ xchk_xattr_block( return 0; /* Allocate memory for block usage checking. */ - error = xchk_setup_xattr_buf(ds->sc, 0, - GFP_KERNEL | __GFP_RETRY_MAYFAIL); + error = xchk_setup_xattr_buf(ds->sc, 0, XCHK_GFP_FLAGS); if (error == -ENOMEM) return -EDEADLOCK; if (error) diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 151567f883665..a0f097b8acb0a 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -8,6 +8,15 @@ struct xfs_scrub; +/* + * Standard flags for allocating memory within scrub. NOFS context is + * configured by the process allocation scope. Scrub and repair must be able + * to back out gracefully if there isn't enough memory. Force-cast to avoid + * complaints from static checkers. + */ +#define XCHK_GFP_FLAGS ((__force gfp_t)(GFP_KERNEL | __GFP_NOWARN | \ + __GFP_RETRY_MAYFAIL)) + /* Type info and names for the scrub types. */ enum xchk_type { ST_NONE = 1, /* disabled */ diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index 75311f8daeeb8..c1c99ffe7408a 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -21,7 +21,7 @@ xchk_setup_symlink( struct xfs_scrub *sc) { /* Allocate the buffer without the inode lock held. */ - sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, GFP_KERNEL); + sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, XCHK_GFP_FLAGS); if (!sc->buf) return -ENOMEM; -- GitLab From b255fab0f80cc65a334fcd90cd278673cddbc988 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:14 -0800 Subject: [PATCH 0658/1423] xfs: make AGFL repair function avoid crosslinked blocks Teach the AGFL repair function to check each block of the proposed AGFL against the rmap btree. If the rmapbt finds any mappings that are not OWN_AG, strike that block from the list. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/agheader_repair.c | 78 ++++++++++++++++++++++++++++------ 1 file changed, 66 insertions(+), 12 deletions(-) diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 2e75ff9b5b2e9..82ceb60ea5fcd 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -442,12 +442,18 @@ out_revert: /* AGFL */ struct xrep_agfl { + /* Bitmap of alleged AGFL blocks that we're not going to add. */ + struct xbitmap crossed; + /* Bitmap of other OWN_AG metadata blocks. */ struct xbitmap agmetablocks; /* Bitmap of free space. */ struct xbitmap *freesp; + /* rmapbt cursor for finding crosslinked blocks */ + struct xfs_btree_cur *rmap_cur; + struct xfs_scrub *sc; }; @@ -477,6 +483,41 @@ xrep_agfl_walk_rmap( return xbitmap_set_btcur_path(&ra->agmetablocks, cur); } +/* Strike out the blocks that are cross-linked according to the rmapbt. */ +STATIC int +xrep_agfl_check_extent( + struct xrep_agfl *ra, + uint64_t start, + uint64_t len) +{ + xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(ra->sc->mp, start); + xfs_agblock_t last_agbno = agbno + len - 1; + int error; + + ASSERT(XFS_FSB_TO_AGNO(ra->sc->mp, start) == ra->sc->sa.pag->pag_agno); + + while (agbno <= last_agbno) { + bool other_owners; + + error = xfs_rmap_has_other_keys(ra->rmap_cur, agbno, 1, + &XFS_RMAP_OINFO_AG, &other_owners); + if (error) + return error; + + if (other_owners) { + error = xbitmap_set(&ra->crossed, agbno, 1); + if (error) + return error; + } + + if (xchk_should_terminate(ra->sc, &error)) + return error; + agbno++; + } + + return 0; +} + /* * Map out all the non-AGFL OWN_AG space in this AG so that we can deduce * which blocks belong to the AGFL. @@ -496,44 +537,58 @@ xrep_agfl_collect_blocks( struct xrep_agfl ra; struct xfs_mount *mp = sc->mp; struct xfs_btree_cur *cur; + struct xbitmap_range *br, *n; int error; ra.sc = sc; ra.freesp = agfl_extents; xbitmap_init(&ra.agmetablocks); + xbitmap_init(&ra.crossed); /* Find all space used by the free space btrees & rmapbt. */ cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); error = xfs_rmap_query_all(cur, xrep_agfl_walk_rmap, &ra); - if (error) - goto err; xfs_btree_del_cursor(cur, error); + if (error) + goto out_bmp; /* Find all blocks currently being used by the bnobt. */ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag, XFS_BTNUM_BNO); error = xbitmap_set_btblocks(&ra.agmetablocks, cur); - if (error) - goto err; xfs_btree_del_cursor(cur, error); + if (error) + goto out_bmp; /* Find all blocks currently being used by the cntbt. */ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag, XFS_BTNUM_CNT); error = xbitmap_set_btblocks(&ra.agmetablocks, cur); - if (error) - goto err; - xfs_btree_del_cursor(cur, error); + if (error) + goto out_bmp; /* * Drop the freesp meta blocks that are in use by btrees. * The remaining blocks /should/ be AGFL blocks. */ error = xbitmap_disunion(agfl_extents, &ra.agmetablocks); - xbitmap_destroy(&ra.agmetablocks); if (error) - return error; + goto out_bmp; + + /* Strike out the blocks that are cross-linked. */ + ra.rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); + for_each_xbitmap_extent(br, n, agfl_extents) { + error = xrep_agfl_check_extent(&ra, br->start, br->len); + if (error) + break; + } + xfs_btree_del_cursor(ra.rmap_cur, error); + if (error) + goto out_bmp; + error = xbitmap_disunion(agfl_extents, &ra.crossed); + if (error) + goto out_bmp; /* * Calculate the new AGFL size. If we found more blocks than fit in @@ -541,11 +596,10 @@ xrep_agfl_collect_blocks( */ *flcount = min_t(uint64_t, xbitmap_hweight(agfl_extents), xfs_agfl_size(mp)); - return 0; -err: +out_bmp: + xbitmap_destroy(&ra.crossed); xbitmap_destroy(&ra.agmetablocks); - xfs_btree_del_cursor(cur, error); return error; } -- GitLab From a7a0f9a5503f4da3b6489583ce4ef9abc0ab2475 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:16 -0800 Subject: [PATCH 0659/1423] xfs: return EINTR when a fatal signal terminates scrub If the program calling online fsck is terminated with a fatal signal, bail out to userspace by returning EINTR, not EAGAIN. EAGAIN is used by scrubbers to indicate that we should try again with more resources locked, and not to indicate that the operation was cancelled. The miswiring is mostly harmless, but it shows up in the trace data. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 454145db10e71..b73648d81d230 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -25,7 +25,7 @@ xchk_should_terminate( if (fatal_signal_pending(current)) { if (*error == 0) - *error = -EAGAIN; + *error = -EINTR; return true; } return false; -- GitLab From 0a713bd41ea2b19904232b9c5278012c4361bc04 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:17 -0800 Subject: [PATCH 0660/1423] xfs: fix return code when fatal signal encountered during dquot scrub If the scrub process is sent a fatal signal while we're checking dquots, the predicate for this will set the error code to -EINTR. Don't then squash that into -ECANCELED, because the wrong errno turns up in the trace output. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/quota.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 21b4c90068591..0b643ff32b222 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -84,7 +84,7 @@ xchk_quota_item( int error = 0; if (xchk_should_terminate(sc, &error)) - return -ECANCELED; + return error; /* * Except for the root dquot, the actual dquot we got must either have -- GitLab From fcd2a43488d5a211aec94e28369b2a72c28258a2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:15 -0800 Subject: [PATCH 0661/1423] xfs: initialize the check_owner object fully Initialize the check_owner list head so that we don't corrupt the list. Reduce the scope of the object pointer. Fixes: 858333dcf021 ("xfs: check btree block ownership with bnobt/rmapbt when scrubbing btree") Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/btree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 2f4519590dc14..075ff3071122e 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -408,7 +408,6 @@ xchk_btree_check_owner( struct xfs_buf *bp) { struct xfs_btree_cur *cur = bs->cur; - struct check_owner *co; /* * In theory, xfs_btree_get_block should only give us a null buffer @@ -431,10 +430,14 @@ xchk_btree_check_owner( * later scanning. */ if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) { + struct check_owner *co; + co = kmem_alloc(sizeof(struct check_owner), KM_MAYFAIL); if (!co) return -ENOMEM; + + INIT_LIST_HEAD(&co->list); co->level = level; co->daddr = xfs_buf_daddr(bp); list_add_tail(&co->list, &bs->to_check); -- GitLab From 6bf2f87915970160ded16c310e2e8887deff97a2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:17 -0800 Subject: [PATCH 0662/1423] xfs: don't retry repairs harder when EAGAIN is returned Repair functions will not return EAGAIN -- if they were not able to obtain resources, they should return EDEADLOCK (like the rest of online fsck) to signal that we need to grab all the resources and try again. Hence we don't need to deal with this case except as a debugging assertion. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/repair.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 22335619c84e7..7323bd9fddfb2 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -61,7 +61,6 @@ xrep_attempt( sc->flags |= XREP_ALREADY_FIXED; return -EAGAIN; case -EDEADLOCK: - case -EAGAIN: /* Tell the caller to try again having grabbed all the locks. */ if (!(sc->flags & XCHK_TRY_HARDER)) { sc->flags |= XCHK_TRY_HARDER; @@ -74,6 +73,11 @@ xrep_attempt( */ return -EFSCORRUPTED; default: + /* + * EAGAIN tells the caller to re-scrub, so we cannot return + * that here. + */ + ASSERT(error != -EAGAIN); return error; } } -- GitLab From 306195f355bbdcc3eff6cffac05bcd93a5e419ed Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:16 -0800 Subject: [PATCH 0663/1423] xfs: pivot online scrub away from kmem.[ch] Convert all the online scrub code to use the Linux slab allocator functions directly instead of going through the kmem wrappers. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/agheader_repair.c | 2 +- fs/xfs/scrub/attr.c | 2 +- fs/xfs/scrub/bitmap.c | 11 ++++++----- fs/xfs/scrub/btree.c | 9 ++++----- fs/xfs/scrub/dabtree.c | 4 ++-- fs/xfs/scrub/fscounters.c | 2 +- fs/xfs/scrub/refcount.c | 12 ++++++------ fs/xfs/scrub/scrub.c | 6 +++--- 8 files changed, 24 insertions(+), 24 deletions(-) diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 82ceb60ea5fcd..d75d82151eeba 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -685,7 +685,7 @@ xrep_agfl_init_header( if (br->len) break; list_del(&br->list); - kmem_free(br); + kfree(br); } /* Write new AGFL to disk. */ diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 11b2593a2be79..31529b9bf3891 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -49,7 +49,7 @@ xchk_setup_xattr_buf( if (ab) { if (sz <= ab->sz) return 0; - kmem_free(ab); + kvfree(ab); sc->buf = NULL; } diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index b89bf9de9b1c5..a255f09e9f0a6 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -10,6 +10,7 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_btree.h" +#include "scrub/scrub.h" #include "scrub/bitmap.h" /* @@ -25,7 +26,7 @@ xbitmap_set( { struct xbitmap_range *bmr; - bmr = kmem_alloc(sizeof(struct xbitmap_range), KM_MAYFAIL); + bmr = kmalloc(sizeof(struct xbitmap_range), XCHK_GFP_FLAGS); if (!bmr) return -ENOMEM; @@ -47,7 +48,7 @@ xbitmap_destroy( for_each_xbitmap_extent(bmr, n, bitmap) { list_del(&bmr->list); - kmem_free(bmr); + kfree(bmr); } } @@ -174,15 +175,15 @@ xbitmap_disunion( /* Total overlap, just delete ex. */ lp = lp->next; list_del(&br->list); - kmem_free(br); + kfree(br); break; case 0: /* * Deleting from the middle: add the new right extent * and then shrink the left extent. */ - new_br = kmem_alloc(sizeof(struct xbitmap_range), - KM_MAYFAIL); + new_br = kmalloc(sizeof(struct xbitmap_range), + XCHK_GFP_FLAGS); if (!new_br) { error = -ENOMEM; goto out; diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 075ff3071122e..0fd36d5b46467 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -432,8 +432,7 @@ xchk_btree_check_owner( if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) { struct check_owner *co; - co = kmem_alloc(sizeof(struct check_owner), - KM_MAYFAIL); + co = kmalloc(sizeof(struct check_owner), XCHK_GFP_FLAGS); if (!co) return -ENOMEM; @@ -652,7 +651,7 @@ xchk_btree( xchk_btree_set_corrupt(sc, cur, 0); return 0; } - bs = kmem_zalloc(cur_sz, KM_NOFS | KM_MAYFAIL); + bs = kzalloc(cur_sz, XCHK_GFP_FLAGS); if (!bs) return -ENOMEM; bs->cur = cur; @@ -743,9 +742,9 @@ out: error = xchk_btree_check_block_owner(bs, co->level, co->daddr); list_del(&co->list); - kmem_free(co); + kfree(co); } - kmem_free(bs); + kfree(bs); return error; } diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index 84fe3d33d6995..d17cee1770854 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -486,7 +486,7 @@ xchk_da_btree( return 0; /* Set up initial da state. */ - ds = kmem_zalloc(sizeof(struct xchk_da_btree), KM_NOFS | KM_MAYFAIL); + ds = kzalloc(sizeof(struct xchk_da_btree), XCHK_GFP_FLAGS); if (!ds) return -ENOMEM; ds->dargs.dp = sc->ip; @@ -591,6 +591,6 @@ out: out_state: xfs_da_state_free(ds->state); - kmem_free(ds); + kfree(ds); return error; } diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index 6a6f8fe7f87c0..3c56f5890da4f 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -116,7 +116,7 @@ xchk_setup_fscounters( struct xchk_fscounters *fsc; int error; - sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), 0); + sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS); if (!sc->buf) return -ENOMEM; fsc = sc->buf; diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index a26ee0f24ef2a..d9c1b3cea4a52 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -127,8 +127,8 @@ xchk_refcountbt_rmap_check( * is healthy each rmap_irec we see will be in agbno order * so we don't need insertion sort here. */ - frag = kmem_alloc(sizeof(struct xchk_refcnt_frag), - KM_MAYFAIL); + frag = kmalloc(sizeof(struct xchk_refcnt_frag), + XCHK_GFP_FLAGS); if (!frag) return -ENOMEM; memcpy(&frag->rm, rec, sizeof(frag->rm)); @@ -215,7 +215,7 @@ xchk_refcountbt_process_rmap_fragments( continue; } list_del(&frag->list); - kmem_free(frag); + kfree(frag); nr++; } @@ -257,11 +257,11 @@ done: /* Delete fragments and work list. */ list_for_each_entry_safe(frag, n, &worklist, list) { list_del(&frag->list); - kmem_free(frag); + kfree(frag); } list_for_each_entry_safe(frag, n, &refchk->fragments, list) { list_del(&frag->list); - kmem_free(frag); + kfree(frag); } } @@ -306,7 +306,7 @@ xchk_refcountbt_xref_rmap( out_free: list_for_each_entry_safe(frag, n, &refchk.fragments, list) { list_del(&frag->list); - kmem_free(frag); + kfree(frag); } } diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 2e8e400f10a9a..07a7a75f987fc 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -174,7 +174,7 @@ xchk_teardown( if (sc->flags & XCHK_REAPING_DISABLED) xchk_start_reaping(sc); if (sc->buf) { - kmem_free(sc->buf); + kvfree(sc->buf); sc->buf = NULL; } return error; @@ -467,7 +467,7 @@ xfs_scrub_metadata( xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SCRUB, "EXPERIMENTAL online scrub feature in use. Use at your own risk!"); - sc = kmem_zalloc(sizeof(struct xfs_scrub), KM_NOFS | KM_MAYFAIL); + sc = kzalloc(sizeof(struct xfs_scrub), XCHK_GFP_FLAGS); if (!sc) { error = -ENOMEM; goto out; @@ -557,7 +557,7 @@ out_nofix: out_teardown: error = xchk_teardown(sc, error); out_sc: - kmem_free(sc); + kfree(sc); out: trace_xchk_done(XFS_I(file_inode(file)), sm, error); if (error == -EFSCORRUPTED || error == -EFSBADCRC) { -- GitLab From 9e13975bb0620c2bfa1a4d2943e7eb8514f7708e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:18 -0800 Subject: [PATCH 0664/1423] xfs: load rtbitmap and rtsummary extent mapping btrees at mount time It turns out that GETFSMAP and online fsck have had a bug for years due to their use of ILOCK_SHARED to coordinate their linear scans of the realtime bitmap. If the bitmap file's data fork happens to be in BTREE format and the scan occurs immediately after mounting, the incore bmbt will not be populated, leading to ASSERTs tripping over the incorrect inode state. Because the bitmap scans always lock bitmap buffers in increasing order of file offset, it is appropriate for these two callers to take a shared ILOCK to improve scalability. To fix this problem, load both data and attr fork state into memory when mounting the realtime inodes. Realtime metadata files aren't supposed to have an attr fork so the second step is likely a nop. On most filesystems this is unlikely since the rtbitmap data fork is usually in extents format, but it's possible to craft a filesystem that will by fragmenting the free space in the data section and growfsing the rt section. Fixes: 4c934c7dd60c ("xfs: report realtime space information via the rtbitmap") Also-Fixes: 46d9bfb5e706 ("xfs: cross-reference the realtime bitmap") Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_rtalloc.c | 56 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 292d5e54a92cc..b0846204c436d 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1325,6 +1325,41 @@ xfs_rtalloc_reinit_frextents( return 0; } +/* + * Read in the bmbt of an rt metadata inode so that we never have to load them + * at runtime. This enables the use of shared ILOCKs for rtbitmap scans. Use + * an empty transaction to avoid deadlocking on loops in the bmbt. + */ +static inline int +xfs_rtmount_iread_extents( + struct xfs_inode *ip, + unsigned int lock_class) +{ + struct xfs_trans *tp; + int error; + + error = xfs_trans_alloc_empty(ip->i_mount, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL | lock_class); + + error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); + if (error) + goto out_unlock; + + if (xfs_inode_has_attr_fork(ip)) { + error = xfs_iread_extents(tp, ip, XFS_ATTR_FORK); + if (error) + goto out_unlock; + } + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL | lock_class); + xfs_trans_cancel(tp); + return error; +} + /* * Get the bitmap and summary inodes and the summary cache into the mount * structure at mount time. @@ -1342,14 +1377,27 @@ xfs_rtmount_inodes( return error; ASSERT(mp->m_rbmip != NULL); + error = xfs_rtmount_iread_extents(mp->m_rbmip, XFS_ILOCK_RTBITMAP); + if (error) + goto out_rele_bitmap; + error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip); - if (error) { - xfs_irele(mp->m_rbmip); - return error; - } + if (error) + goto out_rele_bitmap; ASSERT(mp->m_rsumip != NULL); + + error = xfs_rtmount_iread_extents(mp->m_rsumip, XFS_ILOCK_RTSUM); + if (error) + goto out_rele_summary; + xfs_alloc_rsum_cache(mp, sbp->sb_rbmblocks); return 0; + +out_rele_summary: + xfs_irele(mp->m_rsumip); +out_rele_bitmap: + xfs_irele(mp->m_rbmip); + return error; } void -- GitLab From 11f97e684583469fc342a561387cc44fac4f9b1f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:19 -0800 Subject: [PATCH 0665/1423] xfs: skip fscounters comparisons when the scan is incomplete If any part of the per-AG summary counter scan loop aborts without collecting all of the data we need, the scrubber's observation data will be invalid. Set the incomplete flag so that we abort the scrub without reporting false corruptions. Document the data dependency here too. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/fscounters.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index 3c56f5890da4f..eeb36ac2c6d2c 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -138,6 +138,18 @@ xchk_setup_fscounters( return xchk_trans_alloc(sc, 0); } +/* + * Part 1: Collecting filesystem summary counts. For each AG, we add its + * summary counts (total inodes, free inodes, free data blocks) to an incore + * copy of the overall filesystem summary counts. + * + * To avoid false corruption reports in part 2, any failure in this part must + * set the INCOMPLETE flag even when a negative errno is returned. This care + * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, + * ECANCELED) that are absorbed into a scrub state flag update by + * xchk_*_process_error. + */ + /* Count free space btree blocks manually for pre-lazysbcount filesystems. */ static int xchk_fscount_btreeblks( @@ -225,8 +237,10 @@ retry: } if (pag) xfs_perag_put(pag); - if (error) + if (error) { + xchk_set_incomplete(sc); return error; + } /* * The global incore space reservation is taken from the incore @@ -267,6 +281,11 @@ retry: return 0; } +/* + * Part 2: Comparing filesystem summary counters. All we have to do here is + * sum the percpu counters and compare them to what we've observed. + */ + /* * Is the @counter reasonably close to the @expected value? * -- GitLab From 93b0c58ed04b6cbe45354f23bb5628fff31f9084 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:17 -0800 Subject: [PATCH 0666/1423] xfs: don't return -EFSCORRUPTED from repair when resources cannot be grabbed If we tried to repair something but the repair failed with -EDEADLOCK, that means that the repair function couldn't grab some resource it needed and wants us to try again. If we try again (with TRY_HARDER) but still can't get all the resources we need, the repair fails and errors remain on the filesystem. Right now, repair returns the -EDEADLOCK to the caller as -EFSCORRUPTED, which results in XFS_SCRUB_OFLAG_CORRUPT being passed out to userspace. This is not correct because repair has not determined that anything is corrupt. If the repair had been invoked on an object that could be optimized but wasn't corrupt (OFLAG_PREEN), the inability to grab resources will be reported to userspace as corrupt metadata, and users will be unnecessarily alarmed that their suboptimal metadata turned into a corruption. Fix this by returning zero so that the results of the actual scrub will be copied back out to userspace. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/repair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 7323bd9fddfb2..4b92f9253ccd2 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -69,9 +69,9 @@ xrep_attempt( /* * We tried harder but still couldn't grab all the resources * we needed to fix it. The corruption has not been fixed, - * so report back to userspace. + * so exit to userspace with the scan's output flags unchanged. */ - return -EFSCORRUPTED; + return 0; default: /* * EAGAIN tells the caller to re-scrub, so we cannot return -- GitLab From 5f369dc5b4eb2becbdfd08924dcaf00e391f4ea1 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:18 -0800 Subject: [PATCH 0667/1423] xfs: make rtbitmap ILOCKing consistent when scanning the rt bitmap file xfs_rtalloc_query_range scans the realtime bitmap file in order of increasing file offset, so this caller can take ILOCK_SHARED on the rt bitmap inode instead of ILOCK_EXCL. This isn't going to yield any practical benefits at mount time, but we'd like to make the locking usage consistent around xfs_rtalloc_query_all calls. Make all the places we do this use the same xfs_ilock lockflags for consistency. Fixes: 4c934c7dd60c ("xfs: report realtime space information via the rtbitmap") Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_fsmap.c | 4 ++-- fs/xfs/xfs_rtalloc.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index d8337274c74d0..88a88506ffffe 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -524,7 +524,7 @@ xfs_getfsmap_rtdev_rtbitmap_query( struct xfs_mount *mp = tp->t_mountp; int error; - xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED); + xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); /* * Set up query parameters to return free rtextents covering the range @@ -551,7 +551,7 @@ xfs_getfsmap_rtdev_rtbitmap_query( if (error) goto err; err: - xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED); + xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); return error; } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index b0846204c436d..16534e9873f69 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1311,10 +1311,10 @@ xfs_rtalloc_reinit_frextents( uint64_t val = 0; int error; - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); error = xfs_rtalloc_query_all(mp, NULL, xfs_rtalloc_count_frextent, &val); - xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); if (error) return error; -- GitLab From e74331d6fa2c21a8ecccfe0648dad5193b83defe Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:19 -0800 Subject: [PATCH 0668/1423] xfs: online checking of the free rt extent count Teach the summary count checker to count the number of free realtime extents and compare that to the superblock copy. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/fscounters.c | 86 ++++++++++++++++++++++++++++++++++++++- fs/xfs/scrub/scrub.h | 8 ---- 2 files changed, 84 insertions(+), 10 deletions(-) diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index eeb36ac2c6d2c..4777e7b89fdc6 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -14,6 +14,8 @@ #include "xfs_health.h" #include "xfs_btree.h" #include "xfs_ag.h" +#include "xfs_rtalloc.h" +#include "xfs_inode.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -43,6 +45,16 @@ * our tolerance for mismatch between expected and actual counter values. */ +struct xchk_fscounters { + struct xfs_scrub *sc; + uint64_t icount; + uint64_t ifree; + uint64_t fdblocks; + uint64_t frextents; + unsigned long long icount_min; + unsigned long long icount_max; +}; + /* * Since the expected value computation is lockless but only browses incore * values, the percpu counters should be fairly close to each other. However, @@ -120,6 +132,7 @@ xchk_setup_fscounters( if (!sc->buf) return -ENOMEM; fsc = sc->buf; + fsc->sc = sc; xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max); @@ -281,6 +294,59 @@ retry: return 0; } +#ifdef CONFIG_XFS_RT +STATIC int +xchk_fscount_add_frextent( + struct xfs_mount *mp, + struct xfs_trans *tp, + const struct xfs_rtalloc_rec *rec, + void *priv) +{ + struct xchk_fscounters *fsc = priv; + int error = 0; + + fsc->frextents += rec->ar_extcount; + + xchk_should_terminate(fsc->sc, &error); + return error; +} + +/* Calculate the number of free realtime extents from the realtime bitmap. */ +STATIC int +xchk_fscount_count_frextents( + struct xfs_scrub *sc, + struct xchk_fscounters *fsc) +{ + struct xfs_mount *mp = sc->mp; + int error; + + fsc->frextents = 0; + if (!xfs_has_realtime(mp)) + return 0; + + xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + error = xfs_rtalloc_query_all(sc->mp, sc->tp, + xchk_fscount_add_frextent, fsc); + if (error) { + xchk_set_incomplete(sc); + goto out_unlock; + } + +out_unlock: + xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + return error; +} +#else +STATIC int +xchk_fscount_count_frextents( + struct xfs_scrub *sc, + struct xchk_fscounters *fsc) +{ + fsc->frextents = 0; + return 0; +} +#endif /* CONFIG_XFS_RT */ + /* * Part 2: Comparing filesystem summary counters. All we have to do here is * sum the percpu counters and compare them to what we've observed. @@ -352,16 +418,17 @@ xchk_fscounters( { struct xfs_mount *mp = sc->mp; struct xchk_fscounters *fsc = sc->buf; - int64_t icount, ifree, fdblocks; + int64_t icount, ifree, fdblocks, frextents; int error; /* Snapshot the percpu counters. */ icount = percpu_counter_sum(&mp->m_icount); ifree = percpu_counter_sum(&mp->m_ifree); fdblocks = percpu_counter_sum(&mp->m_fdblocks); + frextents = percpu_counter_sum(&mp->m_frextents); /* No negative values, please! */ - if (icount < 0 || ifree < 0 || fdblocks < 0) + if (icount < 0 || ifree < 0 || fdblocks < 0 || frextents < 0) xchk_set_corrupt(sc); /* See if icount is obviously wrong. */ @@ -372,6 +439,10 @@ xchk_fscounters( if (fdblocks > mp->m_sb.sb_dblocks) xchk_set_corrupt(sc); + /* See if frextents is obviously wrong. */ + if (frextents > mp->m_sb.sb_rextents) + xchk_set_corrupt(sc); + /* * If ifree exceeds icount by more than the minimum variance then * something's probably wrong with the counters. @@ -386,6 +457,13 @@ xchk_fscounters( if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) return 0; + /* Count the free extents counter for rt volumes. */ + error = xchk_fscount_count_frextents(sc, fsc); + if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error)) + return error; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) + return 0; + /* Compare the in-core counters with whatever we counted. */ if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount)) xchk_set_corrupt(sc); @@ -397,5 +475,9 @@ xchk_fscounters( fsc->fdblocks)) xchk_set_corrupt(sc); + if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents, + fsc->frextents)) + xchk_set_corrupt(sc); + return 0; } diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index a0f097b8acb0a..b4d391b4c9385 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -169,12 +169,4 @@ void xchk_xref_is_used_rt_space(struct xfs_scrub *sc, xfs_rtblock_t rtbno, # define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0) #endif -struct xchk_fscounters { - uint64_t icount; - uint64_t ifree; - uint64_t fdblocks; - unsigned long long icount_min; - unsigned long long icount_max; -}; - #endif /* __XFS_SCRUB_SCRUB_H__ */ -- GitLab From 033985b6fe875a7a971cf4e3941e1f3085ba037c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:19 -0800 Subject: [PATCH 0669/1423] xfs: fix perag loop in xchk_bmap_check_rmaps sparse complains that we can return an uninitialized error from this function and that pag could be uninitialized. We know that there are no zero-AG filesystems and hence we had to call xchk_bmap_check_ag_rmaps at least once, so this is not actually possible, but I'm too worn out from automated complaints from unsophisticated AIs so let's just fix this and move on to more interesting problems, eh? Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/bmap.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index f0b9cb6506fdd..cb203e083a4c8 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -599,14 +599,14 @@ xchk_bmap_check_rmaps( for_each_perag(sc->mp, agno, pag) { error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag); - if (error) - break; - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - break; + if (error || + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { + xfs_perag_put(pag); + return error; + } } - if (pag) - xfs_perag_put(pag); - return error; + + return 0; } /* -- GitLab From 6a5777865eebee1b53d7ae0fd2fa9ec2c6596df6 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:20 -0800 Subject: [PATCH 0670/1423] xfs: teach scrub to check for adjacent bmaps when rmap larger than bmap When scrub is checking file fork mappings against rmap records and the rmap record starts before or ends after the bmap record, check the adjacent bmap records to make sure that they're adjacent to the one we're checking. This helps us to detect cases where the rmaps cover territory that the bmaps do not. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/bmap.c | 74 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index cb203e083a4c8..a4a156dcaa8a0 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -90,6 +90,7 @@ out: struct xchk_bmap_info { struct xfs_scrub *sc; + struct xfs_iext_cursor icur; xfs_fileoff_t lastoff; bool is_rt; bool is_shared; @@ -146,6 +147,48 @@ xchk_bmap_get_rmap( return has_rmap; } +static inline bool +xchk_bmap_has_prev( + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec) +{ + struct xfs_bmbt_irec got; + struct xfs_ifork *ifp; + + ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork); + + if (!xfs_iext_peek_prev_extent(ifp, &info->icur, &got)) + return false; + if (got.br_startoff + got.br_blockcount != irec->br_startoff) + return false; + if (got.br_startblock + got.br_blockcount != irec->br_startblock) + return false; + if (got.br_state != irec->br_state) + return false; + return true; +} + +static inline bool +xchk_bmap_has_next( + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec) +{ + struct xfs_bmbt_irec got; + struct xfs_ifork *ifp; + + ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork); + + if (!xfs_iext_peek_next_extent(ifp, &info->icur, &got)) + return false; + if (irec->br_startoff + irec->br_blockcount != got.br_startoff) + return false; + if (irec->br_startblock + irec->br_blockcount != got.br_startblock) + return false; + if (got.br_state != irec->br_state) + return false; + return true; +} + /* Make sure that we have rmapbt records for this extent. */ STATIC void xchk_bmap_xref_rmap( @@ -214,6 +257,34 @@ xchk_bmap_xref_rmap( if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); + + /* + * If the rmap starts before this bmbt record, make sure there's a bmbt + * record for the previous offset that is contiguous with this mapping. + * Skip this for CoW fork extents because the refcount btree (and not + * the inode) is the ondisk owner for those extents. + */ + if (info->whichfork != XFS_COW_FORK && rmap.rm_startblock < agbno && + !xchk_bmap_has_prev(info, irec)) { + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + return; + } + + /* + * If the rmap ends after this bmbt record, make sure there's a bmbt + * record for the next offset that is contiguous with this mapping. + * Skip this for CoW fork extents because the refcount btree (and not + * the inode) is the ondisk owner for those extents. + */ + rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount; + if (info->whichfork != XFS_COW_FORK && + rmap_end > agbno + irec->br_blockcount && + !xchk_bmap_has_next(info, irec)) { + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + return; + } } /* Cross-reference a single rtdev extent record. */ @@ -626,7 +697,6 @@ xchk_bmap( struct xfs_inode *ip = sc->ip; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); xfs_fileoff_t endoff; - struct xfs_iext_cursor icur; int error = 0; /* Non-existent forks can be ignored. */ @@ -690,7 +760,7 @@ xchk_bmap( /* Scrub extent records. */ info.lastoff = 0; ifp = xfs_ifork_ptr(ip, whichfork); - for_each_xfs_iext(ifp, &icur, &irec) { + for_each_xfs_iext(ifp, &info.icur, &irec) { if (xchk_should_terminate(sc, &error) || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) goto out; -- GitLab From 830ffa09fb130d31f111848a75b65506fdac623d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:20 -0800 Subject: [PATCH 0671/1423] xfs: block map scrub should handle incore delalloc reservations Enhance the block map scrubber to check delayed allocation reservations. Though there are no physical space allocations to check, we do need to make sure that the range of file offsets being mapped are correct, and to bump the lastoff cursor so that key order checking works correctly. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/bmap.c | 55 +++++++++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index a4a156dcaa8a0..fa8f22ed057a7 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -368,14 +368,13 @@ xchk_bmap_dirattr_extent( } /* Scrub a single extent record. */ -STATIC int +STATIC void xchk_bmap_iextent( struct xfs_inode *ip, struct xchk_bmap_info *info, struct xfs_bmbt_irec *irec) { struct xfs_mount *mp = info->sc->mp; - int error = 0; /* * Check for out-of-order extents. This record could have come @@ -396,14 +395,6 @@ xchk_bmap_iextent( xchk_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); - /* - * Check for delalloc extents. We never iterate the ones in the - * in-core extent scan, and we should never see these in the bmbt. - */ - if (isnullstartblock(irec->br_startblock)) - xchk_fblock_set_corrupt(info->sc, info->whichfork, - irec->br_startoff); - /* Make sure the extent points to a valid place. */ if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN) xchk_fblock_set_corrupt(info->sc, info->whichfork, @@ -424,15 +415,12 @@ xchk_bmap_iextent( irec->br_startoff); if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return 0; + return; if (info->is_rt) xchk_bmap_rt_iextent_xref(ip, info, irec); else xchk_bmap_iextent_xref(ip, info, irec); - - info->lastoff = irec->br_startoff + irec->br_blockcount; - return error; } /* Scrub a bmbt record. */ @@ -680,6 +668,33 @@ xchk_bmap_check_rmaps( return 0; } +/* Scrub a delalloc reservation from the incore extent map tree. */ +STATIC void +xchk_bmap_iextent_delalloc( + struct xfs_inode *ip, + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec) +{ + struct xfs_mount *mp = info->sc->mp; + + /* + * Check for out-of-order extents. This record could have come + * from the incore list, for which there is no ordering check. + */ + if (irec->br_startoff < info->lastoff) + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + if (!xfs_verify_fileext(mp, irec->br_startoff, irec->br_blockcount)) + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + /* Make sure the extent points to a valid place. */ + if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN) + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); +} + /* * Scrub an inode fork's block mappings. * @@ -764,16 +779,18 @@ xchk_bmap( if (xchk_should_terminate(sc, &error) || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) goto out; - if (isnullstartblock(irec.br_startblock)) - continue; + if (irec.br_startoff >= endoff) { xchk_fblock_set_corrupt(sc, whichfork, irec.br_startoff); goto out; } - error = xchk_bmap_iextent(ip, &info, &irec); - if (error) - goto out; + + if (isnullstartblock(irec.br_startblock)) + xchk_bmap_iextent_delalloc(ip, &info, &irec); + else + xchk_bmap_iextent(ip, &info, &irec); + info.lastoff = irec.br_startoff + irec.br_blockcount; } error = xchk_bmap_check_rmaps(sc, whichfork); -- GitLab From f23c40443d1c2af87c99c1424f9e43bbd7307f92 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:21 -0800 Subject: [PATCH 0672/1423] xfs: check quota files for unwritten extents Teach scrub to flag quota files containing unwritten extents. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/quota.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 0b643ff32b222..9eeac85653948 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -14,6 +14,7 @@ #include "xfs_inode.h" #include "xfs_quota.h" #include "xfs_qm.h" +#include "xfs_bmap.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -189,11 +190,12 @@ xchk_quota_data_fork( for_each_xfs_iext(ifp, &icur, &irec) { if (xchk_should_terminate(sc, &error)) break; + /* - * delalloc extents or blocks mapped above the highest + * delalloc/unwritten extents or blocks mapped above the highest * quota id shouldn't happen. */ - if (isnullstartblock(irec.br_startblock) || + if (!xfs_bmap_is_written_extent(&irec) || irec.br_startoff > max_dqid_off || irec.br_startoff + irec.br_blockcount - 1 > max_dqid_off) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, -- GitLab From 31785537010a91a0d1dc403e5d049a38a3d4a30b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:21 -0800 Subject: [PATCH 0673/1423] xfs: check that CoW fork extents are not shared Ensure that extents in an inode's CoW fork are not marked as shared in the refcount btree. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/bmap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index fa8f22ed057a7..4bb6672b02ba1 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -335,6 +335,8 @@ xchk_bmap_iextent_xref( case XFS_COW_FORK: xchk_xref_is_cow_staging(info->sc, agbno, irec->br_blockcount); + xchk_xref_is_not_shared(info->sc, agbno, + irec->br_blockcount); break; } -- GitLab From 5eef46358fae1a6018d9f886a3ecd30e843728dd Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 6 Nov 2022 17:03:22 -0800 Subject: [PATCH 0674/1423] xfs: teach scrub to flag non-extents format cow forks CoW forks only exist in memory, which means that they can only ever have an incore extent tree. Hence they must always be FMT_EXTENTS, so check this when we're scrubbing them. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/bmap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 4bb6672b02ba1..d50d0eab196ab 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -748,6 +748,8 @@ xchk_bmap( case XFS_DINODE_FMT_DEV: case XFS_DINODE_FMT_LOCAL: /* No mappings to check. */ + if (whichfork == XFS_COW_FORK) + xchk_fblock_set_corrupt(sc, whichfork, 0); goto out; case XFS_DINODE_FMT_EXTENTS: break; -- GitLab From bd5ab5f9874109586cbae5bc98e1f9ff574627e2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 16 Nov 2022 16:08:03 -0800 Subject: [PATCH 0675/1423] xfs: don't warn about files that are exactly s_maxbytes long We can handle files that are exactly s_maxbytes bytes long; we just can't handle anything larger than that. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 51820b40ab1c8..7a2f38e5202c3 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -365,7 +365,7 @@ xchk_dinode( * pagecache can't cache all the blocks in this file due to * overly large offsets, flag the inode for admin review. */ - if (isize >= mp->m_super->s_maxbytes) + if (isize > mp->m_super->s_maxbytes) xchk_ino_set_warning(sc, ino); /* di_nblocks */ -- GitLab From f36b954a1f1bf06b5746fea7ecf0fa639ac65324 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 16 Nov 2022 16:08:03 -0800 Subject: [PATCH 0676/1423] xfs: check inode core when scrubbing metadata files Metadata files (e.g. realtime bitmaps and quota files) do not show up in the bulkstat output, which means that scrub-by-handle does not work; they can only be checked through a specific scrub type. Therefore, each scrub type calls xchk_metadata_inode_forks to check the metadata for whatever's in the file. Unfortunately, that function doesn't actually check the inode record itself. Refactor the function a bit to make that happen. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/common.c | 40 ++++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index ad70f29233c36..613260b04a3dc 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -781,6 +781,33 @@ xchk_buffer_recheck( trace_xchk_block_error(sc, xfs_buf_daddr(bp), fa); } +static inline int +xchk_metadata_inode_subtype( + struct xfs_scrub *sc, + unsigned int scrub_type) +{ + __u32 smtype = sc->sm->sm_type; + int error; + + sc->sm->sm_type = scrub_type; + + switch (scrub_type) { + case XFS_SCRUB_TYPE_INODE: + error = xchk_inode(sc); + break; + case XFS_SCRUB_TYPE_BMBTD: + error = xchk_bmap_data(sc); + break; + default: + ASSERT(0); + error = -EFSCORRUPTED; + break; + } + + sc->sm->sm_type = smtype; + return error; +} + /* * Scrub the attr/data forks of a metadata inode. The metadata inode must be * pointed to by sc->ip and the ILOCK must be held. @@ -789,13 +816,17 @@ int xchk_metadata_inode_forks( struct xfs_scrub *sc) { - __u32 smtype; bool shared; int error; if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return 0; + /* Check the inode record. */ + error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; + /* Metadata inodes don't live on the rt device. */ if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) { xchk_ino_set_corrupt(sc, sc->ip->i_ino); @@ -815,10 +846,7 @@ xchk_metadata_inode_forks( } /* Invoke the data fork scrubber. */ - smtype = sc->sm->sm_type; - sc->sm->sm_type = XFS_SCRUB_TYPE_BMBTD; - error = xchk_bmap_data(sc); - sc->sm->sm_type = smtype; + error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD); if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) return error; @@ -833,7 +861,7 @@ xchk_metadata_inode_forks( xchk_ino_set_corrupt(sc, sc->ip->i_ino); } - return error; + return 0; } /* -- GitLab From 1cec8bbc1764964de24d19983fbf9fee6ce3c09d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 17 Nov 2022 00:23:49 +0000 Subject: [PATCH 0677/1423] KVM: arm64: selftests: Disable single-step with correct KVM define Disable single-step by setting debug.control to KVM_GUESTDBG_ENABLE, not to SINGLE_STEP_DISABLE. The latter is an arbitrary test enum that just happens to have the same value as KVM_GUESTDBG_ENABLE, and so effectively disables single-step debug. No functional change intended. Cc: Reiji Watanabe Fixes: b18e4d4aebdd ("KVM: arm64: selftests: Add a test case for KVM_GUESTDBG_SINGLESTEP") Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221117002350.2178351-2-seanjc@google.com Reviewed-by: Oliver Upton --- tools/testing/selftests/kvm/aarch64/debug-exceptions.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c index 878c334607e1a..0316f225d36a6 100644 --- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c +++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c @@ -369,7 +369,7 @@ void test_single_step_from_userspace(int test_cnt) KVM_GUESTDBG_SINGLESTEP; ss_enable = true; } else { - debug.control = SINGLE_STEP_DISABLE; + debug.control = KVM_GUESTDBG_ENABLE; ss_enable = false; } -- GitLab From b3d937722de0e64eebe267451a0e3d5ed5107ef7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 17 Nov 2022 00:23:50 +0000 Subject: [PATCH 0678/1423] KVM: arm64: selftests: Disable single-step without relying on ucall() Automatically disable single-step when the guest reaches the end of the verified section instead of using an explicit ucall() to ask userspace to disable single-step. An upcoming change to implement a pool-based scheme for ucall() will add an atomic operation (bit test and set) in the guest ucall code, and if the compiler generate "old school" atomics, e.g. 40e57c: c85f7c20 ldxr x0, [x1] 40e580: aa100011 orr x17, x0, x16 40e584: c80ffc31 stlxr w15, x17, [x1] 40e588: 35ffffaf cbnz w15, 40e57c <__aarch64_ldset8_sync+0x1c> the guest will hang as the local exclusive monitor is reset by eret, i.e. the stlxr will always fail due to the debug exception taken to EL2. Link: https://lore.kernel.org/all/20221006003409.649993-8-seanjc@google.com Cc: Oliver Upton Cc: Marc Zyngier Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221117002350.2178351-3-seanjc@google.com Reviewed-by: Oliver Upton --- .../selftests/kvm/aarch64/debug-exceptions.c | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c index 0316f225d36a6..a3c2216b65fb3 100644 --- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c +++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c @@ -241,7 +241,6 @@ static void guest_svc_handler(struct ex_regs *regs) enum single_step_op { SINGLE_STEP_ENABLE = 0, - SINGLE_STEP_DISABLE = 1, }; static void guest_code_ss(int test_cnt) @@ -258,7 +257,7 @@ static void guest_code_ss(int test_cnt) GUEST_SYNC(SINGLE_STEP_ENABLE); /* - * The userspace will veriry that the pc is as expected during + * The userspace will verify that the pc is as expected during * single step execution between iter_ss_begin and iter_ss_end. */ asm volatile("iter_ss_begin:nop\n"); @@ -268,11 +267,9 @@ static void guest_code_ss(int test_cnt) bvr = read_sysreg(dbgbvr0_el1); wvr = read_sysreg(dbgwvr0_el1); + /* Userspace disables Single Step when the end is nigh. */ asm volatile("iter_ss_end:\n"); - /* Disable Single Step execution */ - GUEST_SYNC(SINGLE_STEP_DISABLE); - GUEST_ASSERT(bvr == w_bvr); GUEST_ASSERT(wvr == w_wvr); } @@ -364,15 +361,12 @@ void test_single_step_from_userspace(int test_cnt) TEST_ASSERT(cmd == UCALL_SYNC, "Unexpected ucall cmd 0x%lx", cmd); - if (uc.args[1] == SINGLE_STEP_ENABLE) { - debug.control = KVM_GUESTDBG_ENABLE | - KVM_GUESTDBG_SINGLESTEP; - ss_enable = true; - } else { - debug.control = KVM_GUESTDBG_ENABLE; - ss_enable = false; - } + TEST_ASSERT(uc.args[1] == SINGLE_STEP_ENABLE, + "Unexpected ucall action 0x%lx", uc.args[1]); + debug.control = KVM_GUESTDBG_ENABLE | + KVM_GUESTDBG_SINGLESTEP; + ss_enable = true; vcpu_guest_debug_set(vcpu, &debug); continue; } @@ -385,6 +379,14 @@ void test_single_step_from_userspace(int test_cnt) "Unexpected pc 0x%lx (expected 0x%lx)", pc, test_pc); + if ((pc + 4) == (uint64_t)&iter_ss_end) { + test_pc = 0; + debug.control = KVM_GUESTDBG_ENABLE; + ss_enable = false; + vcpu_guest_debug_set(vcpu, &debug); + continue; + } + /* * If the current pc is between iter_ss_bgin and * iter_ss_end, the pc for the next KVM_EXIT_DEBUG should -- GitLab From 7046638192d52416adbfc273c36950f0e3311191 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:34:03 +0000 Subject: [PATCH 0679/1423] KVM: selftests: Consolidate common code for populating ucall struct Make ucall() a common helper that populates struct ucall, and only calls into arch code to make the actually call out to userspace. Rename all arch-specific helpers to make it clear they're arch-specific, and to avoid collisions with common helpers (one more on its way...) Add WRITE_ONCE() to stores in ucall() code (as already done to aarch64 code in commit 9e2f6498efbb ("selftests: KVM: Handle compiler optimizations in ucall")) to prevent clang optimizations breaking ucalls. Cc: Colton Lewis Reviewed-by: Andrew Jones Tested-by: Peter Gonda Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221006003409.649993-2-seanjc@google.com --- tools/testing/selftests/kvm/Makefile | 1 + .../selftests/kvm/include/ucall_common.h | 23 ++++++++++++++++--- .../testing/selftests/kvm/lib/aarch64/ucall.c | 22 ++++-------------- tools/testing/selftests/kvm/lib/riscv/ucall.c | 23 ++++--------------- tools/testing/selftests/kvm/lib/s390x/ucall.c | 23 ++++--------------- .../testing/selftests/kvm/lib/ucall_common.c | 20 ++++++++++++++++ .../testing/selftests/kvm/lib/x86_64/ucall.c | 23 ++++--------------- 7 files changed, 61 insertions(+), 74 deletions(-) create mode 100644 tools/testing/selftests/kvm/lib/ucall_common.c diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index a00253b79040b..6e2a683629c77 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -47,6 +47,7 @@ LIBKVM += lib/memstress.c LIBKVM += lib/rbtree.c LIBKVM += lib/sparsebit.c LIBKVM += lib/test_util.c +LIBKVM += lib/ucall_common.c LIBKVM_STRING += lib/string_override.c diff --git a/tools/testing/selftests/kvm/include/ucall_common.h b/tools/testing/selftests/kvm/include/ucall_common.h index ee79d180e07e4..5a85f5318bbe5 100644 --- a/tools/testing/selftests/kvm/include/ucall_common.h +++ b/tools/testing/selftests/kvm/include/ucall_common.h @@ -24,10 +24,27 @@ struct ucall { uint64_t args[UCALL_MAX_ARGS]; }; -void ucall_init(struct kvm_vm *vm, void *arg); -void ucall_uninit(struct kvm_vm *vm); +void ucall_arch_init(struct kvm_vm *vm, void *arg); +void ucall_arch_uninit(struct kvm_vm *vm); +void ucall_arch_do_ucall(vm_vaddr_t uc); +uint64_t ucall_arch_get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc); + void ucall(uint64_t cmd, int nargs, ...); -uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc); + +static inline void ucall_init(struct kvm_vm *vm, void *arg) +{ + ucall_arch_init(vm, arg); +} + +static inline void ucall_uninit(struct kvm_vm *vm) +{ + ucall_arch_uninit(vm); +} + +static inline uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc) +{ + return ucall_arch_get_ucall(vcpu, uc); +} #define GUEST_SYNC_ARGS(stage, arg1, arg2, arg3, arg4) \ ucall(UCALL_SYNC, 6, "hello", stage, arg1, arg2, arg3, arg4) diff --git a/tools/testing/selftests/kvm/lib/aarch64/ucall.c b/tools/testing/selftests/kvm/lib/aarch64/ucall.c index ed237b7446907..3630708c32d62 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/ucall.c +++ b/tools/testing/selftests/kvm/lib/aarch64/ucall.c @@ -21,7 +21,7 @@ static bool ucall_mmio_init(struct kvm_vm *vm, vm_paddr_t gpa) return true; } -void ucall_init(struct kvm_vm *vm, void *arg) +void ucall_arch_init(struct kvm_vm *vm, void *arg) { vm_paddr_t gpa, start, end, step, offset; unsigned int bits; @@ -64,30 +64,18 @@ void ucall_init(struct kvm_vm *vm, void *arg) TEST_FAIL("Can't find a ucall mmio address"); } -void ucall_uninit(struct kvm_vm *vm) +void ucall_arch_uninit(struct kvm_vm *vm) { ucall_exit_mmio_addr = 0; sync_global_to_guest(vm, ucall_exit_mmio_addr); } -void ucall(uint64_t cmd, int nargs, ...) +void ucall_arch_do_ucall(vm_vaddr_t uc) { - struct ucall uc = {}; - va_list va; - int i; - - WRITE_ONCE(uc.cmd, cmd); - nargs = min(nargs, UCALL_MAX_ARGS); - - va_start(va, nargs); - for (i = 0; i < nargs; ++i) - WRITE_ONCE(uc.args[i], va_arg(va, uint64_t)); - va_end(va); - - WRITE_ONCE(*ucall_exit_mmio_addr, (vm_vaddr_t)&uc); + WRITE_ONCE(*ucall_exit_mmio_addr, uc); } -uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc) +uint64_t ucall_arch_get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc) { struct kvm_run *run = vcpu->run; struct ucall ucall = {}; diff --git a/tools/testing/selftests/kvm/lib/riscv/ucall.c b/tools/testing/selftests/kvm/lib/riscv/ucall.c index 087b9740bc8fc..b1598f418c1f8 100644 --- a/tools/testing/selftests/kvm/lib/riscv/ucall.c +++ b/tools/testing/selftests/kvm/lib/riscv/ucall.c @@ -10,11 +10,11 @@ #include "kvm_util.h" #include "processor.h" -void ucall_init(struct kvm_vm *vm, void *arg) +void ucall_arch_init(struct kvm_vm *vm, void *arg) { } -void ucall_uninit(struct kvm_vm *vm) +void ucall_arch_uninit(struct kvm_vm *vm) { } @@ -44,27 +44,14 @@ struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0, return ret; } -void ucall(uint64_t cmd, int nargs, ...) +void ucall_arch_do_ucall(vm_vaddr_t uc) { - struct ucall uc = { - .cmd = cmd, - }; - va_list va; - int i; - - nargs = min(nargs, UCALL_MAX_ARGS); - - va_start(va, nargs); - for (i = 0; i < nargs; ++i) - uc.args[i] = va_arg(va, uint64_t); - va_end(va); - sbi_ecall(KVM_RISCV_SELFTESTS_SBI_EXT, KVM_RISCV_SELFTESTS_SBI_UCALL, - (vm_vaddr_t)&uc, 0, 0, 0, 0, 0); + uc, 0, 0, 0, 0, 0); } -uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc) +uint64_t ucall_arch_get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc) { struct kvm_run *run = vcpu->run; struct ucall ucall = {}; diff --git a/tools/testing/selftests/kvm/lib/s390x/ucall.c b/tools/testing/selftests/kvm/lib/s390x/ucall.c index 73dc4e21190ff..114cb4af295f5 100644 --- a/tools/testing/selftests/kvm/lib/s390x/ucall.c +++ b/tools/testing/selftests/kvm/lib/s390x/ucall.c @@ -6,34 +6,21 @@ */ #include "kvm_util.h" -void ucall_init(struct kvm_vm *vm, void *arg) +void ucall_arch_init(struct kvm_vm *vm, void *arg) { } -void ucall_uninit(struct kvm_vm *vm) +void ucall_arch_uninit(struct kvm_vm *vm) { } -void ucall(uint64_t cmd, int nargs, ...) +void ucall_arch_do_ucall(vm_vaddr_t uc) { - struct ucall uc = { - .cmd = cmd, - }; - va_list va; - int i; - - nargs = min(nargs, UCALL_MAX_ARGS); - - va_start(va, nargs); - for (i = 0; i < nargs; ++i) - uc.args[i] = va_arg(va, uint64_t); - va_end(va); - /* Exit via DIAGNOSE 0x501 (normally used for breakpoints) */ - asm volatile ("diag 0,%0,0x501" : : "a"(&uc) : "memory"); + asm volatile ("diag 0,%0,0x501" : : "a"(uc) : "memory"); } -uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc) +uint64_t ucall_arch_get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc) { struct kvm_run *run = vcpu->run; struct ucall ucall = {}; diff --git a/tools/testing/selftests/kvm/lib/ucall_common.c b/tools/testing/selftests/kvm/lib/ucall_common.c new file mode 100644 index 0000000000000..2395c7f1d5435 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/ucall_common.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "kvm_util.h" + +void ucall(uint64_t cmd, int nargs, ...) +{ + struct ucall uc = {}; + va_list va; + int i; + + WRITE_ONCE(uc.cmd, cmd); + + nargs = min(nargs, UCALL_MAX_ARGS); + + va_start(va, nargs); + for (i = 0; i < nargs; ++i) + WRITE_ONCE(uc.args[i], va_arg(va, uint64_t)); + va_end(va); + + ucall_arch_do_ucall((vm_vaddr_t)&uc); +} diff --git a/tools/testing/selftests/kvm/lib/x86_64/ucall.c b/tools/testing/selftests/kvm/lib/x86_64/ucall.c index e5f0f9e0d3ee0..9f532dba10035 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/ucall.c +++ b/tools/testing/selftests/kvm/lib/x86_64/ucall.c @@ -8,34 +8,21 @@ #define UCALL_PIO_PORT ((uint16_t)0x1000) -void ucall_init(struct kvm_vm *vm, void *arg) +void ucall_arch_init(struct kvm_vm *vm, void *arg) { } -void ucall_uninit(struct kvm_vm *vm) +void ucall_arch_uninit(struct kvm_vm *vm) { } -void ucall(uint64_t cmd, int nargs, ...) +void ucall_arch_do_ucall(vm_vaddr_t uc) { - struct ucall uc = { - .cmd = cmd, - }; - va_list va; - int i; - - nargs = min(nargs, UCALL_MAX_ARGS); - - va_start(va, nargs); - for (i = 0; i < nargs; ++i) - uc.args[i] = va_arg(va, uint64_t); - va_end(va); - asm volatile("in %[port], %%al" - : : [port] "d" (UCALL_PIO_PORT), "D" (&uc) : "rax", "memory"); + : : [port] "d" (UCALL_PIO_PORT), "D" (uc) : "rax", "memory"); } -uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc) +uint64_t ucall_arch_get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc) { struct kvm_run *run = vcpu->run; struct ucall ucall = {}; -- GitLab From ef38871eb22879438d2af8642ed7a52c1616f410 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:34:04 +0000 Subject: [PATCH 0680/1423] KVM: selftests: Consolidate boilerplate code in get_ucall() Consolidate the actual copying of a ucall struct from guest=>host into the common get_ucall(). Return a host virtual address instead of a guest virtual address even though the addr_gva2hva() part could be moved to get_ucall() too. Conceptually, get_ucall() is invoked from the host and should return a host virtual address (and returning NULL for "nothing to see here" is far superior to returning 0). Use pointer shenanigans instead of an unnecessary bounce buffer when the caller of get_ucall() provides a valid pointer. Reviewed-by: Andrew Jones Tested-by: Peter Gonda Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221006003409.649993-3-seanjc@google.com --- .../selftests/kvm/include/ucall_common.h | 8 ++------ .../testing/selftests/kvm/lib/aarch64/ucall.c | 14 +++----------- tools/testing/selftests/kvm/lib/riscv/ucall.c | 19 +++---------------- tools/testing/selftests/kvm/lib/s390x/ucall.c | 16 +++------------- .../testing/selftests/kvm/lib/ucall_common.c | 19 +++++++++++++++++++ .../testing/selftests/kvm/lib/x86_64/ucall.c | 16 +++------------- 6 files changed, 33 insertions(+), 59 deletions(-) diff --git a/tools/testing/selftests/kvm/include/ucall_common.h b/tools/testing/selftests/kvm/include/ucall_common.h index 5a85f5318bbe5..63bfc60be995e 100644 --- a/tools/testing/selftests/kvm/include/ucall_common.h +++ b/tools/testing/selftests/kvm/include/ucall_common.h @@ -27,9 +27,10 @@ struct ucall { void ucall_arch_init(struct kvm_vm *vm, void *arg); void ucall_arch_uninit(struct kvm_vm *vm); void ucall_arch_do_ucall(vm_vaddr_t uc); -uint64_t ucall_arch_get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc); +void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu); void ucall(uint64_t cmd, int nargs, ...); +uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc); static inline void ucall_init(struct kvm_vm *vm, void *arg) { @@ -41,11 +42,6 @@ static inline void ucall_uninit(struct kvm_vm *vm) ucall_arch_uninit(vm); } -static inline uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc) -{ - return ucall_arch_get_ucall(vcpu, uc); -} - #define GUEST_SYNC_ARGS(stage, arg1, arg2, arg3, arg4) \ ucall(UCALL_SYNC, 6, "hello", stage, arg1, arg2, arg3, arg4) #define GUEST_SYNC(stage) ucall(UCALL_SYNC, 2, "hello", stage) diff --git a/tools/testing/selftests/kvm/lib/aarch64/ucall.c b/tools/testing/selftests/kvm/lib/aarch64/ucall.c index 3630708c32d62..f214f5cc53d3b 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/ucall.c +++ b/tools/testing/selftests/kvm/lib/aarch64/ucall.c @@ -75,13 +75,9 @@ void ucall_arch_do_ucall(vm_vaddr_t uc) WRITE_ONCE(*ucall_exit_mmio_addr, uc); } -uint64_t ucall_arch_get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc) +void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; - struct ucall ucall = {}; - - if (uc) - memset(uc, 0, sizeof(*uc)); if (run->exit_reason == KVM_EXIT_MMIO && run->mmio.phys_addr == (uint64_t)ucall_exit_mmio_addr) { @@ -90,12 +86,8 @@ uint64_t ucall_arch_get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc) TEST_ASSERT(run->mmio.is_write && run->mmio.len == 8, "Unexpected ucall exit mmio address access"); memcpy(&gva, run->mmio.data, sizeof(gva)); - memcpy(&ucall, addr_gva2hva(vcpu->vm, gva), sizeof(ucall)); - - vcpu_run_complete_io(vcpu); - if (uc) - memcpy(uc, &ucall, sizeof(ucall)); + return addr_gva2hva(vcpu->vm, gva); } - return ucall.cmd; + return NULL; } diff --git a/tools/testing/selftests/kvm/lib/riscv/ucall.c b/tools/testing/selftests/kvm/lib/riscv/ucall.c index b1598f418c1f8..37e091d4366e5 100644 --- a/tools/testing/selftests/kvm/lib/riscv/ucall.c +++ b/tools/testing/selftests/kvm/lib/riscv/ucall.c @@ -51,27 +51,15 @@ void ucall_arch_do_ucall(vm_vaddr_t uc) uc, 0, 0, 0, 0, 0); } -uint64_t ucall_arch_get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc) +void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; - struct ucall ucall = {}; - - if (uc) - memset(uc, 0, sizeof(*uc)); if (run->exit_reason == KVM_EXIT_RISCV_SBI && run->riscv_sbi.extension_id == KVM_RISCV_SELFTESTS_SBI_EXT) { switch (run->riscv_sbi.function_id) { case KVM_RISCV_SELFTESTS_SBI_UCALL: - memcpy(&ucall, - addr_gva2hva(vcpu->vm, run->riscv_sbi.args[0]), - sizeof(ucall)); - - vcpu_run_complete_io(vcpu); - if (uc) - memcpy(uc, &ucall, sizeof(ucall)); - - break; + return addr_gva2hva(vcpu->vm, run->riscv_sbi.args[0]); case KVM_RISCV_SELFTESTS_SBI_UNEXP: vcpu_dump(stderr, vcpu, 2); TEST_ASSERT(0, "Unexpected trap taken by guest"); @@ -80,6 +68,5 @@ uint64_t ucall_arch_get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc) break; } } - - return ucall.cmd; + return NULL; } diff --git a/tools/testing/selftests/kvm/lib/s390x/ucall.c b/tools/testing/selftests/kvm/lib/s390x/ucall.c index 114cb4af295f5..0f695a031d35d 100644 --- a/tools/testing/selftests/kvm/lib/s390x/ucall.c +++ b/tools/testing/selftests/kvm/lib/s390x/ucall.c @@ -20,13 +20,9 @@ void ucall_arch_do_ucall(vm_vaddr_t uc) asm volatile ("diag 0,%0,0x501" : : "a"(uc) : "memory"); } -uint64_t ucall_arch_get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc) +void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; - struct ucall ucall = {}; - - if (uc) - memset(uc, 0, sizeof(*uc)); if (run->exit_reason == KVM_EXIT_S390_SIEIC && run->s390_sieic.icptcode == 4 && @@ -34,13 +30,7 @@ uint64_t ucall_arch_get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc) (run->s390_sieic.ipb >> 16) == 0x501) { int reg = run->s390_sieic.ipa & 0xf; - memcpy(&ucall, addr_gva2hva(vcpu->vm, run->s.regs.gprs[reg]), - sizeof(ucall)); - - vcpu_run_complete_io(vcpu); - if (uc) - memcpy(uc, &ucall, sizeof(ucall)); + return addr_gva2hva(vcpu->vm, run->s.regs.gprs[reg]); } - - return ucall.cmd; + return NULL; } diff --git a/tools/testing/selftests/kvm/lib/ucall_common.c b/tools/testing/selftests/kvm/lib/ucall_common.c index 2395c7f1d5435..ced4808607463 100644 --- a/tools/testing/selftests/kvm/lib/ucall_common.c +++ b/tools/testing/selftests/kvm/lib/ucall_common.c @@ -18,3 +18,22 @@ void ucall(uint64_t cmd, int nargs, ...) ucall_arch_do_ucall((vm_vaddr_t)&uc); } + +uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc) +{ + struct ucall ucall; + void *addr; + + if (!uc) + uc = &ucall; + + addr = ucall_arch_get_ucall(vcpu); + if (addr) { + memcpy(uc, addr, sizeof(*uc)); + vcpu_run_complete_io(vcpu); + } else { + memset(uc, 0, sizeof(*uc)); + } + + return uc->cmd; +} diff --git a/tools/testing/selftests/kvm/lib/x86_64/ucall.c b/tools/testing/selftests/kvm/lib/x86_64/ucall.c index 9f532dba10035..ead9946399abf 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/ucall.c +++ b/tools/testing/selftests/kvm/lib/x86_64/ucall.c @@ -22,25 +22,15 @@ void ucall_arch_do_ucall(vm_vaddr_t uc) : : [port] "d" (UCALL_PIO_PORT), "D" (uc) : "rax", "memory"); } -uint64_t ucall_arch_get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc) +void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; - struct ucall ucall = {}; - - if (uc) - memset(uc, 0, sizeof(*uc)); if (run->exit_reason == KVM_EXIT_IO && run->io.port == UCALL_PIO_PORT) { struct kvm_regs regs; vcpu_regs_get(vcpu, ®s); - memcpy(&ucall, addr_gva2hva(vcpu->vm, (vm_vaddr_t)regs.rdi), - sizeof(ucall)); - - vcpu_run_complete_io(vcpu); - if (uc) - memcpy(uc, &ucall, sizeof(ucall)); + return addr_gva2hva(vcpu->vm, regs.rdi); } - - return ucall.cmd; + return NULL; } -- GitLab From dc88244bf5488b04fb7bbe47d8d9c38ff8f7dbb4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:34:05 +0000 Subject: [PATCH 0681/1423] KVM: selftests: Automatically do init_ucall() for non-barebones VMs Do init_ucall() automatically during VM creation to kill two (three?) birds with one stone. First, initializing ucall immediately after VM creations allows forcing aarch64's MMIO ucall address to immediately follow memslot0. This is still somewhat fragile as tests could clobber the MMIO address with a new memslot, but it's safe-ish since tests have to be conversative when accounting for memslot0. And this can be hardened in the future by creating a read-only memslot for the MMIO page (KVM ARM exits with MMIO if the guest writes to a read-only memslot). Add a TODO to document that selftests can and should use a memslot for the ucall MMIO (doing so requires yet more rework because tests assumes thay can use all memslots except memslot0). Second, initializing ucall for all VMs prepares for making ucall initialization meaningful on all architectures. aarch64 is currently the only arch that needs to do any setup, but that will change in the future by switching to a pool-based implementation (instead of the current stack-based approach). Lastly, defining the ucall MMIO address from common code will simplify switching all architectures (except s390) to a common MMIO-based ucall implementation (if there's ever sufficient motivation to do so). Cc: Oliver Upton Reviewed-by: Andrew Jones Tested-by: Peter Gonda Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221006003409.649993-4-seanjc@google.com --- .../selftests/kvm/aarch64/aarch32_id_regs.c | 2 - .../selftests/kvm/aarch64/arch_timer.c | 1 - .../selftests/kvm/aarch64/debug-exceptions.c | 2 - .../selftests/kvm/aarch64/hypercalls.c | 1 - .../testing/selftests/kvm/aarch64/psci_test.c | 1 - .../testing/selftests/kvm/aarch64/vgic_init.c | 2 - .../testing/selftests/kvm/aarch64/vgic_irq.c | 1 - tools/testing/selftests/kvm/dirty_log_test.c | 2 - .../selftests/kvm/include/ucall_common.h | 6 +-- .../selftests/kvm/kvm_page_table_test.c | 1 - .../testing/selftests/kvm/lib/aarch64/ucall.c | 54 ++----------------- tools/testing/selftests/kvm/lib/kvm_util.c | 11 ++++ tools/testing/selftests/kvm/lib/memstress.c | 2 - tools/testing/selftests/kvm/lib/riscv/ucall.c | 2 +- tools/testing/selftests/kvm/lib/s390x/ucall.c | 2 +- .../testing/selftests/kvm/lib/x86_64/ucall.c | 2 +- .../testing/selftests/kvm/memslot_perf_test.c | 1 - tools/testing/selftests/kvm/rseq_test.c | 1 - tools/testing/selftests/kvm/steal_time.c | 1 - .../kvm/system_counter_offset_test.c | 1 - 20 files changed, 20 insertions(+), 76 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c b/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c index 6f9c1f19c7f64..03f6b3af6b4d7 100644 --- a/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c +++ b/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c @@ -158,8 +158,6 @@ int main(void) TEST_REQUIRE(vcpu_aarch64_only(vcpu)); - ucall_init(vm, NULL); - test_user_raz_wi(vcpu); test_user_raz_invariant(vcpu); test_guest_raz(vcpu); diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c index 9409617fce9c4..54016cdd8a098 100644 --- a/tools/testing/selftests/kvm/aarch64/arch_timer.c +++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c @@ -375,7 +375,6 @@ static struct kvm_vm *test_vm_create(void) for (i = 0; i < nr_vcpus; i++) vcpu_init_descriptor_tables(vcpus[i]); - ucall_init(vm, NULL); test_init_timer_irq(vm); gic_fd = vgic_v3_setup(vm, nr_vcpus, 64, GICD_BASE_GPA, GICR_BASE_GPA); __TEST_REQUIRE(gic_fd >= 0, "Failed to create vgic-v3"); diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c index a3c2216b65fb3..d86c4e4d1c826 100644 --- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c +++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c @@ -292,7 +292,6 @@ static void test_guest_debug_exceptions(void) int stage; vm = vm_create_with_one_vcpu(&vcpu, guest_code); - ucall_init(vm, NULL); vm_init_descriptor_tables(vm); vcpu_init_descriptor_tables(vcpu); @@ -343,7 +342,6 @@ void test_single_step_from_userspace(int test_cnt) struct kvm_guest_debug debug = {}; vm = vm_create_with_one_vcpu(&vcpu, guest_code_ss); - ucall_init(vm, NULL); run = vcpu->run; vcpu_args_set(vcpu, 1, test_cnt); diff --git a/tools/testing/selftests/kvm/aarch64/hypercalls.c b/tools/testing/selftests/kvm/aarch64/hypercalls.c index a39da3fe49525..3dceecfd1f628 100644 --- a/tools/testing/selftests/kvm/aarch64/hypercalls.c +++ b/tools/testing/selftests/kvm/aarch64/hypercalls.c @@ -236,7 +236,6 @@ static struct kvm_vm *test_vm_create(struct kvm_vcpu **vcpu) vm = vm_create_with_one_vcpu(vcpu, guest_code); - ucall_init(vm, NULL); steal_time_init(*vcpu); return vm; diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c index e0b9e81a3e091..cfa36f3879484 100644 --- a/tools/testing/selftests/kvm/aarch64/psci_test.c +++ b/tools/testing/selftests/kvm/aarch64/psci_test.c @@ -79,7 +79,6 @@ static struct kvm_vm *setup_vm(void *guest_code, struct kvm_vcpu **source, struct kvm_vm *vm; vm = vm_create(2); - ucall_init(vm, NULL); vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &init); init.features[0] |= (1 << KVM_ARM_VCPU_PSCI_0_2); diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c index 9c131d977a1b5..eef816b80993f 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_init.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c @@ -68,8 +68,6 @@ static void guest_code(void) /* we don't want to assert on run execution, hence that helper */ static int run_vcpu(struct kvm_vcpu *vcpu) { - ucall_init(vcpu->vm, NULL); - return __vcpu_run(vcpu) ? -errno : 0; } diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c index 4ead42a072b7d..e0310ebc313ca 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c @@ -756,7 +756,6 @@ static void test_vgic(uint32_t nr_irqs, bool level_sensitive, bool eoi_split) print_args(&args); vm = vm_create_with_one_vcpu(&vcpu, guest_code); - ucall_init(vm, NULL); vm_init_descriptor_tables(vm); vcpu_init_descriptor_tables(vcpu); diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index b5234d6efbe15..b458a2701634b 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -756,8 +756,6 @@ static void run_test(enum vm_guest_mode mode, void *arg) /* Cache the HVA pointer of the region */ host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_phys_mem); - ucall_init(vm, NULL); - /* Export the shared variables to the guest */ sync_global_to_guest(vm, host_page_size); sync_global_to_guest(vm, guest_page_size); diff --git a/tools/testing/selftests/kvm/include/ucall_common.h b/tools/testing/selftests/kvm/include/ucall_common.h index 63bfc60be995e..8077a6d8b1ba5 100644 --- a/tools/testing/selftests/kvm/include/ucall_common.h +++ b/tools/testing/selftests/kvm/include/ucall_common.h @@ -24,7 +24,7 @@ struct ucall { uint64_t args[UCALL_MAX_ARGS]; }; -void ucall_arch_init(struct kvm_vm *vm, void *arg); +void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa); void ucall_arch_uninit(struct kvm_vm *vm); void ucall_arch_do_ucall(vm_vaddr_t uc); void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu); @@ -32,9 +32,9 @@ void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu); void ucall(uint64_t cmd, int nargs, ...); uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc); -static inline void ucall_init(struct kvm_vm *vm, void *arg) +static inline void ucall_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa) { - ucall_arch_init(vm, arg); + ucall_arch_init(vm, mmio_gpa); } static inline void ucall_uninit(struct kvm_vm *vm) diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index 696b366be06b4..3db32d56787e8 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -289,7 +289,6 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg) host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_phys_mem); /* Export shared structure test_args to guest */ - ucall_init(vm, NULL); sync_global_to_guest(vm, test_args); ret = sem_init(&test_stage_updated, 0, 0); diff --git a/tools/testing/selftests/kvm/lib/aarch64/ucall.c b/tools/testing/selftests/kvm/lib/aarch64/ucall.c index f214f5cc53d3b..f02ae27c3e43a 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/ucall.c +++ b/tools/testing/selftests/kvm/lib/aarch64/ucall.c @@ -8,60 +8,12 @@ static vm_vaddr_t *ucall_exit_mmio_addr; -static bool ucall_mmio_init(struct kvm_vm *vm, vm_paddr_t gpa) +void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa) { - if (kvm_userspace_memory_region_find(vm, gpa, gpa + 1)) - return false; + virt_pg_map(vm, mmio_gpa, mmio_gpa); - virt_pg_map(vm, gpa, gpa); - - ucall_exit_mmio_addr = (vm_vaddr_t *)gpa; + ucall_exit_mmio_addr = (vm_vaddr_t *)mmio_gpa; sync_global_to_guest(vm, ucall_exit_mmio_addr); - - return true; -} - -void ucall_arch_init(struct kvm_vm *vm, void *arg) -{ - vm_paddr_t gpa, start, end, step, offset; - unsigned int bits; - bool ret; - - if (arg) { - gpa = (vm_paddr_t)arg; - ret = ucall_mmio_init(vm, gpa); - TEST_ASSERT(ret, "Can't set ucall mmio address to %lx", gpa); - return; - } - - /* - * Find an address within the allowed physical and virtual address - * spaces, that does _not_ have a KVM memory region associated with - * it. Identity mapping an address like this allows the guest to - * access it, but as KVM doesn't know what to do with it, it - * will assume it's something userspace handles and exit with - * KVM_EXIT_MMIO. Well, at least that's how it works for AArch64. - * Here we start with a guess that the addresses around 5/8th - * of the allowed space are unmapped and then work both down and - * up from there in 1/16th allowed space sized steps. - * - * Note, we need to use VA-bits - 1 when calculating the allowed - * virtual address space for an identity mapping because the upper - * half of the virtual address space is the two's complement of the - * lower and won't match physical addresses. - */ - bits = vm->va_bits - 1; - bits = min(vm->pa_bits, bits); - end = 1ul << bits; - start = end * 5 / 8; - step = end / 16; - for (offset = 0; offset < end - start; offset += step) { - if (ucall_mmio_init(vm, start - offset)) - return; - if (ucall_mmio_init(vm, start + offset)) - return; - } - TEST_FAIL("Can't find a ucall mmio address"); } void ucall_arch_uninit(struct kvm_vm *vm) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 3b7710fb37840..07c8edd4e5486 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -335,15 +335,26 @@ struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint32_t nr_runnable_vcpus, { uint64_t nr_pages = vm_nr_pages_required(mode, nr_runnable_vcpus, nr_extra_pages); + struct userspace_mem_region *slot0; struct kvm_vm *vm; vm = ____vm_create(mode, nr_pages); kvm_vm_elf_load(vm, program_invocation_name); + /* + * TODO: Add proper defines to protect the library's memslots, and then + * carve out memslot1 for the ucall MMIO address. KVM treats writes to + * read-only memslots as MMIO, and creating a read-only memslot for the + * MMIO region would prevent silently clobbering the MMIO region. + */ + slot0 = memslot2region(vm, 0); + ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size); + #ifdef __x86_64__ vm_create_irqchip(vm); #endif + return vm; } diff --git a/tools/testing/selftests/kvm/lib/memstress.c b/tools/testing/selftests/kvm/lib/memstress.c index 503da78c558dd..b66404e56a3f8 100644 --- a/tools/testing/selftests/kvm/lib/memstress.c +++ b/tools/testing/selftests/kvm/lib/memstress.c @@ -221,8 +221,6 @@ struct kvm_vm *memstress_create_vm(enum vm_guest_mode mode, int nr_vcpus, memstress_setup_nested(vm, nr_vcpus, vcpus); } - ucall_init(vm, NULL); - /* Export the shared variables to the guest. */ sync_global_to_guest(vm, memstress_args); diff --git a/tools/testing/selftests/kvm/lib/riscv/ucall.c b/tools/testing/selftests/kvm/lib/riscv/ucall.c index 37e091d4366e5..c58ecb8a0981b 100644 --- a/tools/testing/selftests/kvm/lib/riscv/ucall.c +++ b/tools/testing/selftests/kvm/lib/riscv/ucall.c @@ -10,7 +10,7 @@ #include "kvm_util.h" #include "processor.h" -void ucall_arch_init(struct kvm_vm *vm, void *arg) +void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa) { } diff --git a/tools/testing/selftests/kvm/lib/s390x/ucall.c b/tools/testing/selftests/kvm/lib/s390x/ucall.c index 0f695a031d35d..208f0f04299bb 100644 --- a/tools/testing/selftests/kvm/lib/s390x/ucall.c +++ b/tools/testing/selftests/kvm/lib/s390x/ucall.c @@ -6,7 +6,7 @@ */ #include "kvm_util.h" -void ucall_arch_init(struct kvm_vm *vm, void *arg) +void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa) { } diff --git a/tools/testing/selftests/kvm/lib/x86_64/ucall.c b/tools/testing/selftests/kvm/lib/x86_64/ucall.c index ead9946399abf..016a0487cf723 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/ucall.c +++ b/tools/testing/selftests/kvm/lib/x86_64/ucall.c @@ -8,7 +8,7 @@ #define UCALL_PIO_PORT ((uint16_t)0x1000) -void ucall_arch_init(struct kvm_vm *vm, void *arg) +void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa) { } diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c index 330aaef1c02fb..d771262ea584b 100644 --- a/tools/testing/selftests/kvm/memslot_perf_test.c +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -277,7 +277,6 @@ static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots, TEST_ASSERT(data->hva_slots, "malloc() fail"); data->vm = __vm_create_with_one_vcpu(&data->vcpu, mempages, guest_code); - ucall_init(data->vm, NULL); pr_info_v("Adding slots 1..%i, each slot with %"PRIu64" pages + %"PRIu64" extra pages last\n", max_mem_slots - 1, data->pages_per_slot, rempages); diff --git a/tools/testing/selftests/kvm/rseq_test.c b/tools/testing/selftests/kvm/rseq_test.c index 6f88da7e60be6..0e9e2b48a51f7 100644 --- a/tools/testing/selftests/kvm/rseq_test.c +++ b/tools/testing/selftests/kvm/rseq_test.c @@ -224,7 +224,6 @@ int main(int argc, char *argv[]) * CPU affinity. */ vm = vm_create_with_one_vcpu(&vcpu, guest_code); - ucall_init(vm, NULL); pthread_create(&migration_thread, NULL, migration_worker, (void *)(unsigned long)syscall(SYS_gettid)); diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index db8967f1a17bc..c87f387120736 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -266,7 +266,6 @@ int main(int ac, char **av) gpages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, STEAL_TIME_SIZE * NR_VCPUS); vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, ST_GPA_BASE, 1, gpages, 0); virt_map(vm, ST_GPA_BASE, ST_GPA_BASE, gpages); - ucall_init(vm, NULL); TEST_REQUIRE(is_steal_time_supported(vcpus[0])); diff --git a/tools/testing/selftests/kvm/system_counter_offset_test.c b/tools/testing/selftests/kvm/system_counter_offset_test.c index 1c274933912bb..7f5b330b6a1b1 100644 --- a/tools/testing/selftests/kvm/system_counter_offset_test.c +++ b/tools/testing/selftests/kvm/system_counter_offset_test.c @@ -121,7 +121,6 @@ int main(void) vm = vm_create_with_one_vcpu(&vcpu, guest_main); check_preconditions(vcpu); - ucall_init(vm, NULL); enter_guest(vcpu); kvm_vm_free(vm); -- GitLab From cf4694be2b2cf74945e50d39a02ea2307c4495f4 Mon Sep 17 00:00:00 2001 From: Peter Gonda Date: Thu, 6 Oct 2022 00:34:06 +0000 Subject: [PATCH 0682/1423] tools: Add atomic_test_and_set_bit() Add x86 and generic implementations of atomic_test_and_set_bit() to allow KVM selftests to atomically manage bitmaps. Note, the generic version is taken from arch_test_and_set_bit() as of commit 415d83249709 ("locking/atomic: Make test_and_*_bit() ordered on failure"). Signed-off-by: Peter Gonda Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221006003409.649993-5-seanjc@google.com --- tools/arch/x86/include/asm/atomic.h | 7 +++++++ tools/include/asm-generic/atomic-gcc.h | 12 ++++++++++++ 2 files changed, 19 insertions(+) diff --git a/tools/arch/x86/include/asm/atomic.h b/tools/arch/x86/include/asm/atomic.h index 1f5e26aae9fc5..01cc27ec4520d 100644 --- a/tools/arch/x86/include/asm/atomic.h +++ b/tools/arch/x86/include/asm/atomic.h @@ -8,6 +8,7 @@ #define LOCK_PREFIX "\n\tlock; " +#include #include /* @@ -70,4 +71,10 @@ static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new) return cmpxchg(&v->counter, old, new); } +static inline int atomic_test_and_set_bit(long nr, unsigned long *addr) +{ + GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(bts), *addr, "Ir", nr, "%0", "c"); + +} + #endif /* _TOOLS_LINUX_ASM_X86_ATOMIC_H */ diff --git a/tools/include/asm-generic/atomic-gcc.h b/tools/include/asm-generic/atomic-gcc.h index 4c1966f7c77a7..6daa68bf5b9e0 100644 --- a/tools/include/asm-generic/atomic-gcc.h +++ b/tools/include/asm-generic/atomic-gcc.h @@ -4,6 +4,7 @@ #include #include +#include /* * Atomic operations that C can't guarantee us. Useful for @@ -69,4 +70,15 @@ static inline int atomic_cmpxchg(atomic_t *v, int oldval, int newval) return cmpxchg(&(v)->counter, oldval, newval); } +static inline int atomic_test_and_set_bit(long nr, unsigned long *addr) +{ + unsigned long mask = BIT_MASK(nr); + long old; + + addr += BIT_WORD(nr); + + old = __sync_fetch_and_or(addr, mask); + return !!(old & mask); +} + #endif /* __TOOLS_ASM_GENERIC_ATOMIC_H */ -- GitLab From 03b4750533fc6519845ac2ca0e1d88a81ac260a1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:34:07 +0000 Subject: [PATCH 0683/1423] KVM: selftests: Make arm64's MMIO ucall multi-VM friendly Fix a mostly-theoretical bug where ARM's ucall MMIO setup could result in different VMs stomping on each other by cloberring the global pointer. Fix the most obvious issue by saving the MMIO gpa into the VM. A more subtle bug is that creating VMs in parallel (on multiple tasks) could result in a VM using the wrong address. Synchronizing a global to a guest effectively snapshots the value on a per-VM basis, i.e. the "global" is already prepped to work with multiple VMs, but setting the global in the host is not thread-safe. To fix that bug, add write_guest_global() to allow stuffing a VM's copy of a "global" without modifying the host value. Reviewed-by: Andrew Jones Tested-by: Peter Gonda Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221006003409.649993-6-seanjc@google.com --- .../selftests/kvm/include/kvm_util_base.h | 15 +++++++++++++++ .../testing/selftests/kvm/lib/aarch64/ucall.c | 19 ++++++++++++++----- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 3bf2333ef95d2..a7047e0767d33 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -16,6 +16,7 @@ #include #include "linux/rbtree.h" +#include #include @@ -81,6 +82,7 @@ struct kvm_vm { struct sparsebit *vpages_mapped; bool has_irqchip; bool pgd_created; + vm_paddr_t ucall_mmio_addr; vm_paddr_t pgd; vm_vaddr_t gdt; vm_vaddr_t tss; @@ -722,6 +724,19 @@ kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start, memcpy(&(g), _p, sizeof(g)); \ }) +/* + * Write a global value, but only in the VM's (guest's) domain. Primarily used + * for "globals" that hold per-VM values (VMs always duplicate code and global + * data into their own region of physical memory), but can be used anytime it's + * undesirable to change the host's copy of the global. + */ +#define write_guest_global(vm, g, val) ({ \ + typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \ + typeof(g) _val = val; \ + \ + memcpy(_p, &(_val), sizeof(g)); \ +}) + void assert_on_unhandled_exception(struct kvm_vcpu *vcpu); void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, diff --git a/tools/testing/selftests/kvm/lib/aarch64/ucall.c b/tools/testing/selftests/kvm/lib/aarch64/ucall.c index f02ae27c3e43a..1c38bd260f90b 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/ucall.c +++ b/tools/testing/selftests/kvm/lib/aarch64/ucall.c @@ -6,20 +6,29 @@ */ #include "kvm_util.h" +/* + * ucall_exit_mmio_addr holds per-VM values (global data is duplicated by each + * VM), it must not be accessed from host code. + */ static vm_vaddr_t *ucall_exit_mmio_addr; +static void ucall_set_mmio_addr(struct kvm_vm *vm, vm_paddr_t mmio_gpa) +{ + vm->ucall_mmio_addr = mmio_gpa; + + write_guest_global(vm, ucall_exit_mmio_addr, (vm_vaddr_t *)mmio_gpa); +} + void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa) { virt_pg_map(vm, mmio_gpa, mmio_gpa); - ucall_exit_mmio_addr = (vm_vaddr_t *)mmio_gpa; - sync_global_to_guest(vm, ucall_exit_mmio_addr); + ucall_set_mmio_addr(vm, mmio_gpa); } void ucall_arch_uninit(struct kvm_vm *vm) { - ucall_exit_mmio_addr = 0; - sync_global_to_guest(vm, ucall_exit_mmio_addr); + ucall_set_mmio_addr(vm, (vm_paddr_t)NULL); } void ucall_arch_do_ucall(vm_vaddr_t uc) @@ -32,7 +41,7 @@ void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu) struct kvm_run *run = vcpu->run; if (run->exit_reason == KVM_EXIT_MMIO && - run->mmio.phys_addr == (uint64_t)ucall_exit_mmio_addr) { + run->mmio.phys_addr == vcpu->vm->ucall_mmio_addr) { vm_vaddr_t gva; TEST_ASSERT(run->mmio.is_write && run->mmio.len == 8, -- GitLab From 28a65567acb51759079adf5c6e3fcd047cda8120 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:34:08 +0000 Subject: [PATCH 0684/1423] KVM: selftests: Drop now-unnecessary ucall_uninit() Drop ucall_uninit() and ucall_arch_uninit() now that ARM doesn't modify the host's copy of ucall_exit_mmio_addr, i.e. now that there's no need to reset the pointer before potentially creating a new VM. The few calls to ucall_uninit() are all immediately followed by kvm_vm_free(), and that is likely always going to hold true, i.e. it's extremely unlikely a test will want to effectively disable ucall in the middle of a test. Reviewed-by: Andrew Jones Tested-by: Peter Gonda Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221006003409.649993-7-seanjc@google.com --- .../selftests/kvm/aarch64/aarch32_id_regs.c | 1 - tools/testing/selftests/kvm/dirty_log_test.c | 1 - tools/testing/selftests/kvm/include/ucall_common.h | 6 ------ tools/testing/selftests/kvm/kvm_page_table_test.c | 1 - tools/testing/selftests/kvm/lib/aarch64/ucall.c | 14 ++------------ tools/testing/selftests/kvm/lib/memstress.c | 1 - tools/testing/selftests/kvm/lib/riscv/ucall.c | 4 ---- tools/testing/selftests/kvm/lib/s390x/ucall.c | 4 ---- tools/testing/selftests/kvm/lib/x86_64/ucall.c | 4 ---- 9 files changed, 2 insertions(+), 34 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c b/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c index 03f6b3af6b4d7..b1d2158c0b6de 100644 --- a/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c +++ b/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c @@ -162,6 +162,5 @@ int main(void) test_user_raz_invariant(vcpu); test_guest_raz(vcpu); - ucall_uninit(vm); kvm_vm_free(vm); } diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index b458a2701634b..a38c4369fb8ed 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -811,7 +811,6 @@ static void run_test(enum vm_guest_mode mode, void *arg) free(bmap); free(host_bmap_track); - ucall_uninit(vm); kvm_vm_free(vm); } diff --git a/tools/testing/selftests/kvm/include/ucall_common.h b/tools/testing/selftests/kvm/include/ucall_common.h index 8077a6d8b1ba5..2662a4352a8c0 100644 --- a/tools/testing/selftests/kvm/include/ucall_common.h +++ b/tools/testing/selftests/kvm/include/ucall_common.h @@ -25,7 +25,6 @@ struct ucall { }; void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa); -void ucall_arch_uninit(struct kvm_vm *vm); void ucall_arch_do_ucall(vm_vaddr_t uc); void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu); @@ -37,11 +36,6 @@ static inline void ucall_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa) ucall_arch_init(vm, mmio_gpa); } -static inline void ucall_uninit(struct kvm_vm *vm) -{ - ucall_arch_uninit(vm); -} - #define GUEST_SYNC_ARGS(stage, arg1, arg2, arg3, arg4) \ ucall(UCALL_SYNC, 6, "hello", stage, arg1, arg2, arg3, arg4) #define GUEST_SYNC(stage) ucall(UCALL_SYNC, 2, "hello", stage) diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index 3db32d56787e8..b3b00be1ef824 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -416,7 +416,6 @@ static void run_test(enum vm_guest_mode mode, void *arg) TEST_ASSERT(ret == 0, "Error in sem_destroy"); free(vcpu_threads); - ucall_uninit(vm); kvm_vm_free(vm); } diff --git a/tools/testing/selftests/kvm/lib/aarch64/ucall.c b/tools/testing/selftests/kvm/lib/aarch64/ucall.c index 1c38bd260f90b..21d73afcb14fb 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/ucall.c +++ b/tools/testing/selftests/kvm/lib/aarch64/ucall.c @@ -12,23 +12,13 @@ */ static vm_vaddr_t *ucall_exit_mmio_addr; -static void ucall_set_mmio_addr(struct kvm_vm *vm, vm_paddr_t mmio_gpa) -{ - vm->ucall_mmio_addr = mmio_gpa; - - write_guest_global(vm, ucall_exit_mmio_addr, (vm_vaddr_t *)mmio_gpa); -} - void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa) { virt_pg_map(vm, mmio_gpa, mmio_gpa); - ucall_set_mmio_addr(vm, mmio_gpa); -} + vm->ucall_mmio_addr = mmio_gpa; -void ucall_arch_uninit(struct kvm_vm *vm) -{ - ucall_set_mmio_addr(vm, (vm_paddr_t)NULL); + write_guest_global(vm, ucall_exit_mmio_addr, (vm_vaddr_t *)mmio_gpa); } void ucall_arch_do_ucall(vm_vaddr_t uc) diff --git a/tools/testing/selftests/kvm/lib/memstress.c b/tools/testing/selftests/kvm/lib/memstress.c index b66404e56a3f8..2de8a5d527b3a 100644 --- a/tools/testing/selftests/kvm/lib/memstress.c +++ b/tools/testing/selftests/kvm/lib/memstress.c @@ -229,7 +229,6 @@ struct kvm_vm *memstress_create_vm(enum vm_guest_mode mode, int nr_vcpus, void memstress_destroy_vm(struct kvm_vm *vm) { - ucall_uninit(vm); kvm_vm_free(vm); } diff --git a/tools/testing/selftests/kvm/lib/riscv/ucall.c b/tools/testing/selftests/kvm/lib/riscv/ucall.c index c58ecb8a0981b..78acdb084ab0f 100644 --- a/tools/testing/selftests/kvm/lib/riscv/ucall.c +++ b/tools/testing/selftests/kvm/lib/riscv/ucall.c @@ -14,10 +14,6 @@ void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa) { } -void ucall_arch_uninit(struct kvm_vm *vm) -{ -} - struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, diff --git a/tools/testing/selftests/kvm/lib/s390x/ucall.c b/tools/testing/selftests/kvm/lib/s390x/ucall.c index 208f0f04299bb..cbee520a26f2a 100644 --- a/tools/testing/selftests/kvm/lib/s390x/ucall.c +++ b/tools/testing/selftests/kvm/lib/s390x/ucall.c @@ -10,10 +10,6 @@ void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa) { } -void ucall_arch_uninit(struct kvm_vm *vm) -{ -} - void ucall_arch_do_ucall(vm_vaddr_t uc) { /* Exit via DIAGNOSE 0x501 (normally used for breakpoints) */ diff --git a/tools/testing/selftests/kvm/lib/x86_64/ucall.c b/tools/testing/selftests/kvm/lib/x86_64/ucall.c index 016a0487cf723..eb8bf55b359a5 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/ucall.c +++ b/tools/testing/selftests/kvm/lib/x86_64/ucall.c @@ -12,10 +12,6 @@ void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa) { } -void ucall_arch_uninit(struct kvm_vm *vm) -{ -} - void ucall_arch_do_ucall(vm_vaddr_t uc) { asm volatile("in %[port], %%al" -- GitLab From 426729b2cf2e02ff4bd5c988832c044c8b77f4c7 Mon Sep 17 00:00:00 2001 From: Peter Gonda Date: Thu, 6 Oct 2022 00:34:09 +0000 Subject: [PATCH 0685/1423] KVM: selftests: Add ucall pool based implementation To play nice with guests whose stack memory is encrypted, e.g. AMD SEV, introduce a new "ucall pool" implementation that passes the ucall struct via dedicated memory (which can be mapped shared, a.k.a. as plain text). Because not all architectures have access to the vCPU index in the guest, use a bitmap with atomic accesses to track which entries in the pool are free/used. A list+lock could also work in theory, but synchronizing the individual pointers to the guest would be a mess. Note, there's no need to rewalk the bitmap to ensure success. If all vCPUs are simply allocating, success is guaranteed because there are enough entries for all vCPUs. If one or more vCPUs are freeing and then reallocating, success is guaranteed because vCPUs _always_ walk the bitmap from 0=>N; if vCPU frees an entry and then wins a race to re-allocate, then either it will consume the entry it just freed (bit is the first free bit), or the losing vCPU is guaranteed to see the freed bit (winner consumes an earlier bit, which the loser hasn't yet visited). Reviewed-by: Andrew Jones Signed-off-by: Peter Gonda Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221006003409.649993-8-seanjc@google.com --- .../selftests/kvm/include/ucall_common.h | 9 ++- .../testing/selftests/kvm/lib/aarch64/ucall.c | 7 +- tools/testing/selftests/kvm/lib/riscv/ucall.c | 2 +- tools/testing/selftests/kvm/lib/s390x/ucall.c | 2 +- .../testing/selftests/kvm/lib/ucall_common.c | 72 +++++++++++++++++-- .../testing/selftests/kvm/lib/x86_64/ucall.c | 2 +- 6 files changed, 77 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/kvm/include/ucall_common.h b/tools/testing/selftests/kvm/include/ucall_common.h index 2662a4352a8c0..bdd373189a777 100644 --- a/tools/testing/selftests/kvm/include/ucall_common.h +++ b/tools/testing/selftests/kvm/include/ucall_common.h @@ -22,6 +22,9 @@ enum { struct ucall { uint64_t cmd; uint64_t args[UCALL_MAX_ARGS]; + + /* Host virtual address of this struct. */ + struct ucall *hva; }; void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa); @@ -30,11 +33,7 @@ void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu); void ucall(uint64_t cmd, int nargs, ...); uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc); - -static inline void ucall_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa) -{ - ucall_arch_init(vm, mmio_gpa); -} +void ucall_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa); #define GUEST_SYNC_ARGS(stage, arg1, arg2, arg3, arg4) \ ucall(UCALL_SYNC, 6, "hello", stage, arg1, arg2, arg3, arg4) diff --git a/tools/testing/selftests/kvm/lib/aarch64/ucall.c b/tools/testing/selftests/kvm/lib/aarch64/ucall.c index 21d73afcb14fb..562c16dfbb002 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/ucall.c +++ b/tools/testing/selftests/kvm/lib/aarch64/ucall.c @@ -32,12 +32,9 @@ void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu) if (run->exit_reason == KVM_EXIT_MMIO && run->mmio.phys_addr == vcpu->vm->ucall_mmio_addr) { - vm_vaddr_t gva; - - TEST_ASSERT(run->mmio.is_write && run->mmio.len == 8, + TEST_ASSERT(run->mmio.is_write && run->mmio.len == sizeof(uint64_t), "Unexpected ucall exit mmio address access"); - memcpy(&gva, run->mmio.data, sizeof(gva)); - return addr_gva2hva(vcpu->vm, gva); + return (void *)(*((uint64_t *)run->mmio.data)); } return NULL; diff --git a/tools/testing/selftests/kvm/lib/riscv/ucall.c b/tools/testing/selftests/kvm/lib/riscv/ucall.c index 78acdb084ab0f..9a3476a2dfca4 100644 --- a/tools/testing/selftests/kvm/lib/riscv/ucall.c +++ b/tools/testing/selftests/kvm/lib/riscv/ucall.c @@ -55,7 +55,7 @@ void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu) run->riscv_sbi.extension_id == KVM_RISCV_SELFTESTS_SBI_EXT) { switch (run->riscv_sbi.function_id) { case KVM_RISCV_SELFTESTS_SBI_UCALL: - return addr_gva2hva(vcpu->vm, run->riscv_sbi.args[0]); + return (void *)run->riscv_sbi.args[0]; case KVM_RISCV_SELFTESTS_SBI_UNEXP: vcpu_dump(stderr, vcpu, 2); TEST_ASSERT(0, "Unexpected trap taken by guest"); diff --git a/tools/testing/selftests/kvm/lib/s390x/ucall.c b/tools/testing/selftests/kvm/lib/s390x/ucall.c index cbee520a26f2a..a7f02dc372cf7 100644 --- a/tools/testing/selftests/kvm/lib/s390x/ucall.c +++ b/tools/testing/selftests/kvm/lib/s390x/ucall.c @@ -26,7 +26,7 @@ void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu) (run->s390_sieic.ipb >> 16) == 0x501) { int reg = run->s390_sieic.ipa & 0xf; - return addr_gva2hva(vcpu->vm, run->s.regs.gprs[reg]); + return (void *)run->s.regs.gprs[reg]; } return NULL; } diff --git a/tools/testing/selftests/kvm/lib/ucall_common.c b/tools/testing/selftests/kvm/lib/ucall_common.c index ced4808607463..fcae96461e460 100644 --- a/tools/testing/selftests/kvm/lib/ucall_common.c +++ b/tools/testing/selftests/kvm/lib/ucall_common.c @@ -1,22 +1,86 @@ // SPDX-License-Identifier: GPL-2.0-only #include "kvm_util.h" +#include "linux/types.h" +#include "linux/bitmap.h" +#include "linux/atomic.h" + +struct ucall_header { + DECLARE_BITMAP(in_use, KVM_MAX_VCPUS); + struct ucall ucalls[KVM_MAX_VCPUS]; +}; + +/* + * ucall_pool holds per-VM values (global data is duplicated by each VM), it + * must not be accessed from host code. + */ +static struct ucall_header *ucall_pool; + +void ucall_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa) +{ + struct ucall_header *hdr; + struct ucall *uc; + vm_vaddr_t vaddr; + int i; + + vaddr = vm_vaddr_alloc(vm, sizeof(*hdr), KVM_UTIL_MIN_VADDR); + hdr = (struct ucall_header *)addr_gva2hva(vm, vaddr); + memset(hdr, 0, sizeof(*hdr)); + + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + uc = &hdr->ucalls[i]; + uc->hva = uc; + } + + write_guest_global(vm, ucall_pool, (struct ucall_header *)vaddr); + + ucall_arch_init(vm, mmio_gpa); +} + +static struct ucall *ucall_alloc(void) +{ + struct ucall *uc; + int i; + + GUEST_ASSERT(ucall_pool); + + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + if (!atomic_test_and_set_bit(i, ucall_pool->in_use)) { + uc = &ucall_pool->ucalls[i]; + memset(uc->args, 0, sizeof(uc->args)); + return uc; + } + } + + GUEST_ASSERT(0); + return NULL; +} + +static void ucall_free(struct ucall *uc) +{ + /* Beware, here be pointer arithmetic. */ + clear_bit(uc - ucall_pool->ucalls, ucall_pool->in_use); +} void ucall(uint64_t cmd, int nargs, ...) { - struct ucall uc = {}; + struct ucall *uc; va_list va; int i; - WRITE_ONCE(uc.cmd, cmd); + uc = ucall_alloc(); + + WRITE_ONCE(uc->cmd, cmd); nargs = min(nargs, UCALL_MAX_ARGS); va_start(va, nargs); for (i = 0; i < nargs; ++i) - WRITE_ONCE(uc.args[i], va_arg(va, uint64_t)); + WRITE_ONCE(uc->args[i], va_arg(va, uint64_t)); va_end(va); - ucall_arch_do_ucall((vm_vaddr_t)&uc); + ucall_arch_do_ucall((vm_vaddr_t)uc->hva); + + ucall_free(uc); } uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc) diff --git a/tools/testing/selftests/kvm/lib/x86_64/ucall.c b/tools/testing/selftests/kvm/lib/x86_64/ucall.c index eb8bf55b359a5..4d41dc63cc9ed 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/ucall.c +++ b/tools/testing/selftests/kvm/lib/x86_64/ucall.c @@ -26,7 +26,7 @@ void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu) struct kvm_regs regs; vcpu_regs_get(vcpu, ®s); - return addr_gva2hva(vcpu->vm, regs.rdi); + return (void *)regs.rdi; } return NULL; } -- GitLab From 9a6418dacd241169df9e0eeefc7980b3f9b40794 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 28 Sep 2022 22:34:58 +0100 Subject: [PATCH 0686/1423] KVM: selftests: Fix spelling mistake "begining" -> "beginning" There is a spelling mistake in an assert message. Fix it. Signed-off-by: Colin Ian King Reviewed-by: Jim Mattson Link: https://lore.kernel.org/r/20220928213458.64089-1-colin.i.king@gmail.com [sean: fix an ironic typo in the changelog] Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/elf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/lib/elf.c b/tools/testing/selftests/kvm/lib/elf.c index 9f54c098d9d08..d71a9a5974de5 100644 --- a/tools/testing/selftests/kvm/lib/elf.c +++ b/tools/testing/selftests/kvm/lib/elf.c @@ -138,7 +138,7 @@ void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename) offset = hdr.e_phoff + (n1 * hdr.e_phentsize); offset_rv = lseek(fd, offset, SEEK_SET); TEST_ASSERT(offset_rv == offset, - "Failed to seek to begining of program header %u,\n" + "Failed to seek to beginning of program header %u,\n" " filename: %s\n" " rv: %jd errno: %i", n1, filename, (intmax_t) offset_rv, errno); -- GitLab From 816c54b74742ac1a74a74de9355ab982d11e63e6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:45:06 +0000 Subject: [PATCH 0687/1423] KVM: selftests: Drop helpers to read/write page table entries Drop vm_{g,s}et_page_table_entry() and instead expose the "inner" helper (was _vm_get_page_table_entry()) that returns a _pointer_ to the PTE, i.e. let tests directly modify PTEs instead of bouncing through helpers that just make life difficult. Opportunsitically use BIT_ULL() in emulator_error_test, and use the MAXPHYADDR define to set the "rogue" GPA bit instead of open coding the same value. No functional change intended. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221006004512.666529-2-seanjc@google.com --- .../selftests/kvm/include/x86_64/processor.h | 6 ++---- .../selftests/kvm/lib/x86_64/processor.c | 21 ++----------------- .../kvm/x86_64/emulator_error_test.c | 6 ++++-- 3 files changed, 8 insertions(+), 25 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index e8ca0d8a6a7e0..30d5df1ebaad0 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -827,10 +827,8 @@ static inline uint8_t wrmsr_safe(uint32_t msr, uint64_t val) bool kvm_is_tdp_enabled(void); -uint64_t vm_get_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, - uint64_t vaddr); -void vm_set_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, - uint64_t vaddr, uint64_t pte); +uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, + uint64_t vaddr); uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 39c4409ef56a6..90b35998b0f3a 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -241,9 +241,8 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, } } -static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm, - struct kvm_vcpu *vcpu, - uint64_t vaddr) +uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, + uint64_t vaddr) { uint16_t index[4]; uint64_t *pml4e, *pdpe, *pde; @@ -313,22 +312,6 @@ static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm, return &pte[index[0]]; } -uint64_t vm_get_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, - uint64_t vaddr) -{ - uint64_t *pte = _vm_get_page_table_entry(vm, vcpu, vaddr); - - return *(uint64_t *)pte; -} - -void vm_set_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, - uint64_t vaddr, uint64_t pte) -{ - uint64_t *new_pte = _vm_get_page_table_entry(vm, vcpu, vaddr); - - *(uint64_t *)new_pte = pte; -} - void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { uint64_t *pml4e, *pml4e_start; diff --git a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c b/tools/testing/selftests/kvm/x86_64/emulator_error_test.c index 236e11755ba67..bde247f3c8a16 100644 --- a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c +++ b/tools/testing/selftests/kvm/x86_64/emulator_error_test.c @@ -152,8 +152,9 @@ int main(int argc, char *argv[]) { struct kvm_vcpu *vcpu; struct kvm_vm *vm; - uint64_t gpa, pte; + uint64_t *pte; uint64_t *hva; + uint64_t gpa; int rc; /* Tell stdout not to buffer its content */ @@ -178,8 +179,9 @@ int main(int argc, char *argv[]) virt_map(vm, MEM_REGION_GVA, MEM_REGION_GPA, 1); hva = addr_gpa2hva(vm, MEM_REGION_GPA); memset(hva, 0, PAGE_SIZE); + pte = vm_get_page_table_entry(vm, vcpu, MEM_REGION_GVA); - vm_set_page_table_entry(vm, vcpu, MEM_REGION_GVA, pte | (1ull << 36)); + *pte |= BIT_ULL(MAXPHYADDR); vcpu_run(vcpu); process_exit_on_emulation_error(vcpu); -- GitLab From 751f280017b697d98936618a21ca3defdc03a9f4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:45:07 +0000 Subject: [PATCH 0688/1423] KVM: selftests: Drop reserved bit checks from PTE accessor Drop the reserved bit checks from the helper to retrieve a PTE, there's very little value in sanity checking the constructed page tables as any will quickly be noticed in the form of an unexpected #PF. The checks also place unnecessary restrictions on the usage of the helpers, e.g. if a test _wanted_ to set reserved bits for whatever reason. Removing the NX check in particular allows for the removal of the @vcpu param, which will in turn allow the helper to be reused nearly verbatim for addr_gva2gpa(). Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221006004512.666529-3-seanjc@google.com --- .../selftests/kvm/include/x86_64/processor.h | 3 +-- .../selftests/kvm/lib/x86_64/processor.c | 26 +------------------ .../kvm/x86_64/emulator_error_test.c | 2 +- 3 files changed, 3 insertions(+), 28 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 30d5df1ebaad0..53d52a5ace48b 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -827,8 +827,7 @@ static inline uint8_t wrmsr_safe(uint32_t msr, uint64_t val) bool kvm_is_tdp_enabled(void); -uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, - uint64_t vaddr); +uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr); uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 90b35998b0f3a..9e196837a794b 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -241,29 +241,11 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, } } -uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, - uint64_t vaddr) +uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr) { uint16_t index[4]; uint64_t *pml4e, *pdpe, *pde; uint64_t *pte; - struct kvm_sregs sregs; - uint64_t rsvd_mask = 0; - - /* Set the high bits in the reserved mask. */ - if (vm->pa_bits < 52) - rsvd_mask = GENMASK_ULL(51, vm->pa_bits); - - /* - * SDM vol 3, fig 4-11 "Formats of CR3 and Paging-Structure Entries - * with 4-Level Paging and 5-Level Paging". - * If IA32_EFER.NXE = 0 and the P flag of a paging-structure entry is 1, - * the XD flag (bit 63) is reserved. - */ - vcpu_sregs_get(vcpu, &sregs); - if ((sregs.efer & EFER_NX) == 0) { - rsvd_mask |= PTE_NX_MASK; - } TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " "unknown or unsupported guest mode, mode: 0x%x", vm->mode); @@ -286,24 +268,18 @@ uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, pml4e = addr_gpa2hva(vm, vm->pgd); TEST_ASSERT(pml4e[index[3]] & PTE_PRESENT_MASK, "Expected pml4e to be present for gva: 0x%08lx", vaddr); - TEST_ASSERT((pml4e[index[3]] & (rsvd_mask | PTE_LARGE_MASK)) == 0, - "Unexpected reserved bits set."); pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size); TEST_ASSERT(pdpe[index[2]] & PTE_PRESENT_MASK, "Expected pdpe to be present for gva: 0x%08lx", vaddr); TEST_ASSERT(!(pdpe[index[2]] & PTE_LARGE_MASK), "Expected pdpe to map a pde not a 1-GByte page."); - TEST_ASSERT((pdpe[index[2]] & rsvd_mask) == 0, - "Unexpected reserved bits set."); pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size); TEST_ASSERT(pde[index[1]] & PTE_PRESENT_MASK, "Expected pde to be present for gva: 0x%08lx", vaddr); TEST_ASSERT(!(pde[index[1]] & PTE_LARGE_MASK), "Expected pde to map a pte not a 2-MByte page."); - TEST_ASSERT((pde[index[1]] & rsvd_mask) == 0, - "Unexpected reserved bits set."); pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size); TEST_ASSERT(pte[index[0]] & PTE_PRESENT_MASK, diff --git a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c b/tools/testing/selftests/kvm/x86_64/emulator_error_test.c index bde247f3c8a16..1abb347357545 100644 --- a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c +++ b/tools/testing/selftests/kvm/x86_64/emulator_error_test.c @@ -180,7 +180,7 @@ int main(int argc, char *argv[]) hva = addr_gpa2hva(vm, MEM_REGION_GPA); memset(hva, 0, PAGE_SIZE); - pte = vm_get_page_table_entry(vm, vcpu, MEM_REGION_GVA); + pte = vm_get_page_table_entry(vm, MEM_REGION_GVA); *pte |= BIT_ULL(MAXPHYADDR); vcpu_run(vcpu); -- GitLab From 91add12d384c650d243b8ccdee1f2ddea3c9a85d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:45:08 +0000 Subject: [PATCH 0689/1423] KVM: selftests: Remove useless shifts when creating guest page tables Remove the pointless shift from GPA=>GFN and immediately back to GFN=>GPA when creating guest page tables. Ignore the other walkers that have a similar pattern for the moment, they will be converted to use virt_get_pte() in the near future. No functional change intended. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221006004512.666529-4-seanjc@google.com --- .../selftests/kvm/include/x86_64/processor.h | 3 ++- .../selftests/kvm/lib/x86_64/processor.c | 17 ++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 53d52a5ace48b..9676a34647580 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -177,7 +177,8 @@ struct kvm_x86_cpu_feature { #define PAGE_MASK (~(PAGE_SIZE-1)) #define PHYSICAL_PAGE_MASK GENMASK_ULL(51, 12) -#define PTE_GET_PFN(pte) (((pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT) +#define PTE_GET_PA(pte) ((pte) & PHYSICAL_PAGE_MASK) +#define PTE_GET_PFN(pte) (PTE_GET_PA(pte) >> PAGE_SHIFT) /* General Registers in 64-Bit Mode */ struct gpr64_regs { diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 9e196837a794b..324bf24564a1a 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -131,23 +131,23 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) } } -static void *virt_get_pte(struct kvm_vm *vm, uint64_t pt_pfn, uint64_t vaddr, +static void *virt_get_pte(struct kvm_vm *vm, uint64_t pt_gpa, uint64_t vaddr, int level) { - uint64_t *page_table = addr_gpa2hva(vm, pt_pfn << vm->page_shift); + uint64_t *page_table = addr_gpa2hva(vm, pt_gpa); int index = (vaddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu; return &page_table[index]; } static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, - uint64_t pt_pfn, + uint64_t pt_gpa, uint64_t vaddr, uint64_t paddr, int current_level, int target_level) { - uint64_t *pte = virt_get_pte(vm, pt_pfn, vaddr, current_level); + uint64_t *pte = virt_get_pte(vm, pt_gpa, vaddr, current_level); if (!(*pte & PTE_PRESENT_MASK)) { *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK; @@ -197,21 +197,20 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) * Allocate upper level page tables, if not already present. Return * early if a hugepage was created. */ - pml4e = virt_create_upper_pte(vm, vm->pgd >> vm->page_shift, - vaddr, paddr, PG_LEVEL_512G, level); + pml4e = virt_create_upper_pte(vm, vm->pgd, vaddr, paddr, PG_LEVEL_512G, level); if (*pml4e & PTE_LARGE_MASK) return; - pdpe = virt_create_upper_pte(vm, PTE_GET_PFN(*pml4e), vaddr, paddr, PG_LEVEL_1G, level); + pdpe = virt_create_upper_pte(vm, PTE_GET_PA(*pml4e), vaddr, paddr, PG_LEVEL_1G, level); if (*pdpe & PTE_LARGE_MASK) return; - pde = virt_create_upper_pte(vm, PTE_GET_PFN(*pdpe), vaddr, paddr, PG_LEVEL_2M, level); + pde = virt_create_upper_pte(vm, PTE_GET_PA(*pdpe), vaddr, paddr, PG_LEVEL_2M, level); if (*pde & PTE_LARGE_MASK) return; /* Fill in page table entry. */ - pte = virt_get_pte(vm, PTE_GET_PFN(*pde), vaddr, PG_LEVEL_4K); + pte = virt_get_pte(vm, PTE_GET_PA(*pde), vaddr, PG_LEVEL_4K); TEST_ASSERT(!(*pte & PTE_PRESENT_MASK), "PTE already present for 4k page at vaddr: 0x%lx\n", vaddr); *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK); -- GitLab From ed0b58fc6f0bdde360c28314e0faedc9a0a6c3de Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:45:09 +0000 Subject: [PATCH 0690/1423] KVM: selftests: Verify parent PTE is PRESENT when getting child PTE Verify the parent PTE is PRESENT when getting a child via virt_get_pte() so that the helper can be used for getting PTEs/GPAs without losing sanity checks that the walker isn't wandering into the weeds. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221006004512.666529-5-seanjc@google.com --- .../selftests/kvm/lib/x86_64/processor.c | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 324bf24564a1a..c9649f19aca17 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -131,23 +131,28 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) } } -static void *virt_get_pte(struct kvm_vm *vm, uint64_t pt_gpa, uint64_t vaddr, - int level) +static void *virt_get_pte(struct kvm_vm *vm, uint64_t *parent_pte, + uint64_t vaddr, int level) { + uint64_t pt_gpa = PTE_GET_PA(*parent_pte); uint64_t *page_table = addr_gpa2hva(vm, pt_gpa); int index = (vaddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu; + TEST_ASSERT((*parent_pte & PTE_PRESENT_MASK) || parent_pte == &vm->pgd, + "Parent PTE (level %d) not PRESENT for gva: 0x%08lx", + level + 1, vaddr); + return &page_table[index]; } static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, - uint64_t pt_gpa, + uint64_t *parent_pte, uint64_t vaddr, uint64_t paddr, int current_level, int target_level) { - uint64_t *pte = virt_get_pte(vm, pt_gpa, vaddr, current_level); + uint64_t *pte = virt_get_pte(vm, parent_pte, vaddr, current_level); if (!(*pte & PTE_PRESENT_MASK)) { *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK; @@ -197,20 +202,20 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) * Allocate upper level page tables, if not already present. Return * early if a hugepage was created. */ - pml4e = virt_create_upper_pte(vm, vm->pgd, vaddr, paddr, PG_LEVEL_512G, level); + pml4e = virt_create_upper_pte(vm, &vm->pgd, vaddr, paddr, PG_LEVEL_512G, level); if (*pml4e & PTE_LARGE_MASK) return; - pdpe = virt_create_upper_pte(vm, PTE_GET_PA(*pml4e), vaddr, paddr, PG_LEVEL_1G, level); + pdpe = virt_create_upper_pte(vm, pml4e, vaddr, paddr, PG_LEVEL_1G, level); if (*pdpe & PTE_LARGE_MASK) return; - pde = virt_create_upper_pte(vm, PTE_GET_PA(*pdpe), vaddr, paddr, PG_LEVEL_2M, level); + pde = virt_create_upper_pte(vm, pdpe, vaddr, paddr, PG_LEVEL_2M, level); if (*pde & PTE_LARGE_MASK) return; /* Fill in page table entry. */ - pte = virt_get_pte(vm, PTE_GET_PA(*pde), vaddr, PG_LEVEL_4K); + pte = virt_get_pte(vm, pde, vaddr, PG_LEVEL_4K); TEST_ASSERT(!(*pte & PTE_PRESENT_MASK), "PTE already present for 4k page at vaddr: 0x%lx\n", vaddr); *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK); -- GitLab From 99d51c6eef2dadc204363ab3bf58c91d02f895be Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:45:10 +0000 Subject: [PATCH 0691/1423] KVM: selftests: Use virt_get_pte() when getting PTE pointer Use virt_get_pte() in vm_get_page_table_entry() instead of open coding equivalent code. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221006004512.666529-6-seanjc@google.com --- .../selftests/kvm/lib/x86_64/processor.c | 29 ++++--------------- 1 file changed, 6 insertions(+), 23 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index c9649f19aca17..09b550fd88152 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -247,9 +247,7 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr) { - uint16_t index[4]; uint64_t *pml4e, *pdpe, *pde; - uint64_t *pte; TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " "unknown or unsupported guest mode, mode: 0x%x", vm->mode); @@ -264,32 +262,17 @@ uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr) TEST_ASSERT(vaddr == (((int64_t)vaddr << 16) >> 16), "Canonical check failed. The virtual address is invalid."); - index[0] = (vaddr >> 12) & 0x1ffu; - index[1] = (vaddr >> 21) & 0x1ffu; - index[2] = (vaddr >> 30) & 0x1ffu; - index[3] = (vaddr >> 39) & 0x1ffu; - - pml4e = addr_gpa2hva(vm, vm->pgd); - TEST_ASSERT(pml4e[index[3]] & PTE_PRESENT_MASK, - "Expected pml4e to be present for gva: 0x%08lx", vaddr); + pml4e = virt_get_pte(vm, &vm->pgd, vaddr, PG_LEVEL_512G); - pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size); - TEST_ASSERT(pdpe[index[2]] & PTE_PRESENT_MASK, - "Expected pdpe to be present for gva: 0x%08lx", vaddr); - TEST_ASSERT(!(pdpe[index[2]] & PTE_LARGE_MASK), + pdpe = virt_get_pte(vm, pml4e, vaddr, PG_LEVEL_1G); + TEST_ASSERT(!(*pdpe & PTE_LARGE_MASK), "Expected pdpe to map a pde not a 1-GByte page."); - pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size); - TEST_ASSERT(pde[index[1]] & PTE_PRESENT_MASK, - "Expected pde to be present for gva: 0x%08lx", vaddr); - TEST_ASSERT(!(pde[index[1]] & PTE_LARGE_MASK), + pde = virt_get_pte(vm, pdpe, vaddr, PG_LEVEL_2M); + TEST_ASSERT(!(*pde & PTE_LARGE_MASK), "Expected pde to map a pte not a 2-MByte page."); - pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size); - TEST_ASSERT(pte[index[0]] & PTE_PRESENT_MASK, - "Expected pte to be present for gva: 0x%08lx", vaddr); - - return &pte[index[0]]; + return virt_get_pte(vm, pde, vaddr, PG_LEVEL_4K); } void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) -- GitLab From efe91dc307d00766911fbcb5021bdc3a1cf9c79e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:45:11 +0000 Subject: [PATCH 0692/1423] KVM: selftests: Use vm_get_page_table_entry() in addr_arch_gva2gpa() Use vm_get_page_table_entry() in addr_arch_gva2gpa() to get the leaf PTE instead of manually walking page tables. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221006004512.666529-7-seanjc@google.com --- .../selftests/kvm/lib/x86_64/processor.c | 38 ++----------------- 1 file changed, 4 insertions(+), 34 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 09b550fd88152..053f64191122b 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -458,41 +458,11 @@ static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector, vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) { - uint16_t index[4]; - uint64_t *pml4e, *pdpe, *pde; - uint64_t *pte; - - TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " - "unknown or unsupported guest mode, mode: 0x%x", vm->mode); - - index[0] = (gva >> 12) & 0x1ffu; - index[1] = (gva >> 21) & 0x1ffu; - index[2] = (gva >> 30) & 0x1ffu; - index[3] = (gva >> 39) & 0x1ffu; - - if (!vm->pgd_created) - goto unmapped_gva; - pml4e = addr_gpa2hva(vm, vm->pgd); - if (!(pml4e[index[3]] & PTE_PRESENT_MASK)) - goto unmapped_gva; - - pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size); - if (!(pdpe[index[2]] & PTE_PRESENT_MASK)) - goto unmapped_gva; - - pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size); - if (!(pde[index[1]] & PTE_PRESENT_MASK)) - goto unmapped_gva; - - pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size); - if (!(pte[index[0]] & PTE_PRESENT_MASK)) - goto unmapped_gva; - - return (PTE_GET_PFN(pte[index[0]]) * vm->page_size) + (gva & ~PAGE_MASK); + uint64_t *pte = vm_get_page_table_entry(vm, gva); -unmapped_gva: - TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva); - exit(EXIT_FAILURE); + TEST_ASSERT(*pte & PTE_PRESENT_MASK, + "Leaf PTE not PRESENT for gva: 0x%08lx", gva); + return PTE_GET_PA(*pte) | (gva & ~PAGE_MASK); } static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt) -- GitLab From 96b69958c77d84e49c06ebe2e3502e4c1620e3c0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:45:12 +0000 Subject: [PATCH 0693/1423] KVM: selftests: Play nice with huge pages when getting PTEs/GPAs Play nice with huge pages when getting PTEs and translating GVAs to GPAs, there's no reason to disallow using huge pages in selftests. Use PG_LEVEL_NONE to indicate that the caller doesn't care about the mapping level and just wants to get the pte+level. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221006004512.666529-8-seanjc@google.com --- .../selftests/kvm/include/x86_64/processor.h | 11 ++++- .../selftests/kvm/lib/x86_64/processor.c | 45 ++++++++++++++++--- 2 files changed, 47 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 9676a34647580..e000e35c948fa 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -172,11 +172,16 @@ struct kvm_x86_cpu_feature { #define PTE_GLOBAL_MASK BIT_ULL(8) #define PTE_NX_MASK BIT_ULL(63) +#define PHYSICAL_PAGE_MASK GENMASK_ULL(51, 12) + #define PAGE_SHIFT 12 #define PAGE_SIZE (1ULL << PAGE_SHIFT) -#define PAGE_MASK (~(PAGE_SIZE-1)) +#define PAGE_MASK (~(PAGE_SIZE-1) & PHYSICAL_PAGE_MASK) + +#define HUGEPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9)) +#define HUGEPAGE_SIZE(x) (1UL << HUGEPAGE_SHIFT(x)) +#define HUGEPAGE_MASK(x) (~(HUGEPAGE_SIZE(x) - 1) & PHYSICAL_PAGE_MASK) -#define PHYSICAL_PAGE_MASK GENMASK_ULL(51, 12) #define PTE_GET_PA(pte) ((pte) & PHYSICAL_PAGE_MASK) #define PTE_GET_PFN(pte) (PTE_GET_PA(pte) >> PAGE_SHIFT) @@ -828,6 +833,8 @@ static inline uint8_t wrmsr_safe(uint32_t msr, uint64_t val) bool kvm_is_tdp_enabled(void); +uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, + int *level); uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr); uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 053f64191122b..efa20d0f99271 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -245,10 +245,26 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, } } -uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr) +static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level) +{ + if (*pte & PTE_LARGE_MASK) { + TEST_ASSERT(*level == PG_LEVEL_NONE || + *level == current_level, + "Unexpected hugepage at level %d\n", current_level); + *level = current_level; + } + + return *level == current_level; +} + +uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, + int *level) { uint64_t *pml4e, *pdpe, *pde; + TEST_ASSERT(*level >= PG_LEVEL_NONE && *level < PG_LEVEL_NUM, + "Invalid PG_LEVEL_* '%d'", *level); + TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " "unknown or unsupported guest mode, mode: 0x%x", vm->mode); TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, @@ -263,18 +279,27 @@ uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr) "Canonical check failed. The virtual address is invalid."); pml4e = virt_get_pte(vm, &vm->pgd, vaddr, PG_LEVEL_512G); + if (vm_is_target_pte(pml4e, level, PG_LEVEL_512G)) + return pml4e; pdpe = virt_get_pte(vm, pml4e, vaddr, PG_LEVEL_1G); - TEST_ASSERT(!(*pdpe & PTE_LARGE_MASK), - "Expected pdpe to map a pde not a 1-GByte page."); + if (vm_is_target_pte(pdpe, level, PG_LEVEL_1G)) + return pdpe; pde = virt_get_pte(vm, pdpe, vaddr, PG_LEVEL_2M); - TEST_ASSERT(!(*pde & PTE_LARGE_MASK), - "Expected pde to map a pte not a 2-MByte page."); + if (vm_is_target_pte(pde, level, PG_LEVEL_2M)) + return pde; return virt_get_pte(vm, pde, vaddr, PG_LEVEL_4K); } +uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr) +{ + int level = PG_LEVEL_4K; + + return __vm_get_page_table_entry(vm, vaddr, &level); +} + void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { uint64_t *pml4e, *pml4e_start; @@ -458,11 +483,17 @@ static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector, vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) { - uint64_t *pte = vm_get_page_table_entry(vm, gva); + int level = PG_LEVEL_NONE; + uint64_t *pte = __vm_get_page_table_entry(vm, gva, &level); TEST_ASSERT(*pte & PTE_PRESENT_MASK, "Leaf PTE not PRESENT for gva: 0x%08lx", gva); - return PTE_GET_PA(*pte) | (gva & ~PAGE_MASK); + + /* + * No need for a hugepage mask on the PTE, x86-64 requires the "unused" + * address bits to be zero. + */ + return PTE_GET_PA(*pte) | (gva & ~HUGEPAGE_MASK(level)); } static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt) -- GitLab From 197ebb713ad04518ffef6d966954b519a0805100 Mon Sep 17 00:00:00 2001 From: Vishal Annapurve Date: Tue, 15 Nov 2022 21:38:43 +0000 Subject: [PATCH 0694/1423] KVM: selftests: move common startup logic to kvm_util.c Consolidate common startup logic in one place by implementing a single setup function with __attribute((constructor)) for all selftests within kvm_util.c. This allows moving logic like: /* Tell stdout not to buffer its content */ setbuf(stdout, NULL); to a single file for all selftests. This will also allow any required setup at entry in future to be done in common main function. Link: https://lore.kernel.org/lkml/Ywa9T+jKUpaHLu%2Fl@google.com Suggested-by: Sean Christopherson Reviewed-by: Andrew Jones Reviewed-by: Peter Gonda Signed-off-by: Vishal Annapurve Link: https://lore.kernel.org/r/20221115213845.3348210-2-vannapurve@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/aarch64/arch_timer.c | 3 --- tools/testing/selftests/kvm/aarch64/hypercalls.c | 2 -- tools/testing/selftests/kvm/aarch64/vgic_irq.c | 3 --- tools/testing/selftests/kvm/lib/kvm_util.c | 6 ++++++ tools/testing/selftests/kvm/memslot_perf_test.c | 3 --- tools/testing/selftests/kvm/rseq_test.c | 3 --- tools/testing/selftests/kvm/s390x/memop.c | 2 -- tools/testing/selftests/kvm/s390x/resets.c | 2 -- tools/testing/selftests/kvm/s390x/sync_regs_test.c | 3 --- tools/testing/selftests/kvm/set_memory_region_test.c | 3 --- tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c | 3 --- tools/testing/selftests/kvm/x86_64/emulator_error_test.c | 3 --- tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c | 3 --- tools/testing/selftests/kvm/x86_64/platform_info_test.c | 3 --- tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c | 3 --- tools/testing/selftests/kvm/x86_64/set_sregs_test.c | 3 --- .../selftests/kvm/x86_64/svm_nested_soft_inject_test.c | 3 --- tools/testing/selftests/kvm/x86_64/sync_regs_test.c | 3 --- tools/testing/selftests/kvm/x86_64/userspace_io_test.c | 3 --- .../testing/selftests/kvm/x86_64/userspace_msr_exit_test.c | 3 --- 20 files changed, 6 insertions(+), 54 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c index 54016cdd8a098..f2a96779716a8 100644 --- a/tools/testing/selftests/kvm/aarch64/arch_timer.c +++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c @@ -446,9 +446,6 @@ int main(int argc, char *argv[]) { struct kvm_vm *vm; - /* Tell stdout not to buffer its content */ - setbuf(stdout, NULL); - if (!parse_args(argc, argv)) exit(KSFT_SKIP); diff --git a/tools/testing/selftests/kvm/aarch64/hypercalls.c b/tools/testing/selftests/kvm/aarch64/hypercalls.c index 3dceecfd1f628..bef1499fb4654 100644 --- a/tools/testing/selftests/kvm/aarch64/hypercalls.c +++ b/tools/testing/selftests/kvm/aarch64/hypercalls.c @@ -305,8 +305,6 @@ static void test_run(void) int main(void) { - setbuf(stdout, NULL); - test_run(); return 0; } diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c index e0310ebc313ca..90d854e0fcffe 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c @@ -817,9 +817,6 @@ int main(int argc, char **argv) int opt; bool eoi_split = false; - /* Tell stdout not to buffer its content */ - setbuf(stdout, NULL); - while ((opt = getopt(argc, argv, "hn:e:l:")) != -1) { switch (opt) { case 'n': diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 07c8edd4e5486..575a0c38d1c0c 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -2086,3 +2086,9 @@ void __vm_get_stat(struct kvm_vm *vm, const char *stat_name, uint64_t *data, break; } } + +void __attribute((constructor)) kvm_selftest_init(void) +{ + /* Tell stdout not to buffer its content. */ + setbuf(stdout, NULL); +} diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c index d771262ea584b..36b20abfb948e 100644 --- a/tools/testing/selftests/kvm/memslot_perf_test.c +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -994,9 +994,6 @@ int main(int argc, char *argv[]) struct test_result rbestslottime; int tctr; - /* Tell stdout not to buffer its content */ - setbuf(stdout, NULL); - if (!parse_args(argc, argv, &targs)) return -1; diff --git a/tools/testing/selftests/kvm/rseq_test.c b/tools/testing/selftests/kvm/rseq_test.c index 0e9e2b48a51f7..3045fdf9bdf59 100644 --- a/tools/testing/selftests/kvm/rseq_test.c +++ b/tools/testing/selftests/kvm/rseq_test.c @@ -205,9 +205,6 @@ int main(int argc, char *argv[]) struct kvm_vcpu *vcpu; u32 cpu, rseq_cpu; - /* Tell stdout not to buffer its content */ - setbuf(stdout, NULL); - r = sched_getaffinity(0, sizeof(possible_mask), &possible_mask); TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)", errno, strerror(errno)); diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c index 9113696d5178a..3fd81e58f40c2 100644 --- a/tools/testing/selftests/kvm/s390x/memop.c +++ b/tools/testing/selftests/kvm/s390x/memop.c @@ -760,8 +760,6 @@ int main(int argc, char *argv[]) TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_MEM_OP)); - setbuf(stdout, NULL); /* Tell stdout not to buffer its content */ - ksft_print_header(); ksft_set_plan(ARRAY_SIZE(testlist)); diff --git a/tools/testing/selftests/kvm/s390x/resets.c b/tools/testing/selftests/kvm/s390x/resets.c index 19486084eb309..e41e2cb8ffa97 100644 --- a/tools/testing/selftests/kvm/s390x/resets.c +++ b/tools/testing/selftests/kvm/s390x/resets.c @@ -296,8 +296,6 @@ int main(int argc, char *argv[]) bool has_s390_vcpu_resets = kvm_check_cap(KVM_CAP_S390_VCPU_RESETS); int idx; - setbuf(stdout, NULL); /* Tell stdout not to buffer its content */ - ksft_print_header(); ksft_set_plan(ARRAY_SIZE(testlist)); diff --git a/tools/testing/selftests/kvm/s390x/sync_regs_test.c b/tools/testing/selftests/kvm/s390x/sync_regs_test.c index 3fdb6e2598ebe..2ddde41c44ba9 100644 --- a/tools/testing/selftests/kvm/s390x/sync_regs_test.c +++ b/tools/testing/selftests/kvm/s390x/sync_regs_test.c @@ -231,9 +231,6 @@ int main(int argc, char *argv[]) TEST_REQUIRE(kvm_has_cap(KVM_CAP_SYNC_REGS)); - /* Tell stdout not to buffer its content */ - setbuf(stdout, NULL); - ksft_print_header(); ksft_set_plan(ARRAY_SIZE(testlist)); diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index 85c16f09a50e1..2ef1d1b72ce43 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -392,9 +392,6 @@ int main(int argc, char *argv[]) int i, loops; #endif - /* Tell stdout not to buffer its content */ - setbuf(stdout, NULL); - #ifdef __x86_64__ /* * FIXME: the zero-memslot test fails on aarch64 and s390x because diff --git a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c index 4208487652f8e..1027a671c7d3a 100644 --- a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c +++ b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c @@ -57,9 +57,6 @@ int main(int argc, char *argv[]) TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XSAVE)); - /* Tell stdout not to buffer its content */ - setbuf(stdout, NULL); - vm = vm_create_with_one_vcpu(&vcpu, guest_code); run = vcpu->run; diff --git a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c b/tools/testing/selftests/kvm/x86_64/emulator_error_test.c index 1abb347357545..d945e571e7a0c 100644 --- a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c +++ b/tools/testing/selftests/kvm/x86_64/emulator_error_test.c @@ -157,9 +157,6 @@ int main(int argc, char *argv[]) uint64_t gpa; int rc; - /* Tell stdout not to buffer its content */ - setbuf(stdout, NULL); - TEST_REQUIRE(kvm_has_cap(KVM_CAP_SMALLER_MAXPHYADDR)); vm = vm_create_with_one_vcpu(&vcpu, guest_code); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c index e804eb08dff90..5c27efbf405e1 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c @@ -134,9 +134,6 @@ int main(int argc, char *argv[]) const struct kvm_cpuid2 *hv_cpuid_entries; struct kvm_vcpu *vcpu; - /* Tell stdout not to buffer its content */ - setbuf(stdout, NULL); - TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_CPUID)); vm = vm_create_with_one_vcpu(&vcpu, guest_code); diff --git a/tools/testing/selftests/kvm/x86_64/platform_info_test.c b/tools/testing/selftests/kvm/x86_64/platform_info_test.c index 76417c7d687bd..310a104d94f06 100644 --- a/tools/testing/selftests/kvm/x86_64/platform_info_test.c +++ b/tools/testing/selftests/kvm/x86_64/platform_info_test.c @@ -72,9 +72,6 @@ int main(int argc, char *argv[]) struct kvm_vm *vm; uint64_t msr_platform_info; - /* Tell stdout not to buffer its content */ - setbuf(stdout, NULL); - TEST_REQUIRE(kvm_has_cap(KVM_CAP_MSR_PLATFORM_INFO)); vm = vm_create_with_one_vcpu(&vcpu, guest_code); diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c index ea4e259a1e2ef..a6ffa245c8971 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c @@ -447,9 +447,6 @@ int main(int argc, char *argv[]) struct kvm_vcpu *vcpu; struct kvm_vm *vm; - /* Tell stdout not to buffer its content */ - setbuf(stdout, NULL); - TEST_REQUIRE(kvm_has_cap(KVM_CAP_PMU_EVENT_FILTER)); TEST_REQUIRE(use_intel_pmu() || use_amd_pmu()); diff --git a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c index 2bb08bf2125dd..a284fcef6ed74 100644 --- a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c +++ b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c @@ -82,9 +82,6 @@ int main(int argc, char *argv[]) uint64_t cr4; int rc; - /* Tell stdout not to buffer its content */ - setbuf(stdout, NULL); - /* * Create a dummy VM, specifically to avoid doing KVM_SET_CPUID2, and * use it to verify all supported CR4 bits can be set prior to defining diff --git a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c index e637d77360123..e497ace629c19 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c @@ -194,9 +194,6 @@ done: int main(int argc, char *argv[]) { - /* Tell stdout not to buffer its content */ - setbuf(stdout, NULL); - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); TEST_ASSERT(kvm_cpu_has(X86_FEATURE_NRIPS), diff --git a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c index 9b6db0b0b13e9..d2f9b5bdfab20 100644 --- a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c +++ b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c @@ -90,9 +90,6 @@ int main(int argc, char *argv[]) struct kvm_vcpu_events events; int rv, cap; - /* Tell stdout not to buffer its content */ - setbuf(stdout, NULL); - cap = kvm_check_cap(KVM_CAP_SYNC_REGS); TEST_REQUIRE((cap & TEST_SYNC_FIELDS) == TEST_SYNC_FIELDS); TEST_REQUIRE(!(cap & INVALID_SYNC_FIELD)); diff --git a/tools/testing/selftests/kvm/x86_64/userspace_io_test.c b/tools/testing/selftests/kvm/x86_64/userspace_io_test.c index 7316521428f86..91076c9787b41 100644 --- a/tools/testing/selftests/kvm/x86_64/userspace_io_test.c +++ b/tools/testing/selftests/kvm/x86_64/userspace_io_test.c @@ -56,9 +56,6 @@ int main(int argc, char *argv[]) struct kvm_vm *vm; struct ucall uc; - /* Tell stdout not to buffer its content */ - setbuf(stdout, NULL); - vm = vm_create_with_one_vcpu(&vcpu, guest_code); run = vcpu->run; diff --git a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c index fae95089e6551..25fa55344a10e 100644 --- a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c +++ b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c @@ -818,9 +818,6 @@ static void test_user_exit_msr_flags(void) int main(int argc, char *argv[]) { - /* Tell stdout not to buffer its content */ - setbuf(stdout, NULL); - test_msr_filter_allow(); test_msr_filter_deny(); -- GitLab From e1ab31245c4efe973db4dcd6c82a41c2aec09b27 Mon Sep 17 00:00:00 2001 From: Vishal Annapurve Date: Tue, 15 Nov 2022 21:38:44 +0000 Subject: [PATCH 0695/1423] KVM: selftests: Add arch specific initialization Introduce arch specific API: kvm_selftest_arch_init to allow each arch to handle initialization before running any selftest logic. Suggested-by: Sean Christopherson Reviewed-by: Andrew Jones Reviewed-by: Peter Gonda Signed-off-by: Vishal Annapurve Link: https://lore.kernel.org/r/20221115213845.3348210-3-vannapurve@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/include/kvm_util_base.h | 7 +++++++ .../selftests/kvm/lib/aarch64/processor.c | 18 +++++++++--------- tools/testing/selftests/kvm/lib/kvm_util.c | 6 ++++++ 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index a7047e0767d33..c58eec6d44a7e 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -857,4 +857,11 @@ static inline int __vm_disable_nx_huge_pages(struct kvm_vm *vm) return __vm_enable_cap(vm, KVM_CAP_VM_DISABLE_NX_HUGE_PAGES, 0); } +/* + * Arch hook that is invoked via a constructor, i.e. before exeucting main(), + * to allow for arch-specific setup that is common to all tests, e.g. computing + * the default guest "mode". + */ +void kvm_selftest_arch_init(void); + #endif /* SELFTEST_KVM_UTIL_BASE_H */ diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index 6f5551368944e..0de4aabc0c764 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -495,15 +495,6 @@ void aarch64_get_supported_page_sizes(uint32_t ipa, close(kvm_fd); } -/* - * arm64 doesn't have a true default mode, so start by computing the - * available IPA space and page sizes early. - */ -void __attribute__((constructor)) init_guest_modes(void) -{ - guest_modes_append_default(); -} - void smccc_hvc(uint32_t function_id, uint64_t arg0, uint64_t arg1, uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5, uint64_t arg6, struct arm_smccc_res *res) @@ -528,3 +519,12 @@ void smccc_hvc(uint32_t function_id, uint64_t arg0, uint64_t arg1, [arg4] "r"(arg4), [arg5] "r"(arg5), [arg6] "r"(arg6) : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7"); } + +void kvm_selftest_arch_init(void) +{ + /* + * arm64 doesn't have a true default mode, so start by computing the + * available IPA space and page sizes early. + */ + guest_modes_append_default(); +} diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 575a0c38d1c0c..db62680d59180 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -2087,8 +2087,14 @@ void __vm_get_stat(struct kvm_vm *vm, const char *stat_name, uint64_t *data, } } +__weak void kvm_selftest_arch_init(void) +{ +} + void __attribute((constructor)) kvm_selftest_init(void) { /* Tell stdout not to buffer its content. */ setbuf(stdout, NULL); + + kvm_selftest_arch_init(); } -- GitLab From 2115713cfab05dd6efd48ad6bf56e67f556dcec5 Mon Sep 17 00:00:00 2001 From: Vishal Annapurve Date: Tue, 15 Nov 2022 21:38:45 +0000 Subject: [PATCH 0696/1423] KVM: selftests: Add arch specific post vm creation hook Add arch specific API kvm_arch_vm_post_create to perform any required setup after VM creation. Suggested-by: Sean Christopherson Reviewed-by: Andrew Jones Reviewed-by: Peter Gonda Signed-off-by: Vishal Annapurve Link: https://lore.kernel.org/r/20221115213845.3348210-4-vannapurve@google.com [sean: place x86's implementation by vm_arch_vcpu_add()] Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/kvm_util_base.h | 2 ++ tools/testing/selftests/kvm/lib/kvm_util.c | 8 +++++--- tools/testing/selftests/kvm/lib/x86_64/processor.c | 5 +++++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index c58eec6d44a7e..228212ede05eb 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -864,4 +864,6 @@ static inline int __vm_disable_nx_huge_pages(struct kvm_vm *vm) */ void kvm_selftest_arch_init(void); +void kvm_arch_vm_post_create(struct kvm_vm *vm); + #endif /* SELFTEST_KVM_UTIL_BASE_H */ diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index db62680d59180..5ac8f207ed92d 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -351,9 +351,7 @@ struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint32_t nr_runnable_vcpus, slot0 = memslot2region(vm, 0); ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size); -#ifdef __x86_64__ - vm_create_irqchip(vm); -#endif + kvm_arch_vm_post_create(vm); return vm; } @@ -2087,6 +2085,10 @@ void __vm_get_stat(struct kvm_vm *vm, const char *stat_name, uint64_t *data, } } +__weak void kvm_arch_vm_post_create(struct kvm_vm *vm) +{ +} + __weak void kvm_selftest_arch_init(void) { } diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index efa20d0f99271..999576146d69d 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -586,6 +586,11 @@ void __vm_xsave_require_permission(int bit, const char *name) bitmask); } +void kvm_arch_vm_post_create(struct kvm_vm *vm) +{ + vm_create_irqchip(vm); +} + struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, void *guest_code) { -- GitLab From 376bc1b458c9a6db347c70f02fa6eb8b5f187455 Mon Sep 17 00:00:00 2001 From: Gautam Menghani Date: Mon, 17 Oct 2022 23:28:19 +0530 Subject: [PATCH 0697/1423] KVM: selftests: Don't assume vcpu->id is '0' in xAPIC state test In xapic_state_test's test_icr(), explicitly skip iterations that would match vcpu->id instead of assuming vcpu->id is '0', so that IPIs are are correctly sent to non-existent vCPUs. Suggested-by: Sean Christopherson Link: https://lore.kernel.org/kvm/YyoZr9rXSSMEtdh5@google.com Signed-off-by: Gautam Menghani Link: https://lore.kernel.org/r/20221017175819.12672-1-gautammenghani201@gmail.com [sean: massage shortlog and changelog] Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86_64/xapic_state_test.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c index 6f7a5ef667186..d7d37dae3eebf 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c @@ -114,7 +114,9 @@ static void test_icr(struct xapic_vcpu *x) * vCPUs, not vcpu.id + 1. Arbitrarily use vector 0xff. */ icr = APIC_INT_ASSERT | 0xff; - for (i = vcpu->id + 1; i < 0xff; i++) { + for (i = 0; i < 0xff; i++) { + if (i == vcpu->id) + continue; for (j = 0; j < 8; j++) __test_icr(x, i << (32 + 24) | icr | (j << 8)); } -- GitLab From 52d3a4fb5be1579b0fb8d48f8abaf9a9c7db1083 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 2 Nov 2022 11:46:45 -0700 Subject: [PATCH 0698/1423] KVM: selftests: Rename emulator_error_test to smaller_maxphyaddr_emulation_test Rename emulator_error_test to smaller_maxphyaddr_emulation_test and update the comment at the top of the file to document that this is explicitly a test to validate that KVM emulates instructions in response to an EPT violation when emulating a smaller MAXPHYADDR. Signed-off-by: David Matlack Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20221102184654.282799-2-dmatlack@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/.gitignore | 2 +- tools/testing/selftests/kvm/Makefile | 2 +- ...ulator_error_test.c => smaller_maxphyaddr_emulation_test.c} | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) rename tools/testing/selftests/kvm/x86_64/{emulator_error_test.c => smaller_maxphyaddr_emulation_test.c} (96%) diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 2f0d705db9dba..053e5d34cd030 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -17,7 +17,6 @@ /x86_64/cr4_cpuid_sync_test /x86_64/debug_regs /x86_64/evmcs_test -/x86_64/emulator_error_test /x86_64/fix_hypercall_test /x86_64/get_msr_index_features /x86_64/kvm_clock_test @@ -36,6 +35,7 @@ /x86_64/set_boot_cpu_id /x86_64/set_sregs_test /x86_64/sev_migrate_tests +/x86_64/smaller_maxphyaddr_emulation_test /x86_64/smm_test /x86_64/state_test /x86_64/svm_vmcall_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 6e2a683629c77..cff3a7ff87829 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -82,7 +82,6 @@ TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test -TEST_GEN_PROGS_x86_64 += x86_64/emulator_error_test TEST_GEN_PROGS_x86_64 += x86_64/fix_hypercall_test TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid @@ -97,6 +96,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test +TEST_GEN_PROGS_x86_64 += x86_64/smaller_maxphyaddr_emulation_test TEST_GEN_PROGS_x86_64 += x86_64/smm_test TEST_GEN_PROGS_x86_64 += x86_64/state_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test diff --git a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c b/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c similarity index 96% rename from tools/testing/selftests/kvm/x86_64/emulator_error_test.c rename to tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c index d945e571e7a0c..6fe5af8bd47ef 100644 --- a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c +++ b/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c @@ -2,7 +2,8 @@ /* * Copyright (C) 2020, Google LLC. * - * Tests for KVM_CAP_EXIT_ON_EMULATION_FAILURE capability. + * Test that KVM emulates instructions in response to EPT violations when + * allow_smaller_maxphyaddr is enabled and guest.MAXPHYADDR < host.MAXPHYADDR. */ #define _GNU_SOURCE /* for program_invocation_short_name */ -- GitLab From 48e59373398a554098863bc7a3d1350cd0d5c4d0 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 2 Nov 2022 11:46:46 -0700 Subject: [PATCH 0699/1423] KVM: selftests: Explicitly require instructions bytes Hard-code the flds instruction and assert the exact instruction bytes are present in run->emulation_failure. The test already requires the instruction bytes to be present because that's the only way the test will advance the RIP past the flds and get to GUEST_DONE(). Note that KVM does not necessarily return exactly 2 bytes in run->emulation_failure since it may not know the exact instruction length in all cases. So just assert that run->emulation_failure.insn_size is at least 2. Signed-off-by: David Matlack Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20221102184654.282799-3-dmatlack@google.com Signed-off-by: Sean Christopherson --- .../smaller_maxphyaddr_emulation_test.c | 68 ++++++------------- 1 file changed, 20 insertions(+), 48 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c b/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c index 6fe5af8bd47ef..6f7891b0e193b 100644 --- a/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c +++ b/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c @@ -19,41 +19,20 @@ #define MEM_REGION_SLOT 10 #define MEM_REGION_SIZE PAGE_SIZE +#define FLDS_MEM_EAX ".byte 0xd9, 0x00" + static void guest_code(void) { - __asm__ __volatile__("flds (%[addr])" - :: [addr]"r"(MEM_REGION_GVA)); + __asm__ __volatile__(FLDS_MEM_EAX :: "a"(MEM_REGION_GVA)); GUEST_DONE(); } -/* - * Accessors to get R/M, REG, and Mod bits described in the SDM vol 2, - * figure 2-2 "Table Interpretation of ModR/M Byte (C8H)". - */ -#define GET_RM(insn_byte) (insn_byte & 0x7) -#define GET_REG(insn_byte) ((insn_byte & 0x38) >> 3) -#define GET_MOD(insn_byte) ((insn_byte & 0xc) >> 6) - -/* Ensure we are dealing with a simple 2-byte flds instruction. */ -static bool is_flds(uint8_t *insn_bytes, uint8_t insn_size) -{ - return insn_size >= 2 && - insn_bytes[0] == 0xd9 && - GET_REG(insn_bytes[1]) == 0x0 && - GET_MOD(insn_bytes[1]) == 0x0 && - /* Ensure there is no SIB byte. */ - GET_RM(insn_bytes[1]) != 0x4 && - /* Ensure there is no displacement byte. */ - GET_RM(insn_bytes[1]) != 0x5; -} - static void process_exit_on_emulation_error(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; struct kvm_regs regs; uint8_t *insn_bytes; - uint8_t insn_size; uint64_t flags; TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR, @@ -65,30 +44,23 @@ static void process_exit_on_emulation_error(struct kvm_vcpu *vcpu) "Unexpected suberror: %u", run->emulation_failure.suberror); - if (run->emulation_failure.ndata >= 1) { - flags = run->emulation_failure.flags; - if ((flags & KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES) && - run->emulation_failure.ndata >= 3) { - insn_size = run->emulation_failure.insn_size; - insn_bytes = run->emulation_failure.insn_bytes; - - TEST_ASSERT(insn_size <= 15 && insn_size > 0, - "Unexpected instruction size: %u", - insn_size); - - TEST_ASSERT(is_flds(insn_bytes, insn_size), - "Unexpected instruction. Expected 'flds' (0xd9 /0)"); - - /* - * If is_flds() succeeded then the instruction bytes - * contained an flds instruction that is 2-bytes in - * length (ie: no prefix, no SIB, no displacement). - */ - vcpu_regs_get(vcpu, ®s); - regs.rip += 2; - vcpu_regs_set(vcpu, ®s); - } - } + flags = run->emulation_failure.flags; + TEST_ASSERT(run->emulation_failure.ndata >= 3 && + flags & KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES, + "run->emulation_failure is missing instruction bytes"); + + TEST_ASSERT(run->emulation_failure.insn_size >= 2, + "Expected a 2-byte opcode for 'flds', got %d bytes", + run->emulation_failure.insn_size); + + insn_bytes = run->emulation_failure.insn_bytes; + TEST_ASSERT(insn_bytes[0] == 0xd9 && insn_bytes[1] == 0, + "Expected 'flds [eax]', opcode '0xd9 0x00', got opcode 0x%02x 0x%02x\n", + insn_bytes[0], insn_bytes[1]); + + vcpu_regs_get(vcpu, ®s); + regs.rip += 2; + vcpu_regs_set(vcpu, ®s); } static void do_guest_assert(struct ucall *uc) -- GitLab From 50824c6eee39eb2d7a60d665b4b245552e852705 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 2 Nov 2022 11:46:47 -0700 Subject: [PATCH 0700/1423] KVM: selftests: Delete dead ucall code Delete a bunch of code related to ucall handling from smaller_maxphyaddr_emulation_test. The only thing smaller_maxphyaddr_emulation_test needs to check is that the vCPU exits with UCALL_DONE after the second vcpu_run(). Signed-off-by: David Matlack Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20221102184654.282799-4-dmatlack@google.com Signed-off-by: Sean Christopherson --- .../smaller_maxphyaddr_emulation_test.c | 61 +------------------ 1 file changed, 1 insertion(+), 60 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c b/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c index 6f7891b0e193b..b72aeefda70c6 100644 --- a/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c +++ b/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c @@ -63,64 +63,6 @@ static void process_exit_on_emulation_error(struct kvm_vcpu *vcpu) vcpu_regs_set(vcpu, ®s); } -static void do_guest_assert(struct ucall *uc) -{ - REPORT_GUEST_ASSERT(*uc); -} - -static void check_for_guest_assert(struct kvm_vcpu *vcpu) -{ - struct ucall uc; - - if (vcpu->run->exit_reason == KVM_EXIT_IO && - get_ucall(vcpu, &uc) == UCALL_ABORT) { - do_guest_assert(&uc); - } -} - -static void process_ucall_done(struct kvm_vcpu *vcpu) -{ - struct kvm_run *run = vcpu->run; - struct ucall uc; - - check_for_guest_assert(vcpu); - - TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, - "Unexpected exit reason: %u (%s)", - run->exit_reason, - exit_reason_str(run->exit_reason)); - - TEST_ASSERT(get_ucall(vcpu, &uc) == UCALL_DONE, - "Unexpected ucall command: %lu, expected UCALL_DONE (%d)", - uc.cmd, UCALL_DONE); -} - -static uint64_t process_ucall(struct kvm_vcpu *vcpu) -{ - struct kvm_run *run = vcpu->run; - struct ucall uc; - - TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, - "Unexpected exit reason: %u (%s)", - run->exit_reason, - exit_reason_str(run->exit_reason)); - - switch (get_ucall(vcpu, &uc)) { - case UCALL_SYNC: - break; - case UCALL_ABORT: - do_guest_assert(&uc); - break; - case UCALL_DONE: - process_ucall_done(vcpu); - break; - default: - TEST_ASSERT(false, "Unexpected ucall"); - } - - return uc.cmd; -} - int main(int argc, char *argv[]) { struct kvm_vcpu *vcpu; @@ -156,8 +98,7 @@ int main(int argc, char *argv[]) vcpu_run(vcpu); process_exit_on_emulation_error(vcpu); vcpu_run(vcpu); - - TEST_ASSERT(process_ucall(vcpu) == UCALL_DONE, "Expected UCALL_DONE"); + ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE); kvm_vm_free(vm); -- GitLab From 19a2b32f5d242844e49764cf88256cfd53de0082 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 2 Nov 2022 11:46:48 -0700 Subject: [PATCH 0701/1423] KVM: selftests: Move flds instruction emulation failure handling to header Move the flds instruction emulation failure handling code to a header so it can be re-used in an upcoming test. No functional change intended. Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20221102184654.282799-5-dmatlack@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/x86_64/flds_emulation.h | 55 +++++++++++++++++++ .../smaller_maxphyaddr_emulation_test.c | 44 ++------------- 2 files changed, 59 insertions(+), 40 deletions(-) create mode 100644 tools/testing/selftests/kvm/x86_64/flds_emulation.h diff --git a/tools/testing/selftests/kvm/x86_64/flds_emulation.h b/tools/testing/selftests/kvm/x86_64/flds_emulation.h new file mode 100644 index 0000000000000..e43a7df25f2c5 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/flds_emulation.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTEST_KVM_FLDS_EMULATION_H +#define SELFTEST_KVM_FLDS_EMULATION_H + +#include "kvm_util.h" + +#define FLDS_MEM_EAX ".byte 0xd9, 0x00" + +/* + * flds is an instruction that the KVM instruction emulator is known not to + * support. This can be used in guest code along with a mechanism to force + * KVM to emulate the instruction (e.g. by providing an MMIO address) to + * exercise emulation failures. + */ +static inline void flds(uint64_t address) +{ + __asm__ __volatile__(FLDS_MEM_EAX :: "a"(address)); +} + +static inline void handle_flds_emulation_failure_exit(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + struct kvm_regs regs; + uint8_t *insn_bytes; + uint64_t flags; + + TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR, + "Unexpected exit reason: %u (%s)", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + TEST_ASSERT(run->emulation_failure.suberror == KVM_INTERNAL_ERROR_EMULATION, + "Unexpected suberror: %u", + run->emulation_failure.suberror); + + flags = run->emulation_failure.flags; + TEST_ASSERT(run->emulation_failure.ndata >= 3 && + flags & KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES, + "run->emulation_failure is missing instruction bytes"); + + TEST_ASSERT(run->emulation_failure.insn_size >= 2, + "Expected a 2-byte opcode for 'flds', got %d bytes", + run->emulation_failure.insn_size); + + insn_bytes = run->emulation_failure.insn_bytes; + TEST_ASSERT(insn_bytes[0] == 0xd9 && insn_bytes[1] == 0, + "Expected 'flds [eax]', opcode '0xd9 0x00', got opcode 0x%02x 0x%02x\n", + insn_bytes[0], insn_bytes[1]); + + vcpu_regs_get(vcpu, ®s); + regs.rip += 2; + vcpu_regs_set(vcpu, ®s); +} + +#endif /* !SELFTEST_KVM_FLDS_EMULATION_H */ diff --git a/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c b/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c index b72aeefda70c6..a8d0811789176 100644 --- a/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c +++ b/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c @@ -8,6 +8,8 @@ #define _GNU_SOURCE /* for program_invocation_short_name */ +#include "flds_emulation.h" + #include "test_util.h" #include "kvm_util.h" #include "vmx.h" @@ -19,50 +21,12 @@ #define MEM_REGION_SLOT 10 #define MEM_REGION_SIZE PAGE_SIZE -#define FLDS_MEM_EAX ".byte 0xd9, 0x00" - static void guest_code(void) { - __asm__ __volatile__(FLDS_MEM_EAX :: "a"(MEM_REGION_GVA)); - + flds(MEM_REGION_GVA); GUEST_DONE(); } -static void process_exit_on_emulation_error(struct kvm_vcpu *vcpu) -{ - struct kvm_run *run = vcpu->run; - struct kvm_regs regs; - uint8_t *insn_bytes; - uint64_t flags; - - TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR, - "Unexpected exit reason: %u (%s)", - run->exit_reason, - exit_reason_str(run->exit_reason)); - - TEST_ASSERT(run->emulation_failure.suberror == KVM_INTERNAL_ERROR_EMULATION, - "Unexpected suberror: %u", - run->emulation_failure.suberror); - - flags = run->emulation_failure.flags; - TEST_ASSERT(run->emulation_failure.ndata >= 3 && - flags & KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES, - "run->emulation_failure is missing instruction bytes"); - - TEST_ASSERT(run->emulation_failure.insn_size >= 2, - "Expected a 2-byte opcode for 'flds', got %d bytes", - run->emulation_failure.insn_size); - - insn_bytes = run->emulation_failure.insn_bytes; - TEST_ASSERT(insn_bytes[0] == 0xd9 && insn_bytes[1] == 0, - "Expected 'flds [eax]', opcode '0xd9 0x00', got opcode 0x%02x 0x%02x\n", - insn_bytes[0], insn_bytes[1]); - - vcpu_regs_get(vcpu, ®s); - regs.rip += 2; - vcpu_regs_set(vcpu, ®s); -} - int main(int argc, char *argv[]) { struct kvm_vcpu *vcpu; @@ -96,7 +60,7 @@ int main(int argc, char *argv[]) *pte |= BIT_ULL(MAXPHYADDR); vcpu_run(vcpu); - process_exit_on_emulation_error(vcpu); + handle_flds_emulation_failure_exit(vcpu); vcpu_run(vcpu); ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE); -- GitLab From d6ecfe976ac342e5c3249b9439da762f65e98015 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 2 Nov 2022 11:46:49 -0700 Subject: [PATCH 0702/1423] KVM: x86/mmu: Use BIT{,_ULL}() for PFERR masks Use the preferred BIT() and BIT_ULL() to construct the PFERR masks rather than open-coding the bit shifting. No functional change intended. Signed-off-by: David Matlack Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20221102184654.282799-6-dmatlack@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 81114a376c4ee..598eb3b9ae440 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -256,16 +256,16 @@ enum x86_intercept_stage; #define PFERR_GUEST_PAGE_BIT 33 #define PFERR_IMPLICIT_ACCESS_BIT 48 -#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) -#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) -#define PFERR_USER_MASK (1U << PFERR_USER_BIT) -#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) -#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) -#define PFERR_PK_MASK (1U << PFERR_PK_BIT) -#define PFERR_SGX_MASK (1U << PFERR_SGX_BIT) -#define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT) -#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT) -#define PFERR_IMPLICIT_ACCESS (1ULL << PFERR_IMPLICIT_ACCESS_BIT) +#define PFERR_PRESENT_MASK BIT(PFERR_PRESENT_BIT) +#define PFERR_WRITE_MASK BIT(PFERR_WRITE_BIT) +#define PFERR_USER_MASK BIT(PFERR_USER_BIT) +#define PFERR_RSVD_MASK BIT(PFERR_RSVD_BIT) +#define PFERR_FETCH_MASK BIT(PFERR_FETCH_BIT) +#define PFERR_PK_MASK BIT(PFERR_PK_BIT) +#define PFERR_SGX_MASK BIT(PFERR_SGX_BIT) +#define PFERR_GUEST_FINAL_MASK BIT_ULL(PFERR_GUEST_FINAL_BIT) +#define PFERR_GUEST_PAGE_MASK BIT_ULL(PFERR_GUEST_PAGE_BIT) +#define PFERR_IMPLICIT_ACCESS BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT) #define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ PFERR_WRITE_MASK | \ -- GitLab From 77f7813cc2b9d95166a3539a5203663d5bbb0fd0 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 2 Nov 2022 11:46:50 -0700 Subject: [PATCH 0703/1423] KVM: selftests: Copy KVM PFERR masks into selftests Copy KVM's macros for page fault error masks into processor.h so they can be used in selftests. Signed-off-by: David Matlack Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20221102184654.282799-7-dmatlack@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/include/x86_64/processor.h | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index e000e35c948fa..159ea618d90f8 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -887,4 +887,27 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, #define XSTATE_XTILE_DATA_MASK (1ULL << XSTATE_XTILE_DATA_BIT) #define XFEATURE_XTILE_MASK (XSTATE_XTILE_CFG_MASK | \ XSTATE_XTILE_DATA_MASK) + +#define PFERR_PRESENT_BIT 0 +#define PFERR_WRITE_BIT 1 +#define PFERR_USER_BIT 2 +#define PFERR_RSVD_BIT 3 +#define PFERR_FETCH_BIT 4 +#define PFERR_PK_BIT 5 +#define PFERR_SGX_BIT 15 +#define PFERR_GUEST_FINAL_BIT 32 +#define PFERR_GUEST_PAGE_BIT 33 +#define PFERR_IMPLICIT_ACCESS_BIT 48 + +#define PFERR_PRESENT_MASK BIT(PFERR_PRESENT_BIT) +#define PFERR_WRITE_MASK BIT(PFERR_WRITE_BIT) +#define PFERR_USER_MASK BIT(PFERR_USER_BIT) +#define PFERR_RSVD_MASK BIT(PFERR_RSVD_BIT) +#define PFERR_FETCH_MASK BIT(PFERR_FETCH_BIT) +#define PFERR_PK_MASK BIT(PFERR_PK_BIT) +#define PFERR_SGX_MASK BIT(PFERR_SGX_BIT) +#define PFERR_GUEST_FINAL_MASK BIT_ULL(PFERR_GUEST_FINAL_BIT) +#define PFERR_GUEST_PAGE_MASK BIT_ULL(PFERR_GUEST_PAGE_BIT) +#define PFERR_IMPLICIT_ACCESS BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT) + #endif /* SELFTEST_KVM_PROCESSOR_H */ -- GitLab From f2e5b53b4ba9bc10d3febc3682bdf22e946bf6eb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 2 Nov 2022 11:46:51 -0700 Subject: [PATCH 0704/1423] KVM: selftests: Avoid JMP in non-faulting path of KVM_ASM_SAFE() Clear R9 in the non-faulting path of KVM_ASM_SAFE() and fall through to to a common load of "vector" to effectively load "vector" with '0' to reduce the code footprint of the asm blob, to reduce the runtime overhead of the non-faulting path (when "vector" is stored in a register), and so that additional output constraints that are valid if and only if a fault occur are loaded even in the non-faulting case. A future patch will add a 64-bit output for the error code, and if its output is not explicitly loaded with _something_, the user of the asm blob can end up technically consuming uninitialized data. Using a common path to load the output constraints will allow using an existing scratch register, e.g. r10, to hold the error code in the faulting path, while also guaranteeing the error code is initialized with deterministic data in the non-faulting patch (r10 is loaded with the RIP of to-be-executed instruction). Consuming the error code when a fault doesn't occur would obviously be a test bug, but there's no guarantee the compiler will detect uninitialized consumption. And conversely, it's theoretically possible that the compiler might throw a false positive on uninitialized data, e.g. if the compiler can't determine that the non-faulting path won't touch the error code. Alternatively, the error code could be explicitly loaded in the non-faulting path, but loading a 64-bit memory|register output operand with an explicitl value requires a sign-extended "MOV imm32, r/m64", which isn't exactly straightforward and has a largish code footprint. And loading the error code with what is effectively garbage (from a scratch register) avoids having to choose an arbitrary value for the non-faulting case. Opportunistically remove a rogue asterisk in the block comment. Signed-off-by: Sean Christopherson Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20221102184654.282799-8-dmatlack@google.com --- tools/testing/selftests/kvm/include/x86_64/processor.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 159ea618d90f8..6f4c727d68188 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -770,7 +770,7 @@ void vm_install_exception_handler(struct kvm_vm *vm, int vector, * for recursive faults when accessing memory in the handler. The downside to * using registers is that it restricts what registers can be used by the actual * instruction. But, selftests are 64-bit only, making register* pressure a - * minor concern. Use r9-r11 as they are volatile, i.e. don't need* to be saved + * minor concern. Use r9-r11 as they are volatile, i.e. don't need to be saved * by the callee, and except for r11 are not implicit parameters to any * instructions. Ideally, fixup would use r8-r10 and thus avoid implicit * parameters entirely, but Hyper-V's hypercall ABI uses r8 and testing Hyper-V @@ -792,11 +792,9 @@ void vm_install_exception_handler(struct kvm_vm *vm, int vector, "lea 1f(%%rip), %%r10\n\t" \ "lea 2f(%%rip), %%r11\n\t" \ "1: " insn "\n\t" \ - "movb $0, %[vector]\n\t" \ - "jmp 3f\n\t" \ + "xor %%r9, %%r9\n\t" \ "2:\n\t" \ - "mov %%r9b, %[vector]\n\t" \ - "3:\n\t" + "mov %%r9b, %[vector]\n\t" #define KVM_ASM_SAFE_OUTPUTS(v) [vector] "=qm"(v) #define KVM_ASM_SAFE_CLOBBERS "r9", "r10", "r11" -- GitLab From b9635930f0a73c1ef7b465121896c3fb2e3b77cd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 2 Nov 2022 11:46:52 -0700 Subject: [PATCH 0705/1423] KVM: selftests: Provide error code as a KVM_ASM_SAFE() output Provide the error code on a fault in KVM_ASM_SAFE(), e.g. to allow tests to assert that #PF generates the correct error code without needing to manually install a #PF handler. Use r10 as the scratch register for the error code, as it's already clobbered by the asm blob (loaded with the RIP of the to-be-executed instruction). Deliberately load the output "error_code" even in the non-faulting path so that error_code is always initialized with deterministic data (the aforementioned RIP), i.e to ensure a selftest won't end up with uninitialized consumption regardless of how KVM_ASM_SAFE() is used. Don't clear r10 in the non-faulting case and instead load error code with the RIP (see above). The error code is valid if and only if an exception occurs, and '0' isn't necessarily a better "invalid" value, e.g. '0' could result in false passes for a buggy test. Signed-off-by: Sean Christopherson Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20221102184654.282799-9-dmatlack@google.com --- .../selftests/kvm/include/x86_64/processor.h | 39 +++++++++++++------ .../selftests/kvm/lib/x86_64/processor.c | 1 + .../selftests/kvm/x86_64/hyperv_features.c | 3 +- 3 files changed, 30 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 6f4c727d68188..6093d0d53b4e6 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -786,6 +786,7 @@ void vm_install_exception_handler(struct kvm_vm *vm, int vector, * * REGISTER OUTPUTS: * r9 = exception vector (non-zero) + * r10 = error code */ #define KVM_ASM_SAFE(insn) \ "mov $" __stringify(KVM_EXCEPTION_MAGIC) ", %%r9\n\t" \ @@ -794,29 +795,43 @@ void vm_install_exception_handler(struct kvm_vm *vm, int vector, "1: " insn "\n\t" \ "xor %%r9, %%r9\n\t" \ "2:\n\t" \ - "mov %%r9b, %[vector]\n\t" + "mov %%r9b, %[vector]\n\t" \ + "mov %%r10, %[error_code]\n\t" -#define KVM_ASM_SAFE_OUTPUTS(v) [vector] "=qm"(v) +#define KVM_ASM_SAFE_OUTPUTS(v, ec) [vector] "=qm"(v), [error_code] "=rm"(ec) #define KVM_ASM_SAFE_CLOBBERS "r9", "r10", "r11" -#define kvm_asm_safe(insn, inputs...) \ -({ \ - uint8_t vector; \ - \ - asm volatile(KVM_ASM_SAFE(insn) \ - : KVM_ASM_SAFE_OUTPUTS(vector) \ - : inputs \ - : KVM_ASM_SAFE_CLOBBERS); \ - vector; \ +#define kvm_asm_safe(insn, inputs...) \ +({ \ + uint64_t ign_error_code; \ + uint8_t vector; \ + \ + asm volatile(KVM_ASM_SAFE(insn) \ + : KVM_ASM_SAFE_OUTPUTS(vector, ign_error_code) \ + : inputs \ + : KVM_ASM_SAFE_CLOBBERS); \ + vector; \ +}) + +#define kvm_asm_safe_ec(insn, error_code, inputs...) \ +({ \ + uint8_t vector; \ + \ + asm volatile(KVM_ASM_SAFE(insn) \ + : KVM_ASM_SAFE_OUTPUTS(vector, error_code) \ + : inputs \ + : KVM_ASM_SAFE_CLOBBERS); \ + vector; \ }) static inline uint8_t rdmsr_safe(uint32_t msr, uint64_t *val) { + uint64_t error_code; uint8_t vector; uint32_t a, d; asm volatile(KVM_ASM_SAFE("rdmsr") - : "=a"(a), "=d"(d), KVM_ASM_SAFE_OUTPUTS(vector) + : "=a"(a), "=d"(d), KVM_ASM_SAFE_OUTPUTS(vector, error_code) : "c"(msr) : KVM_ASM_SAFE_CLOBBERS); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 999576146d69d..4623874a805bc 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1068,6 +1068,7 @@ static bool kvm_fixup_exception(struct ex_regs *regs) regs->rip = regs->r11; regs->r9 = regs->vector; + regs->r10 = regs->error_code; return true; } diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c index 05b32e550a802..2b6d455acf8ae 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c @@ -18,6 +18,7 @@ static inline uint8_t hypercall(u64 control, vm_vaddr_t input_address, vm_vaddr_t output_address, uint64_t *hv_status) { + uint64_t error_code; uint8_t vector; /* Note both the hypercall and the "asm safe" clobber r9-r11. */ @@ -25,7 +26,7 @@ static inline uint8_t hypercall(u64 control, vm_vaddr_t input_address, KVM_ASM_SAFE("vmcall") : "=a" (*hv_status), "+c" (control), "+d" (input_address), - KVM_ASM_SAFE_OUTPUTS(vector) + KVM_ASM_SAFE_OUTPUTS(vector, error_code) : [output_address] "r"(output_address), "a" (-EFAULT) : "cc", "memory", "r8", KVM_ASM_SAFE_CLOBBERS); -- GitLab From a323845d6c3d2f667274bde8145906580359fe06 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 2 Nov 2022 11:46:53 -0700 Subject: [PATCH 0706/1423] KVM: selftests: Expect #PF(RSVD) when TDP is disabled Change smaller_maxphyaddr_emulation_test to expect a #PF(RSVD), rather than an emulation failure, when TDP is disabled. KVM only needs to emulate instructions to emulate a smaller guest.MAXPHYADDR when TDP is enabled. Fixes: 39bbcc3a4e39 ("selftests: kvm: Allows userspace to handle emulation errors.") Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20221102184654.282799-10-dmatlack@google.com [sean: massage comment to talk about having to emulate due to MAXPHYADDR] Signed-off-by: Sean Christopherson --- .../smaller_maxphyaddr_emulation_test.c | 51 +++++++++++++++++-- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c b/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c index a8d0811789176..06edf00a97d61 100644 --- a/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c +++ b/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c @@ -21,9 +21,27 @@ #define MEM_REGION_SLOT 10 #define MEM_REGION_SIZE PAGE_SIZE -static void guest_code(void) +static void guest_code(bool tdp_enabled) { - flds(MEM_REGION_GVA); + uint64_t error_code; + uint64_t vector; + + vector = kvm_asm_safe_ec(FLDS_MEM_EAX, error_code, "a"(MEM_REGION_GVA)); + + /* + * When TDP is enabled, flds will trigger an emulation failure, exit to + * userspace, and then the selftest host "VMM" skips the instruction. + * + * When TDP is disabled, no instruction emulation is required so flds + * should generate #PF(RSVD). + */ + if (tdp_enabled) { + GUEST_ASSERT(!vector); + } else { + GUEST_ASSERT_EQ(vector, PF_VECTOR); + GUEST_ASSERT(error_code & PFERR_RSVD_MASK); + } + GUEST_DONE(); } @@ -31,6 +49,7 @@ int main(int argc, char *argv[]) { struct kvm_vcpu *vcpu; struct kvm_vm *vm; + struct ucall uc; uint64_t *pte; uint64_t *hva; uint64_t gpa; @@ -39,6 +58,10 @@ int main(int argc, char *argv[]) TEST_REQUIRE(kvm_has_cap(KVM_CAP_SMALLER_MAXPHYADDR)); vm = vm_create_with_one_vcpu(&vcpu, guest_code); + vcpu_args_set(vcpu, 1, kvm_is_tdp_enabled()); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vcpu); vcpu_set_cpuid_maxphyaddr(vcpu, MAXPHYADDR); @@ -60,9 +83,27 @@ int main(int argc, char *argv[]) *pte |= BIT_ULL(MAXPHYADDR); vcpu_run(vcpu); - handle_flds_emulation_failure_exit(vcpu); - vcpu_run(vcpu); - ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE); + + /* + * When TDP is enabled, KVM must emulate in response the guest physical + * address that is illegal from the guest's perspective, but is legal + * from hardware's perspeective. This should result in an emulation + * failure exit to userspace since KVM doesn't support emulating flds. + */ + if (kvm_is_tdp_enabled()) { + handle_flds_emulation_failure_exit(vcpu); + vcpu_run(vcpu); + } + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_DONE: + break; + default: + TEST_FAIL("Unrecognized ucall: %lu\n", uc.cmd); + } kvm_vm_free(vm); -- GitLab From 3ae5b759c3c033b4ca2b7ef19836622902151517 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 2 Nov 2022 11:46:54 -0700 Subject: [PATCH 0707/1423] KVM: selftests: Add a test for KVM_CAP_EXIT_ON_EMULATION_FAILURE Add a selftest to exercise the KVM_CAP_EXIT_ON_EMULATION_FAILURE capability. This capability is also exercised through smaller_maxphyaddr_emulation_test, but that test requires allow_smaller_maxphyaddr=Y, which is off by default on Intel when ept=Y and unconditionally disabled on AMD when npt=Y. This new test ensures that KVM_CAP_EXIT_ON_EMULATION_FAILURE is exercised independent of allow_smaller_maxphyaddr. Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20221102184654.282799-11-dmatlack@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + .../x86_64/exit_on_emulation_failure_test.c | 45 +++++++++++++++++++ 3 files changed, 47 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 053e5d34cd030..bef984e4c39d9 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -17,6 +17,7 @@ /x86_64/cr4_cpuid_sync_test /x86_64/debug_regs /x86_64/evmcs_test +/x86_64/exit_on_emulation_failure_test /x86_64/fix_hypercall_test /x86_64/get_msr_index_features /x86_64/kvm_clock_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index cff3a7ff87829..487248c67decf 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -82,6 +82,7 @@ TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test +TEST_GEN_PROGS_x86_64 += x86_64/exit_on_emulation_failure_test TEST_GEN_PROGS_x86_64 += x86_64/fix_hypercall_test TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid diff --git a/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c b/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c new file mode 100644 index 0000000000000..37c61f712fd5c --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022, Google LLC. + * + * Test for KVM_CAP_EXIT_ON_EMULATION_FAILURE. + */ + +#define _GNU_SOURCE /* for program_invocation_short_name */ + +#include "flds_emulation.h" + +#include "test_util.h" + +#define MMIO_GPA 0x700000000 +#define MMIO_GVA MMIO_GPA + +static void guest_code(void) +{ + /* Execute flds with an MMIO address to force KVM to emulate it. */ + flds(MMIO_GVA); + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + /* Tell stdout not to buffer its content */ + setbuf(stdout, NULL); + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_EXIT_ON_EMULATION_FAILURE)); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + vm_enable_cap(vm, KVM_CAP_EXIT_ON_EMULATION_FAILURE, 1); + virt_map(vm, MMIO_GVA, MMIO_GPA, 1); + + vcpu_run(vcpu); + handle_flds_emulation_failure_exit(vcpu); + vcpu_run(vcpu); + ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE); + + kvm_vm_free(vm); + return 0; +} -- GitLab From 3bd396353d18b4f4e4f9953e5f5c46b0045a5477 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:51:14 +0000 Subject: [PATCH 0708/1423] KVM: selftests: Add X86_FEATURE_PAE and use it calc "fallback" MAXPHYADDR Add X86_FEATURE_PAE and use it to guesstimate the MAXPHYADDR when the MAXPHYADDR CPUID entry isn't supported. No functional change intended. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221006005125.680782-2-seanjc@google.com --- tools/testing/selftests/kvm/include/x86_64/processor.h | 1 + tools/testing/selftests/kvm/lib/x86_64/processor.c | 4 +--- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 6093d0d53b4e6..4f928fe3ad179 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -89,6 +89,7 @@ struct kvm_x86_cpu_feature { #define X86_FEATURE_XSAVE KVM_X86_CPU_FEATURE(0x1, 0, ECX, 26) #define X86_FEATURE_OSXSAVE KVM_X86_CPU_FEATURE(0x1, 0, ECX, 27) #define X86_FEATURE_RDRAND KVM_X86_CPU_FEATURE(0x1, 0, ECX, 30) +#define X86_FEATURE_PAE KVM_X86_CPU_FEATURE(0x1, 0, EDX, 6) #define X86_FEATURE_MCE KVM_X86_CPU_FEATURE(0x1, 0, EDX, 7) #define X86_FEATURE_APIC KVM_X86_CPU_FEATURE(0x1, 0, EDX, 9) #define X86_FEATURE_CLFLUSH KVM_X86_CPU_FEATURE(0x1, 0, EDX, 19) diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 4623874a805bc..061e001c8a48f 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1012,12 +1012,10 @@ bool is_amd_cpu(void) void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits) { const struct kvm_cpuid_entry2 *entry; - bool pae; /* SDM 4.1.4 */ if (kvm_get_cpuid_max_extended() < 0x80000008) { - pae = kvm_get_supported_cpuid_entry(1)->edx & (1 << 6); - *pa_bits = pae ? 36 : 32; + *pa_bits == kvm_cpu_has(X86_FEATURE_PAE) ? 36 : 32; *va_bits = 32; } else { entry = kvm_get_supported_cpuid_entry(0x80000008); -- GitLab From ee3795536664e514196cbe7396d3eb4c9925de98 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:51:15 +0000 Subject: [PATCH 0709/1423] KVM: selftests: Refactor X86_FEATURE_* framework to prep for X86_PROPERTY_* Refactor the X86_FEATURE_* framework to prepare for extending the core logic to support "properties". The "feature" framework allows querying a single CPUID bit to detect the presence of a feature; the "property" framework will extend the idea to allow querying a value, i.e. to get a value that is a set of contiguous bits in a CPUID leaf. Opportunistically add static asserts to ensure features are fully defined at compile time, and to try and catch mistakes in the definition of features. No functional change intended. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221006005125.680782-3-seanjc@google.com --- .../selftests/kvm/include/x86_64/processor.h | 38 ++++++++++++------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 4f928fe3ad179..52e12d40e66c2 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -63,16 +63,21 @@ struct kvm_x86_cpu_feature { u8 reg; u8 bit; }; -#define KVM_X86_CPU_FEATURE(fn, idx, gpr, __bit) \ -({ \ - struct kvm_x86_cpu_feature feature = { \ - .function = fn, \ - .index = idx, \ - .reg = KVM_CPUID_##gpr, \ - .bit = __bit, \ - }; \ - \ - feature; \ +#define KVM_X86_CPU_FEATURE(fn, idx, gpr, __bit) \ +({ \ + struct kvm_x86_cpu_feature feature = { \ + .function = fn, \ + .index = idx, \ + .reg = KVM_CPUID_##gpr, \ + .bit = __bit, \ + }; \ + \ + static_assert((fn & 0xc0000000) == 0 || \ + (fn & 0xc0000000) == 0x40000000 || \ + (fn & 0xc0000000) == 0x80000000 || \ + (fn & 0xc0000000) == 0xc0000000); \ + static_assert(idx < BIT(sizeof(feature.index) * BITS_PER_BYTE)); \ + feature; \ }) /* @@ -432,15 +437,22 @@ static inline void cpuid(uint32_t function, return __cpuid(function, 0, eax, ebx, ecx, edx); } -static inline bool this_cpu_has(struct kvm_x86_cpu_feature feature) +static inline uint32_t __this_cpu_has(uint32_t function, uint32_t index, + uint8_t reg, uint8_t lo, uint8_t hi) { uint32_t gprs[4]; - __cpuid(feature.function, feature.index, + __cpuid(function, index, &gprs[KVM_CPUID_EAX], &gprs[KVM_CPUID_EBX], &gprs[KVM_CPUID_ECX], &gprs[KVM_CPUID_EDX]); - return gprs[feature.reg] & BIT(feature.bit); + return (gprs[reg] & GENMASK(hi, lo)) >> lo; +} + +static inline bool this_cpu_has(struct kvm_x86_cpu_feature feature) +{ + return __this_cpu_has(feature.function, feature.index, + feature.reg, feature.bit, feature.bit); } #define SET_XMM(__var, __xmm) \ -- GitLab From 53a7dc0f215ec91b098e3e6d7bb4bb9cef43a99a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:51:16 +0000 Subject: [PATCH 0710/1423] KVM: selftests: Add X86_PROPERTY_* framework to retrieve CPUID values Introduce X86_PROPERTY_* to allow retrieving values/properties from CPUID leafs, e.g. MAXPHYADDR from CPUID.0x80000008. Use the same core code as X86_FEATURE_*, the primary difference is that properties are multi-bit values, whereas features enumerate a single bit. Add this_cpu_has_p() to allow querying whether or not a property exists based on the maximum leaf associated with the property, e.g. MAXPHYADDR doesn't exist if the max leaf for 0x8000_xxxx is less than 0x8000_0008. Use the new property infrastructure in vm_compute_max_gfn() to prove that the code works as intended. Future patches will convert additional selftests code. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221006005125.680782-4-seanjc@google.com --- .../selftests/kvm/include/x86_64/processor.h | 68 +++++++++++++++++++ .../selftests/kvm/lib/x86_64/processor.c | 18 +++-- 2 files changed, 76 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 52e12d40e66c2..df1ba5af89ed3 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -168,6 +168,48 @@ struct kvm_x86_cpu_feature { #define X86_FEATURE_KVM_HC_MAP_GPA_RANGE KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 16) #define X86_FEATURE_KVM_MIGRATION_CONTROL KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 17) +/* + * Same idea as X86_FEATURE_XXX, but X86_PROPERTY_XXX retrieves a multi-bit + * value/property as opposed to a single-bit feature. Again, pack the info + * into a 64-bit value to pass by value with no overhead. + */ +struct kvm_x86_cpu_property { + u32 function; + u8 index; + u8 reg; + u8 lo_bit; + u8 hi_bit; +}; +#define KVM_X86_CPU_PROPERTY(fn, idx, gpr, low_bit, high_bit) \ +({ \ + struct kvm_x86_cpu_property property = { \ + .function = fn, \ + .index = idx, \ + .reg = KVM_CPUID_##gpr, \ + .lo_bit = low_bit, \ + .hi_bit = high_bit, \ + }; \ + \ + static_assert(low_bit < high_bit); \ + static_assert((fn & 0xc0000000) == 0 || \ + (fn & 0xc0000000) == 0x40000000 || \ + (fn & 0xc0000000) == 0x80000000 || \ + (fn & 0xc0000000) == 0xc0000000); \ + static_assert(idx < BIT(sizeof(property.index) * BITS_PER_BYTE)); \ + property; \ +}) + +#define X86_PROPERTY_MAX_BASIC_LEAF KVM_X86_CPU_PROPERTY(0, 0, EAX, 0, 31) + +#define X86_PROPERTY_MAX_KVM_LEAF KVM_X86_CPU_PROPERTY(0x40000000, 0, EAX, 0, 31) + +#define X86_PROPERTY_MAX_EXT_LEAF KVM_X86_CPU_PROPERTY(0x80000000, 0, EAX, 0, 31) +#define X86_PROPERTY_MAX_PHY_ADDR KVM_X86_CPU_PROPERTY(0x80000008, 0, EAX, 0, 7) +#define X86_PROPERTY_PHYS_ADDR_REDUCTION KVM_X86_CPU_PROPERTY(0x8000001F, 0, EBX, 6, 11) + +#define X86_PROPERTY_MAX_CENTAUR_LEAF KVM_X86_CPU_PROPERTY(0xC0000000, 0, EAX, 0, 31) + + /* Page table bitfield declarations */ #define PTE_PRESENT_MASK BIT_ULL(0) #define PTE_WRITABLE_MASK BIT_ULL(1) @@ -455,6 +497,32 @@ static inline bool this_cpu_has(struct kvm_x86_cpu_feature feature) feature.reg, feature.bit, feature.bit); } +static inline uint32_t this_cpu_property(struct kvm_x86_cpu_property property) +{ + return __this_cpu_has(property.function, property.index, + property.reg, property.lo_bit, property.hi_bit); +} + +static __always_inline bool this_cpu_has_p(struct kvm_x86_cpu_property property) +{ + uint32_t max_leaf; + + switch (property.function & 0xc0000000) { + case 0: + max_leaf = this_cpu_property(X86_PROPERTY_MAX_BASIC_LEAF); + break; + case 0x40000000: + max_leaf = this_cpu_property(X86_PROPERTY_MAX_KVM_LEAF); + break; + case 0x80000000: + max_leaf = this_cpu_property(X86_PROPERTY_MAX_EXT_LEAF); + break; + case 0xc0000000: + max_leaf = this_cpu_property(X86_PROPERTY_MAX_CENTAUR_LEAF); + } + return max_leaf >= property.function; +} + #define SET_XMM(__var, __xmm) \ asm volatile("movq %0, %%"#__xmm : : "r"(__var) : #__xmm) diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 061e001c8a48f..23321c1d0631e 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1229,7 +1229,8 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm) { const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */ unsigned long ht_gfn, max_gfn, max_pfn; - uint32_t eax, ebx, ecx, edx, max_ext_leaf; + uint32_t eax, ebx, ecx, edx; + uint8_t maxphyaddr; max_gfn = (1ULL << (vm->pa_bits - vm->page_shift)) - 1; @@ -1252,17 +1253,14 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm) * reduced due to SME by bits 11:6 of CPUID[0x8000001f].EBX. Use * the old conservative value if MAXPHYADDR is not enumerated. */ - cpuid(0x80000000, &eax, &ebx, &ecx, &edx); - max_ext_leaf = eax; - if (max_ext_leaf < 0x80000008) + if (!this_cpu_has_p(X86_PROPERTY_MAX_PHY_ADDR)) goto done; - cpuid(0x80000008, &eax, &ebx, &ecx, &edx); - max_pfn = (1ULL << ((eax & 0xff) - vm->page_shift)) - 1; - if (max_ext_leaf >= 0x8000001f) { - cpuid(0x8000001f, &eax, &ebx, &ecx, &edx); - max_pfn >>= (ebx >> 6) & 0x3f; - } + maxphyaddr = this_cpu_property(X86_PROPERTY_MAX_PHY_ADDR); + max_pfn = (1ULL << (maxphyaddr - vm->page_shift)) - 1; + + if (this_cpu_has_p(X86_PROPERTY_PHYS_ADDR_REDUCTION)) + max_pfn >>= this_cpu_property(X86_PROPERTY_PHYS_ADDR_REDUCTION); ht_gfn = max_pfn - num_ht_pages; done: -- GitLab From d80ddad2a8e042e6499d69d5a45a17051092e161 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:51:17 +0000 Subject: [PATCH 0711/1423] KVM: selftests: Use X86_PROPERTY_MAX_KVM_LEAF in CPUID test Use X86_PROPERTY_MAX_KVM_LEAF to replace the equivalent open coded check on KVM's maximum paravirt CPUID leaf. No functional change intended. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221006005125.680782-5-seanjc@google.com --- tools/testing/selftests/kvm/x86_64/cpuid_test.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/cpuid_test.c b/tools/testing/selftests/kvm/x86_64/cpuid_test.c index a6aeee2e62e4f..2fc3ad9c887e2 100644 --- a/tools/testing/selftests/kvm/x86_64/cpuid_test.c +++ b/tools/testing/selftests/kvm/x86_64/cpuid_test.c @@ -43,15 +43,6 @@ static void test_guest_cpuids(struct kvm_cpuid2 *guest_cpuid) } -static void test_cpuid_40000000(struct kvm_cpuid2 *guest_cpuid) -{ - u32 eax, ebx, ecx, edx; - - cpuid(0x40000000, &eax, &ebx, &ecx, &edx); - - GUEST_ASSERT(eax == 0x40000001); -} - static void guest_main(struct kvm_cpuid2 *guest_cpuid) { GUEST_SYNC(1); @@ -60,7 +51,7 @@ static void guest_main(struct kvm_cpuid2 *guest_cpuid) GUEST_SYNC(2); - test_cpuid_40000000(guest_cpuid); + GUEST_ASSERT(this_cpu_property(X86_PROPERTY_MAX_KVM_LEAF) == 0x40000001); GUEST_DONE(); } -- GitLab From a29e6e383b0d0d59a93ebbf6e93d3d41b905d336 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:51:18 +0000 Subject: [PATCH 0712/1423] KVM: selftests: Refactor kvm_cpuid_has() to prep for X86_PROPERTY_* support Refactor kvm_cpuid_has() to prepare for extending X86_PROPERTY_* support to KVM as well as "this CPU". No functional change intended. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221006005125.680782-6-seanjc@google.com --- .../selftests/kvm/lib/x86_64/processor.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 23321c1d0631e..710e5851a8635 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -652,8 +652,9 @@ const struct kvm_cpuid2 *kvm_get_supported_cpuid(void) return cpuid; } -bool kvm_cpuid_has(const struct kvm_cpuid2 *cpuid, - struct kvm_x86_cpu_feature feature) +static uint32_t __kvm_cpu_has(const struct kvm_cpuid2 *cpuid, + uint32_t function, uint32_t index, + uint8_t reg, uint8_t lo, uint8_t hi) { const struct kvm_cpuid_entry2 *entry; int i; @@ -666,12 +667,18 @@ bool kvm_cpuid_has(const struct kvm_cpuid2 *cpuid, * order, but kvm_x86_cpu_feature matches that mess, so yay * pointer shenanigans! */ - if (entry->function == feature.function && - entry->index == feature.index) - return (&entry->eax)[feature.reg] & BIT(feature.bit); + if (entry->function == function && entry->index == index) + return ((&entry->eax)[reg] & GENMASK(hi, lo)) >> lo; } - return false; + return 0; +} + +bool kvm_cpuid_has(const struct kvm_cpuid2 *cpuid, + struct kvm_x86_cpu_feature feature) +{ + return __kvm_cpu_has(cpuid, feature.function, feature.index, + feature.reg, feature.bit, feature.bit); } uint64_t kvm_get_feature_msr(uint64_t msr_index) -- GitLab From 40854713e3254f7a4fc4a92388309140e51e046c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:51:19 +0000 Subject: [PATCH 0713/1423] KVM: selftests: Add kvm_cpu_*() support for X86_PROPERTY_* Extent X86_PROPERTY_* support to KVM, i.e. add kvm_cpu_property() and kvm_cpu_has_p(), and use the new helpers in kvm_get_cpu_address_width(). No functional change intended. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221006005125.680782-7-seanjc@google.com --- .../selftests/kvm/include/x86_64/processor.h | 34 ++++++++++++++++--- .../selftests/kvm/lib/x86_64/processor.c | 17 ++++++---- 2 files changed, 39 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index df1ba5af89ed3..1eb76268c9f73 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -205,6 +205,7 @@ struct kvm_x86_cpu_property { #define X86_PROPERTY_MAX_EXT_LEAF KVM_X86_CPU_PROPERTY(0x80000000, 0, EAX, 0, 31) #define X86_PROPERTY_MAX_PHY_ADDR KVM_X86_CPU_PROPERTY(0x80000008, 0, EAX, 0, 7) +#define X86_PROPERTY_MAX_VIRT_ADDR KVM_X86_CPU_PROPERTY(0x80000008, 0, EAX, 8, 15) #define X86_PROPERTY_PHYS_ADDR_REDUCTION KVM_X86_CPU_PROPERTY(0x8000001F, 0, EBX, 6, 11) #define X86_PROPERTY_MAX_CENTAUR_LEAF KVM_X86_CPU_PROPERTY(0xC0000000, 0, EAX, 0, 31) @@ -703,6 +704,34 @@ static inline bool kvm_cpu_has(struct kvm_x86_cpu_feature feature) return kvm_cpuid_has(kvm_get_supported_cpuid(), feature); } +uint32_t kvm_cpuid_property(const struct kvm_cpuid2 *cpuid, + struct kvm_x86_cpu_property property); + +static inline uint32_t kvm_cpu_property(struct kvm_x86_cpu_property property) +{ + return kvm_cpuid_property(kvm_get_supported_cpuid(), property); +} + +static __always_inline bool kvm_cpu_has_p(struct kvm_x86_cpu_property property) +{ + uint32_t max_leaf; + + switch (property.function & 0xc0000000) { + case 0: + max_leaf = kvm_cpu_property(X86_PROPERTY_MAX_BASIC_LEAF); + break; + case 0x40000000: + max_leaf = kvm_cpu_property(X86_PROPERTY_MAX_KVM_LEAF); + break; + case 0x80000000: + max_leaf = kvm_cpu_property(X86_PROPERTY_MAX_EXT_LEAF); + break; + case 0xc0000000: + max_leaf = kvm_cpu_property(X86_PROPERTY_MAX_CENTAUR_LEAF); + } + return max_leaf >= property.function; +} + static inline size_t kvm_cpuid2_size(int nr_entries) { return sizeof(struct kvm_cpuid2) + @@ -815,11 +844,6 @@ static inline uint32_t kvm_get_cpuid_max_basic(void) return kvm_get_supported_cpuid_entry(0)->eax; } -static inline uint32_t kvm_get_cpuid_max_extended(void) -{ - return kvm_get_supported_cpuid_entry(0x80000000)->eax; -} - void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits); bool vm_is_unrestricted_guest(struct kvm_vm *vm); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 710e5851a8635..af23c70ac129e 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -681,6 +681,13 @@ bool kvm_cpuid_has(const struct kvm_cpuid2 *cpuid, feature.reg, feature.bit, feature.bit); } +uint32_t kvm_cpuid_property(const struct kvm_cpuid2 *cpuid, + struct kvm_x86_cpu_property property) +{ + return __kvm_cpu_has(cpuid, property.function, property.index, + property.reg, property.lo_bit, property.hi_bit); +} + uint64_t kvm_get_feature_msr(uint64_t msr_index) { struct { @@ -1018,16 +1025,12 @@ bool is_amd_cpu(void) void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits) { - const struct kvm_cpuid_entry2 *entry; - - /* SDM 4.1.4 */ - if (kvm_get_cpuid_max_extended() < 0x80000008) { + if (!kvm_cpu_has_p(X86_PROPERTY_MAX_PHY_ADDR)) { *pa_bits == kvm_cpu_has(X86_FEATURE_PAE) ? 36 : 32; *va_bits = 32; } else { - entry = kvm_get_supported_cpuid_entry(0x80000008); - *pa_bits = entry->eax & 0xff; - *va_bits = (entry->eax >> 8) & 0xff; + *pa_bits = kvm_cpu_property(X86_PROPERTY_MAX_PHY_ADDR); + *va_bits = kvm_cpu_property(X86_PROPERTY_MAX_VIRT_ADDR); } } -- GitLab From 5dc19f1c7dd302f48a9f7fe7f29bb186d3477795 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:51:20 +0000 Subject: [PATCH 0714/1423] KVM: selftests: Convert AMX test to use X86_PROPRETY_XXX Add and use x86 "properties" for the myriad AMX CPUID values that are validated by the AMX test. Drop most of the test's single-usage helpers so that the asserts more precisely capture what check failed. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221006005125.680782-8-seanjc@google.com --- .../selftests/kvm/include/x86_64/processor.h | 9 ++ tools/testing/selftests/kvm/x86_64/amx_test.c | 101 ++++-------------- 2 files changed, 31 insertions(+), 79 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 1eb76268c9f73..841ef8c4ba003 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -200,6 +200,15 @@ struct kvm_x86_cpu_property { }) #define X86_PROPERTY_MAX_BASIC_LEAF KVM_X86_CPU_PROPERTY(0, 0, EAX, 0, 31) +#define X86_PROPERTY_XSTATE_MAX_SIZE_XCR0 KVM_X86_CPU_PROPERTY(0xd, 0, EBX, 0, 31) +#define X86_PROPERTY_XSTATE_MAX_SIZE KVM_X86_CPU_PROPERTY(0xd, 0, ECX, 0, 31) +#define X86_PROPERTY_XSTATE_TILE_SIZE KVM_X86_CPU_PROPERTY(0xd, 18, EAX, 0, 31) +#define X86_PROPERTY_XSTATE_TILE_OFFSET KVM_X86_CPU_PROPERTY(0xd, 18, EBX, 0, 31) +#define X86_PROPERTY_AMX_TOTAL_TILE_BYTES KVM_X86_CPU_PROPERTY(0x1d, 1, EAX, 0, 15) +#define X86_PROPERTY_AMX_BYTES_PER_TILE KVM_X86_CPU_PROPERTY(0x1d, 1, EAX, 16, 31) +#define X86_PROPERTY_AMX_BYTES_PER_ROW KVM_X86_CPU_PROPERTY(0x1d, 1, EBX, 0, 15) +#define X86_PROPERTY_AMX_NR_TILE_REGS KVM_X86_CPU_PROPERTY(0x1d, 1, EBX, 16, 31) +#define X86_PROPERTY_AMX_MAX_ROWS KVM_X86_CPU_PROPERTY(0x1d, 1, ECX, 0, 15) #define X86_PROPERTY_MAX_KVM_LEAF KVM_X86_CPU_PROPERTY(0x40000000, 0, EAX, 0, 31) diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c index dadcbad10a1dc..21de6ae42086d 100644 --- a/tools/testing/selftests/kvm/x86_64/amx_test.c +++ b/tools/testing/selftests/kvm/x86_64/amx_test.c @@ -39,11 +39,6 @@ #define XFEATURE_MASK_XTILEDATA (1 << XFEATURE_XTILEDATA) #define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA) -#define TILE_CPUID 0x1d -#define XSTATE_CPUID 0xd -#define TILE_PALETTE_CPUID_SUBLEAVE 0x1 -#define XSTATE_USER_STATE_SUBLEAVE 0x0 - #define XSAVE_HDR_OFFSET 512 struct xsave_data { @@ -129,71 +124,26 @@ static bool check_xsave_supports_xtile(void) return __xgetbv(0) & XFEATURE_MASK_XTILE; } -static bool enum_xtile_config(void) -{ - u32 eax, ebx, ecx, edx; - - __cpuid(TILE_CPUID, TILE_PALETTE_CPUID_SUBLEAVE, &eax, &ebx, &ecx, &edx); - if (!eax || !ebx || !ecx) - return false; - - xtile.max_names = ebx >> 16; - if (xtile.max_names < NUM_TILES) - return false; - - xtile.bytes_per_tile = eax >> 16; - if (xtile.bytes_per_tile < TILE_SIZE) - return false; - - xtile.bytes_per_row = ebx; - xtile.max_rows = ecx; - - return true; -} - -static bool enum_xsave_tile(void) -{ - u32 eax, ebx, ecx, edx; - - __cpuid(XSTATE_CPUID, XFEATURE_XTILEDATA, &eax, &ebx, &ecx, &edx); - if (!eax || !ebx) - return false; - - xtile.xsave_offset = ebx; - xtile.xsave_size = eax; - - return true; -} - -static bool check_xsave_size(void) -{ - u32 eax, ebx, ecx, edx; - bool valid = false; - - __cpuid(XSTATE_CPUID, XSTATE_USER_STATE_SUBLEAVE, &eax, &ebx, &ecx, &edx); - if (ebx && ebx <= XSAVE_SIZE) - valid = true; - - return valid; -} - -static bool check_xtile_info(void) +static void check_xtile_info(void) { - bool ret = false; - - if (!check_xsave_size()) - return ret; - - if (!enum_xsave_tile()) - return ret; - - if (!enum_xtile_config()) - return ret; + GUEST_ASSERT(this_cpu_has_p(X86_PROPERTY_XSTATE_MAX_SIZE_XCR0)); + GUEST_ASSERT(this_cpu_property(X86_PROPERTY_XSTATE_MAX_SIZE_XCR0) <= XSAVE_SIZE); - if (sizeof(struct tile_data) >= xtile.xsave_size) - ret = true; + xtile.xsave_offset = this_cpu_property(X86_PROPERTY_XSTATE_TILE_OFFSET); + GUEST_ASSERT(xtile.xsave_offset == 2816); + xtile.xsave_size = this_cpu_property(X86_PROPERTY_XSTATE_TILE_SIZE); + GUEST_ASSERT(xtile.xsave_size == 8192); + GUEST_ASSERT(sizeof(struct tile_data) >= xtile.xsave_size); - return ret; + GUEST_ASSERT(this_cpu_has_p(X86_PROPERTY_AMX_NR_TILE_REGS)); + xtile.max_names = this_cpu_property(X86_PROPERTY_AMX_NR_TILE_REGS); + GUEST_ASSERT(xtile.max_names == 8); + xtile.bytes_per_tile = this_cpu_property(X86_PROPERTY_AMX_BYTES_PER_TILE); + GUEST_ASSERT(xtile.bytes_per_tile == 1024); + xtile.bytes_per_row = this_cpu_property(X86_PROPERTY_AMX_BYTES_PER_ROW); + GUEST_ASSERT(xtile.bytes_per_row == 64); + xtile.max_rows = this_cpu_property(X86_PROPERTY_AMX_MAX_ROWS); + GUEST_ASSERT(xtile.max_rows == 16); } static void set_tilecfg(struct tile_config *cfg) @@ -238,16 +188,8 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg, { init_regs(); check_cpuid_xsave(); - GUEST_ASSERT(check_xsave_supports_xtile()); - GUEST_ASSERT(check_xtile_info()); - - /* check xtile configs */ - GUEST_ASSERT(xtile.xsave_offset == 2816); - GUEST_ASSERT(xtile.xsave_size == 8192); - GUEST_ASSERT(xtile.max_names == 8); - GUEST_ASSERT(xtile.bytes_per_tile == 1024); - GUEST_ASSERT(xtile.bytes_per_row == 64); - GUEST_ASSERT(xtile.max_rows == 16); + check_xsave_supports_xtile(); + check_xtile_info(); GUEST_SYNC(1); /* xfd=0, enable amx */ @@ -317,8 +259,9 @@ int main(int argc, char *argv[]) TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XTILECFG)); TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XTILEDATA)); - /* Get xsave/restore max size */ - xsave_restore_size = kvm_get_supported_cpuid_entry(0xd)->ecx; + TEST_ASSERT(kvm_cpu_has_p(X86_PROPERTY_XSTATE_MAX_SIZE), + "KVM should enumerate max XSAVE size when XSAVE is supported"); + xsave_restore_size = kvm_cpu_property(X86_PROPERTY_XSTATE_MAX_SIZE); run = vcpu->run; vcpu_regs_get(vcpu, ®s1); -- GitLab From 4feb9d21a407318129b6ea3a6735f1866439b9ab Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:51:21 +0000 Subject: [PATCH 0715/1423] KVM: selftests: Convert vmx_pmu_caps_test to use X86_PROPERTY_* Add X86_PROPERTY_PMU_VERSION and use it in vmx_pmu_caps_test to replace open coded versions of the same functionality. No functional change intended. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221006005125.680782-9-seanjc@google.com --- .../selftests/kvm/include/x86_64/processor.h | 6 ++---- .../selftests/kvm/x86_64/vmx_pmu_caps_test.c | 19 ++----------------- 2 files changed, 4 insertions(+), 21 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 841ef8c4ba003..848dfbc9866b1 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -200,6 +200,8 @@ struct kvm_x86_cpu_property { }) #define X86_PROPERTY_MAX_BASIC_LEAF KVM_X86_CPU_PROPERTY(0, 0, EAX, 0, 31) +#define X86_PROPERTY_PMU_VERSION KVM_X86_CPU_PROPERTY(0xa, 0, EAX, 0, 7) + #define X86_PROPERTY_XSTATE_MAX_SIZE_XCR0 KVM_X86_CPU_PROPERTY(0xd, 0, EBX, 0, 31) #define X86_PROPERTY_XSTATE_MAX_SIZE KVM_X86_CPU_PROPERTY(0xd, 0, ECX, 0, 31) #define X86_PROPERTY_XSTATE_TILE_SIZE KVM_X86_CPU_PROPERTY(0xd, 18, EAX, 0, 31) @@ -848,10 +850,6 @@ static inline void vcpu_set_msr(struct kvm_vcpu *vcpu, uint64_t msr_index, TEST_ASSERT(r == 1, KVM_IOCTL_ERROR(KVM_SET_MSRS, r)); } -static inline uint32_t kvm_get_cpuid_max_basic(void) -{ - return kvm_get_supported_cpuid_entry(0)->eax; -} void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits); bool vm_is_unrestricted_guest(struct kvm_vm *vm); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c index 069589c52f419..c280ba1e65724 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c @@ -20,16 +20,6 @@ #define PMU_CAP_FW_WRITES (1ULL << 13) #define PMU_CAP_LBR_FMT 0x3f -union cpuid10_eax { - struct { - unsigned int version_id:8; - unsigned int num_counters:8; - unsigned int bit_width:8; - unsigned int mask_length:8; - } split; - unsigned int full; -}; - union perf_capabilities { struct { u64 lbr_format:6; @@ -53,11 +43,9 @@ static void guest_code(void) int main(int argc, char *argv[]) { - const struct kvm_cpuid_entry2 *entry_a_0; struct kvm_vm *vm; struct kvm_vcpu *vcpu; int ret; - union cpuid10_eax eax; union perf_capabilities host_cap; uint64_t val; @@ -69,11 +57,8 @@ int main(int argc, char *argv[]) TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_PDCM)); - TEST_REQUIRE(kvm_get_cpuid_max_basic() >= 0xa); - entry_a_0 = kvm_get_supported_cpuid_entry(0xa); - - eax.full = entry_a_0->eax; - __TEST_REQUIRE(eax.split.version_id, "PMU is not supported by the vCPU"); + TEST_REQUIRE(kvm_cpu_has_p(X86_PROPERTY_PMU_VERSION)); + TEST_REQUIRE(kvm_cpu_property(X86_PROPERTY_PMU_VERSION) > 0); /* testcase 1, set capabilities when we have PDCM bit */ vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, PMU_CAP_FW_WRITES); -- GitLab From 5228c02a4c541a065ec071ea7ec2c1c76f3723dd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:51:22 +0000 Subject: [PATCH 0716/1423] KVM: selftests: Add PMU feature framework, use in PMU event filter test Add an X86_PMU_FEATURE_* framework to simplify probing architectural events on Intel PMUs, which require checking the length of a bit vector and the _absence_ of a "feature" bit. Add helpers for both KVM and "this CPU", and use the newfangled magic (along with X86_PROPERTY_*) to clean up pmu_event_filter_test. No functional change intended. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221006005125.680782-10-seanjc@google.com --- .../selftests/kvm/include/x86_64/processor.h | 41 +++++++++++++++ .../kvm/x86_64/pmu_event_filter_test.c | 51 +++---------------- 2 files changed, 48 insertions(+), 44 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 848dfbc9866b1..343393308005c 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -201,6 +201,8 @@ struct kvm_x86_cpu_property { #define X86_PROPERTY_MAX_BASIC_LEAF KVM_X86_CPU_PROPERTY(0, 0, EAX, 0, 31) #define X86_PROPERTY_PMU_VERSION KVM_X86_CPU_PROPERTY(0xa, 0, EAX, 0, 7) +#define X86_PROPERTY_PMU_NR_GP_COUNTERS KVM_X86_CPU_PROPERTY(0xa, 0, EAX, 8, 15) +#define X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH KVM_X86_CPU_PROPERTY(0xa, 0, EAX, 24, 31) #define X86_PROPERTY_XSTATE_MAX_SIZE_XCR0 KVM_X86_CPU_PROPERTY(0xd, 0, EBX, 0, 31) #define X86_PROPERTY_XSTATE_MAX_SIZE KVM_X86_CPU_PROPERTY(0xd, 0, ECX, 0, 31) @@ -221,6 +223,29 @@ struct kvm_x86_cpu_property { #define X86_PROPERTY_MAX_CENTAUR_LEAF KVM_X86_CPU_PROPERTY(0xC0000000, 0, EAX, 0, 31) +/* + * Intel's architectural PMU events are bizarre. They have a "feature" bit + * that indicates the feature is _not_ supported, and a property that states + * the length of the bit mask of unsupported features. A feature is supported + * if the size of the bit mask is larger than the "unavailable" bit, and said + * bit is not set. + * + * Wrap the "unavailable" feature to simplify checking whether or not a given + * architectural event is supported. + */ +struct kvm_x86_pmu_feature { + struct kvm_x86_cpu_feature anti_feature; +}; +#define KVM_X86_PMU_FEATURE(name, __bit) \ +({ \ + struct kvm_x86_pmu_feature feature = { \ + .anti_feature = KVM_X86_CPU_FEATURE(0xa, 0, EBX, __bit), \ + }; \ + \ + feature; \ +}) + +#define X86_PMU_FEATURE_BRANCH_INSNS_RETIRED KVM_X86_PMU_FEATURE(BRANCH_INSNS_RETIRED, 5) /* Page table bitfield declarations */ #define PTE_PRESENT_MASK BIT_ULL(0) @@ -535,6 +560,14 @@ static __always_inline bool this_cpu_has_p(struct kvm_x86_cpu_property property) return max_leaf >= property.function; } +static inline bool this_pmu_has(struct kvm_x86_pmu_feature feature) +{ + uint32_t nr_bits = this_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH); + + return nr_bits > feature.anti_feature.bit && + !this_cpu_has(feature.anti_feature); +} + #define SET_XMM(__var, __xmm) \ asm volatile("movq %0, %%"#__xmm : : "r"(__var) : #__xmm) @@ -743,6 +776,14 @@ static __always_inline bool kvm_cpu_has_p(struct kvm_x86_cpu_property property) return max_leaf >= property.function; } +static inline bool kvm_pmu_has(struct kvm_x86_pmu_feature feature) +{ + uint32_t nr_bits = kvm_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH); + + return nr_bits > feature.anti_feature.bit && + !kvm_cpu_has(feature.anti_feature); +} + static inline size_t kvm_cpuid2_size(int nr_entries) { return sizeof(struct kvm_cpuid2) + diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c index a6ffa245c8971..201c4ea44ca97 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c @@ -21,29 +21,6 @@ #define ARCH_PERFMON_EVENTSEL_OS (1ULL << 17) #define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22) -union cpuid10_eax { - struct { - unsigned int version_id:8; - unsigned int num_counters:8; - unsigned int bit_width:8; - unsigned int mask_length:8; - } split; - unsigned int full; -}; - -union cpuid10_ebx { - struct { - unsigned int no_unhalted_core_cycles:1; - unsigned int no_instructions_retired:1; - unsigned int no_unhalted_reference_cycles:1; - unsigned int no_llc_reference:1; - unsigned int no_llc_misses:1; - unsigned int no_branch_instruction_retired:1; - unsigned int no_branch_misses_retired:1; - } split; - unsigned int full; -}; - /* End of stuff taken from perf_event.h. */ /* Oddly, this isn't in perf_event.h. */ @@ -380,30 +357,16 @@ static void test_pmu_config_disable(void (*guest_code)(void)) } /* - * Check for a non-zero PMU version, at least one general-purpose - * counter per logical processor, an EBX bit vector of length greater - * than 5, and EBX[5] clear. - */ -static bool check_intel_pmu_leaf(const struct kvm_cpuid_entry2 *entry) -{ - union cpuid10_eax eax = { .full = entry->eax }; - union cpuid10_ebx ebx = { .full = entry->ebx }; - - return eax.split.version_id && eax.split.num_counters > 0 && - eax.split.mask_length > ARCH_PERFMON_BRANCHES_RETIRED && - !ebx.split.no_branch_instruction_retired; -} - -/* - * Note that CPUID leaf 0xa is Intel-specific. This leaf should be - * clear on AMD hardware. + * On Intel, check for a non-zero PMU version, at least one general-purpose + * counter per logical processor, and support for counting the number of branch + * instructions retired. */ static bool use_intel_pmu(void) { - const struct kvm_cpuid_entry2 *entry; - - entry = kvm_get_supported_cpuid_entry(0xa); - return is_intel_cpu() && check_intel_pmu_leaf(entry); + return is_intel_cpu() && + kvm_cpu_property(X86_PROPERTY_PMU_VERSION) && + kvm_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS) && + kvm_pmu_has(X86_PMU_FEATURE_BRANCH_INSNS_RETIRED); } static bool is_zen1(uint32_t eax) -- GitLab From 24f3f9898e3c463b39dac8a03870c628dab8176e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:51:23 +0000 Subject: [PATCH 0717/1423] KVM: selftests: Add dedicated helpers for getting x86 Family and Model Add dedicated helpers for getting x86's Family and Model, which are the last holdouts that "need" raw access to CPUID information. FMS info is a mess and requires not only splicing together multiple values, but requires doing so conditional in the Family case. Provide wrappers to reduce the odds of copy+paste errors, but mostly to allow for the eventual removal of kvm_get_supported_cpuid_entry(). No functional change intended. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221006005125.680782-11-seanjc@google.com --- .../selftests/kvm/include/x86_64/processor.h | 52 +++++++++++++------ .../selftests/kvm/lib/x86_64/processor.c | 4 +- 2 files changed, 36 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 343393308005c..5d0acf8a96332 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -247,6 +247,23 @@ struct kvm_x86_pmu_feature { #define X86_PMU_FEATURE_BRANCH_INSNS_RETIRED KVM_X86_PMU_FEATURE(BRANCH_INSNS_RETIRED, 5) +static inline unsigned int x86_family(unsigned int eax) +{ + unsigned int x86; + + x86 = (eax >> 8) & 0xf; + + if (x86 == 0xf) + x86 += (eax >> 20) & 0xff; + + return x86; +} + +static inline unsigned int x86_model(unsigned int eax) +{ + return ((eax >> 12) & 0xf0) | ((eax >> 4) & 0x0f); +} + /* Page table bitfield declarations */ #define PTE_PRESENT_MASK BIT_ULL(0) #define PTE_WRITABLE_MASK BIT_ULL(1) @@ -516,6 +533,24 @@ static inline void cpuid(uint32_t function, return __cpuid(function, 0, eax, ebx, ecx, edx); } +static inline uint32_t this_cpu_fms(void) +{ + uint32_t eax, ebx, ecx, edx; + + cpuid(1, &eax, &ebx, &ecx, &edx); + return eax; +} + +static inline uint32_t this_cpu_family(void) +{ + return x86_family(this_cpu_fms()); +} + +static inline uint32_t this_cpu_model(void) +{ + return x86_model(this_cpu_fms()); +} + static inline uint32_t __this_cpu_has(uint32_t function, uint32_t index, uint8_t reg, uint8_t lo, uint8_t hi) { @@ -658,23 +693,6 @@ static inline void cpu_relax(void) bool is_intel_cpu(void); bool is_amd_cpu(void); -static inline unsigned int x86_family(unsigned int eax) -{ - unsigned int x86; - - x86 = (eax >> 8) & 0xf; - - if (x86 == 0xf) - x86 += (eax >> 20) & 0xff; - - return x86; -} - -static inline unsigned int x86_model(unsigned int eax) -{ - return ((eax >> 12) & 0xf0) | ((eax >> 4) & 0x0f); -} - struct kvm_x86_state *vcpu_save_state(struct kvm_vcpu *vcpu); void vcpu_load_state(struct kvm_vcpu *vcpu, struct kvm_x86_state *state); void kvm_x86_state_cleanup(struct kvm_x86_state *state); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index af23c70ac129e..65e87b5acb6ff 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1239,7 +1239,6 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm) { const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */ unsigned long ht_gfn, max_gfn, max_pfn; - uint32_t eax, ebx, ecx, edx; uint8_t maxphyaddr; max_gfn = (1ULL << (vm->pa_bits - vm->page_shift)) - 1; @@ -1254,8 +1253,7 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm) /* Before family 17h, the HyperTransport area is just below 1T. */ ht_gfn = (1 << 28) - num_ht_pages; - cpuid(1, &eax, &ebx, &ecx, &edx); - if (x86_family(eax) < 0x17) + if (this_cpu_family() < 0x17) goto done; /* -- GitLab From 074e9d4c9c6046f4605c9e37e90cff404119c525 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:51:24 +0000 Subject: [PATCH 0718/1423] KVM: selftests: Add and use KVM helpers for x86 Family and Model Add KVM variants of the x86 Family and Model helpers, and use them in the PMU event filter test. Open code the retrieval of KVM's supported CPUID entry 0x1.0 in anticipation of dropping kvm_get_supported_cpuid_entry(). No functional change intended. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221006005125.680782-12-seanjc@google.com --- .../selftests/kvm/include/x86_64/processor.h | 19 +++++++++++++-- .../kvm/x86_64/pmu_event_filter_test.c | 23 +++++++++---------- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 5d0acf8a96332..4366dbcc1bcdc 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -754,10 +754,27 @@ static inline void vcpu_xcrs_set(struct kvm_vcpu *vcpu, struct kvm_xcrs *xcrs) vcpu_ioctl(vcpu, KVM_SET_XCRS, xcrs); } +const struct kvm_cpuid_entry2 *get_cpuid_entry(const struct kvm_cpuid2 *cpuid, + uint32_t function, uint32_t index); const struct kvm_cpuid2 *kvm_get_supported_cpuid(void); const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void); const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu); +static inline uint32_t kvm_cpu_fms(void) +{ + return get_cpuid_entry(kvm_get_supported_cpuid(), 0x1, 0)->eax; +} + +static inline uint32_t kvm_cpu_family(void) +{ + return x86_family(kvm_cpu_fms()); +} + +static inline uint32_t kvm_cpu_model(void) +{ + return x86_model(kvm_cpu_fms()); +} + bool kvm_cpuid_has(const struct kvm_cpuid2 *cpuid, struct kvm_x86_cpu_feature feature); @@ -825,8 +842,6 @@ static inline struct kvm_cpuid2 *allocate_kvm_cpuid2(int nr_entries) return cpuid; } -const struct kvm_cpuid_entry2 *get_cpuid_entry(const struct kvm_cpuid2 *cpuid, - uint32_t function, uint32_t index); void vcpu_init_cpuid(struct kvm_vcpu *vcpu, const struct kvm_cpuid2 *cpuid); void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu); diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c index 201c4ea44ca97..2de98fce7eddc 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c @@ -369,20 +369,19 @@ static bool use_intel_pmu(void) kvm_pmu_has(X86_PMU_FEATURE_BRANCH_INSNS_RETIRED); } -static bool is_zen1(uint32_t eax) +static bool is_zen1(uint32_t family, uint32_t model) { - return x86_family(eax) == 0x17 && x86_model(eax) <= 0x0f; + return family == 0x17 && model <= 0x0f; } -static bool is_zen2(uint32_t eax) +static bool is_zen2(uint32_t family, uint32_t model) { - return x86_family(eax) == 0x17 && - x86_model(eax) >= 0x30 && x86_model(eax) <= 0x3f; + return family == 0x17 && model >= 0x30 && model <= 0x3f; } -static bool is_zen3(uint32_t eax) +static bool is_zen3(uint32_t family, uint32_t model) { - return x86_family(eax) == 0x19 && x86_model(eax) <= 0x0f; + return family == 0x19 && model <= 0x0f; } /* @@ -395,13 +394,13 @@ static bool is_zen3(uint32_t eax) */ static bool use_amd_pmu(void) { - const struct kvm_cpuid_entry2 *entry; + uint32_t family = kvm_cpu_family(); + uint32_t model = kvm_cpu_model(); - entry = kvm_get_supported_cpuid_entry(1); return is_amd_cpu() && - (is_zen1(entry->eax) || - is_zen2(entry->eax) || - is_zen3(entry->eax)); + (is_zen1(family, model) || + is_zen2(family, model) || + is_zen3(family, model)); } int main(int argc, char *argv[]) -- GitLab From b941ba2380ccf51e048dc58ff0e6bdf11828f6d9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:51:25 +0000 Subject: [PATCH 0719/1423] KVM: selftests: Drop helpers for getting specific KVM supported CPUID entry Drop kvm_get_supported_cpuid_entry() and its inner helper now that all known usage can use X86_FEATURE_*, X86_PROPERTY_*, X86_PMU_FEATURE_*, or the dedicated Family/Model helpers. Providing "raw" access to CPUID leafs is undesirable as it encourages open coding CPUID checks, which is often error prone and not self-documenting. No functional change intended. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221006005125.680782-13-seanjc@google.com --- .../testing/selftests/kvm/include/x86_64/processor.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 4366dbcc1bcdc..481f44c683aa1 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -902,17 +902,6 @@ static inline void vcpu_clear_cpuid_feature(struct kvm_vcpu *vcpu, vcpu_set_or_clear_cpuid_feature(vcpu, feature, false); } -static inline const struct kvm_cpuid_entry2 *__kvm_get_supported_cpuid_entry(uint32_t function, - uint32_t index) -{ - return get_cpuid_entry(kvm_get_supported_cpuid(), function, index); -} - -static inline const struct kvm_cpuid_entry2 *kvm_get_supported_cpuid_entry(uint32_t function) -{ - return __kvm_get_supported_cpuid_entry(function, 0); -} - uint64_t vcpu_get_msr(struct kvm_vcpu *vcpu, uint64_t msr_index); int _vcpu_set_msr(struct kvm_vcpu *vcpu, uint64_t msr_index, uint64_t msr_value); -- GitLab From 5c107f7085f45e071bbcf13006fffccd8e5de0e1 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 16 Nov 2022 12:46:31 -0800 Subject: [PATCH 0720/1423] KVM: selftests: Assert in prepare_eptp() that nEPT is supported Now that a VM isn't needed to check for nEPT support, assert that KVM supports nEPT in prepare_eptp() instead of skipping the test, and push the TEST_REQUIRE() check out to individual tests. The require+assert are somewhat redundant and will incur some amount of ongoing maintenance burden, but placing the "require" logic in the test makes it easier to find/understand a test's requirements and in this case, provides a very strong hint that the test cares about nEPT. Suggested-by: Sean Christopherson Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20220927165209.930904-1-dmatlack@google.com [sean: rebase on merged code, write changelog] Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/x86_64/memstress.c | 1 + tools/testing/selftests/kvm/lib/x86_64/vmx.c | 2 +- tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/lib/x86_64/memstress.c b/tools/testing/selftests/kvm/lib/x86_64/memstress.c index 2b3b47e4a973e..d61e623afc8cf 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/memstress.c +++ b/tools/testing/selftests/kvm/lib/x86_64/memstress.c @@ -85,6 +85,7 @@ void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vc int vcpu_id; TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + TEST_REQUIRE(kvm_cpu_has_ept()); for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { vmx = vcpu_alloc_vmx(vm, &vmx_gva); diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c index 6800fc9aeef01..3e4ea846366cb 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c @@ -559,7 +559,7 @@ bool kvm_cpu_has_ept(void) void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, uint32_t eptp_memslot) { - TEST_REQUIRE(kvm_cpu_has_ept()); + TEST_ASSERT(kvm_cpu_has_ept(), "KVM doesn't support nested EPT"); vmx->eptp = (void *)vm_vaddr_alloc_page(vm); vmx->eptp_hva = addr_gva2hva(vm, (uintptr_t)vmx->eptp); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c index 2d8c23d639f77..f0456fb031b1e 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c @@ -78,6 +78,7 @@ int main(int argc, char *argv[]) bool done = false; TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + TEST_REQUIRE(kvm_cpu_has_ept()); /* Create VM */ vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); -- GitLab From ecb89a51724b3cd89c13ba7364e82f9879b68dcf Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 16 Nov 2022 12:42:28 -0800 Subject: [PATCH 0721/1423] KVM: selftests: Check for KVM nEPT support using "feature" MSRs When checking for nEPT support in KVM, use kvm_get_feature_msr() instead of vcpu_get_msr() to retrieve KVM's default TRUE_PROCBASED_CTLS and PROCBASED_CTLS2 MSR values, i.e. don't require a VM+vCPU to query nEPT support. Suggested-by: Sean Christopherson Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20220927165209.930904-1-dmatlack@google.com [sean: rebase on merged code, write changelog] Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/x86_64/vmx.h | 2 +- tools/testing/selftests/kvm/lib/x86_64/vmx.c | 12 ++++-------- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h index 71b290b6469d6..e9c96b49966a6 100644 --- a/tools/testing/selftests/kvm/include/x86_64/vmx.h +++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h @@ -572,7 +572,7 @@ void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, uint32_t memslot); void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t addr, uint64_t size); -bool kvm_vm_has_ept(struct kvm_vm *vm); +bool kvm_cpu_has_ept(void); void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, uint32_t eptp_memslot); void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm); diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c index d21049c38fc56..6800fc9aeef01 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c @@ -544,26 +544,22 @@ void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, __nested_map(vmx, vm, addr, addr, size, PG_LEVEL_1G); } -bool kvm_vm_has_ept(struct kvm_vm *vm) +bool kvm_cpu_has_ept(void) { - struct kvm_vcpu *vcpu; uint64_t ctrl; - vcpu = list_first_entry(&vm->vcpus, struct kvm_vcpu, list); - TEST_ASSERT(vcpu, "Cannot determine EPT support without vCPUs.\n"); - - ctrl = vcpu_get_msr(vcpu, MSR_IA32_VMX_TRUE_PROCBASED_CTLS) >> 32; + ctrl = kvm_get_feature_msr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS) >> 32; if (!(ctrl & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) return false; - ctrl = vcpu_get_msr(vcpu, MSR_IA32_VMX_PROCBASED_CTLS2) >> 32; + ctrl = kvm_get_feature_msr(MSR_IA32_VMX_PROCBASED_CTLS2) >> 32; return ctrl & SECONDARY_EXEC_ENABLE_EPT; } void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, uint32_t eptp_memslot) { - TEST_REQUIRE(kvm_vm_has_ept(vm)); + TEST_REQUIRE(kvm_cpu_has_ept()); vmx->eptp = (void *)vm_vaddr_alloc_page(vm); vmx->eptp_hva = addr_gva2hva(vm, (uintptr_t)vmx->eptp); -- GitLab From 2653d53345bda90604f673bb211dd060a5a5c232 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 16 Nov 2022 19:20:20 -0800 Subject: [PATCH 0722/1423] xfs: fix incorrect error-out in xfs_remove Clean up resources if resetting the dotdot entry doesn't succeed. Observed through code inspection. Fixes: 5838d0356bb3 ("xfs: reset child dir '..' entry when unlinking child") Signed-off-by: Darrick J. Wong Reviewed-by: Andrey Albershteyn --- fs/xfs/xfs_inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index aa303be11576f..d354ea2b74f96 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2479,7 +2479,7 @@ xfs_remove( error = xfs_dir_replace(tp, ip, &xfs_name_dotdot, tp->t_mountp->m_sb.sb_rootino, 0); if (error) - return error; + goto out_trans_cancel; } } else { /* -- GitLab From 59f6ab40fd8735c9a1a15401610a31cc06a0bbd6 Mon Sep 17 00:00:00 2001 From: Long Li Date: Wed, 16 Nov 2022 19:20:20 -0800 Subject: [PATCH 0723/1423] xfs: fix sb write verify for lazysbcount When lazysbcount is enabled, fsstress and loop mount/unmount test report the following problems: XFS (loop0): SB summary counter sanity check failed XFS (loop0): Metadata corruption detected at xfs_sb_write_verify+0x13b/0x460, xfs_sb block 0x0 XFS (loop0): Unmount and run xfs_repair XFS (loop0): First 128 bytes of corrupted metadata buffer: 00000000: 58 46 53 42 00 00 10 00 00 00 00 00 00 28 00 00 XFSB.........(.. 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000020: 69 fb 7c cd 5f dc 44 af 85 74 e0 cc d4 e3 34 5a i.|._.D..t....4Z 00000030: 00 00 00 00 00 20 00 06 00 00 00 00 00 00 00 80 ..... .......... 00000040: 00 00 00 00 00 00 00 81 00 00 00 00 00 00 00 82 ................ 00000050: 00 00 00 01 00 0a 00 00 00 00 00 04 00 00 00 00 ................ 00000060: 00 00 0a 00 b4 b5 02 00 02 00 00 08 00 00 00 00 ................ 00000070: 00 00 00 00 00 00 00 00 0c 09 09 03 14 00 00 19 ................ XFS (loop0): Corruption of in-memory data (0x8) detected at _xfs_buf_ioapply +0xe1e/0x10e0 (fs/xfs/xfs_buf.c:1580). Shutting down filesystem. XFS (loop0): Please unmount the filesystem and rectify the problem(s) XFS (loop0): log mount/recovery failed: error -117 XFS (loop0): log mount failed This corruption will shutdown the file system and the file system will no longer be mountable. The following script can reproduce the problem, but it may take a long time. #!/bin/bash device=/dev/sda testdir=/mnt/test round=0 function fail() { echo "$*" exit 1 } mkdir -p $testdir while [ $round -lt 10000 ] do echo "******* round $round ********" mkfs.xfs -f $device mount $device $testdir || fail "mount failed!" fsstress -d $testdir -l 0 -n 10000 -p 4 >/dev/null & sleep 4 killall -w fsstress umount $testdir xfs_repair -e $device > /dev/null if [ $? -eq 2 ];then echo "ERR CODE 2: Dirty log exception during repair." exit 1 fi round=$(($round+1)) done With lazysbcount is enabled, There is no additional lock protection for reading m_ifree and m_icount in xfs_log_sb(), if other cpu modifies the m_ifree, this will make the m_ifree greater than m_icount. For example, consider the following sequence and ifreedelta is postive: CPU0 CPU1 xfs_log_sb xfs_trans_unreserve_and_mod_sb ---------- ------------------------------ percpu_counter_sum(&mp->m_icount) percpu_counter_add_batch(&mp->m_icount, idelta, XFS_ICOUNT_BATCH) percpu_counter_add(&mp->m_ifree, ifreedelta); percpu_counter_sum(&mp->m_ifree) After this, incorrect inode count (sb_ifree > sb_icount) will be writen to the log. In the subsequent writing of sb, incorrect inode count (sb_ifree > sb_icount) will fail to pass the boundary check in xfs_validate_sb_write() that cause the file system shutdown. When lazysbcount is enabled, we don't need to guarantee that Lazy sb counters are completely correct, but we do need to guarantee that sb_ifree <= sb_icount. On the other hand, the constraint that m_ifree <= m_icount must be satisfied any time that there /cannot/ be other threads allocating or freeing inode chunks. If the constraint is violated under these circumstances, sb_i{count,free} (the ondisk superblock inode counters) maybe incorrect and need to be marked sick at unmount, the count will be rebuilt on the next mount. Fixes: 8756a5af1819 ("libxfs: add more bounds checking to sb sanity checks") Signed-off-by: Long Li Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_sb.c | 4 +++- fs/xfs/xfs_mount.c | 15 +++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index a20cade590e9f..1eeecf2eb2a77 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -972,7 +972,9 @@ xfs_log_sb( */ if (xfs_has_lazysbcount(mp)) { mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); - mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); + mp->m_sb.sb_ifree = min_t(uint64_t, + percpu_counter_sum(&mp->m_ifree), + mp->m_sb.sb_icount); mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index e8bb3c2e847e1..fb87ffb48f7fe 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -538,6 +538,20 @@ xfs_check_summary_counts( return 0; } +static void +xfs_unmount_check( + struct xfs_mount *mp) +{ + if (xfs_is_shutdown(mp)) + return; + + if (percpu_counter_sum(&mp->m_ifree) > + percpu_counter_sum(&mp->m_icount)) { + xfs_alert(mp, "ifree/icount mismatch at unmount"); + xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS); + } +} + /* * Flush and reclaim dirty inodes in preparation for unmount. Inodes and * internal inode structures can be sitting in the CIL and AIL at this point, @@ -1077,6 +1091,7 @@ xfs_unmountfs( if (error) xfs_warn(mp, "Unable to free reserved block pool. " "Freespace may not be correct on next mount."); + xfs_unmount_check(mp); xfs_log_unmount(mp); xfs_da_unmount(mp); -- GitLab From 64c80dfd04d1dd2ecf550542c8f3f41b54b20207 Mon Sep 17 00:00:00 2001 From: Lukas Herbolt Date: Wed, 16 Nov 2022 19:20:21 -0800 Subject: [PATCH 0724/1423] xfs: Print XFS UUID on mount and umount events. As of now only device names are printed out over __xfs_printk(). The device names are not persistent across reboots which in case of searching for origin of corruption brings another task to properly identify the devices. This patch add XFS UUID upon every mount/umount event which will make the identification much easier. Signed-off-by: Lukas Herbolt [sandeen: rebase onto current upstream kernel] Signed-off-by: Eric Sandeen Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_log.c | 10 ++++++---- fs/xfs/xfs_super.c | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index f02a0dd522b3d..0141d9907d31b 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -644,12 +644,14 @@ xfs_log_mount( int min_logfsbs; if (!xfs_has_norecovery(mp)) { - xfs_notice(mp, "Mounting V%d Filesystem", - XFS_SB_VERSION_NUM(&mp->m_sb)); + xfs_notice(mp, "Mounting V%d Filesystem %pU", + XFS_SB_VERSION_NUM(&mp->m_sb), + &mp->m_sb.sb_uuid); } else { xfs_notice(mp, -"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.", - XFS_SB_VERSION_NUM(&mp->m_sb)); +"Mounting V%d filesystem %pU in no-recovery mode. Filesystem will be inconsistent.", + XFS_SB_VERSION_NUM(&mp->m_sb), + &mp->m_sb.sb_uuid); ASSERT(xfs_is_readonly(mp)); } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index ee4b429a2f2c9..0c4b73e9b29d2 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1110,7 +1110,7 @@ xfs_fs_put_super( if (!sb->s_fs_info) return; - xfs_notice(mp, "Unmounting Filesystem"); + xfs_notice(mp, "Unmounting Filesystem %pU", &mp->m_sb.sb_uuid); xfs_filestream_unmount(mp); xfs_unmountfs(mp); -- GitLab From 4f44e519b6a945068755708119cca5b74d01d1f6 Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Mon, 14 Nov 2022 19:16:59 -0600 Subject: [PATCH 0725/1423] RDMA/irdma: Fix inline for multiple SGE's Currently, inline send and inline write assume a single SGE and only copy data from the first one. Add support for multiple SGE's. Fixes: b48c24c2d710 ("RDMA/irdma: Implement device supported verb APIs") Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Link: https://lore.kernel.org/r/20221115011701.1379-2-shiraz.saleem@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/uk.c | 149 ++++++++++++++++++---------- drivers/infiniband/hw/irdma/user.h | 19 +--- drivers/infiniband/hw/irdma/verbs.c | 55 ++++------ 3 files changed, 119 insertions(+), 104 deletions(-) diff --git a/drivers/infiniband/hw/irdma/uk.c b/drivers/infiniband/hw/irdma/uk.c index a6e5d350a94ce..1a57ed9d77ff3 100644 --- a/drivers/infiniband/hw/irdma/uk.c +++ b/drivers/infiniband/hw/irdma/uk.c @@ -566,21 +566,37 @@ static void irdma_set_mw_bind_wqe_gen_1(__le64 *wqe, /** * irdma_copy_inline_data_gen_1 - Copy inline data to wqe - * @dest: pointer to wqe - * @src: pointer to inline data - * @len: length of inline data to copy + * @wqe: pointer to wqe + * @sge_list: table of pointers to inline data + * @num_sges: Total inline data length * @polarity: compatibility parameter */ -static void irdma_copy_inline_data_gen_1(u8 *dest, u8 *src, u32 len, - u8 polarity) +static void irdma_copy_inline_data_gen_1(u8 *wqe, struct ib_sge *sge_list, + u32 num_sges, u8 polarity) { - if (len <= 16) { - memcpy(dest, src, len); - } else { - memcpy(dest, src, 16); - src += 16; - dest = dest + 32; - memcpy(dest, src, len - 16); + u32 quanta_bytes_remaining = 16; + int i; + + for (i = 0; i < num_sges; i++) { + u8 *cur_sge = (u8 *)(uintptr_t)sge_list[i].addr; + u32 sge_len = sge_list[i].length; + + while (sge_len) { + u32 bytes_copied; + + bytes_copied = min(sge_len, quanta_bytes_remaining); + memcpy(wqe, cur_sge, bytes_copied); + wqe += bytes_copied; + cur_sge += bytes_copied; + quanta_bytes_remaining -= bytes_copied; + sge_len -= bytes_copied; + + if (!quanta_bytes_remaining) { + /* Remaining inline bytes reside after hdr */ + wqe += 16; + quanta_bytes_remaining = 32; + } + } } } @@ -612,35 +628,51 @@ static void irdma_set_mw_bind_wqe(__le64 *wqe, /** * irdma_copy_inline_data - Copy inline data to wqe - * @dest: pointer to wqe - * @src: pointer to inline data - * @len: length of inline data to copy + * @wqe: pointer to wqe + * @sge_list: table of pointers to inline data + * @num_sges: number of SGE's * @polarity: polarity of wqe valid bit */ -static void irdma_copy_inline_data(u8 *dest, u8 *src, u32 len, u8 polarity) +static void irdma_copy_inline_data(u8 *wqe, struct ib_sge *sge_list, + u32 num_sges, u8 polarity) { u8 inline_valid = polarity << IRDMA_INLINE_VALID_S; - u32 copy_size; - - dest += 8; - if (len <= 8) { - memcpy(dest, src, len); - return; - } - - *((u64 *)dest) = *((u64 *)src); - len -= 8; - src += 8; - dest += 24; /* point to additional 32 byte quanta */ - - while (len) { - copy_size = len < 31 ? len : 31; - memcpy(dest, src, copy_size); - *(dest + 31) = inline_valid; - len -= copy_size; - dest += 32; - src += copy_size; + u32 quanta_bytes_remaining = 8; + bool first_quanta = true; + int i; + + wqe += 8; + + for (i = 0; i < num_sges; i++) { + u8 *cur_sge = (u8 *)(uintptr_t)sge_list[i].addr; + u32 sge_len = sge_list[i].length; + + while (sge_len) { + u32 bytes_copied; + + bytes_copied = min(sge_len, quanta_bytes_remaining); + memcpy(wqe, cur_sge, bytes_copied); + wqe += bytes_copied; + cur_sge += bytes_copied; + quanta_bytes_remaining -= bytes_copied; + sge_len -= bytes_copied; + + if (!quanta_bytes_remaining) { + quanta_bytes_remaining = 31; + + /* Remaining inline bytes reside after hdr */ + if (first_quanta) { + first_quanta = false; + wqe += 16; + } else { + *wqe = inline_valid; + wqe++; + } + } + } } + if (!first_quanta && quanta_bytes_remaining < 31) + *(wqe + quanta_bytes_remaining) = inline_valid; } /** @@ -679,20 +711,27 @@ int irdma_uk_inline_rdma_write(struct irdma_qp_uk *qp, struct irdma_post_sq_info *info, bool post_sq) { __le64 *wqe; - struct irdma_inline_rdma_write *op_info; + struct irdma_rdma_write *op_info; u64 hdr = 0; u32 wqe_idx; bool read_fence = false; + u32 i, total_size = 0; u16 quanta; info->push_wqe = qp->push_db ? true : false; - op_info = &info->op.inline_rdma_write; + op_info = &info->op.rdma_write; + + if (unlikely(qp->max_sq_frag_cnt < op_info->num_lo_sges)) + return -EINVAL; - if (op_info->len > qp->max_inline_data) + for (i = 0; i < op_info->num_lo_sges; i++) + total_size += op_info->lo_sg_list[i].length; + + if (unlikely(total_size > qp->max_inline_data)) return -EINVAL; - quanta = qp->wqe_ops.iw_inline_data_size_to_quanta(op_info->len); - wqe = irdma_qp_get_next_send_wqe(qp, &wqe_idx, quanta, op_info->len, + quanta = qp->wqe_ops.iw_inline_data_size_to_quanta(total_size); + wqe = irdma_qp_get_next_send_wqe(qp, &wqe_idx, quanta, total_size, info); if (!wqe) return -ENOMEM; @@ -705,7 +744,7 @@ int irdma_uk_inline_rdma_write(struct irdma_qp_uk *qp, hdr = FIELD_PREP(IRDMAQPSQ_REMSTAG, op_info->rem_addr.lkey) | FIELD_PREP(IRDMAQPSQ_OPCODE, info->op_type) | - FIELD_PREP(IRDMAQPSQ_INLINEDATALEN, op_info->len) | + FIELD_PREP(IRDMAQPSQ_INLINEDATALEN, total_size) | FIELD_PREP(IRDMAQPSQ_REPORTRTT, info->report_rtt ? 1 : 0) | FIELD_PREP(IRDMAQPSQ_INLINEDATAFLAG, 1) | FIELD_PREP(IRDMAQPSQ_IMMDATAFLAG, info->imm_data_valid ? 1 : 0) | @@ -719,7 +758,8 @@ int irdma_uk_inline_rdma_write(struct irdma_qp_uk *qp, set_64bit_val(wqe, 0, FIELD_PREP(IRDMAQPSQ_IMMDATA, info->imm_data)); - qp->wqe_ops.iw_copy_inline_data((u8 *)wqe, op_info->data, op_info->len, + qp->wqe_ops.iw_copy_inline_data((u8 *)wqe, op_info->lo_sg_list, + op_info->num_lo_sges, qp->swqe_polarity); dma_wmb(); /* make sure WQE is populated before valid bit is set */ @@ -745,20 +785,27 @@ int irdma_uk_inline_send(struct irdma_qp_uk *qp, struct irdma_post_sq_info *info, bool post_sq) { __le64 *wqe; - struct irdma_post_inline_send *op_info; + struct irdma_post_send *op_info; u64 hdr; u32 wqe_idx; bool read_fence = false; + u32 i, total_size = 0; u16 quanta; info->push_wqe = qp->push_db ? true : false; - op_info = &info->op.inline_send; + op_info = &info->op.send; - if (op_info->len > qp->max_inline_data) + if (unlikely(qp->max_sq_frag_cnt < op_info->num_sges)) return -EINVAL; - quanta = qp->wqe_ops.iw_inline_data_size_to_quanta(op_info->len); - wqe = irdma_qp_get_next_send_wqe(qp, &wqe_idx, quanta, op_info->len, + for (i = 0; i < op_info->num_sges; i++) + total_size += op_info->sg_list[i].length; + + if (unlikely(total_size > qp->max_inline_data)) + return -EINVAL; + + quanta = qp->wqe_ops.iw_inline_data_size_to_quanta(total_size); + wqe = irdma_qp_get_next_send_wqe(qp, &wqe_idx, quanta, total_size, info); if (!wqe) return -ENOMEM; @@ -773,7 +820,7 @@ int irdma_uk_inline_send(struct irdma_qp_uk *qp, hdr = FIELD_PREP(IRDMAQPSQ_REMSTAG, info->stag_to_inv) | FIELD_PREP(IRDMAQPSQ_AHID, op_info->ah_id) | FIELD_PREP(IRDMAQPSQ_OPCODE, info->op_type) | - FIELD_PREP(IRDMAQPSQ_INLINEDATALEN, op_info->len) | + FIELD_PREP(IRDMAQPSQ_INLINEDATALEN, total_size) | FIELD_PREP(IRDMAQPSQ_IMMDATAFLAG, (info->imm_data_valid ? 1 : 0)) | FIELD_PREP(IRDMAQPSQ_REPORTRTT, (info->report_rtt ? 1 : 0)) | @@ -789,8 +836,8 @@ int irdma_uk_inline_send(struct irdma_qp_uk *qp, if (info->imm_data_valid) set_64bit_val(wqe, 0, FIELD_PREP(IRDMAQPSQ_IMMDATA, info->imm_data)); - qp->wqe_ops.iw_copy_inline_data((u8 *)wqe, op_info->data, op_info->len, - qp->swqe_polarity); + qp->wqe_ops.iw_copy_inline_data((u8 *)wqe, op_info->sg_list, + op_info->num_sges, qp->swqe_polarity); dma_wmb(); /* make sure WQE is populated before valid bit is set */ diff --git a/drivers/infiniband/hw/irdma/user.h b/drivers/infiniband/hw/irdma/user.h index 2ef61923c9268..424d4aa8cdcd5 100644 --- a/drivers/infiniband/hw/irdma/user.h +++ b/drivers/infiniband/hw/irdma/user.h @@ -173,14 +173,6 @@ struct irdma_post_send { u32 ah_id; }; -struct irdma_post_inline_send { - void *data; - u32 len; - u32 qkey; - u32 dest_qp; - u32 ah_id; -}; - struct irdma_post_rq_info { u64 wr_id; struct ib_sge *sg_list; @@ -193,12 +185,6 @@ struct irdma_rdma_write { struct ib_sge rem_addr; }; -struct irdma_inline_rdma_write { - void *data; - u32 len; - struct ib_sge rem_addr; -}; - struct irdma_rdma_read { struct ib_sge *lo_sg_list; u32 num_lo_sges; @@ -241,8 +227,6 @@ struct irdma_post_sq_info { struct irdma_rdma_read rdma_read; struct irdma_bind_window bind_window; struct irdma_inv_local_stag inv_local_stag; - struct irdma_inline_rdma_write inline_rdma_write; - struct irdma_post_inline_send inline_send; } op; }; @@ -291,7 +275,8 @@ int irdma_uk_stag_local_invalidate(struct irdma_qp_uk *qp, bool post_sq); struct irdma_wqe_uk_ops { - void (*iw_copy_inline_data)(u8 *dest, u8 *src, u32 len, u8 polarity); + void (*iw_copy_inline_data)(u8 *dest, struct ib_sge *sge_list, + u32 num_sges, u8 polarity); u16 (*iw_inline_data_size_to_quanta)(u32 data_size); void (*iw_set_fragment)(__le64 *wqe, u32 offset, struct ib_sge *sge, u8 valid); diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 434241789f123..e252f431e2aca 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -3136,30 +3136,20 @@ static int irdma_post_send(struct ib_qp *ibqp, info.stag_to_inv = ib_wr->ex.invalidate_rkey; } - if (ib_wr->send_flags & IB_SEND_INLINE) { - info.op.inline_send.data = (void *)(unsigned long) - ib_wr->sg_list[0].addr; - info.op.inline_send.len = ib_wr->sg_list[0].length; - if (iwqp->ibqp.qp_type == IB_QPT_UD || - iwqp->ibqp.qp_type == IB_QPT_GSI) { - ah = to_iwah(ud_wr(ib_wr)->ah); - info.op.inline_send.ah_id = ah->sc_ah.ah_info.ah_idx; - info.op.inline_send.qkey = ud_wr(ib_wr)->remote_qkey; - info.op.inline_send.dest_qp = ud_wr(ib_wr)->remote_qpn; - } + info.op.send.num_sges = ib_wr->num_sge; + info.op.send.sg_list = ib_wr->sg_list; + if (iwqp->ibqp.qp_type == IB_QPT_UD || + iwqp->ibqp.qp_type == IB_QPT_GSI) { + ah = to_iwah(ud_wr(ib_wr)->ah); + info.op.send.ah_id = ah->sc_ah.ah_info.ah_idx; + info.op.send.qkey = ud_wr(ib_wr)->remote_qkey; + info.op.send.dest_qp = ud_wr(ib_wr)->remote_qpn; + } + + if (ib_wr->send_flags & IB_SEND_INLINE) err = irdma_uk_inline_send(ukqp, &info, false); - } else { - info.op.send.num_sges = ib_wr->num_sge; - info.op.send.sg_list = ib_wr->sg_list; - if (iwqp->ibqp.qp_type == IB_QPT_UD || - iwqp->ibqp.qp_type == IB_QPT_GSI) { - ah = to_iwah(ud_wr(ib_wr)->ah); - info.op.send.ah_id = ah->sc_ah.ah_info.ah_idx; - info.op.send.qkey = ud_wr(ib_wr)->remote_qkey; - info.op.send.dest_qp = ud_wr(ib_wr)->remote_qpn; - } + else err = irdma_uk_send(ukqp, &info, false); - } break; case IB_WR_RDMA_WRITE_WITH_IMM: if (ukqp->qp_caps & IRDMA_WRITE_WITH_IMM) { @@ -3176,22 +3166,15 @@ static int irdma_post_send(struct ib_qp *ibqp, else info.op_type = IRDMA_OP_TYPE_RDMA_WRITE; - if (ib_wr->send_flags & IB_SEND_INLINE) { - info.op.inline_rdma_write.data = (void *)(uintptr_t)ib_wr->sg_list[0].addr; - info.op.inline_rdma_write.len = - ib_wr->sg_list[0].length; - info.op.inline_rdma_write.rem_addr.addr = - rdma_wr(ib_wr)->remote_addr; - info.op.inline_rdma_write.rem_addr.lkey = - rdma_wr(ib_wr)->rkey; + info.op.rdma_write.num_lo_sges = ib_wr->num_sge; + info.op.rdma_write.lo_sg_list = ib_wr->sg_list; + info.op.rdma_write.rem_addr.addr = + rdma_wr(ib_wr)->remote_addr; + info.op.rdma_write.rem_addr.lkey = rdma_wr(ib_wr)->rkey; + if (ib_wr->send_flags & IB_SEND_INLINE) err = irdma_uk_inline_rdma_write(ukqp, &info, false); - } else { - info.op.rdma_write.lo_sg_list = (void *)ib_wr->sg_list; - info.op.rdma_write.num_lo_sges = ib_wr->num_sge; - info.op.rdma_write.rem_addr.addr = rdma_wr(ib_wr)->remote_addr; - info.op.rdma_write.rem_addr.lkey = rdma_wr(ib_wr)->rkey; + else err = irdma_uk_rdma_write(ukqp, &info, false); - } break; case IB_WR_RDMA_READ_WITH_INV: inv_stag = true; -- GitLab From 24419777e9431137d5923a747f546facb1e49b1f Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Mon, 14 Nov 2022 19:17:00 -0600 Subject: [PATCH 0726/1423] RDMA/irdma: Fix RQ completion opcode The opcode written by HW, in the RQ CQE, is the RoCEv2/iWARP protocol opcode from the received packet and not the SW opcode as currently assumed. Fix this by returning the raw operation type and queue type in the CQE to irdma_process_cqe and add 2 helpers set_ib_wc_op_sq set_ib_wc_op_rq to map IRDMA HW op types to IB op types. Note that for iWARP, only Write with Immediate is supported so the opcode can only be IB_WC_RECV_RDMA_WITH_IMM when there is immediate data present. Fixes: b48c24c2d710 ("RDMA/irdma: Implement device supported verb APIs") Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Link: https://lore.kernel.org/r/20221115011701.1379-3-shiraz.saleem@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/uk.c | 21 +++++------- drivers/infiniband/hw/irdma/user.h | 1 + drivers/infiniband/hw/irdma/utils.c | 2 ++ drivers/infiniband/hw/irdma/verbs.c | 39 ++++----------------- drivers/infiniband/hw/irdma/verbs.h | 53 +++++++++++++++++++++++++++++ 5 files changed, 71 insertions(+), 45 deletions(-) diff --git a/drivers/infiniband/hw/irdma/uk.c b/drivers/infiniband/hw/irdma/uk.c index 1a57ed9d77ff3..16183e894da77 100644 --- a/drivers/infiniband/hw/irdma/uk.c +++ b/drivers/infiniband/hw/irdma/uk.c @@ -1049,11 +1049,10 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, __le64 *cqe; struct irdma_qp_uk *qp; struct irdma_ring *pring = NULL; - u32 wqe_idx, q_type; + u32 wqe_idx; int ret_code; bool move_cq_head = true; u8 polarity; - u8 op_type; bool ext_valid; __le64 *ext_cqe; @@ -1121,7 +1120,7 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, info->ud_vlan_valid = false; } - q_type = (u8)FIELD_GET(IRDMA_CQ_SQ, qword3); + info->q_type = (u8)FIELD_GET(IRDMA_CQ_SQ, qword3); info->error = (bool)FIELD_GET(IRDMA_CQ_ERROR, qword3); info->push_dropped = (bool)FIELD_GET(IRDMACQ_PSHDROP, qword3); info->ipv4 = (bool)FIELD_GET(IRDMACQ_IPV4, qword3); @@ -1160,8 +1159,9 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, } wqe_idx = (u32)FIELD_GET(IRDMA_CQ_WQEIDX, qword3); info->qp_handle = (irdma_qp_handle)(unsigned long)qp; + info->op_type = (u8)FIELD_GET(IRDMA_CQ_SQ, qword3); - if (q_type == IRDMA_CQE_QTYPE_RQ) { + if (info->q_type == IRDMA_CQE_QTYPE_RQ) { u32 array_idx; array_idx = wqe_idx / qp->rq_wqe_size_multiplier; @@ -1181,10 +1181,6 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, info->bytes_xfered = (u32)FIELD_GET(IRDMACQ_PAYLDLEN, qword0); - if (info->imm_valid) - info->op_type = IRDMA_OP_TYPE_REC_IMM; - else - info->op_type = IRDMA_OP_TYPE_REC; if (qword3 & IRDMACQ_STAG) { info->stag_invalid_set = true; info->inv_stag = (u32)FIELD_GET(IRDMACQ_INVSTAG, qword2); @@ -1242,17 +1238,18 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, sw_wqe = qp->sq_base[tail].elem; get_64bit_val(sw_wqe, 24, &wqe_qword); - op_type = (u8)FIELD_GET(IRDMAQPSQ_OPCODE, wqe_qword); - info->op_type = op_type; + info->op_type = (u8)FIELD_GET(IRDMAQPSQ_OPCODE, + wqe_qword); IRDMA_RING_SET_TAIL(qp->sq_ring, tail + qp->sq_wrtrk_array[tail].quanta); - if (op_type != IRDMAQP_OP_NOP) { + if (info->op_type != IRDMAQP_OP_NOP) { info->wr_id = qp->sq_wrtrk_array[tail].wrid; info->bytes_xfered = qp->sq_wrtrk_array[tail].wr_len; break; } } while (1); - if (op_type == IRDMA_OP_TYPE_BIND_MW && info->minor_err == FLUSH_PROT_ERR) + if (info->op_type == IRDMA_OP_TYPE_BIND_MW && + info->minor_err == FLUSH_PROT_ERR) info->minor_err = FLUSH_MW_BIND_ERR; qp->sq_flush_seen = true; if (!IRDMA_RING_MORE_WORK(qp->sq_ring)) diff --git a/drivers/infiniband/hw/irdma/user.h b/drivers/infiniband/hw/irdma/user.h index 424d4aa8cdcd5..d0cdf609f5e06 100644 --- a/drivers/infiniband/hw/irdma/user.h +++ b/drivers/infiniband/hw/irdma/user.h @@ -245,6 +245,7 @@ struct irdma_cq_poll_info { u16 ud_vlan; u8 ud_smac[6]; u8 op_type; + u8 q_type; bool stag_invalid_set:1; /* or L_R_Key set */ bool push_dropped:1; bool error:1; diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index 8dfc9e154d733..445e69e864097 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -2591,6 +2591,7 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp) sw_wqe = qp->sq_base[wqe_idx].elem; get_64bit_val(sw_wqe, 24, &wqe_qword); cmpl->cpi.op_type = (u8)FIELD_GET(IRDMAQPSQ_OPCODE, IRDMAQPSQ_OPCODE); + cmpl->cpi.q_type = IRDMA_CQE_QTYPE_SQ; /* remove the SQ WR by moving SQ tail*/ IRDMA_RING_SET_TAIL(*sq_ring, sq_ring->tail + qp->sq_wrtrk_array[sq_ring->tail].quanta); @@ -2629,6 +2630,7 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp) cmpl->cpi.wr_id = qp->rq_wrid_array[wqe_idx]; cmpl->cpi.op_type = IRDMA_OP_TYPE_REC; + cmpl->cpi.q_type = IRDMA_CQE_QTYPE_RQ; /* remove the RQ WR by moving RQ tail */ IRDMA_RING_SET_TAIL(*rq_ring, rq_ring->tail + 1); ibdev_dbg(iwqp->iwrcq->ibcq.device, diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index e252f431e2aca..01d0dc4b56490 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -3334,7 +3334,6 @@ static enum ib_wc_status irdma_flush_err_to_ib_wc_status(enum irdma_flush_opcode static void irdma_process_cqe(struct ib_wc *entry, struct irdma_cq_poll_info *cq_poll_info) { - struct irdma_qp *iwqp; struct irdma_sc_qp *qp; entry->wc_flags = 0; @@ -3342,7 +3341,6 @@ static void irdma_process_cqe(struct ib_wc *entry, entry->wr_id = cq_poll_info->wr_id; qp = cq_poll_info->qp_handle; - iwqp = qp->qp_uk.back_qp; entry->qp = qp->qp_uk.back_qp; if (cq_poll_info->error) { @@ -3375,42 +3373,17 @@ static void irdma_process_cqe(struct ib_wc *entry, } } - switch (cq_poll_info->op_type) { - case IRDMA_OP_TYPE_RDMA_WRITE: - case IRDMA_OP_TYPE_RDMA_WRITE_SOL: - entry->opcode = IB_WC_RDMA_WRITE; - break; - case IRDMA_OP_TYPE_RDMA_READ_INV_STAG: - case IRDMA_OP_TYPE_RDMA_READ: - entry->opcode = IB_WC_RDMA_READ; - break; - case IRDMA_OP_TYPE_SEND_INV: - case IRDMA_OP_TYPE_SEND_SOL: - case IRDMA_OP_TYPE_SEND_SOL_INV: - case IRDMA_OP_TYPE_SEND: - entry->opcode = IB_WC_SEND; - break; - case IRDMA_OP_TYPE_FAST_REG_NSMR: - entry->opcode = IB_WC_REG_MR; - break; - case IRDMA_OP_TYPE_INV_STAG: - entry->opcode = IB_WC_LOCAL_INV; - break; - case IRDMA_OP_TYPE_REC_IMM: - case IRDMA_OP_TYPE_REC: - entry->opcode = cq_poll_info->op_type == IRDMA_OP_TYPE_REC_IMM ? - IB_WC_RECV_RDMA_WITH_IMM : IB_WC_RECV; + if (cq_poll_info->q_type == IRDMA_CQE_QTYPE_SQ) { + set_ib_wc_op_sq(cq_poll_info, entry); + } else { + set_ib_wc_op_rq(cq_poll_info, entry, + qp->qp_uk.qp_caps & IRDMA_SEND_WITH_IMM ? + true : false); if (qp->qp_uk.qp_type != IRDMA_QP_TYPE_ROCE_UD && cq_poll_info->stag_invalid_set) { entry->ex.invalidate_rkey = cq_poll_info->inv_stag; entry->wc_flags |= IB_WC_WITH_INVALIDATE; } - break; - default: - ibdev_err(&iwqp->iwdev->ibdev, - "Invalid opcode = %d in CQE\n", cq_poll_info->op_type); - entry->status = IB_WC_GENERAL_ERR; - return; } if (qp->qp_uk.qp_type == IRDMA_QP_TYPE_ROCE_UD) { diff --git a/drivers/infiniband/hw/irdma/verbs.h b/drivers/infiniband/hw/irdma/verbs.h index 4309b7159f42c..a536e9fa85ebf 100644 --- a/drivers/infiniband/hw/irdma/verbs.h +++ b/drivers/infiniband/hw/irdma/verbs.h @@ -232,6 +232,59 @@ static inline u16 irdma_fw_minor_ver(struct irdma_sc_dev *dev) return (u16)FIELD_GET(IRDMA_FW_VER_MINOR, dev->feature_info[IRDMA_FEATURE_FW_INFO]); } +static inline void set_ib_wc_op_sq(struct irdma_cq_poll_info *cq_poll_info, + struct ib_wc *entry) +{ + switch (cq_poll_info->op_type) { + case IRDMA_OP_TYPE_RDMA_WRITE: + case IRDMA_OP_TYPE_RDMA_WRITE_SOL: + entry->opcode = IB_WC_RDMA_WRITE; + break; + case IRDMA_OP_TYPE_RDMA_READ_INV_STAG: + case IRDMA_OP_TYPE_RDMA_READ: + entry->opcode = IB_WC_RDMA_READ; + break; + case IRDMA_OP_TYPE_SEND_SOL: + case IRDMA_OP_TYPE_SEND_SOL_INV: + case IRDMA_OP_TYPE_SEND_INV: + case IRDMA_OP_TYPE_SEND: + entry->opcode = IB_WC_SEND; + break; + case IRDMA_OP_TYPE_FAST_REG_NSMR: + entry->opcode = IB_WC_REG_MR; + break; + case IRDMA_OP_TYPE_INV_STAG: + entry->opcode = IB_WC_LOCAL_INV; + break; + default: + entry->status = IB_WC_GENERAL_ERR; + } +} + +static inline void set_ib_wc_op_rq(struct irdma_cq_poll_info *cq_poll_info, + struct ib_wc *entry, bool send_imm_support) +{ + /** + * iWARP does not support sendImm, so the presence of Imm data + * must be WriteImm. + */ + if (!send_imm_support) { + entry->opcode = cq_poll_info->imm_valid ? + IB_WC_RECV_RDMA_WITH_IMM : + IB_WC_RECV; + return; + } + + switch (cq_poll_info->op_type) { + case IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE: + case IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE: + entry->opcode = IB_WC_RECV_RDMA_WITH_IMM; + break; + default: + entry->opcode = IB_WC_RECV; + } +} + void irdma_mcast_mac(u32 *ip_addr, u8 *mac, bool ipv4); int irdma_ib_register_device(struct irdma_device *iwdev); void irdma_ib_unregister_device(struct irdma_device *iwdev); -- GitLab From 8f7e2daa6336f9f4b6f8a4715a809674606df16b Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Mon, 14 Nov 2022 19:17:01 -0600 Subject: [PATCH 0727/1423] RDMA/irdma: Do not request 2-level PBLEs for CQ alloc When allocating PBLE's for a large CQ, it is possible that a 2-level PBLE is returned which would cause the CQ allocation to fail since 1-level is assumed and checked for. Fix this by requesting a level one PBLE only. Fixes: b48c24c2d710 ("RDMA/irdma: Implement device supported verb APIs") Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Link: https://lore.kernel.org/r/20221115011701.1379-4-shiraz.saleem@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/verbs.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 01d0dc4b56490..dc3f5f3fee901 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -2329,9 +2329,10 @@ static bool irdma_check_mr_contiguous(struct irdma_pble_alloc *palloc, * @rf: RDMA PCI function * @iwmr: mr pointer for this memory registration * @use_pbles: flag if to use pble's + * @lvl_1_only: request only level 1 pble if true */ static int irdma_setup_pbles(struct irdma_pci_f *rf, struct irdma_mr *iwmr, - bool use_pbles) + bool use_pbles, bool lvl_1_only) { struct irdma_pbl *iwpbl = &iwmr->iwpbl; struct irdma_pble_alloc *palloc = &iwpbl->pble_alloc; @@ -2342,7 +2343,7 @@ static int irdma_setup_pbles(struct irdma_pci_f *rf, struct irdma_mr *iwmr, if (use_pbles) { status = irdma_get_pble(rf->pble_rsrc, palloc, iwmr->page_cnt, - false); + lvl_1_only); if (status) return status; @@ -2385,16 +2386,10 @@ static int irdma_handle_q_mem(struct irdma_device *iwdev, bool ret = true; pg_size = iwmr->page_size; - err = irdma_setup_pbles(iwdev->rf, iwmr, use_pbles); + err = irdma_setup_pbles(iwdev->rf, iwmr, use_pbles, true); if (err) return err; - if (use_pbles && palloc->level != PBLE_LEVEL_1) { - irdma_free_pble(iwdev->rf->pble_rsrc, palloc); - iwpbl->pbl_allocated = false; - return -ENOMEM; - } - if (use_pbles) arr = palloc->level1.addr; @@ -2870,7 +2865,7 @@ static struct ib_mr *irdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, case IRDMA_MEMREG_TYPE_MEM: use_pbles = (iwmr->page_cnt != 1); - err = irdma_setup_pbles(iwdev->rf, iwmr, use_pbles); + err = irdma_setup_pbles(iwdev->rf, iwmr, use_pbles, false); if (err) goto error; -- GitLab From d7115727e32e94c77a5441892e74ae0339afa1a5 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 17 Nov 2022 18:19:38 +0800 Subject: [PATCH 0728/1423] RDMA/rtrs-srv: Refactor rtrs_srv_rdma_cm_handler The RDMA_CM_EVENT_CONNECT_REQUEST is quite different to other types, let's check it separately at the beginning of routine, then we can avoid the indentation accordingly. Acked-by: Jack Wang Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20221117101945.6317-2-guoqing.jiang@linux.dev Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-srv.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index 22d7ba05e9fe8..5fe3699cb8ff9 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -1950,22 +1950,21 @@ static int rtrs_srv_rdma_cm_handler(struct rdma_cm_id *cm_id, { struct rtrs_srv_path *srv_path = NULL; struct rtrs_path *s = NULL; + struct rtrs_con *c = NULL; - if (ev->event != RDMA_CM_EVENT_CONNECT_REQUEST) { - struct rtrs_con *c = cm_id->context; - - s = c->path; - srv_path = to_srv_path(s); - } - - switch (ev->event) { - case RDMA_CM_EVENT_CONNECT_REQUEST: + if (ev->event == RDMA_CM_EVENT_CONNECT_REQUEST) /* * In case of error cma.c will destroy cm_id, * see cma_process_remove() */ return rtrs_rdma_connect(cm_id, ev->param.conn.private_data, ev->param.conn.private_data_len); + + c = cm_id->context; + s = c->path; + srv_path = to_srv_path(s); + + switch (ev->event) { case RDMA_CM_EVENT_ESTABLISHED: /* Nothing here */ break; -- GitLab From 0f597ac618d04beb9de997fda59a29c9d3818fb2 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 17 Nov 2022 18:19:39 +0800 Subject: [PATCH 0729/1423] RDMA/rtrs-srv: Refactor the handling of failure case in map_cont_bufs Let's call unmap_cont_bufs when failure happens, and also only update mrs_num after everything is settled which means we can remove 'mri'. Acked-by: Md Haris Iqbal Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20221117101945.6317-3-guoqing.jiang@linux.dev Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-srv.c | 47 +++++++++++--------------- 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index 5fe3699cb8ff9..b877dd57b6b92 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -561,9 +561,11 @@ static int map_cont_bufs(struct rtrs_srv_path *srv_path) { struct rtrs_srv_sess *srv = srv_path->srv; struct rtrs_path *ss = &srv_path->s; - int i, mri, err, mrs_num; + int i, err, mrs_num; unsigned int chunk_bits; int chunks_per_mr = 1; + struct ib_mr *mr; + struct sg_table *sgt; /* * Here we map queue_depth chunks to MR. Firstly we have to @@ -586,16 +588,14 @@ static int map_cont_bufs(struct rtrs_srv_path *srv_path) if (!srv_path->mrs) return -ENOMEM; - srv_path->mrs_num = mrs_num; - - for (mri = 0; mri < mrs_num; mri++) { - struct rtrs_srv_mr *srv_mr = &srv_path->mrs[mri]; - struct sg_table *sgt = &srv_mr->sgt; + for (srv_path->mrs_num = 0; srv_path->mrs_num < mrs_num; + srv_path->mrs_num++) { + struct rtrs_srv_mr *srv_mr = &srv_path->mrs[srv_path->mrs_num]; struct scatterlist *s; - struct ib_mr *mr; int nr, nr_sgt, chunks; - chunks = chunks_per_mr * mri; + sgt = &srv_mr->sgt; + chunks = chunks_per_mr * srv_path->mrs_num; if (!always_invalidate) chunks_per_mr = min_t(int, chunks_per_mr, srv->queue_depth - chunks); @@ -644,31 +644,24 @@ static int map_cont_bufs(struct rtrs_srv_path *srv_path) ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey)); srv_mr->mr = mr; - - continue; -err: - while (mri--) { - srv_mr = &srv_path->mrs[mri]; - sgt = &srv_mr->sgt; - mr = srv_mr->mr; - rtrs_iu_free(srv_mr->iu, srv_path->s.dev->ib_dev, 1); -dereg_mr: - ib_dereg_mr(mr); -unmap_sg: - ib_dma_unmap_sg(srv_path->s.dev->ib_dev, sgt->sgl, - sgt->nents, DMA_BIDIRECTIONAL); -free_sg: - sg_free_table(sgt); - } - kfree(srv_path->mrs); - - return err; } chunk_bits = ilog2(srv->queue_depth - 1) + 1; srv_path->mem_bits = (MAX_IMM_PAYL_BITS - chunk_bits); return 0; + +dereg_mr: + ib_dereg_mr(mr); +unmap_sg: + ib_dma_unmap_sg(srv_path->s.dev->ib_dev, sgt->sgl, + sgt->nents, DMA_BIDIRECTIONAL); +free_sg: + sg_free_table(sgt); +err: + unmap_cont_bufs(srv_path); + + return err; } static void rtrs_srv_hb_err_handler(struct rtrs_con *c) -- GitLab From 102d2f70ec0999a5cde181f1ccbe8a81cba45b10 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 17 Nov 2022 18:19:40 +0800 Subject: [PATCH 0730/1423] RDMA/rtrs-srv: Correct the checking of ib_map_mr_sg We should check with nr_sgt, also the only successful case is that all sg elements are mapped, so make it explicitly. Acked-by: Jack Wang Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20221117101945.6317-4-guoqing.jiang@linux.dev Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-srv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index b877dd57b6b92..581c850e71d64 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -622,7 +622,7 @@ static int map_cont_bufs(struct rtrs_srv_path *srv_path) } nr = ib_map_mr_sg(mr, sgt->sgl, nr_sgt, NULL, max_chunk_size); - if (nr < 0 || nr < sgt->nents) { + if (nr != nr_sgt) { err = nr < 0 ? nr : -EINVAL; goto dereg_mr; } -- GitLab From f5708e6699c230f64736107c90b63a53bdc0a613 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 17 Nov 2022 18:19:41 +0800 Subject: [PATCH 0731/1423] RDMA/rtrs-clt: Correct the checking of ib_map_mr_sg We should check with count, also the only successful case is that all sg elements are mapped, so make it explicitly. Acked-by: Jack Wang Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20221117101945.6317-5-guoqing.jiang@linux.dev Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index 8546b8816524c..be7c8480f9472 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -1064,10 +1064,8 @@ static int rtrs_map_sg_fr(struct rtrs_clt_io_req *req, size_t count) /* Align the MR to a 4K page size to match the block virt boundary */ nr = ib_map_mr_sg(req->mr, req->sglist, count, NULL, SZ_4K); - if (nr < 0) - return nr; - if (nr < req->sg_cnt) - return -EINVAL; + if (nr != count) + return nr < 0 ? nr : -EINVAL; ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey)); return nr; -- GitLab From a4399563356c86eafeadcdc155d5d93320713b6e Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 17 Nov 2022 18:19:42 +0800 Subject: [PATCH 0732/1423] RDMA/rtrs-srv: Remove outdated comments from create_con Remove the orphan comments. Acked-by: Md Haris Iqbal Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20221117101945.6317-6-guoqing.jiang@linux.dev Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-srv.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index 581c850e71d64..d1703e2c0b82f 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -1671,12 +1671,6 @@ static int create_con(struct rtrs_srv_path *srv_path, srv->queue_depth * (1 + 2) + 1); max_recv_wr = srv->queue_depth + 1; - /* - * If we have all receive requests posted and - * all write requests posted and each read request - * requires an invalidate request + drain - * and qp gets into error state. - */ } cq_num = max_send_wr + max_recv_wr; atomic_set(&con->c.sq_wr_avail, max_send_wr); -- GitLab From 7526198f27107278c600a3aee41f1a77ed84dd78 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 17 Nov 2022 18:19:43 +0800 Subject: [PATCH 0733/1423] RDMA/rtrs: Clean up rtrs_rdma_dev_pd_ops Let's remove them since the three members are not used. Acked-by: Md Haris Iqbal Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20221117101945.6317-7-guoqing.jiang@linux.dev Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-pri.h | 3 --- drivers/infiniband/ulp/rtrs/rtrs.c | 22 ++++------------------ 2 files changed, 4 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-pri.h b/drivers/infiniband/ulp/rtrs/rtrs-pri.h index a2420eecaf5a1..ab25619261d28 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-pri.h +++ b/drivers/infiniband/ulp/rtrs/rtrs-pri.h @@ -68,10 +68,7 @@ enum { struct rtrs_ib_dev; struct rtrs_rdma_dev_pd_ops { - struct rtrs_ib_dev *(*alloc)(void); - void (*free)(struct rtrs_ib_dev *dev); int (*init)(struct rtrs_ib_dev *dev); - void (*deinit)(struct rtrs_ib_dev *dev); }; struct rtrs_rdma_dev_pd { diff --git a/drivers/infiniband/ulp/rtrs/rtrs.c b/drivers/infiniband/ulp/rtrs/rtrs.c index ed324b47d93ae..4bf9d868cc522 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs.c +++ b/drivers/infiniband/ulp/rtrs/rtrs.c @@ -557,7 +557,6 @@ EXPORT_SYMBOL(rtrs_addr_to_sockaddr); void rtrs_rdma_dev_pd_init(enum ib_pd_flags pd_flags, struct rtrs_rdma_dev_pd *pool) { - WARN_ON(pool->ops && (!pool->ops->alloc ^ !pool->ops->free)); INIT_LIST_HEAD(&pool->list); mutex_init(&pool->mutex); pool->pd_flags = pd_flags; @@ -583,15 +582,8 @@ static void dev_free(struct kref *ref) list_del(&dev->entry); mutex_unlock(&pool->mutex); - if (pool->ops && pool->ops->deinit) - pool->ops->deinit(dev); - ib_dealloc_pd(dev->ib_pd); - - if (pool->ops && pool->ops->free) - pool->ops->free(dev); - else - kfree(dev); + kfree(dev); } int rtrs_ib_dev_put(struct rtrs_ib_dev *dev) @@ -618,11 +610,8 @@ rtrs_ib_dev_find_or_add(struct ib_device *ib_dev, goto out_unlock; } mutex_unlock(&pool->mutex); - if (pool->ops && pool->ops->alloc) - dev = pool->ops->alloc(); - else - dev = kzalloc(sizeof(*dev), GFP_KERNEL); - if (IS_ERR_OR_NULL(dev)) + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) goto out_err; kref_init(&dev->ref); @@ -644,10 +633,7 @@ out_unlock: out_free_pd: ib_dealloc_pd(dev->ib_pd); out_free_dev: - if (pool->ops && pool->ops->free) - pool->ops->free(dev); - else - kfree(dev); + kfree(dev); out_err: return NULL; } -- GitLab From 6af4609c18b3aa69209d022b9c00e6db78c57ae5 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 17 Nov 2022 18:19:44 +0800 Subject: [PATCH 0734/1423] RDMA/rtrs-srv: Fix several issues in rtrs_srv_destroy_path_files There are several issues in the function which is supposed to be paired with rtrs_srv_create_path_files. 1. rtrs_srv_stats_attr_group is not removed though it is created in rtrs_srv_create_stats_files. 2. it makes more sense to check kobj_stats.state_in_sysfs before destroy kobj_stats instead of rely on kobj.state_in_sysfs. 3. kobject_init_and_add is used for both kobjs (srv_path->kobj and srv_path->stats->kobj_stats), however we missed to call kobject_del for srv_path->kobj which was called in free_path. 4. rtrs_srv_destroy_once_sysfs_root_folders is independent of either kobj or kobj_stats. Acked-by: Md Haris Iqbal Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20221117101945.6317-8-guoqing.jiang@linux.dev Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c index 2a3c9ac64a42e..da8e205ce3311 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c @@ -304,12 +304,18 @@ destroy_root: void rtrs_srv_destroy_path_files(struct rtrs_srv_path *srv_path) { - if (srv_path->kobj.state_in_sysfs) { + if (srv_path->stats->kobj_stats.state_in_sysfs) { + sysfs_remove_group(&srv_path->stats->kobj_stats, + &rtrs_srv_stats_attr_group); kobject_del(&srv_path->stats->kobj_stats); kobject_put(&srv_path->stats->kobj_stats); + } + + if (srv_path->kobj.state_in_sysfs) { sysfs_remove_group(&srv_path->kobj, &rtrs_srv_path_attr_group); + kobject_del(&srv_path->kobj); kobject_put(&srv_path->kobj); - - rtrs_srv_destroy_once_sysfs_root_folders(srv_path); } + + rtrs_srv_destroy_once_sysfs_root_folders(srv_path); } -- GitLab From 34a046f08b62fb855ac590c1f4dfb1934f1fdb64 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 17 Nov 2022 18:19:45 +0800 Subject: [PATCH 0735/1423] RDMA/rtrs-srv: Remove kobject_del from rtrs_srv_destroy_once_sysfs_root_folders The kobj_paths which is created dynamically by kobject_create_and_add, and per the comment above kobject_create_and_add, we only need to call kobject_put which is not same as other kobjs such as stats->kobj_stats and srv_path->kobj. Acked-by: Md Haris Iqbal Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20221117101945.6317-9-guoqing.jiang@linux.dev Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c index da8e205ce3311..c76ba29da1e20 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c @@ -203,7 +203,6 @@ rtrs_srv_destroy_once_sysfs_root_folders(struct rtrs_srv_path *srv_path) mutex_lock(&srv->paths_mutex); if (!--srv->dev_ref) { - kobject_del(srv->kobj_paths); kobject_put(srv->kobj_paths); mutex_unlock(&srv->paths_mutex); device_del(&srv->dev); -- GitLab From d017eeabd5092565c3dd1c8a7b00ba724c33c18f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 13 Nov 2022 16:38:17 +0000 Subject: [PATCH 0736/1423] arm64: Add ID_DFR0_EL1.PerfMon values for PMUv3p7 and IMP_DEF Align the ID_DFR0_EL1.PerfMon values with ID_AA64DFR0_EL1.PMUver. Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221113163832.3154370-2-maz@kernel.org --- arch/arm64/include/asm/sysreg.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 7d301700d1a93..84f59ce1dc6db 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -698,6 +698,8 @@ #define ID_DFR0_PERFMON_8_1 0x4 #define ID_DFR0_PERFMON_8_4 0x5 #define ID_DFR0_PERFMON_8_5 0x6 +#define ID_DFR0_PERFMON_8_7 0x7 +#define ID_DFR0_PERFMON_IMP_DEF 0xf #define ID_ISAR4_SWP_FRAC_SHIFT 28 #define ID_ISAR4_PSR_M_SHIFT 24 -- GitLab From bead02204e9806807bb290137b1ccabfcb4b16fd Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 13 Nov 2022 16:38:18 +0000 Subject: [PATCH 0737/1423] KVM: arm64: PMU: Align chained counter implementation with architecture pseudocode Ricardo recently pointed out that the PMU chained counter emulation in KVM wasn't quite behaving like the one on actual hardware, in the sense that a chained counter would expose an overflow on both halves of a chained counter, while KVM would only expose the overflow on the top half. The difference is subtle, but significant. What does the architecture say (DDI0087 H.a): - Up to PMUv3p4, all counters but the cycle counter are 32bit - A 32bit counter that overflows generates a CHAIN event on the adjacent counter after exposing its own overflow status - The CHAIN event is accounted if the counter is correctly configured (CHAIN event selected and counter enabled) This all means that our current implementation (which uses 64bit perf events) prevents us from emulating this overflow on the lower half. How to fix this? By implementing the above, to the letter. This largely results in code deletion, removing the notions of "counter pair", "chained counters", and "canonical counter". The code is further restructured to make the CHAIN handling similar to SWINC, as the two are now extremely similar in behaviour. Reported-by: Ricardo Koller Signed-off-by: Marc Zyngier Reviewed-by: Reiji Watanabe Link: https://lore.kernel.org/r/20221113163832.3154370-3-maz@kernel.org --- arch/arm64/kvm/pmu-emul.c | 320 ++++++++++---------------------------- include/kvm/arm_pmu.h | 2 - 2 files changed, 86 insertions(+), 236 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 0003c7d37533a..57765be69bea0 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -15,16 +15,14 @@ #include #include +#define PERF_ATTR_CFG1_COUNTER_64BIT BIT(0) + DEFINE_STATIC_KEY_FALSE(kvm_arm_pmu_available); static LIST_HEAD(arm_pmus); static DEFINE_MUTEX(arm_pmus_lock); static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx); -static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx); -static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc); - -#define PERF_ATTR_CFG1_KVM_PMU_CHAINED 0x1 static u32 kvm_pmu_event_mask(struct kvm *kvm) { @@ -57,6 +55,11 @@ static bool kvm_pmu_idx_is_64bit(struct kvm_vcpu *vcpu, u64 select_idx) __vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_LC); } +static bool kvm_pmu_counter_can_chain(struct kvm_vcpu *vcpu, u64 idx) +{ + return (!(idx & 1) && (idx + 1) < ARMV8_PMU_CYCLE_IDX); +} + static struct kvm_vcpu *kvm_pmc_to_vcpu(struct kvm_pmc *pmc) { struct kvm_pmu *pmu; @@ -69,91 +72,22 @@ static struct kvm_vcpu *kvm_pmc_to_vcpu(struct kvm_pmc *pmc) } /** - * kvm_pmu_pmc_is_chained - determine if the pmc is chained - * @pmc: The PMU counter pointer - */ -static bool kvm_pmu_pmc_is_chained(struct kvm_pmc *pmc) -{ - struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); - - return test_bit(pmc->idx >> 1, vcpu->arch.pmu.chained); -} - -/** - * kvm_pmu_idx_is_high_counter - determine if select_idx is a high/low counter - * @select_idx: The counter index - */ -static bool kvm_pmu_idx_is_high_counter(u64 select_idx) -{ - return select_idx & 0x1; -} - -/** - * kvm_pmu_get_canonical_pmc - obtain the canonical pmc - * @pmc: The PMU counter pointer - * - * When a pair of PMCs are chained together we use the low counter (canonical) - * to hold the underlying perf event. - */ -static struct kvm_pmc *kvm_pmu_get_canonical_pmc(struct kvm_pmc *pmc) -{ - if (kvm_pmu_pmc_is_chained(pmc) && - kvm_pmu_idx_is_high_counter(pmc->idx)) - return pmc - 1; - - return pmc; -} -static struct kvm_pmc *kvm_pmu_get_alternate_pmc(struct kvm_pmc *pmc) -{ - if (kvm_pmu_idx_is_high_counter(pmc->idx)) - return pmc - 1; - else - return pmc + 1; -} - -/** - * kvm_pmu_idx_has_chain_evtype - determine if the event type is chain + * kvm_pmu_get_counter_value - get PMU counter value * @vcpu: The vcpu pointer * @select_idx: The counter index */ -static bool kvm_pmu_idx_has_chain_evtype(struct kvm_vcpu *vcpu, u64 select_idx) -{ - u64 eventsel, reg; - - select_idx |= 0x1; - - if (select_idx == ARMV8_PMU_CYCLE_IDX) - return false; - - reg = PMEVTYPER0_EL0 + select_idx; - eventsel = __vcpu_sys_reg(vcpu, reg) & kvm_pmu_event_mask(vcpu->kvm); - - return eventsel == ARMV8_PMUV3_PERFCTR_CHAIN; -} - -/** - * kvm_pmu_get_pair_counter_value - get PMU counter value - * @vcpu: The vcpu pointer - * @pmc: The PMU counter pointer - */ -static u64 kvm_pmu_get_pair_counter_value(struct kvm_vcpu *vcpu, - struct kvm_pmc *pmc) +u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx) { - u64 counter, counter_high, reg, enabled, running; - - if (kvm_pmu_pmc_is_chained(pmc)) { - pmc = kvm_pmu_get_canonical_pmc(pmc); - reg = PMEVCNTR0_EL0 + pmc->idx; + u64 counter, reg, enabled, running; + struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_pmc *pmc = &pmu->pmc[select_idx]; - counter = __vcpu_sys_reg(vcpu, reg); - counter_high = __vcpu_sys_reg(vcpu, reg + 1); + if (!kvm_vcpu_has_pmu(vcpu)) + return 0; - counter = lower_32_bits(counter) | (counter_high << 32); - } else { - reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX) - ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + pmc->idx; - counter = __vcpu_sys_reg(vcpu, reg); - } + reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX) + ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + pmc->idx; + counter = __vcpu_sys_reg(vcpu, reg); /* * The real counter value is equal to the value of counter register plus @@ -163,29 +97,7 @@ static u64 kvm_pmu_get_pair_counter_value(struct kvm_vcpu *vcpu, counter += perf_event_read_value(pmc->perf_event, &enabled, &running); - return counter; -} - -/** - * kvm_pmu_get_counter_value - get PMU counter value - * @vcpu: The vcpu pointer - * @select_idx: The counter index - */ -u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx) -{ - u64 counter; - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc = &pmu->pmc[select_idx]; - - if (!kvm_vcpu_has_pmu(vcpu)) - return 0; - - counter = kvm_pmu_get_pair_counter_value(vcpu, pmc); - - if (kvm_pmu_pmc_is_chained(pmc) && - kvm_pmu_idx_is_high_counter(select_idx)) - counter = upper_32_bits(counter); - else if (select_idx != ARMV8_PMU_CYCLE_IDX) + if (select_idx != ARMV8_PMU_CYCLE_IDX) counter = lower_32_bits(counter); return counter; @@ -218,7 +130,6 @@ void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) */ static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc) { - pmc = kvm_pmu_get_canonical_pmc(pmc); if (pmc->perf_event) { perf_event_disable(pmc->perf_event); perf_event_release_kernel(pmc->perf_event); @@ -236,11 +147,10 @@ static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc) { u64 counter, reg, val; - pmc = kvm_pmu_get_canonical_pmc(pmc); if (!pmc->perf_event) return; - counter = kvm_pmu_get_pair_counter_value(vcpu, pmc); + counter = kvm_pmu_get_counter_value(vcpu, pmc->idx); if (pmc->idx == ARMV8_PMU_CYCLE_IDX) { reg = PMCCNTR_EL0; @@ -252,9 +162,6 @@ static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc) __vcpu_sys_reg(vcpu, reg) = val; - if (kvm_pmu_pmc_is_chained(pmc)) - __vcpu_sys_reg(vcpu, reg + 1) = upper_32_bits(counter); - kvm_pmu_release_perf_event(pmc); } @@ -285,8 +192,6 @@ void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) for_each_set_bit(i, &mask, 32) kvm_pmu_stop_counter(vcpu, &pmu->pmc[i]); - - bitmap_zero(vcpu->arch.pmu.chained, ARMV8_PMU_MAX_COUNTER_PAIRS); } /** @@ -340,12 +245,9 @@ void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val) pmc = &pmu->pmc[i]; - /* A change in the enable state may affect the chain state */ - kvm_pmu_update_pmc_chained(vcpu, i); - kvm_pmu_create_perf_event(vcpu, i); - - /* At this point, pmc must be the canonical */ - if (pmc->perf_event) { + if (!pmc->perf_event) { + kvm_pmu_create_perf_event(vcpu, i); + } else { perf_event_enable(pmc->perf_event); if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE) kvm_debug("fail to enable perf event\n"); @@ -375,11 +277,6 @@ void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val) pmc = &pmu->pmc[i]; - /* A change in the enable state may affect the chain state */ - kvm_pmu_update_pmc_chained(vcpu, i); - kvm_pmu_create_perf_event(vcpu, i); - - /* At this point, pmc must be the canonical */ if (pmc->perf_event) perf_event_disable(pmc->perf_event); } @@ -484,6 +381,48 @@ static void kvm_pmu_perf_overflow_notify_vcpu(struct irq_work *work) kvm_vcpu_kick(vcpu); } +/* + * Perform an increment on any of the counters described in @mask, + * generating the overflow if required, and propagate it as a chained + * event if possible. + */ +static void kvm_pmu_counter_increment(struct kvm_vcpu *vcpu, + unsigned long mask, u32 event) +{ + int i; + + if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) + return; + + /* Weed out disabled counters */ + mask &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); + + for_each_set_bit(i, &mask, ARMV8_PMU_CYCLE_IDX) { + u64 type, reg; + + /* Filter on event type */ + type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i); + type &= kvm_pmu_event_mask(vcpu->kvm); + if (type != event) + continue; + + /* Increment this counter */ + reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1; + reg = lower_32_bits(reg); + __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg; + + if (reg) /* No overflow? move on */ + continue; + + /* Mark overflow */ + __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i); + + if (kvm_pmu_counter_can_chain(vcpu, i)) + kvm_pmu_counter_increment(vcpu, BIT(i + 1), + ARMV8_PMUV3_PERFCTR_CHAIN); + } +} + /** * When the perf event overflows, set the overflow status and inform the vcpu. */ @@ -514,6 +453,10 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event, __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(idx); + if (kvm_pmu_counter_can_chain(vcpu, idx)) + kvm_pmu_counter_increment(vcpu, BIT(idx + 1), + ARMV8_PMUV3_PERFCTR_CHAIN); + if (kvm_pmu_overflow_status(vcpu)) { kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); @@ -533,50 +476,7 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event, */ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) { - struct kvm_pmu *pmu = &vcpu->arch.pmu; - int i; - - if (!kvm_vcpu_has_pmu(vcpu)) - return; - - if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) - return; - - /* Weed out disabled counters */ - val &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); - - for (i = 0; i < ARMV8_PMU_CYCLE_IDX; i++) { - u64 type, reg; - - if (!(val & BIT(i))) - continue; - - /* PMSWINC only applies to ... SW_INC! */ - type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i); - type &= kvm_pmu_event_mask(vcpu->kvm); - if (type != ARMV8_PMUV3_PERFCTR_SW_INCR) - continue; - - /* increment this even SW_INC counter */ - reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1; - reg = lower_32_bits(reg); - __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg; - - if (reg) /* no overflow on the low part */ - continue; - - if (kvm_pmu_pmc_is_chained(&pmu->pmc[i])) { - /* increment the high counter */ - reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) + 1; - reg = lower_32_bits(reg); - __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) = reg; - if (!reg) /* mark overflow on the high counter */ - __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i + 1); - } else { - /* mark overflow on low counter */ - __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i); - } - } + kvm_pmu_counter_increment(vcpu, val, ARMV8_PMUV3_PERFCTR_SW_INCR); } /** @@ -625,18 +525,11 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx) { struct arm_pmu *arm_pmu = vcpu->kvm->arch.arm_pmu; struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc; + struct kvm_pmc *pmc = &pmu->pmc[select_idx]; struct perf_event *event; struct perf_event_attr attr; u64 eventsel, counter, reg, data; - /* - * For chained counters the event type and filtering attributes are - * obtained from the low/even counter. We also use this counter to - * determine if the event is enabled/disabled. - */ - pmc = kvm_pmu_get_canonical_pmc(&pmu->pmc[select_idx]); - reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX) ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + pmc->idx; data = __vcpu_sys_reg(vcpu, reg); @@ -647,8 +540,12 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx) else eventsel = data & kvm_pmu_event_mask(vcpu->kvm); - /* Software increment event doesn't need to be backed by a perf event */ - if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR) + /* + * Neither SW increment nor chained events need to be backed + * by a perf event. + */ + if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR || + eventsel == ARMV8_PMUV3_PERFCTR_CHAIN) return; /* @@ -670,30 +567,21 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx) attr.exclude_host = 1; /* Don't count host events */ attr.config = eventsel; - counter = kvm_pmu_get_pair_counter_value(vcpu, pmc); + counter = kvm_pmu_get_counter_value(vcpu, select_idx); - if (kvm_pmu_pmc_is_chained(pmc)) { - /** - * The initial sample period (overflow count) of an event. For - * chained counters we only support overflow interrupts on the - * high counter. - */ + /* + * If counting with a 64bit counter, advertise it to the perf + * code, carefully dealing with the initial sample period. + */ + if (kvm_pmu_idx_is_64bit(vcpu, select_idx)) { + attr.config1 |= PERF_ATTR_CFG1_COUNTER_64BIT; attr.sample_period = (-counter) & GENMASK(63, 0); - attr.config1 |= PERF_ATTR_CFG1_KVM_PMU_CHAINED; - - event = perf_event_create_kernel_counter(&attr, -1, current, - kvm_pmu_perf_overflow, - pmc + 1); } else { - /* The initial sample period (overflow count) of an event. */ - if (kvm_pmu_idx_is_64bit(vcpu, pmc->idx)) - attr.sample_period = (-counter) & GENMASK(63, 0); - else - attr.sample_period = (-counter) & GENMASK(31, 0); + attr.sample_period = (-counter) & GENMASK(31, 0); + } - event = perf_event_create_kernel_counter(&attr, -1, current, + event = perf_event_create_kernel_counter(&attr, -1, current, kvm_pmu_perf_overflow, pmc); - } if (IS_ERR(event)) { pr_err_once("kvm: pmu event creation failed %ld\n", @@ -704,41 +592,6 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx) pmc->perf_event = event; } -/** - * kvm_pmu_update_pmc_chained - update chained bitmap - * @vcpu: The vcpu pointer - * @select_idx: The number of selected counter - * - * Update the chained bitmap based on the event type written in the - * typer register and the enable state of the odd register. - */ -static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx) -{ - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc = &pmu->pmc[select_idx], *canonical_pmc; - bool new_state, old_state; - - old_state = kvm_pmu_pmc_is_chained(pmc); - new_state = kvm_pmu_idx_has_chain_evtype(vcpu, pmc->idx) && - kvm_pmu_counter_is_enabled(vcpu, pmc->idx | 0x1); - - if (old_state == new_state) - return; - - canonical_pmc = kvm_pmu_get_canonical_pmc(pmc); - kvm_pmu_stop_counter(vcpu, canonical_pmc); - if (new_state) { - /* - * During promotion from !chained to chained we must ensure - * the adjacent counter is stopped and its event destroyed - */ - kvm_pmu_stop_counter(vcpu, kvm_pmu_get_alternate_pmc(pmc)); - set_bit(pmc->idx >> 1, vcpu->arch.pmu.chained); - return; - } - clear_bit(pmc->idx >> 1, vcpu->arch.pmu.chained); -} - /** * kvm_pmu_set_counter_event_type - set selected counter to monitor some event * @vcpu: The vcpu pointer @@ -766,7 +619,6 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, __vcpu_sys_reg(vcpu, reg) = data & mask; - kvm_pmu_update_pmc_chained(vcpu, select_idx); kvm_pmu_create_perf_event(vcpu, select_idx); } diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index c0b868ce6a8f2..96b192139a23a 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -11,7 +11,6 @@ #include #define ARMV8_PMU_CYCLE_IDX (ARMV8_PMU_MAX_COUNTERS - 1) -#define ARMV8_PMU_MAX_COUNTER_PAIRS ((ARMV8_PMU_MAX_COUNTERS + 1) >> 1) #ifdef CONFIG_HW_PERF_EVENTS @@ -29,7 +28,6 @@ struct kvm_pmu { struct irq_work overflow_work; struct kvm_pmu_events events; struct kvm_pmc pmc[ARMV8_PMU_MAX_COUNTERS]; - DECLARE_BITMAP(chained, ARMV8_PMU_MAX_COUNTER_PAIRS); int irq_num; bool created; bool irq_level; -- GitLab From acdd8a4e13a008a83c6da88bb53eecbecda9714c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 13 Nov 2022 16:38:19 +0000 Subject: [PATCH 0738/1423] KVM: arm64: PMU: Always advertise the CHAIN event Even when the underlying HW doesn't offer the CHAIN event (which happens with QEMU), we can always support it as we're in control of the counter overflow. Always advertise the event via PMCEID0_EL0. Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221113163832.3154370-4-maz@kernel.org --- arch/arm64/kvm/pmu-emul.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 57765be69bea0..69b67ab3c4bfa 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -701,6 +701,8 @@ u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) if (!pmceid1) { val = read_sysreg(pmceid0_el0); + /* always support CHAIN */ + val |= BIT(ARMV8_PMUV3_PERFCTR_CHAIN); base = 0; } else { val = read_sysreg(pmceid1_el0); -- GitLab From c82d28cbf1d4f9fe174041b4485c635cb970afa7 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 13 Nov 2022 16:38:20 +0000 Subject: [PATCH 0739/1423] KVM: arm64: PMU: Distinguish between 64bit counter and 64bit overflow The PMU architecture makes a subtle difference between a 64bit counter and a counter that has a 64bit overflow. This is for example the case of the cycle counter, which can generate an overflow on a 32bit boundary if PMCR_EL0.LC==0 despite the accumulation being done on 64 bits. Use this distinction in the few cases where it matters in the code, as we will reuse this with PMUv3p5 long counters. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221113163832.3154370-5-maz@kernel.org --- arch/arm64/kvm/pmu-emul.c | 43 ++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 69b67ab3c4bfa..d050143326b58 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -50,6 +50,11 @@ static u32 kvm_pmu_event_mask(struct kvm *kvm) * @select_idx: The counter index */ static bool kvm_pmu_idx_is_64bit(struct kvm_vcpu *vcpu, u64 select_idx) +{ + return (select_idx == ARMV8_PMU_CYCLE_IDX); +} + +static bool kvm_pmu_idx_has_64bit_overflow(struct kvm_vcpu *vcpu, u64 select_idx) { return (select_idx == ARMV8_PMU_CYCLE_IDX && __vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_LC); @@ -57,7 +62,8 @@ static bool kvm_pmu_idx_is_64bit(struct kvm_vcpu *vcpu, u64 select_idx) static bool kvm_pmu_counter_can_chain(struct kvm_vcpu *vcpu, u64 idx) { - return (!(idx & 1) && (idx + 1) < ARMV8_PMU_CYCLE_IDX); + return (!(idx & 1) && (idx + 1) < ARMV8_PMU_CYCLE_IDX && + !kvm_pmu_idx_has_64bit_overflow(vcpu, idx)); } static struct kvm_vcpu *kvm_pmc_to_vcpu(struct kvm_pmc *pmc) @@ -97,7 +103,7 @@ u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx) counter += perf_event_read_value(pmc->perf_event, &enabled, &running); - if (select_idx != ARMV8_PMU_CYCLE_IDX) + if (!kvm_pmu_idx_is_64bit(vcpu, select_idx)) counter = lower_32_bits(counter); return counter; @@ -423,6 +429,23 @@ static void kvm_pmu_counter_increment(struct kvm_vcpu *vcpu, } } +/* Compute the sample period for a given counter value */ +static u64 compute_period(struct kvm_vcpu *vcpu, u64 select_idx, u64 counter) +{ + u64 val; + + if (kvm_pmu_idx_is_64bit(vcpu, select_idx)) { + if (!kvm_pmu_idx_has_64bit_overflow(vcpu, select_idx)) + val = -(counter & GENMASK(31, 0)); + else + val = (-counter) & GENMASK(63, 0); + } else { + val = (-counter) & GENMASK(31, 0); + } + + return val; +} + /** * When the perf event overflows, set the overflow status and inform the vcpu. */ @@ -442,10 +465,7 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event, * Reset the sample period to the architectural limit, * i.e. the point where the counter overflows. */ - period = -(local64_read(&perf_event->count)); - - if (!kvm_pmu_idx_is_64bit(vcpu, pmc->idx)) - period &= GENMASK(31, 0); + period = compute_period(vcpu, idx, local64_read(&perf_event->count)); local64_set(&perf_event->hw.period_left, 0); perf_event->attr.sample_period = period; @@ -571,14 +591,13 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx) /* * If counting with a 64bit counter, advertise it to the perf - * code, carefully dealing with the initial sample period. + * code, carefully dealing with the initial sample period + * which also depends on the overflow. */ - if (kvm_pmu_idx_is_64bit(vcpu, select_idx)) { + if (kvm_pmu_idx_is_64bit(vcpu, select_idx)) attr.config1 |= PERF_ATTR_CFG1_COUNTER_64BIT; - attr.sample_period = (-counter) & GENMASK(63, 0); - } else { - attr.sample_period = (-counter) & GENMASK(31, 0); - } + + attr.sample_period = compute_period(vcpu, select_idx, counter); event = perf_event_create_kernel_counter(&attr, -1, current, kvm_pmu_perf_overflow, pmc); -- GitLab From 001d85bd6c039d3662a4f33a5d212ef3e0438b27 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 13 Nov 2022 16:38:21 +0000 Subject: [PATCH 0740/1423] KVM: arm64: PMU: Narrow the overflow checking when required For 64bit counters that overflow on a 32bit boundary, make sure we only check the bottom 32bit to generate a CHAIN event. Signed-off-by: Marc Zyngier Reviewed-by: Reiji Watanabe Link: https://lore.kernel.org/r/20221113163832.3154370-6-maz@kernel.org --- arch/arm64/kvm/pmu-emul.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index d050143326b58..9e6bc7edc4de0 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -417,7 +417,8 @@ static void kvm_pmu_counter_increment(struct kvm_vcpu *vcpu, reg = lower_32_bits(reg); __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg; - if (reg) /* No overflow? move on */ + /* No overflow? move on */ + if (kvm_pmu_idx_has_64bit_overflow(vcpu, i) ? reg : lower_32_bits(reg)) continue; /* Mark overflow */ -- GitLab From 0f1e172b54f7574ca6aa46b851b332896add955f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 13 Nov 2022 16:38:22 +0000 Subject: [PATCH 0741/1423] KVM: arm64: PMU: Only narrow counters that are not 64bit wide The current PMU emulation sometimes narrows counters to 32bit if the counter isn't the cycle counter. As this is going to change with PMUv3p5 where the counters are all 64bit, fix the couple of cases where this happens unconditionally. Signed-off-by: Marc Zyngier Reviewed-by: Reiji Watanabe Link: https://lore.kernel.org/r/20221113163832.3154370-7-maz@kernel.org --- arch/arm64/kvm/pmu-emul.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 9e6bc7edc4de0..1fab889dbc74f 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -151,20 +151,17 @@ static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc) */ static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc) { - u64 counter, reg, val; + u64 reg, val; if (!pmc->perf_event) return; - counter = kvm_pmu_get_counter_value(vcpu, pmc->idx); + val = kvm_pmu_get_counter_value(vcpu, pmc->idx); - if (pmc->idx == ARMV8_PMU_CYCLE_IDX) { + if (pmc->idx == ARMV8_PMU_CYCLE_IDX) reg = PMCCNTR_EL0; - val = counter; - } else { + else reg = PMEVCNTR0_EL0 + pmc->idx; - val = lower_32_bits(counter); - } __vcpu_sys_reg(vcpu, reg) = val; @@ -414,7 +411,8 @@ static void kvm_pmu_counter_increment(struct kvm_vcpu *vcpu, /* Increment this counter */ reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1; - reg = lower_32_bits(reg); + if (!kvm_pmu_idx_is_64bit(vcpu, i)) + reg = lower_32_bits(reg); __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg; /* No overflow? move on */ -- GitLab From 0cb9c3c87a9d3287eaf353936e6846d885102439 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 13 Nov 2022 16:38:23 +0000 Subject: [PATCH 0742/1423] KVM: arm64: PMU: Add counter_index_to_*reg() helpers In order to reduce the boilerplate code, add two helpers returning the counter register index (resp. the event register) in the vcpu register file from the counter index. Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221113163832.3154370-8-maz@kernel.org --- arch/arm64/kvm/pmu-emul.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 1fab889dbc74f..faab0f57a45de 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -77,6 +77,16 @@ static struct kvm_vcpu *kvm_pmc_to_vcpu(struct kvm_pmc *pmc) return container_of(vcpu_arch, struct kvm_vcpu, arch); } +static u32 counter_index_to_reg(u64 idx) +{ + return (idx == ARMV8_PMU_CYCLE_IDX) ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + idx; +} + +static u32 counter_index_to_evtreg(u64 idx) +{ + return (idx == ARMV8_PMU_CYCLE_IDX) ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + idx; +} + /** * kvm_pmu_get_counter_value - get PMU counter value * @vcpu: The vcpu pointer @@ -91,8 +101,7 @@ u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx) if (!kvm_vcpu_has_pmu(vcpu)) return 0; - reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX) - ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + pmc->idx; + reg = counter_index_to_reg(select_idx); counter = __vcpu_sys_reg(vcpu, reg); /* @@ -122,8 +131,7 @@ void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) if (!kvm_vcpu_has_pmu(vcpu)) return; - reg = (select_idx == ARMV8_PMU_CYCLE_IDX) - ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + select_idx; + reg = counter_index_to_reg(select_idx); __vcpu_sys_reg(vcpu, reg) += (s64)val - kvm_pmu_get_counter_value(vcpu, select_idx); /* Recreate the perf event to reflect the updated sample_period */ @@ -158,10 +166,7 @@ static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc) val = kvm_pmu_get_counter_value(vcpu, pmc->idx); - if (pmc->idx == ARMV8_PMU_CYCLE_IDX) - reg = PMCCNTR_EL0; - else - reg = PMEVCNTR0_EL0 + pmc->idx; + reg = counter_index_to_reg(pmc->idx); __vcpu_sys_reg(vcpu, reg) = val; @@ -404,16 +409,16 @@ static void kvm_pmu_counter_increment(struct kvm_vcpu *vcpu, u64 type, reg; /* Filter on event type */ - type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i); + type = __vcpu_sys_reg(vcpu, counter_index_to_evtreg(i)); type &= kvm_pmu_event_mask(vcpu->kvm); if (type != event) continue; /* Increment this counter */ - reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1; + reg = __vcpu_sys_reg(vcpu, counter_index_to_reg(i)) + 1; if (!kvm_pmu_idx_is_64bit(vcpu, i)) reg = lower_32_bits(reg); - __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg; + __vcpu_sys_reg(vcpu, counter_index_to_reg(i)) = reg; /* No overflow? move on */ if (kvm_pmu_idx_has_64bit_overflow(vcpu, i) ? reg : lower_32_bits(reg)) @@ -549,8 +554,7 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx) struct perf_event_attr attr; u64 eventsel, counter, reg, data; - reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX) - ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + pmc->idx; + reg = counter_index_to_evtreg(select_idx); data = __vcpu_sys_reg(vcpu, reg); kvm_pmu_stop_counter(vcpu, pmc); @@ -632,8 +636,7 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, mask &= ~ARMV8_PMU_EVTYPE_EVENT; mask |= kvm_pmu_event_mask(vcpu->kvm); - reg = (select_idx == ARMV8_PMU_CYCLE_IDX) - ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + select_idx; + reg = counter_index_to_evtreg(select_idx); __vcpu_sys_reg(vcpu, reg) = data & mask; -- GitLab From 9917264d74d9063341968a8e071266358496777b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 13 Nov 2022 16:38:24 +0000 Subject: [PATCH 0743/1423] KVM: arm64: PMU: Simplify setting a counter to a specific value kvm_pmu_set_counter_value() is pretty odd, as it tries to update the counter value while taking into account the value that is currently held by the running perf counter. This is not only complicated, this is quite wrong. Nowhere in the architecture is it said that the counter would be offset by something that is pending. The counter should be updated with the value set by SW, and start counting from there if required. Remove the odd computation and just assign the provided value after having released the perf event (which is then restarted). Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221113163832.3154370-9-maz@kernel.org --- arch/arm64/kvm/pmu-emul.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index faab0f57a45de..ea0c8411641fd 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -23,6 +23,7 @@ static LIST_HEAD(arm_pmus); static DEFINE_MUTEX(arm_pmus_lock); static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx); +static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc); static u32 kvm_pmu_event_mask(struct kvm *kvm) { @@ -131,8 +132,10 @@ void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) if (!kvm_vcpu_has_pmu(vcpu)) return; + kvm_pmu_release_perf_event(&vcpu->arch.pmu.pmc[select_idx]); + reg = counter_index_to_reg(select_idx); - __vcpu_sys_reg(vcpu, reg) += (s64)val - kvm_pmu_get_counter_value(vcpu, select_idx); + __vcpu_sys_reg(vcpu, reg) = val; /* Recreate the perf event to reflect the updated sample_period */ kvm_pmu_create_perf_event(vcpu, select_idx); -- GitLab From c4b33d28ea51c7d194b19a41c96a4f973cc0a280 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 9 Nov 2022 10:59:05 -0800 Subject: [PATCH 0744/1423] KVM: x86/mmu: Split huge pages mapped by the TDP MMU on fault Now that the TDP MMU has a mechanism to split huge pages, use it in the fault path when a huge page needs to be replaced with a mapping at a lower level. This change reduces the negative performance impact of NX HugePages. Prior to this change if a vCPU executed from a huge page and NX HugePages was enabled, the vCPU would take a fault, zap the huge page, and mapping the faulting address at 4KiB with execute permissions enabled. The rest of the memory would be left *unmapped* and have to be faulted back in by the guest upon access (read, write, or execute). If guest is backed by 1GiB, a single execute instruction can zap an entire GiB of its physical address space. For example, it can take a VM longer to execute from its memory than to populate that memory in the first place: $ ./execute_perf_test -s anonymous_hugetlb_1gb -v96 Populating memory : 2.748378795s Executing from memory : 2.899670885s With this change, such faults split the huge page instead of zapping it, which avoids the non-present faults on the rest of the huge page: $ ./execute_perf_test -s anonymous_hugetlb_1gb -v96 Populating memory : 2.729544474s Executing from memory : 0.111965688s <--- This change also reduces the performance impact of dirty logging when eager_page_split=N. eager_page_split=N (abbreviated "eps=N" below) can be desirable for read-heavy workloads, as it avoids allocating memory to split huge pages that are never written and avoids increasing the TLB miss cost on reads of those pages. | Config: ept=Y, tdp_mmu=Y, 5% writes | | Iteration 1 dirty memory time | | --------------------------------------------- | vCPU Count | eps=N (Before) | eps=N (After) | eps=Y | ------------ | -------------- | ------------- | ------------ | 2 | 0.332305091s | 0.019615027s | 0.006108211s | 4 | 0.353096020s | 0.019452131s | 0.006214670s | 8 | 0.453938562s | 0.019748246s | 0.006610997s | 16 | 0.719095024s | 0.019972171s | 0.007757889s | 32 | 1.698727124s | 0.021361615s | 0.012274432s | 64 | 2.630673582s | 0.031122014s | 0.016994683s | 96 | 3.016535213s | 0.062608739s | 0.044760838s | Eager page splitting remains beneficial for write-heavy workloads, but the gap is now reduced. | Config: ept=Y, tdp_mmu=Y, 100% writes | | Iteration 1 dirty memory time | | --------------------------------------------- | vCPU Count | eps=N (Before) | eps=N (After) | eps=Y | ------------ | -------------- | ------------- | ------------ | 2 | 0.317710329s | 0.296204596s | 0.058689782s | 4 | 0.337102375s | 0.299841017s | 0.060343076s | 8 | 0.386025681s | 0.297274460s | 0.060399702s | 16 | 0.791462524s | 0.298942578s | 0.062508699s | 32 | 1.719646014s | 0.313101996s | 0.075984855s | 64 | 2.527973150s | 0.455779206s | 0.079789363s | 96 | 2.681123208s | 0.673778787s | 0.165386739s | Further study is needed to determine if the remaining gap is acceptable for customer workloads or if eager_page_split=N still requires a-priori knowledge of the VM workload, especially when considering these costs extrapolated out to large VMs with e.g. 416 vCPUs and 12TB RAM. Signed-off-by: David Matlack Reviewed-by: Mingwei Zhang Message-Id: <20221109185905.486172-3-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 73 ++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 38 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 4e5b3ae824c16..e085967754276 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1146,6 +1146,9 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, return 0; } +static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, + struct kvm_mmu_page *sp, bool shared); + /* * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing * page tables and SPTEs to translate the faulting guest physical address. @@ -1171,49 +1174,42 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (iter.level == fault->goal_level) break; - /* - * If there is an SPTE mapping a large page at a higher level - * than the target, that SPTE must be cleared and replaced - * with a non-leaf SPTE. - */ + /* Step down into the lower level page table if it exists. */ if (is_shadow_present_pte(iter.old_spte) && - is_large_pte(iter.old_spte)) { - if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter)) - break; + !is_large_pte(iter.old_spte)) + continue; - /* - * The iter must explicitly re-read the spte here - * because the new value informs the !present - * path below. - */ - iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep); - } + /* + * If SPTE has been frozen by another thread, just give up and + * retry, avoiding unnecessary page table allocation and free. + */ + if (is_removed_spte(iter.old_spte)) + break; - if (!is_shadow_present_pte(iter.old_spte)) { - /* - * If SPTE has been frozen by another thread, just - * give up and retry, avoiding unnecessary page table - * allocation and free. - */ - if (is_removed_spte(iter.old_spte)) - break; + /* + * The SPTE is either non-present or points to a huge page that + * needs to be split. + */ + sp = tdp_mmu_alloc_sp(vcpu); + tdp_mmu_init_child_sp(sp, &iter); - sp = tdp_mmu_alloc_sp(vcpu); - tdp_mmu_init_child_sp(sp, &iter); + sp->nx_huge_page_disallowed = fault->huge_page_disallowed; - sp->nx_huge_page_disallowed = fault->huge_page_disallowed; + if (is_shadow_present_pte(iter.old_spte)) + ret = tdp_mmu_split_huge_page(kvm, &iter, sp, true); + else + ret = tdp_mmu_link_sp(kvm, &iter, sp, true); - if (tdp_mmu_link_sp(kvm, &iter, sp, true)) { - tdp_mmu_free_sp(sp); - break; - } + if (ret) { + tdp_mmu_free_sp(sp); + break; + } - if (fault->huge_page_disallowed && - fault->req_level >= iter.level) { - spin_lock(&kvm->arch.tdp_mmu_pages_lock); - track_possible_nx_huge_page(kvm, sp); - spin_unlock(&kvm->arch.tdp_mmu_pages_lock); - } + if (fault->huge_page_disallowed && + fault->req_level >= iter.level) { + spin_lock(&kvm->arch.tdp_mmu_pages_lock); + track_possible_nx_huge_page(kvm, sp); + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); } } @@ -1477,6 +1473,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, return sp; } +/* Note, the caller is responsible for initializing @sp. */ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, struct kvm_mmu_page *sp, bool shared) { @@ -1484,8 +1481,6 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, const int level = iter->level; int ret, i; - tdp_mmu_init_child_sp(sp, iter); - /* * No need for atomics when writing to sp->spt since the page table has * not been linked in yet and thus is not reachable from any other CPU. @@ -1561,6 +1556,8 @@ retry: continue; } + tdp_mmu_init_child_sp(sp, &iter); + if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared)) goto retry; -- GitLab From 63d28a25e04cb48e6bd15141506645ac99d9f8b2 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 17 Nov 2022 11:05:51 -0500 Subject: [PATCH 0745/1423] KVM: x86/mmu: simplify kvm_tdp_mmu_map flow when guest has to retry A removed SPTE is never present, hence the "if" in kvm_tdp_mmu_map only fails in the exact same conditions that the earlier loop tested in order to issue a "break". So, instead of checking twice the condition (upper level SPTEs could not be created or was frozen), just exit the loop with a goto---the usual poor-man C replacement for RAII early returns. While at it, do not use the "ret" variable for return values of functions that do not return a RET_PF_* enum. This is clearer and also makes it possible to initialize ret to RET_PF_RETRY. Suggested-by: Robert Hoo Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 40 ++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index e085967754276..771210ce51811 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1159,7 +1159,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) struct kvm *kvm = vcpu->kvm; struct tdp_iter iter; struct kvm_mmu_page *sp; - int ret; + int ret = RET_PF_RETRY; kvm_mmu_hugepage_adjust(vcpu, fault); @@ -1168,23 +1168,25 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) rcu_read_lock(); tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) { + int r; + if (fault->nx_huge_page_workaround_enabled) disallowed_hugepage_adjust(fault, iter.old_spte, iter.level); if (iter.level == fault->goal_level) break; - /* Step down into the lower level page table if it exists. */ - if (is_shadow_present_pte(iter.old_spte) && - !is_large_pte(iter.old_spte)) - continue; - /* * If SPTE has been frozen by another thread, just give up and * retry, avoiding unnecessary page table allocation and free. */ if (is_removed_spte(iter.old_spte)) - break; + goto retry; + + /* Step down into the lower level page table if it exists. */ + if (is_shadow_present_pte(iter.old_spte) && + !is_large_pte(iter.old_spte)) + continue; /* * The SPTE is either non-present or points to a huge page that @@ -1196,13 +1198,17 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) sp->nx_huge_page_disallowed = fault->huge_page_disallowed; if (is_shadow_present_pte(iter.old_spte)) - ret = tdp_mmu_split_huge_page(kvm, &iter, sp, true); + r = tdp_mmu_split_huge_page(kvm, &iter, sp, true); else - ret = tdp_mmu_link_sp(kvm, &iter, sp, true); + r = tdp_mmu_link_sp(kvm, &iter, sp, true); - if (ret) { + /* + * Also force the guest to retry the access if the upper level SPTEs + * aren't in place. + */ + if (r) { tdp_mmu_free_sp(sp); - break; + goto retry; } if (fault->huge_page_disallowed && @@ -1213,18 +1219,10 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) } } - /* - * Force the guest to retry the access if the upper level SPTEs aren't - * in place, or if the target leaf SPTE is frozen by another CPU. - */ - if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) { - rcu_read_unlock(); - return RET_PF_RETRY; - } - ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter); - rcu_read_unlock(); +retry: + rcu_read_unlock(); return ret; } -- GitLab From eb298605705a5c6b3d61c754e3c80ac8ef8e8724 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Thu, 3 Nov 2022 13:44:21 -0700 Subject: [PATCH 0746/1423] KVM: x86/mmu: Do not recover dirty-tracked NX Huge Pages Do not recover (i.e. zap) an NX Huge Page that is being dirty tracked, as it will just be faulted back in at the same 4KiB granularity when accessed by a vCPU. This may need to be changed if KVM ever supports 2MiB (or larger) dirty tracking granularity, or faulting huge pages during dirty tracking for reads/executes. However for now, these zaps are entirely wasteful. In order to check if this commit increases the CPU usage of the NX recovery worker thread I used a modified version of execute_perf_test [1] that supports splitting guest memory into multiple slots and reports /proc/pid/schedstat:se.sum_exec_runtime for the NX recovery worker just before tearing down the VM. The goal was to force a large number of NX Huge Page recoveries and see if the recovery worker used any more CPU. Test Setup: echo 1000 > /sys/module/kvm/parameters/nx_huge_pages_recovery_period_ms echo 10 > /sys/module/kvm/parameters/nx_huge_pages_recovery_ratio Test Command: ./execute_perf_test -v64 -s anonymous_hugetlb_1gb -x 16 -o | kvm-nx-lpage-re:se.sum_exec_runtime | | ---------------------------------------- | Run | Before | After | ------- | ------------------ | ------------------- | 1 | 730.084105 | 724.375314 | 2 | 728.751339 | 740.581988 | 3 | 736.264720 | 757.078163 | Comparing the median results, this commit results in about a 1% increase CPU usage of the NX recovery worker when testing a VM with 16 slots. However, the effect is negligible with the default halving time of NX pages, which is 1 hour rather than 10 seconds given by period_ms = 1000, ratio = 10. [1] https://lore.kernel.org/kvm/20221019234050.3919566-2-dmatlack@google.com/ Signed-off-by: David Matlack Message-Id: <20221103204421.1146958-1-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 93c389eaf4711..cfff74685a259 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6841,6 +6841,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel static void kvm_recover_nx_huge_pages(struct kvm *kvm) { unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits; + struct kvm_memory_slot *slot; int rcu_idx; struct kvm_mmu_page *sp; unsigned int ratio; @@ -6875,7 +6876,21 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm) struct kvm_mmu_page, possible_nx_huge_page_link); WARN_ON_ONCE(!sp->nx_huge_page_disallowed); - if (is_tdp_mmu_page(sp)) + WARN_ON_ONCE(!sp->role.direct); + + slot = gfn_to_memslot(kvm, sp->gfn); + WARN_ON_ONCE(!slot); + + /* + * Unaccount and do not attempt to recover any NX Huge Pages + * that are being dirty tracked, as they would just be faulted + * back in as 4KiB pages. The NX Huge Pages in this slot will be + * recovered, along with all the other huge pages in the slot, + * when dirty logging is disabled. + */ + if (slot && kvm_slot_dirty_track_enabled(slot)) + unaccount_nx_huge_page(kvm, sp); + else if (is_tdp_mmu_page(sp)) flush |= kvm_tdp_mmu_zap_sp(kvm, sp); else kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); -- GitLab From effae0e3d9e1139d583e9b5d050f4f948825b8a3 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Tue, 15 Nov 2022 10:51:33 +0000 Subject: [PATCH 0747/1423] riscv: Kconfig: Enable cpufreq kconfig menu Enable cpufreq kconfig menu for RISC-V. Signed-off-by: Lad Prabhakar Reviewed-by: Conor Dooley Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20221115105135.1180490-2-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 7cd981f96f48b..3b41165a8b100 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -684,6 +684,8 @@ menu "CPU Power Management" source "drivers/cpuidle/Kconfig" +source "drivers/cpufreq/Kconfig" + endmenu # "CPU Power Management" source "arch/riscv/kvm/Kconfig" -- GitLab From 8610e98f0b48a7f9974cb12c5f501433c4afa958 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 16 Nov 2022 14:50:59 -0600 Subject: [PATCH 0748/1423] PCI: Drop of_match_ptr() to avoid unused variables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We have stubs for most OF interfaces even when CONFIG_OF is not set, so we allow building of most controller drivers in that case for compile testing. When CONFIG_OF is not set, "of_match_ptr()" compiles to NULL, which leaves unused, resulting in errors like this: $ make W=1 drivers/pci/controller/pci-xgene.c:636:34: error: ‘xgene_pcie_match_table’ defined but not used [-Werror=unused-const-variable=] Drop of_match_ptr() to avoid the unused variable warning. See also 1dff012f636d ("PCI: Drop of_match_ptr() to avoid unused variables"). Link: https://lore.kernel.org/r/20221025191339.667614-2-helgaas@kernel.org Link: https://lore.kernel.org/r/20221116205100.1136224-2-helgaas@kernel.org Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/pci-ftpci100.c | 2 +- drivers/pci/controller/pci-v3-semi.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/pci-ftpci100.c b/drivers/pci/controller/pci-ftpci100.c index 0cfd9d5a497c9..ecd3009df586d 100644 --- a/drivers/pci/controller/pci-ftpci100.c +++ b/drivers/pci/controller/pci-ftpci100.c @@ -553,7 +553,7 @@ static const struct of_device_id faraday_pci_of_match[] = { static struct platform_driver faraday_pci_driver = { .driver = { .name = "ftpci100", - .of_match_table = of_match_ptr(faraday_pci_of_match), + .of_match_table = faraday_pci_of_match, .suppress_bind_attrs = true, }, .probe = faraday_pci_probe, diff --git a/drivers/pci/controller/pci-v3-semi.c b/drivers/pci/controller/pci-v3-semi.c index 784fcf35599c6..ca44b0c83d1bf 100644 --- a/drivers/pci/controller/pci-v3-semi.c +++ b/drivers/pci/controller/pci-v3-semi.c @@ -901,7 +901,7 @@ static const struct of_device_id v3_pci_of_match[] = { static struct platform_driver v3_pci_driver = { .driver = { .name = "pci-v3-semi", - .of_match_table = of_match_ptr(v3_pci_of_match), + .of_match_table = v3_pci_of_match, .suppress_bind_attrs = true, }, .probe = v3_pci_probe, -- GitLab From 16bdbae394280f1d97933d919023eccbf0b564bd Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 7 Nov 2022 13:24:55 +0100 Subject: [PATCH 0749/1423] hwrng: core - treat default_quality as a maximum and default to 1024 Most hw_random devices return entropy which is assumed to be of full quality, but driver authors don't bother setting the quality knob. Some hw_random devices return less than full quality entropy, and then driver authors set the quality knob. Therefore, the entropy crediting should be opt-out rather than opt-in per-driver, to reflect the actual reality on the ground. For example, the two Raspberry Pi RNG drivers produce full entropy randomness, and both EDK2 and U-Boot's drivers for these treat them as such. The result is that EFI then uses these numbers and passes the to Linux, and Linux credits them as boot, thereby initializing the RNG. Yet, in Linux, the quality knob was never set to anything, and so on the chance that Linux is booted without EFI, nothing is ever credited. That's annoying. The same pattern appears to repeat itself throughout various drivers. In fact, very very few drivers have bothered setting quality=1024. Looking at the git history of existing drivers and corresponding mailing list discussion, this conclusion tracks. There's been a decent amount of discussion about drivers that set quality < 1024 -- somebody read and interepreted a datasheet, or made some back of the envelope calculation somehow. But there's been very little, if any, discussion about most drivers where the quality is just set to 1024 or unset (or set to 1000 when the authors misunderstood the API and assumed it was base-10 rather than base-2); in both cases the intent was fairly clear of, "this is a hardware random device; it's fine." So let's invert this logic. A hw_random struct's quality knob now controls the maximum quality a driver can produce, or 0 to specify 1024. Then, the module-wide switch called "default_quality" is changed to represent the maximum quality of any driver. By default it's 1024, and the quality of any particular driver is then given by: min(default_quality, rng->quality ?: 1024); This way, the user can still turn this off for weird reasons (and we can replace whatever driver-specific disabling hacks existed in the past), yet we get proper crediting for relevant RNGs. Cc: Dominik Brodowski Cc: Ard Biesheuvel Cc: Herbert Xu Signed-off-by: Jason A. Donenfeld Signed-off-by: Herbert Xu --- arch/um/drivers/random.c | 1 - drivers/char/hw_random/cavium-rng-vf.c | 1 - drivers/char/hw_random/cn10k-rng.c | 1 - drivers/char/hw_random/core.c | 9 +++------ drivers/char/hw_random/mpfs-rng.c | 1 - drivers/char/hw_random/npcm-rng.c | 1 - drivers/char/hw_random/s390-trng.c | 1 - drivers/char/hw_random/timeriomem-rng.c | 2 -- drivers/char/hw_random/virtio-rng.c | 1 - drivers/crypto/allwinner/sun8i-ce/sun8i-ce-trng.c | 1 - drivers/crypto/atmel-sha204a.c | 1 - drivers/crypto/caam/caamrng.c | 1 - drivers/firmware/turris-mox-rwtm.c | 1 - drivers/s390/crypto/zcrypt_api.c | 6 ------ drivers/usb/misc/chaoskey.c | 1 - include/linux/hw_random.h | 2 +- 16 files changed, 4 insertions(+), 27 deletions(-) diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c index 32b3341fe9707..da985e0dc69a5 100644 --- a/arch/um/drivers/random.c +++ b/arch/um/drivers/random.c @@ -82,7 +82,6 @@ static int __init rng_init (void) sigio_broken(random_fd); hwrng.name = RNG_MODULE_NAME; hwrng.read = rng_dev_read; - hwrng.quality = 1024; err = hwrng_register(&hwrng); if (err) { diff --git a/drivers/char/hw_random/cavium-rng-vf.c b/drivers/char/hw_random/cavium-rng-vf.c index 7c55f4cf4a8ba..c99c54cd99c67 100644 --- a/drivers/char/hw_random/cavium-rng-vf.c +++ b/drivers/char/hw_random/cavium-rng-vf.c @@ -225,7 +225,6 @@ static int cavium_rng_probe_vf(struct pci_dev *pdev, return -ENOMEM; rng->ops.read = cavium_rng_read; - rng->ops.quality = 1000; pci_set_drvdata(pdev, rng); diff --git a/drivers/char/hw_random/cn10k-rng.c b/drivers/char/hw_random/cn10k-rng.c index a01e9307737c5..c1193f85982c3 100644 --- a/drivers/char/hw_random/cn10k-rng.c +++ b/drivers/char/hw_random/cn10k-rng.c @@ -145,7 +145,6 @@ static int cn10k_rng_probe(struct pci_dev *pdev, const struct pci_device_id *id) return -ENOMEM; rng->ops.read = cn10k_rng_read; - rng->ops.quality = 1000; rng->ops.priv = (unsigned long)rng; reset_rng_health_state(rng); diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index cc002b0c2f0c3..afde685f5e0a4 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -41,14 +41,14 @@ static DEFINE_MUTEX(reading_mutex); static int data_avail; static u8 *rng_buffer, *rng_fillbuf; static unsigned short current_quality; -static unsigned short default_quality; /* = 0; default to "off" */ +static unsigned short default_quality = 1024; /* default to maximum */ module_param(current_quality, ushort, 0644); MODULE_PARM_DESC(current_quality, "current hwrng entropy estimation per 1024 bits of input -- obsolete, use rng_quality instead"); module_param(default_quality, ushort, 0644); MODULE_PARM_DESC(default_quality, - "default entropy content of hwrng per 1024 bits of input"); + "default maximum entropy content of hwrng per 1024 bits of input"); static void drop_current_rng(void); static int hwrng_init(struct hwrng *rng); @@ -170,10 +170,7 @@ static int hwrng_init(struct hwrng *rng) reinit_completion(&rng->cleanup_done); skip_init: - if (!rng->quality) - rng->quality = default_quality; - if (rng->quality > 1024) - rng->quality = 1024; + rng->quality = min_t(u16, min_t(u16, default_quality, 1024), rng->quality ?: 1024); current_quality = rng->quality; /* obsolete */ return 0; diff --git a/drivers/char/hw_random/mpfs-rng.c b/drivers/char/hw_random/mpfs-rng.c index 5813da617a485..c6972734ae62e 100644 --- a/drivers/char/hw_random/mpfs-rng.c +++ b/drivers/char/hw_random/mpfs-rng.c @@ -78,7 +78,6 @@ static int mpfs_rng_probe(struct platform_device *pdev) rng_priv->rng.read = mpfs_rng_read; rng_priv->rng.name = pdev->name; - rng_priv->rng.quality = 1024; platform_set_drvdata(pdev, rng_priv); diff --git a/drivers/char/hw_random/npcm-rng.c b/drivers/char/hw_random/npcm-rng.c index 5bf7f370f9859..9903d0357e06e 100644 --- a/drivers/char/hw_random/npcm-rng.c +++ b/drivers/char/hw_random/npcm-rng.c @@ -111,7 +111,6 @@ static int npcm_rng_probe(struct platform_device *pdev) priv->rng.name = pdev->name; priv->rng.read = npcm_rng_read; priv->rng.priv = (unsigned long)&pdev->dev; - priv->rng.quality = 1000; priv->clkp = (u32)(uintptr_t)of_device_get_match_data(&pdev->dev); writel(NPCM_RNG_M1ROSEL, priv->base + NPCM_RNGMODE_REG); diff --git a/drivers/char/hw_random/s390-trng.c b/drivers/char/hw_random/s390-trng.c index 795853dfc46b7..cffa326ddc8d3 100644 --- a/drivers/char/hw_random/s390-trng.c +++ b/drivers/char/hw_random/s390-trng.c @@ -191,7 +191,6 @@ static struct hwrng trng_hwrng_dev = { .name = "s390-trng", .data_read = trng_hwrng_data_read, .read = trng_hwrng_read, - .quality = 1024, }; diff --git a/drivers/char/hw_random/timeriomem-rng.c b/drivers/char/hw_random/timeriomem-rng.c index 8ea1fc831eb7b..26f322d19a883 100644 --- a/drivers/char/hw_random/timeriomem-rng.c +++ b/drivers/char/hw_random/timeriomem-rng.c @@ -145,8 +145,6 @@ static int timeriomem_rng_probe(struct platform_device *pdev) if (!of_property_read_u32(pdev->dev.of_node, "quality", &i)) priv->rng_ops.quality = i; - else - priv->rng_ops.quality = 0; } else { period = pdata->period; priv->rng_ops.quality = pdata->quality; diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index a6f3a8a2aca6d..f7690e0f92ede 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -148,7 +148,6 @@ static int probe_common(struct virtio_device *vdev) .cleanup = virtio_cleanup, .priv = (unsigned long)vi, .name = vi->name, - .quality = 1000, }; vdev->priv = vi; diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-trng.c b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-trng.c index c4b0a8b588429..e2b9b91046941 100644 --- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-trng.c +++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-trng.c @@ -108,7 +108,6 @@ int sun8i_ce_hwrng_register(struct sun8i_ce_dev *ce) } ce->trng.name = "sun8i Crypto Engine TRNG"; ce->trng.read = sun8i_ce_trng_read; - ce->trng.quality = 1000; ret = hwrng_register(&ce->trng); if (ret) diff --git a/drivers/crypto/atmel-sha204a.c b/drivers/crypto/atmel-sha204a.c index a84b657598c6e..c0103e7fc2e75 100644 --- a/drivers/crypto/atmel-sha204a.c +++ b/drivers/crypto/atmel-sha204a.c @@ -107,7 +107,6 @@ static int atmel_sha204a_probe(struct i2c_client *client, i2c_priv->hwrng.name = dev_name(&client->dev); i2c_priv->hwrng.read = atmel_sha204a_rng_read; - i2c_priv->hwrng.quality = 1024; ret = devm_hwrng_register(&client->dev, &i2c_priv->hwrng); if (ret) diff --git a/drivers/crypto/caam/caamrng.c b/drivers/crypto/caam/caamrng.c index 77d048dfe5d06..1f0e820509767 100644 --- a/drivers/crypto/caam/caamrng.c +++ b/drivers/crypto/caam/caamrng.c @@ -246,7 +246,6 @@ int caam_rng_init(struct device *ctrldev) ctx->rng.cleanup = caam_cleanup; ctx->rng.read = caam_read; ctx->rng.priv = (unsigned long)ctx; - ctx->rng.quality = 1024; dev_info(ctrldev, "registering rng-caam\n"); diff --git a/drivers/firmware/turris-mox-rwtm.c b/drivers/firmware/turris-mox-rwtm.c index c2d34dc8ba462..6ea5789a89e2b 100644 --- a/drivers/firmware/turris-mox-rwtm.c +++ b/drivers/firmware/turris-mox-rwtm.c @@ -528,7 +528,6 @@ static int turris_mox_rwtm_probe(struct platform_device *pdev) rwtm->hwrng.name = DRIVER_NAME "_hwrng"; rwtm->hwrng.read = mox_hwrng_read; rwtm->hwrng.priv = (unsigned long) rwtm; - rwtm->hwrng.quality = 1024; ret = devm_hwrng_register(dev, &rwtm->hwrng); if (ret < 0) { diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c index f94b43ce9a658..4bf36e53fe3e4 100644 --- a/drivers/s390/crypto/zcrypt_api.c +++ b/drivers/s390/crypto/zcrypt_api.c @@ -53,10 +53,6 @@ MODULE_LICENSE("GPL"); EXPORT_TRACEPOINT_SYMBOL(s390_zcrypt_req); EXPORT_TRACEPOINT_SYMBOL(s390_zcrypt_rep); -static int zcrypt_hwrng_seed = 1; -module_param_named(hwrng_seed, zcrypt_hwrng_seed, int, 0440); -MODULE_PARM_DESC(hwrng_seed, "Turn on/off hwrng auto seed, default is 1 (on)."); - DEFINE_SPINLOCK(zcrypt_list_lock); LIST_HEAD(zcrypt_card_list); @@ -2063,8 +2059,6 @@ int zcrypt_rng_device_add(void) goto out; } zcrypt_rng_buffer_index = 0; - if (!zcrypt_hwrng_seed) - zcrypt_rng_dev.quality = 0; rc = hwrng_register(&zcrypt_rng_dev); if (rc) goto out_free; diff --git a/drivers/usb/misc/chaoskey.c b/drivers/usb/misc/chaoskey.c index 87067c3d6109b..6fb5140e29b9d 100644 --- a/drivers/usb/misc/chaoskey.c +++ b/drivers/usb/misc/chaoskey.c @@ -200,7 +200,6 @@ static int chaoskey_probe(struct usb_interface *interface, dev->hwrng.name = dev->name ? dev->name : chaoskey_driver.name; dev->hwrng.read = chaoskey_rng_read; - dev->hwrng.quality = 1024; dev->hwrng_registered = (hwrng_register(&dev->hwrng) == 0); if (!dev->hwrng_registered) diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h index 77c2885c4c130..8a3115516a1ba 100644 --- a/include/linux/hw_random.h +++ b/include/linux/hw_random.h @@ -34,7 +34,7 @@ * @priv: Private data, for use by the RNG driver. * @quality: Estimation of true entropy in RNG's bitstream * (in bits of entropy per 1024 bits of input; - * valid values: 1 to 1024, or 0 for unknown). + * valid values: 1 to 1024, or 0 for maximum). */ struct hwrng { const char *name; -- GitLab From 7cdc5e6bcd02ab45d0a2991b35861b448e355276 Mon Sep 17 00:00:00 2001 From: Tomas Marek Date: Tue, 8 Nov 2022 07:42:40 +0100 Subject: [PATCH 0750/1423] hwrng: stm32 - rename readl return value Use a more meaningful name for the readl return value variable. Link: https://lore.kernel.org/all/Y1J3QwynPFIlfrIv@loth.rohan.me.apana.org.au/ Signed-off-by: Tomas Marek Signed-off-by: Herbert Xu --- drivers/char/hw_random/stm32-rng.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/stm32-rng.c b/drivers/char/hw_random/stm32-rng.c index 366edda4848b6..a6731cf0627ac 100644 --- a/drivers/char/hw_random/stm32-rng.c +++ b/drivers/char/hw_random/stm32-rng.c @@ -49,13 +49,13 @@ static int stm32_rng_read(struct hwrng *rng, void *data, size_t max, bool wait) /* Manage timeout which is based on timer and take */ /* care of initial delay time when enabling rng */ if (!sr && wait) { - int ret; + int err; - ret = readl_relaxed_poll_timeout_atomic(priv->base + err = readl_relaxed_poll_timeout_atomic(priv->base + RNG_SR, sr, sr, 10, 50000); - if (ret) + if (err) dev_err((struct device *)priv->rng.priv, "%s: timeout %x!\n", __func__, sr); } -- GitLab From 4f1c596df706c9aca662b6c214fad84047ae2a97 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Tue, 8 Nov 2022 16:29:12 +0800 Subject: [PATCH 0751/1423] crypto: ccree - Remove debugfs when platform_driver_register failed When platform_driver_register failed, we need to remove debugfs, which will caused a resource leak, fix it. Failed logs as follows: [ 32.606488] debugfs: Directory 'ccree' with parent '/' already present! Fixes: 4c3f97276e15 ("crypto: ccree - introduce CryptoCell driver") Signed-off-by: Gaosheng Cui Signed-off-by: Herbert Xu --- drivers/crypto/ccree/cc_driver.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/ccree/cc_driver.c b/drivers/crypto/ccree/cc_driver.c index cadead18b59e8..d489c6f808925 100644 --- a/drivers/crypto/ccree/cc_driver.c +++ b/drivers/crypto/ccree/cc_driver.c @@ -651,9 +651,17 @@ static struct platform_driver ccree_driver = { static int __init ccree_init(void) { + int rc; + cc_debugfs_global_init(); - return platform_driver_register(&ccree_driver); + rc = platform_driver_register(&ccree_driver); + if (rc) { + cc_debugfs_global_fini(); + return rc; + } + + return 0; } module_init(ccree_init); -- GitLab From 824db5cd1ec9b95c254fc317c3999f6b53e98b12 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Thu, 10 Nov 2022 18:42:04 +0800 Subject: [PATCH 0752/1423] crypto: arm64 - Fix unused variable compilation warnings of cpu_feature The cpu feature defined by MODULE_DEVICE_TABLE is only referenced when compiling as a module, and the warning of unused variable will be encountered when compiling with intree. The warning can be removed by adding the __maybe_unused flag. Fixes: 03c9a333fef1 ("crypto: arm64/ghash - add NEON accelerated fallback for 64-bit PMULL") Fixes: ae1b83c7d572 ("crypto: arm64/sm4 - add CE implementation for GCM mode") Reported-by: kernel test robot Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu --- arch/arm64/crypto/ghash-ce-glue.c | 2 +- arch/arm64/crypto/sm4-ce-gcm-glue.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c index 15794fe21a0b2..e5e9adc1fcf4e 100644 --- a/arch/arm64/crypto/ghash-ce-glue.c +++ b/arch/arm64/crypto/ghash-ce-glue.c @@ -508,7 +508,7 @@ static void __exit ghash_ce_mod_exit(void) crypto_unregister_shash(&ghash_alg); } -static const struct cpu_feature ghash_cpu_feature[] = { +static const struct cpu_feature __maybe_unused ghash_cpu_feature[] = { { cpu_feature(PMULL) }, { } }; MODULE_DEVICE_TABLE(cpu, ghash_cpu_feature); diff --git a/arch/arm64/crypto/sm4-ce-gcm-glue.c b/arch/arm64/crypto/sm4-ce-gcm-glue.c index e90ea0f17beb2..c450a2025ca9a 100644 --- a/arch/arm64/crypto/sm4-ce-gcm-glue.c +++ b/arch/arm64/crypto/sm4-ce-gcm-glue.c @@ -271,7 +271,7 @@ static void __exit sm4_ce_gcm_exit(void) crypto_unregister_aead(&sm4_gcm_alg); } -static const struct cpu_feature sm4_ce_gcm_cpu_feature[] = { +static const struct cpu_feature __maybe_unused sm4_ce_gcm_cpu_feature[] = { { cpu_feature(PMULL) }, {} }; -- GitLab From 3a58c231172537f7b0e19d93ed33decd04f80eab Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 11 Nov 2022 17:59:17 +0800 Subject: [PATCH 0753/1423] crypto: cryptd - Use request context instead of stack for sub-request cryptd is buggy as it tries to use sync_skcipher without going through the proper sync_skcipher interface. In fact it doesn't even need sync_skcipher since it's already a proper skcipher and can easily access the request context instead of using something off the stack. Fixes: 36b3875a97b8 ("crypto: cryptd - Remove VLA usage of skcipher") Signed-off-by: Herbert Xu --- crypto/cryptd.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/crypto/cryptd.c b/crypto/cryptd.c index 668095eca0faf..ca3a40fc7da91 100644 --- a/crypto/cryptd.c +++ b/crypto/cryptd.c @@ -68,11 +68,12 @@ struct aead_instance_ctx { struct cryptd_skcipher_ctx { refcount_t refcnt; - struct crypto_sync_skcipher *child; + struct crypto_skcipher *child; }; struct cryptd_skcipher_request_ctx { crypto_completion_t complete; + struct skcipher_request req; }; struct cryptd_hash_ctx { @@ -227,13 +228,13 @@ static int cryptd_skcipher_setkey(struct crypto_skcipher *parent, const u8 *key, unsigned int keylen) { struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(parent); - struct crypto_sync_skcipher *child = ctx->child; + struct crypto_skcipher *child = ctx->child; - crypto_sync_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); - crypto_sync_skcipher_set_flags(child, - crypto_skcipher_get_flags(parent) & - CRYPTO_TFM_REQ_MASK); - return crypto_sync_skcipher_setkey(child, key, keylen); + crypto_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); + crypto_skcipher_set_flags(child, + crypto_skcipher_get_flags(parent) & + CRYPTO_TFM_REQ_MASK); + return crypto_skcipher_setkey(child, key, keylen); } static void cryptd_skcipher_complete(struct skcipher_request *req, int err) @@ -258,13 +259,13 @@ static void cryptd_skcipher_encrypt(struct crypto_async_request *base, struct cryptd_skcipher_request_ctx *rctx = skcipher_request_ctx(req); struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); - struct crypto_sync_skcipher *child = ctx->child; - SYNC_SKCIPHER_REQUEST_ON_STACK(subreq, child); + struct skcipher_request *subreq = &rctx->req; + struct crypto_skcipher *child = ctx->child; if (unlikely(err == -EINPROGRESS)) goto out; - skcipher_request_set_sync_tfm(subreq, child); + skcipher_request_set_tfm(subreq, child); skcipher_request_set_callback(subreq, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen, @@ -286,13 +287,13 @@ static void cryptd_skcipher_decrypt(struct crypto_async_request *base, struct cryptd_skcipher_request_ctx *rctx = skcipher_request_ctx(req); struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); - struct crypto_sync_skcipher *child = ctx->child; - SYNC_SKCIPHER_REQUEST_ON_STACK(subreq, child); + struct skcipher_request *subreq = &rctx->req; + struct crypto_skcipher *child = ctx->child; if (unlikely(err == -EINPROGRESS)) goto out; - skcipher_request_set_sync_tfm(subreq, child); + skcipher_request_set_tfm(subreq, child); skcipher_request_set_callback(subreq, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen, @@ -343,9 +344,10 @@ static int cryptd_skcipher_init_tfm(struct crypto_skcipher *tfm) if (IS_ERR(cipher)) return PTR_ERR(cipher); - ctx->child = (struct crypto_sync_skcipher *)cipher; + ctx->child = cipher; crypto_skcipher_set_reqsize( - tfm, sizeof(struct cryptd_skcipher_request_ctx)); + tfm, sizeof(struct cryptd_skcipher_request_ctx) + + crypto_skcipher_reqsize(cipher)); return 0; } @@ -353,7 +355,7 @@ static void cryptd_skcipher_exit_tfm(struct crypto_skcipher *tfm) { struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); - crypto_free_sync_skcipher(ctx->child); + crypto_free_skcipher(ctx->child); } static void cryptd_skcipher_free(struct skcipher_instance *inst) @@ -931,7 +933,7 @@ struct crypto_skcipher *cryptd_skcipher_child(struct cryptd_skcipher *tfm) { struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(&tfm->base); - return &ctx->child->base; + return ctx->child; } EXPORT_SYMBOL_GPL(cryptd_skcipher_child); -- GitLab From cc7710d0d4ebc6998f04035cde4f32c5ddbe9d7f Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Fri, 11 Nov 2022 18:00:36 +0800 Subject: [PATCH 0754/1423] crypto: hisilicon/qm - add missing pci_dev_put() in q_num_set() pci_get_device() will increase the reference count for the returned pci_dev. We need to use pci_dev_put() to decrease the reference count before q_num_set() returns. Fixes: c8b4b477079d ("crypto: hisilicon - add HiSilicon HPRE accelerator") Signed-off-by: Xiongfeng Wang Reviewed-by: Weili Qian Signed-off-by: Herbert Xu --- include/linux/hisi_acc_qm.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index e230c7c46110a..c3618255b1504 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -384,14 +384,14 @@ struct hisi_qp { static inline int q_num_set(const char *val, const struct kernel_param *kp, unsigned int device) { - struct pci_dev *pdev = pci_get_device(PCI_VENDOR_ID_HUAWEI, - device, NULL); + struct pci_dev *pdev; u32 n, q_num; int ret; if (!val) return -EINVAL; + pdev = pci_get_device(PCI_VENDOR_ID_HUAWEI, device, NULL); if (!pdev) { q_num = min_t(u32, QM_QNUM_V1, QM_QNUM_V2); pr_info("No device found currently, suppose queue number is %u\n", @@ -401,6 +401,8 @@ static inline int q_num_set(const char *val, const struct kernel_param *kp, q_num = QM_QNUM_V1; else q_num = QM_QNUM_V2; + + pci_dev_put(pdev); } ret = kstrtou32(val, 10, &n); -- GitLab From e6cb02bd0a52457e486a752da5db7b67f2540c16 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 11 Nov 2022 18:05:41 +0800 Subject: [PATCH 0755/1423] crypto: skcipher - Allow sync algorithms with large request contexts Some sync algorithms may require a large amount of temporary space during its operations. There is no reason why they should be limited just because some legacy users want to place all temporary data on the stack. Such algorithms can now set a flag to indicate that they need extra request context, which will cause them to be invisible to users that go through the sync_skcipher interface. Signed-off-by: Herbert Xu --- crypto/skcipher.c | 2 +- include/crypto/internal/skcipher.h | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/crypto/skcipher.c b/crypto/skcipher.c index 418211180ceec..0ecab31cfe79b 100644 --- a/crypto/skcipher.c +++ b/crypto/skcipher.c @@ -763,7 +763,7 @@ struct crypto_sync_skcipher *crypto_alloc_sync_skcipher( struct crypto_skcipher *tfm; /* Only sync algorithms allowed. */ - mask |= CRYPTO_ALG_ASYNC; + mask |= CRYPTO_ALG_ASYNC | CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE; tfm = crypto_alloc_tfm(alg_name, &crypto_skcipher_type, type, mask); diff --git a/include/crypto/internal/skcipher.h b/include/crypto/internal/skcipher.h index a2339f80a6159..2a97540156bbf 100644 --- a/include/crypto/internal/skcipher.h +++ b/include/crypto/internal/skcipher.h @@ -14,6 +14,14 @@ #include #include +/* + * Set this if your algorithm is sync but needs a reqsize larger + * than MAX_SYNC_SKCIPHER_REQSIZE. + * + * Reuse bit that is specific to hash algorithms. + */ +#define CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE CRYPTO_ALG_OPTIONAL_KEY + struct aead_request; struct rtattr; -- GitLab From 7bbbc9d81be588ae4fb28b5b202e4421dbfef197 Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Sat, 12 Nov 2022 02:12:50 +0000 Subject: [PATCH 0756/1423] crypto: hisilicon/qm - delete redundant null assignment operations There is no security data in the pointer. It is only a value transferred as a structure. It makes no sense to zero a variable that is on the stack. So not need to set the pointer to null. Signed-off-by: Kai Ye Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/qm.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 363a02810a164..849dc80a71185 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -1773,7 +1773,6 @@ static void dfx_regs_uninit(struct hisi_qm *qm, dregs[i].regs = NULL; } kfree(dregs); - dregs = NULL; } /** -- GitLab From b40b62ed7b0ffe8eb2e6fe8bcfb47027c9a93e93 Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Sat, 12 Nov 2022 02:12:51 +0000 Subject: [PATCH 0757/1423] crypto: hisilicon/qm - modify the process of regs dfx The last register logic and different register logic are combined. Use "u32" instead of 'int' in the regs function input parameter to simplify some checks. Signed-off-by: Kai Ye Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/hpre/hpre_main.c | 7 +- drivers/crypto/hisilicon/qm.c | 151 ++++++++++++---------- drivers/crypto/hisilicon/sec2/sec_main.c | 7 +- drivers/crypto/hisilicon/zip/zip_main.c | 7 +- include/linux/hisi_acc_qm.h | 8 +- 5 files changed, 99 insertions(+), 81 deletions(-) diff --git a/drivers/crypto/hisilicon/hpre/hpre_main.c b/drivers/crypto/hisilicon/hpre/hpre_main.c index baf1faec7046f..923f9c2792654 100644 --- a/drivers/crypto/hisilicon/hpre/hpre_main.c +++ b/drivers/crypto/hisilicon/hpre/hpre_main.c @@ -1101,8 +1101,7 @@ static int hpre_debugfs_init(struct hisi_qm *qm) qm->debug.sqe_mask_offset = HPRE_SQE_MASK_OFFSET; qm->debug.sqe_mask_len = HPRE_SQE_MASK_LEN; - ret = hisi_qm_diff_regs_init(qm, hpre_diff_regs, - ARRAY_SIZE(hpre_diff_regs)); + ret = hisi_qm_regs_debugfs_init(qm, hpre_diff_regs, ARRAY_SIZE(hpre_diff_regs)); if (ret) { dev_warn(dev, "Failed to init HPRE diff regs!\n"); goto debugfs_remove; @@ -1121,7 +1120,7 @@ static int hpre_debugfs_init(struct hisi_qm *qm) return 0; failed_to_create: - hisi_qm_diff_regs_uninit(qm, ARRAY_SIZE(hpre_diff_regs)); + hisi_qm_regs_debugfs_uninit(qm, ARRAY_SIZE(hpre_diff_regs)); debugfs_remove: debugfs_remove_recursive(qm->debug.debug_root); return ret; @@ -1129,7 +1128,7 @@ debugfs_remove: static void hpre_debugfs_exit(struct hisi_qm *qm) { - hisi_qm_diff_regs_uninit(qm, ARRAY_SIZE(hpre_diff_regs)); + hisi_qm_regs_debugfs_uninit(qm, ARRAY_SIZE(hpre_diff_regs)); debugfs_remove_recursive(qm->debug.debug_root); } diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 849dc80a71185..441466df7c6d5 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -1722,8 +1722,21 @@ static int qm_regs_show(struct seq_file *s, void *unused) DEFINE_SHOW_ATTRIBUTE(qm_regs); +static void dfx_regs_uninit(struct hisi_qm *qm, + struct dfx_diff_registers *dregs, int reg_len) +{ + int i; + + /* Setting the pointer is NULL to prevent double free */ + for (i = 0; i < reg_len; i++) { + kfree(dregs[i].regs); + dregs[i].regs = NULL; + } + kfree(dregs); +} + static struct dfx_diff_registers *dfx_regs_init(struct hisi_qm *qm, - const struct dfx_diff_registers *cregs, int reg_len) + const struct dfx_diff_registers *cregs, u32 reg_len) { struct dfx_diff_registers *diff_regs; u32 j, base_offset; @@ -1762,64 +1775,107 @@ alloc_error: return ERR_PTR(-ENOMEM); } -static void dfx_regs_uninit(struct hisi_qm *qm, - struct dfx_diff_registers *dregs, int reg_len) +static int qm_diff_regs_init(struct hisi_qm *qm, + struct dfx_diff_registers *dregs, u32 reg_len) +{ + qm->debug.qm_diff_regs = dfx_regs_init(qm, qm_diff_regs, ARRAY_SIZE(qm_diff_regs)); + if (IS_ERR(qm->debug.qm_diff_regs)) + return PTR_ERR(qm->debug.qm_diff_regs); + + qm->debug.acc_diff_regs = dfx_regs_init(qm, dregs, reg_len); + if (IS_ERR(qm->debug.acc_diff_regs)) { + dfx_regs_uninit(qm, qm->debug.qm_diff_regs, ARRAY_SIZE(qm_diff_regs)); + return PTR_ERR(qm->debug.acc_diff_regs); + } + + return 0; +} + +static void qm_last_regs_uninit(struct hisi_qm *qm) +{ + struct qm_debug *debug = &qm->debug; + + if (qm->fun_type == QM_HW_VF || !debug->qm_last_words) + return; + + kfree(debug->qm_last_words); + debug->qm_last_words = NULL; +} + +static int qm_last_regs_init(struct hisi_qm *qm) { + int dfx_regs_num = ARRAY_SIZE(qm_dfx_regs); + struct qm_debug *debug = &qm->debug; int i; - /* Setting the pointer is NULL to prevent double free */ - for (i = 0; i < reg_len; i++) { - kfree(dregs[i].regs); - dregs[i].regs = NULL; + if (qm->fun_type == QM_HW_VF) + return 0; + + debug->qm_last_words = kcalloc(dfx_regs_num, sizeof(unsigned int), GFP_KERNEL); + if (!debug->qm_last_words) + return -ENOMEM; + + for (i = 0; i < dfx_regs_num; i++) { + debug->qm_last_words[i] = readl_relaxed(qm->io_base + + qm_dfx_regs[i].offset); } - kfree(dregs); + + return 0; +} + +static void qm_diff_regs_uninit(struct hisi_qm *qm, u32 reg_len) +{ + dfx_regs_uninit(qm, qm->debug.acc_diff_regs, reg_len); + dfx_regs_uninit(qm, qm->debug.qm_diff_regs, ARRAY_SIZE(qm_diff_regs)); } /** - * hisi_qm_diff_regs_init() - Allocate memory for registers. + * hisi_qm_regs_debugfs_init() - Allocate memory for registers. * @qm: device qm handle. * @dregs: diff registers handle. * @reg_len: diff registers region length. */ -int hisi_qm_diff_regs_init(struct hisi_qm *qm, - struct dfx_diff_registers *dregs, int reg_len) +int hisi_qm_regs_debugfs_init(struct hisi_qm *qm, + struct dfx_diff_registers *dregs, u32 reg_len) { - if (!qm || !dregs || reg_len <= 0) + int ret; + + if (!qm || !dregs) return -EINVAL; if (qm->fun_type != QM_HW_PF) return 0; - qm->debug.qm_diff_regs = dfx_regs_init(qm, qm_diff_regs, - ARRAY_SIZE(qm_diff_regs)); - if (IS_ERR(qm->debug.qm_diff_regs)) - return PTR_ERR(qm->debug.qm_diff_regs); + ret = qm_last_regs_init(qm); + if (ret) { + dev_info(&qm->pdev->dev, "failed to init qm words memory!\n"); + return ret; + } - qm->debug.acc_diff_regs = dfx_regs_init(qm, dregs, reg_len); - if (IS_ERR(qm->debug.acc_diff_regs)) { - dfx_regs_uninit(qm, qm->debug.qm_diff_regs, - ARRAY_SIZE(qm_diff_regs)); - return PTR_ERR(qm->debug.acc_diff_regs); + ret = qm_diff_regs_init(qm, dregs, reg_len); + if (ret) { + qm_last_regs_uninit(qm); + return ret; } return 0; } -EXPORT_SYMBOL_GPL(hisi_qm_diff_regs_init); +EXPORT_SYMBOL_GPL(hisi_qm_regs_debugfs_init); /** - * hisi_qm_diff_regs_uninit() - Free memory for registers. + * hisi_qm_regs_debugfs_uninit() - Free memory for registers. * @qm: device qm handle. * @reg_len: diff registers region length. */ -void hisi_qm_diff_regs_uninit(struct hisi_qm *qm, int reg_len) +void hisi_qm_regs_debugfs_uninit(struct hisi_qm *qm, u32 reg_len) { - if (!qm || reg_len <= 0 || qm->fun_type != QM_HW_PF) + if (!qm || qm->fun_type != QM_HW_PF) return; - dfx_regs_uninit(qm, qm->debug.acc_diff_regs, reg_len); - dfx_regs_uninit(qm, qm->debug.qm_diff_regs, ARRAY_SIZE(qm_diff_regs)); + qm_diff_regs_uninit(qm, reg_len); + qm_last_regs_uninit(qm); } -EXPORT_SYMBOL_GPL(hisi_qm_diff_regs_uninit); +EXPORT_SYMBOL_GPL(hisi_qm_regs_debugfs_uninit); /** * hisi_qm_acc_diff_regs_dump() - Dump registers's value. @@ -1829,12 +1885,12 @@ EXPORT_SYMBOL_GPL(hisi_qm_diff_regs_uninit); * @regs_len: diff registers region length. */ void hisi_qm_acc_diff_regs_dump(struct hisi_qm *qm, struct seq_file *s, - struct dfx_diff_registers *dregs, int regs_len) + struct dfx_diff_registers *dregs, u32 regs_len) { u32 j, val, base_offset; int i, ret; - if (!qm || !s || !dregs || regs_len <= 0) + if (!qm || !s || !dregs) return; ret = hisi_qm_get_dfx_access(qm); @@ -3719,17 +3775,6 @@ static void hisi_qm_set_state(struct hisi_qm *qm, u8 state) writel(state, qm->io_base + QM_VF_STATE); } -static void qm_last_regs_uninit(struct hisi_qm *qm) -{ - struct qm_debug *debug = &qm->debug; - - if (qm->fun_type == QM_HW_VF || !debug->qm_last_words) - return; - - kfree(debug->qm_last_words); - debug->qm_last_words = NULL; -} - static void hisi_qm_unint_work(struct hisi_qm *qm) { destroy_workqueue(qm->wq); @@ -3760,8 +3805,6 @@ static void hisi_qm_memory_uninit(struct hisi_qm *qm) */ void hisi_qm_uninit(struct hisi_qm *qm) { - qm_last_regs_uninit(qm); - qm_cmd_uninit(qm); hisi_qm_unint_work(qm); down_write(&qm->qps_lock); @@ -6339,26 +6382,6 @@ err_destroy_idr: return ret; } -static void qm_last_regs_init(struct hisi_qm *qm) -{ - int dfx_regs_num = ARRAY_SIZE(qm_dfx_regs); - struct qm_debug *debug = &qm->debug; - int i; - - if (qm->fun_type == QM_HW_VF) - return; - - debug->qm_last_words = kcalloc(dfx_regs_num, sizeof(unsigned int), - GFP_KERNEL); - if (!debug->qm_last_words) - return; - - for (i = 0; i < dfx_regs_num; i++) { - debug->qm_last_words[i] = readl_relaxed(qm->io_base + - qm_dfx_regs[i].offset); - } -} - /** * hisi_qm_init() - Initialize configures about qm. * @qm: The qm needing init. @@ -6407,8 +6430,6 @@ int hisi_qm_init(struct hisi_qm *qm) qm_cmd_init(qm); atomic_set(&qm->status.flags, QM_INIT); - qm_last_regs_init(qm); - return 0; err_free_qm_memory: diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c index 6eb8a16ba0a70..4e24735d95bae 100644 --- a/drivers/crypto/hisilicon/sec2/sec_main.c +++ b/drivers/crypto/hisilicon/sec2/sec_main.c @@ -899,8 +899,7 @@ static int sec_debugfs_init(struct hisi_qm *qm) qm->debug.sqe_mask_offset = SEC_SQE_MASK_OFFSET; qm->debug.sqe_mask_len = SEC_SQE_MASK_LEN; - ret = hisi_qm_diff_regs_init(qm, sec_diff_regs, - ARRAY_SIZE(sec_diff_regs)); + ret = hisi_qm_regs_debugfs_init(qm, sec_diff_regs, ARRAY_SIZE(sec_diff_regs)); if (ret) { dev_warn(dev, "Failed to init SEC diff regs!\n"); goto debugfs_remove; @@ -915,7 +914,7 @@ static int sec_debugfs_init(struct hisi_qm *qm) return 0; failed_to_create: - hisi_qm_diff_regs_uninit(qm, ARRAY_SIZE(sec_diff_regs)); + hisi_qm_regs_debugfs_uninit(qm, ARRAY_SIZE(sec_diff_regs)); debugfs_remove: debugfs_remove_recursive(sec_debugfs_root); return ret; @@ -923,7 +922,7 @@ debugfs_remove: static void sec_debugfs_exit(struct hisi_qm *qm) { - hisi_qm_diff_regs_uninit(qm, ARRAY_SIZE(sec_diff_regs)); + hisi_qm_regs_debugfs_uninit(qm, ARRAY_SIZE(sec_diff_regs)); debugfs_remove_recursive(qm->debug.debug_root); } diff --git a/drivers/crypto/hisilicon/zip/zip_main.c b/drivers/crypto/hisilicon/zip/zip_main.c index c863435e8c75a..1549bec3aea59 100644 --- a/drivers/crypto/hisilicon/zip/zip_main.c +++ b/drivers/crypto/hisilicon/zip/zip_main.c @@ -849,8 +849,7 @@ static int hisi_zip_debugfs_init(struct hisi_qm *qm) qm->debug.sqe_mask_offset = HZIP_SQE_MASK_OFFSET; qm->debug.sqe_mask_len = HZIP_SQE_MASK_LEN; qm->debug.debug_root = dev_d; - ret = hisi_qm_diff_regs_init(qm, hzip_diff_regs, - ARRAY_SIZE(hzip_diff_regs)); + ret = hisi_qm_regs_debugfs_init(qm, hzip_diff_regs, ARRAY_SIZE(hzip_diff_regs)); if (ret) { dev_warn(dev, "Failed to init ZIP diff regs!\n"); goto debugfs_remove; @@ -869,7 +868,7 @@ static int hisi_zip_debugfs_init(struct hisi_qm *qm) return 0; failed_to_create: - hisi_qm_diff_regs_uninit(qm, ARRAY_SIZE(hzip_diff_regs)); + hisi_qm_regs_debugfs_uninit(qm, ARRAY_SIZE(hzip_diff_regs)); debugfs_remove: debugfs_remove_recursive(hzip_debugfs_root); return ret; @@ -895,7 +894,7 @@ static void hisi_zip_debug_regs_clear(struct hisi_qm *qm) static void hisi_zip_debugfs_exit(struct hisi_qm *qm) { - hisi_qm_diff_regs_uninit(qm, ARRAY_SIZE(hzip_diff_regs)); + hisi_qm_regs_debugfs_uninit(qm, ARRAY_SIZE(hzip_diff_regs)); debugfs_remove_recursive(qm->debug.debug_root); diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index c3618255b1504..be3aedaa96dc1 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -471,11 +471,11 @@ int hisi_qm_sriov_disable(struct pci_dev *pdev, bool is_frozen); int hisi_qm_sriov_configure(struct pci_dev *pdev, int num_vfs); void hisi_qm_dev_err_init(struct hisi_qm *qm); void hisi_qm_dev_err_uninit(struct hisi_qm *qm); -int hisi_qm_diff_regs_init(struct hisi_qm *qm, - struct dfx_diff_registers *dregs, int reg_len); -void hisi_qm_diff_regs_uninit(struct hisi_qm *qm, int reg_len); +int hisi_qm_regs_debugfs_init(struct hisi_qm *qm, + struct dfx_diff_registers *dregs, u32 reg_len); +void hisi_qm_regs_debugfs_uninit(struct hisi_qm *qm, u32 reg_len); void hisi_qm_acc_diff_regs_dump(struct hisi_qm *qm, struct seq_file *s, - struct dfx_diff_registers *dregs, int regs_len); + struct dfx_diff_registers *dregs, u32 regs_len); pci_ers_result_t hisi_qm_dev_err_detected(struct pci_dev *pdev, pci_channel_state_t state); -- GitLab From 94476b2b6d60bc926a585ae62e1bf69bd22c1dff Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Sat, 12 Nov 2022 02:12:52 +0000 Subject: [PATCH 0758/1423] crypto: hisilicon/qm - split a debugfs.c from qm Considering that the qm feature and debugfs feature are independent. The code related to debugfs is getting larger and larger. It should be separate as a debugfs file. So move some debugfs code to new file from qm file. The qm code logic is not modified. And maintainability is enhanced. Signed-off-by: Kai Ye Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/Makefile | 2 +- drivers/crypto/hisilicon/debugfs.c | 1097 +++++++++++++++++++++ drivers/crypto/hisilicon/qm.c | 1348 ++------------------------ drivers/crypto/hisilicon/qm_common.h | 87 ++ 4 files changed, 1277 insertions(+), 1257 deletions(-) create mode 100644 drivers/crypto/hisilicon/debugfs.c create mode 100644 drivers/crypto/hisilicon/qm_common.h diff --git a/drivers/crypto/hisilicon/Makefile b/drivers/crypto/hisilicon/Makefile index 1e89269a2e4b0..8595a5a5d2288 100644 --- a/drivers/crypto/hisilicon/Makefile +++ b/drivers/crypto/hisilicon/Makefile @@ -3,6 +3,6 @@ obj-$(CONFIG_CRYPTO_DEV_HISI_HPRE) += hpre/ obj-$(CONFIG_CRYPTO_DEV_HISI_SEC) += sec/ obj-$(CONFIG_CRYPTO_DEV_HISI_SEC2) += sec2/ obj-$(CONFIG_CRYPTO_DEV_HISI_QM) += hisi_qm.o -hisi_qm-objs = qm.o sgl.o +hisi_qm-objs = qm.o sgl.o debugfs.o obj-$(CONFIG_CRYPTO_DEV_HISI_ZIP) += zip/ obj-$(CONFIG_CRYPTO_DEV_HISI_TRNG) += trng/ diff --git a/drivers/crypto/hisilicon/debugfs.c b/drivers/crypto/hisilicon/debugfs.c new file mode 100644 index 0000000000000..13bec8b2d7237 --- /dev/null +++ b/drivers/crypto/hisilicon/debugfs.c @@ -0,0 +1,1097 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 HiSilicon Limited. */ +#include +#include "qm_common.h" + +#define QM_DFX_BASE 0x0100000 +#define QM_DFX_STATE1 0x0104000 +#define QM_DFX_STATE2 0x01040C8 +#define QM_DFX_COMMON 0x0000 +#define QM_DFX_BASE_LEN 0x5A +#define QM_DFX_STATE1_LEN 0x2E +#define QM_DFX_STATE2_LEN 0x11 +#define QM_DFX_COMMON_LEN 0xC3 +#define QM_DFX_REGS_LEN 4UL +#define QM_DBG_TMP_BUF_LEN 22 +#define CURRENT_FUN_MASK GENMASK(5, 0) +#define CURRENT_Q_MASK GENMASK(31, 16) +#define QM_SQE_ADDR_MASK GENMASK(7, 0) + +#define QM_DFX_MB_CNT_VF 0x104010 +#define QM_DFX_DB_CNT_VF 0x104020 +#define QM_DFX_SQE_CNT_VF_SQN 0x104030 +#define QM_DFX_CQE_CNT_VF_CQN 0x104040 +#define QM_DFX_QN_SHIFT 16 +#define QM_DFX_CNT_CLR_CE 0x100118 +#define QM_DBG_WRITE_LEN 1024 + +static const char * const qm_debug_file_name[] = { + [CURRENT_QM] = "current_qm", + [CURRENT_Q] = "current_q", + [CLEAR_ENABLE] = "clear_enable", +}; + +struct qm_dfx_item { + const char *name; + u32 offset; +}; + +static struct qm_dfx_item qm_dfx_files[] = { + {"err_irq", offsetof(struct qm_dfx, err_irq_cnt)}, + {"aeq_irq", offsetof(struct qm_dfx, aeq_irq_cnt)}, + {"abnormal_irq", offsetof(struct qm_dfx, abnormal_irq_cnt)}, + {"create_qp_err", offsetof(struct qm_dfx, create_qp_err_cnt)}, + {"mb_err", offsetof(struct qm_dfx, mb_err_cnt)}, +}; + +#define CNT_CYC_REGS_NUM 10 +static const struct debugfs_reg32 qm_dfx_regs[] = { + /* XXX_CNT are reading clear register */ + {"QM_ECC_1BIT_CNT ", 0x104000ull}, + {"QM_ECC_MBIT_CNT ", 0x104008ull}, + {"QM_DFX_MB_CNT ", 0x104018ull}, + {"QM_DFX_DB_CNT ", 0x104028ull}, + {"QM_DFX_SQE_CNT ", 0x104038ull}, + {"QM_DFX_CQE_CNT ", 0x104048ull}, + {"QM_DFX_SEND_SQE_TO_ACC_CNT ", 0x104050ull}, + {"QM_DFX_WB_SQE_FROM_ACC_CNT ", 0x104058ull}, + {"QM_DFX_ACC_FINISH_CNT ", 0x104060ull}, + {"QM_DFX_CQE_ERR_CNT ", 0x1040b4ull}, + {"QM_DFX_FUNS_ACTIVE_ST ", 0x200ull}, + {"QM_ECC_1BIT_INF ", 0x104004ull}, + {"QM_ECC_MBIT_INF ", 0x10400cull}, + {"QM_DFX_ACC_RDY_VLD0 ", 0x1040a0ull}, + {"QM_DFX_ACC_RDY_VLD1 ", 0x1040a4ull}, + {"QM_DFX_AXI_RDY_VLD ", 0x1040a8ull}, + {"QM_DFX_FF_ST0 ", 0x1040c8ull}, + {"QM_DFX_FF_ST1 ", 0x1040ccull}, + {"QM_DFX_FF_ST2 ", 0x1040d0ull}, + {"QM_DFX_FF_ST3 ", 0x1040d4ull}, + {"QM_DFX_FF_ST4 ", 0x1040d8ull}, + {"QM_DFX_FF_ST5 ", 0x1040dcull}, + {"QM_DFX_FF_ST6 ", 0x1040e0ull}, + {"QM_IN_IDLE_ST ", 0x1040e4ull}, +}; + +static const struct debugfs_reg32 qm_vf_dfx_regs[] = { + {"QM_DFX_FUNS_ACTIVE_ST ", 0x200ull}, +}; + +/* define the QM's dfx regs region and region length */ +static struct dfx_diff_registers qm_diff_regs[] = { + { + .reg_offset = QM_DFX_BASE, + .reg_len = QM_DFX_BASE_LEN, + }, { + .reg_offset = QM_DFX_STATE1, + .reg_len = QM_DFX_STATE1_LEN, + }, { + .reg_offset = QM_DFX_STATE2, + .reg_len = QM_DFX_STATE2_LEN, + }, { + .reg_offset = QM_DFX_COMMON, + .reg_len = QM_DFX_COMMON_LEN, + }, +}; + +static struct hisi_qm *file_to_qm(struct debugfs_file *file) +{ + struct qm_debug *debug = file->debug; + + return container_of(debug, struct hisi_qm, debug); +} + +static ssize_t qm_cmd_read(struct file *filp, char __user *buffer, + size_t count, loff_t *pos) +{ + char buf[QM_DBG_READ_LEN]; + int len; + + len = scnprintf(buf, QM_DBG_READ_LEN, "%s\n", + "Please echo help to cmd to get help information"); + + return simple_read_from_buffer(buffer, count, pos, buf, len); +} + +static void dump_show(struct hisi_qm *qm, void *info, + unsigned int info_size, char *info_name) +{ + struct device *dev = &qm->pdev->dev; + u8 *info_curr = info; + u32 i; +#define BYTE_PER_DW 4 + + dev_info(dev, "%s DUMP\n", info_name); + for (i = 0; i < info_size; i += BYTE_PER_DW, info_curr += BYTE_PER_DW) { + pr_info("DW%u: %02X%02X %02X%02X\n", i / BYTE_PER_DW, + *(info_curr + 3), *(info_curr + 2), *(info_curr + 1), *(info_curr)); + } +} + +static int qm_sqc_dump(struct hisi_qm *qm, const char *s) +{ + struct device *dev = &qm->pdev->dev; + struct qm_sqc *sqc, *sqc_curr; + dma_addr_t sqc_dma; + u32 qp_id; + int ret; + + if (!s) + return -EINVAL; + + ret = kstrtou32(s, 0, &qp_id); + if (ret || qp_id >= qm->qp_num) { + dev_err(dev, "Please input qp num (0-%u)", qm->qp_num - 1); + return -EINVAL; + } + + sqc = hisi_qm_ctx_alloc(qm, sizeof(*sqc), &sqc_dma); + if (IS_ERR(sqc)) + return PTR_ERR(sqc); + + ret = hisi_qm_mb(qm, QM_MB_CMD_SQC, sqc_dma, qp_id, 1); + if (ret) { + down_read(&qm->qps_lock); + if (qm->sqc) { + sqc_curr = qm->sqc + qp_id; + + dump_show(qm, sqc_curr, sizeof(*sqc), "SOFT SQC"); + } + up_read(&qm->qps_lock); + + goto free_ctx; + } + + dump_show(qm, sqc, sizeof(*sqc), "SQC"); + +free_ctx: + hisi_qm_ctx_free(qm, sizeof(*sqc), sqc, &sqc_dma); + return 0; +} + +static int qm_cqc_dump(struct hisi_qm *qm, const char *s) +{ + struct device *dev = &qm->pdev->dev; + struct qm_cqc *cqc, *cqc_curr; + dma_addr_t cqc_dma; + u32 qp_id; + int ret; + + if (!s) + return -EINVAL; + + ret = kstrtou32(s, 0, &qp_id); + if (ret || qp_id >= qm->qp_num) { + dev_err(dev, "Please input qp num (0-%u)", qm->qp_num - 1); + return -EINVAL; + } + + cqc = hisi_qm_ctx_alloc(qm, sizeof(*cqc), &cqc_dma); + if (IS_ERR(cqc)) + return PTR_ERR(cqc); + + ret = hisi_qm_mb(qm, QM_MB_CMD_CQC, cqc_dma, qp_id, 1); + if (ret) { + down_read(&qm->qps_lock); + if (qm->cqc) { + cqc_curr = qm->cqc + qp_id; + + dump_show(qm, cqc_curr, sizeof(*cqc), "SOFT CQC"); + } + up_read(&qm->qps_lock); + + goto free_ctx; + } + + dump_show(qm, cqc, sizeof(*cqc), "CQC"); + +free_ctx: + hisi_qm_ctx_free(qm, sizeof(*cqc), cqc, &cqc_dma); + return 0; +} + +static int qm_eqc_aeqc_dump(struct hisi_qm *qm, char *s, size_t size, + int cmd, char *name) +{ + struct device *dev = &qm->pdev->dev; + dma_addr_t xeqc_dma; + void *xeqc; + int ret; + + if (strsep(&s, " ")) { + dev_err(dev, "Please do not input extra characters!\n"); + return -EINVAL; + } + + xeqc = hisi_qm_ctx_alloc(qm, size, &xeqc_dma); + if (IS_ERR(xeqc)) + return PTR_ERR(xeqc); + + ret = hisi_qm_mb(qm, cmd, xeqc_dma, 0, 1); + if (ret) + goto err_free_ctx; + + dump_show(qm, xeqc, size, name); + +err_free_ctx: + hisi_qm_ctx_free(qm, size, xeqc, &xeqc_dma); + return ret; +} + +static int q_dump_param_parse(struct hisi_qm *qm, char *s, + u32 *e_id, u32 *q_id, u16 q_depth) +{ + struct device *dev = &qm->pdev->dev; + unsigned int qp_num = qm->qp_num; + char *presult; + int ret; + + presult = strsep(&s, " "); + if (!presult) { + dev_err(dev, "Please input qp number!\n"); + return -EINVAL; + } + + ret = kstrtou32(presult, 0, q_id); + if (ret || *q_id >= qp_num) { + dev_err(dev, "Please input qp num (0-%u)", qp_num - 1); + return -EINVAL; + } + + presult = strsep(&s, " "); + if (!presult) { + dev_err(dev, "Please input sqe number!\n"); + return -EINVAL; + } + + ret = kstrtou32(presult, 0, e_id); + if (ret || *e_id >= q_depth) { + dev_err(dev, "Please input sqe num (0-%u)", q_depth - 1); + return -EINVAL; + } + + if (strsep(&s, " ")) { + dev_err(dev, "Please do not input extra characters!\n"); + return -EINVAL; + } + + return 0; +} + +static int qm_sq_dump(struct hisi_qm *qm, char *s) +{ + u16 sq_depth = qm->qp_array->cq_depth; + void *sqe, *sqe_curr; + struct hisi_qp *qp; + u32 qp_id, sqe_id; + int ret; + + ret = q_dump_param_parse(qm, s, &sqe_id, &qp_id, sq_depth); + if (ret) + return ret; + + sqe = kzalloc(qm->sqe_size * sq_depth, GFP_KERNEL); + if (!sqe) + return -ENOMEM; + + qp = &qm->qp_array[qp_id]; + memcpy(sqe, qp->sqe, qm->sqe_size * sq_depth); + sqe_curr = sqe + (u32)(sqe_id * qm->sqe_size); + memset(sqe_curr + qm->debug.sqe_mask_offset, QM_SQE_ADDR_MASK, + qm->debug.sqe_mask_len); + + dump_show(qm, sqe_curr, qm->sqe_size, "SQE"); + + kfree(sqe); + + return 0; +} + +static int qm_cq_dump(struct hisi_qm *qm, char *s) +{ + struct qm_cqe *cqe_curr; + struct hisi_qp *qp; + u32 qp_id, cqe_id; + int ret; + + ret = q_dump_param_parse(qm, s, &cqe_id, &qp_id, qm->qp_array->cq_depth); + if (ret) + return ret; + + qp = &qm->qp_array[qp_id]; + cqe_curr = qp->cqe + cqe_id; + dump_show(qm, cqe_curr, sizeof(struct qm_cqe), "CQE"); + + return 0; +} + +static int qm_eq_aeq_dump(struct hisi_qm *qm, const char *s, + size_t size, char *name) +{ + struct device *dev = &qm->pdev->dev; + void *xeqe; + u32 xeqe_id; + int ret; + + if (!s) + return -EINVAL; + + ret = kstrtou32(s, 0, &xeqe_id); + if (ret) + return -EINVAL; + + if (!strcmp(name, "EQE") && xeqe_id >= qm->eq_depth) { + dev_err(dev, "Please input eqe num (0-%u)", qm->eq_depth - 1); + return -EINVAL; + } else if (!strcmp(name, "AEQE") && xeqe_id >= qm->aeq_depth) { + dev_err(dev, "Please input aeqe num (0-%u)", qm->eq_depth - 1); + return -EINVAL; + } + + down_read(&qm->qps_lock); + + if (qm->eqe && !strcmp(name, "EQE")) { + xeqe = qm->eqe + xeqe_id; + } else if (qm->aeqe && !strcmp(name, "AEQE")) { + xeqe = qm->aeqe + xeqe_id; + } else { + ret = -EINVAL; + goto err_unlock; + } + + dump_show(qm, xeqe, size, name); + +err_unlock: + up_read(&qm->qps_lock); + return ret; +} + +static int qm_dbg_help(struct hisi_qm *qm, char *s) +{ + struct device *dev = &qm->pdev->dev; + + if (strsep(&s, " ")) { + dev_err(dev, "Please do not input extra characters!\n"); + return -EINVAL; + } + + dev_info(dev, "available commands:\n"); + dev_info(dev, "sqc \n"); + dev_info(dev, "cqc \n"); + dev_info(dev, "eqc\n"); + dev_info(dev, "aeqc\n"); + dev_info(dev, "sq \n"); + dev_info(dev, "cq \n"); + dev_info(dev, "eq \n"); + dev_info(dev, "aeq \n"); + + return 0; +} + +static int qm_cmd_write_dump(struct hisi_qm *qm, const char *cmd_buf) +{ + struct device *dev = &qm->pdev->dev; + char *presult, *s, *s_tmp; + int ret; + + s = kstrdup(cmd_buf, GFP_KERNEL); + if (!s) + return -ENOMEM; + + s_tmp = s; + presult = strsep(&s, " "); + if (!presult) { + ret = -EINVAL; + goto err_buffer_free; + } + + if (!strcmp(presult, "sqc")) + ret = qm_sqc_dump(qm, s); + else if (!strcmp(presult, "cqc")) + ret = qm_cqc_dump(qm, s); + else if (!strcmp(presult, "eqc")) + ret = qm_eqc_aeqc_dump(qm, s, sizeof(struct qm_eqc), + QM_MB_CMD_EQC, "EQC"); + else if (!strcmp(presult, "aeqc")) + ret = qm_eqc_aeqc_dump(qm, s, sizeof(struct qm_aeqc), + QM_MB_CMD_AEQC, "AEQC"); + else if (!strcmp(presult, "sq")) + ret = qm_sq_dump(qm, s); + else if (!strcmp(presult, "cq")) + ret = qm_cq_dump(qm, s); + else if (!strcmp(presult, "eq")) + ret = qm_eq_aeq_dump(qm, s, sizeof(struct qm_eqe), "EQE"); + else if (!strcmp(presult, "aeq")) + ret = qm_eq_aeq_dump(qm, s, sizeof(struct qm_aeqe), "AEQE"); + else if (!strcmp(presult, "help")) + ret = qm_dbg_help(qm, s); + else + ret = -EINVAL; + + if (ret) + dev_info(dev, "Please echo help\n"); + +err_buffer_free: + kfree(s_tmp); + + return ret; +} + +static ssize_t qm_cmd_write(struct file *filp, const char __user *buffer, + size_t count, loff_t *pos) +{ + struct hisi_qm *qm = filp->private_data; + char *cmd_buf, *cmd_buf_tmp; + int ret; + + if (*pos) + return 0; + + ret = hisi_qm_get_dfx_access(qm); + if (ret) + return ret; + + /* Judge if the instance is being reset. */ + if (unlikely(atomic_read(&qm->status.flags) == QM_STOP)) { + ret = 0; + goto put_dfx_access; + } + + if (count > QM_DBG_WRITE_LEN) { + ret = -ENOSPC; + goto put_dfx_access; + } + + cmd_buf = memdup_user_nul(buffer, count); + if (IS_ERR(cmd_buf)) { + ret = PTR_ERR(cmd_buf); + goto put_dfx_access; + } + + cmd_buf_tmp = strchr(cmd_buf, '\n'); + if (cmd_buf_tmp) { + *cmd_buf_tmp = '\0'; + count = cmd_buf_tmp - cmd_buf + 1; + } + + ret = qm_cmd_write_dump(qm, cmd_buf); + if (ret) { + kfree(cmd_buf); + goto put_dfx_access; + } + + kfree(cmd_buf); + + ret = count; + +put_dfx_access: + hisi_qm_put_dfx_access(qm); + return ret; +} + +static const struct file_operations qm_cmd_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = qm_cmd_read, + .write = qm_cmd_write, +}; + +/** + * hisi_qm_regs_dump() - Dump registers's value. + * @s: debugfs file handle. + * @regset: accelerator registers information. + * + * Dump accelerator registers. + */ +void hisi_qm_regs_dump(struct seq_file *s, struct debugfs_regset32 *regset) +{ + struct pci_dev *pdev = to_pci_dev(regset->dev); + struct hisi_qm *qm = pci_get_drvdata(pdev); + const struct debugfs_reg32 *regs = regset->regs; + int regs_len = regset->nregs; + int i, ret; + u32 val; + + ret = hisi_qm_get_dfx_access(qm); + if (ret) + return; + + for (i = 0; i < regs_len; i++) { + val = readl(regset->base + regs[i].offset); + seq_printf(s, "%s= 0x%08x\n", regs[i].name, val); + } + + hisi_qm_put_dfx_access(qm); +} +EXPORT_SYMBOL_GPL(hisi_qm_regs_dump); + +static int qm_regs_show(struct seq_file *s, void *unused) +{ + struct hisi_qm *qm = s->private; + struct debugfs_regset32 regset; + + if (qm->fun_type == QM_HW_PF) { + regset.regs = qm_dfx_regs; + regset.nregs = ARRAY_SIZE(qm_dfx_regs); + } else { + regset.regs = qm_vf_dfx_regs; + regset.nregs = ARRAY_SIZE(qm_vf_dfx_regs); + } + + regset.base = qm->io_base; + regset.dev = &qm->pdev->dev; + + hisi_qm_regs_dump(s, ®set); + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(qm_regs); + +static u32 current_q_read(struct hisi_qm *qm) +{ + return readl(qm->io_base + QM_DFX_SQE_CNT_VF_SQN) >> QM_DFX_QN_SHIFT; +} + +static int current_q_write(struct hisi_qm *qm, u32 val) +{ + u32 tmp; + + if (val >= qm->debug.curr_qm_qp_num) + return -EINVAL; + + tmp = val << QM_DFX_QN_SHIFT | + (readl(qm->io_base + QM_DFX_SQE_CNT_VF_SQN) & CURRENT_FUN_MASK); + writel(tmp, qm->io_base + QM_DFX_SQE_CNT_VF_SQN); + + tmp = val << QM_DFX_QN_SHIFT | + (readl(qm->io_base + QM_DFX_CQE_CNT_VF_CQN) & CURRENT_FUN_MASK); + writel(tmp, qm->io_base + QM_DFX_CQE_CNT_VF_CQN); + + return 0; +} + +static u32 clear_enable_read(struct hisi_qm *qm) +{ + return readl(qm->io_base + QM_DFX_CNT_CLR_CE); +} + +/* rd_clr_ctrl 1 enable read clear, otherwise 0 disable it */ +static int clear_enable_write(struct hisi_qm *qm, u32 rd_clr_ctrl) +{ + if (rd_clr_ctrl > 1) + return -EINVAL; + + writel(rd_clr_ctrl, qm->io_base + QM_DFX_CNT_CLR_CE); + + return 0; +} + +static u32 current_qm_read(struct hisi_qm *qm) +{ + return readl(qm->io_base + QM_DFX_MB_CNT_VF); +} + +static int qm_get_vf_qp_num(struct hisi_qm *qm, u32 fun_num) +{ + u32 remain_q_num, vfq_num; + u32 num_vfs = qm->vfs_num; + + vfq_num = (qm->ctrl_qp_num - qm->qp_num) / num_vfs; + if (vfq_num >= qm->max_qp_num) + return qm->max_qp_num; + + remain_q_num = (qm->ctrl_qp_num - qm->qp_num) % num_vfs; + if (vfq_num + remain_q_num <= qm->max_qp_num) + return fun_num == num_vfs ? vfq_num + remain_q_num : vfq_num; + + /* + * if vfq_num + remain_q_num > max_qp_num, the last VFs, + * each with one more queue. + */ + return fun_num + remain_q_num > num_vfs ? vfq_num + 1 : vfq_num; +} + +static int current_qm_write(struct hisi_qm *qm, u32 val) +{ + u32 tmp; + + if (val > qm->vfs_num) + return -EINVAL; + + /* According PF or VF Dev ID to calculation curr_qm_qp_num and store */ + if (!val) + qm->debug.curr_qm_qp_num = qm->qp_num; + else + qm->debug.curr_qm_qp_num = qm_get_vf_qp_num(qm, val); + + writel(val, qm->io_base + QM_DFX_MB_CNT_VF); + writel(val, qm->io_base + QM_DFX_DB_CNT_VF); + + tmp = val | + (readl(qm->io_base + QM_DFX_SQE_CNT_VF_SQN) & CURRENT_Q_MASK); + writel(tmp, qm->io_base + QM_DFX_SQE_CNT_VF_SQN); + + tmp = val | + (readl(qm->io_base + QM_DFX_CQE_CNT_VF_CQN) & CURRENT_Q_MASK); + writel(tmp, qm->io_base + QM_DFX_CQE_CNT_VF_CQN); + + return 0; +} + +static ssize_t qm_debug_read(struct file *filp, char __user *buf, + size_t count, loff_t *pos) +{ + struct debugfs_file *file = filp->private_data; + enum qm_debug_file index = file->index; + struct hisi_qm *qm = file_to_qm(file); + char tbuf[QM_DBG_TMP_BUF_LEN]; + u32 val; + int ret; + + ret = hisi_qm_get_dfx_access(qm); + if (ret) + return ret; + + mutex_lock(&file->lock); + switch (index) { + case CURRENT_QM: + val = current_qm_read(qm); + break; + case CURRENT_Q: + val = current_q_read(qm); + break; + case CLEAR_ENABLE: + val = clear_enable_read(qm); + break; + default: + goto err_input; + } + mutex_unlock(&file->lock); + + hisi_qm_put_dfx_access(qm); + ret = scnprintf(tbuf, QM_DBG_TMP_BUF_LEN, "%u\n", val); + return simple_read_from_buffer(buf, count, pos, tbuf, ret); + +err_input: + mutex_unlock(&file->lock); + hisi_qm_put_dfx_access(qm); + return -EINVAL; +} + +static ssize_t qm_debug_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct debugfs_file *file = filp->private_data; + enum qm_debug_file index = file->index; + struct hisi_qm *qm = file_to_qm(file); + unsigned long val; + char tbuf[QM_DBG_TMP_BUF_LEN]; + int len, ret; + + if (*pos != 0) + return 0; + + if (count >= QM_DBG_TMP_BUF_LEN) + return -ENOSPC; + + len = simple_write_to_buffer(tbuf, QM_DBG_TMP_BUF_LEN - 1, pos, buf, + count); + if (len < 0) + return len; + + tbuf[len] = '\0'; + if (kstrtoul(tbuf, 0, &val)) + return -EFAULT; + + ret = hisi_qm_get_dfx_access(qm); + if (ret) + return ret; + + mutex_lock(&file->lock); + switch (index) { + case CURRENT_QM: + ret = current_qm_write(qm, val); + break; + case CURRENT_Q: + ret = current_q_write(qm, val); + break; + case CLEAR_ENABLE: + ret = clear_enable_write(qm, val); + break; + default: + ret = -EINVAL; + } + mutex_unlock(&file->lock); + + hisi_qm_put_dfx_access(qm); + + if (ret) + return ret; + + return count; +} + +static const struct file_operations qm_debug_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = qm_debug_read, + .write = qm_debug_write, +}; + +static void dfx_regs_uninit(struct hisi_qm *qm, + struct dfx_diff_registers *dregs, int reg_len) +{ + int i; + + /* Setting the pointer is NULL to prevent double free */ + for (i = 0; i < reg_len; i++) { + kfree(dregs[i].regs); + dregs[i].regs = NULL; + } + kfree(dregs); +} + +static struct dfx_diff_registers *dfx_regs_init(struct hisi_qm *qm, + const struct dfx_diff_registers *cregs, u32 reg_len) +{ + struct dfx_diff_registers *diff_regs; + u32 j, base_offset; + int i; + + diff_regs = kcalloc(reg_len, sizeof(*diff_regs), GFP_KERNEL); + if (!diff_regs) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < reg_len; i++) { + if (!cregs[i].reg_len) + continue; + + diff_regs[i].reg_offset = cregs[i].reg_offset; + diff_regs[i].reg_len = cregs[i].reg_len; + diff_regs[i].regs = kcalloc(QM_DFX_REGS_LEN, cregs[i].reg_len, + GFP_KERNEL); + if (!diff_regs[i].regs) + goto alloc_error; + + for (j = 0; j < diff_regs[i].reg_len; j++) { + base_offset = diff_regs[i].reg_offset + + j * QM_DFX_REGS_LEN; + diff_regs[i].regs[j] = readl(qm->io_base + base_offset); + } + } + + return diff_regs; + +alloc_error: + while (i > 0) { + i--; + kfree(diff_regs[i].regs); + } + kfree(diff_regs); + return ERR_PTR(-ENOMEM); +} + +static int qm_diff_regs_init(struct hisi_qm *qm, + struct dfx_diff_registers *dregs, u32 reg_len) +{ + qm->debug.qm_diff_regs = dfx_regs_init(qm, qm_diff_regs, ARRAY_SIZE(qm_diff_regs)); + if (IS_ERR(qm->debug.qm_diff_regs)) + return PTR_ERR(qm->debug.qm_diff_regs); + + qm->debug.acc_diff_regs = dfx_regs_init(qm, dregs, reg_len); + if (IS_ERR(qm->debug.acc_diff_regs)) { + dfx_regs_uninit(qm, qm->debug.qm_diff_regs, ARRAY_SIZE(qm_diff_regs)); + return PTR_ERR(qm->debug.acc_diff_regs); + } + + return 0; +} + +static void qm_last_regs_uninit(struct hisi_qm *qm) +{ + struct qm_debug *debug = &qm->debug; + + if (qm->fun_type == QM_HW_VF || !debug->qm_last_words) + return; + + kfree(debug->qm_last_words); + debug->qm_last_words = NULL; +} + +static int qm_last_regs_init(struct hisi_qm *qm) +{ + int dfx_regs_num = ARRAY_SIZE(qm_dfx_regs); + struct qm_debug *debug = &qm->debug; + int i; + + if (qm->fun_type == QM_HW_VF) + return 0; + + debug->qm_last_words = kcalloc(dfx_regs_num, sizeof(unsigned int), GFP_KERNEL); + if (!debug->qm_last_words) + return -ENOMEM; + + for (i = 0; i < dfx_regs_num; i++) { + debug->qm_last_words[i] = readl_relaxed(qm->io_base + + qm_dfx_regs[i].offset); + } + + return 0; +} + +static void qm_diff_regs_uninit(struct hisi_qm *qm, u32 reg_len) +{ + dfx_regs_uninit(qm, qm->debug.acc_diff_regs, reg_len); + dfx_regs_uninit(qm, qm->debug.qm_diff_regs, ARRAY_SIZE(qm_diff_regs)); +} + +/** + * hisi_qm_regs_debugfs_init() - Allocate memory for registers. + * @qm: device qm handle. + * @dregs: diff registers handle. + * @reg_len: diff registers region length. + */ +int hisi_qm_regs_debugfs_init(struct hisi_qm *qm, + struct dfx_diff_registers *dregs, u32 reg_len) +{ + int ret; + + if (!qm || !dregs) + return -EINVAL; + + if (qm->fun_type != QM_HW_PF) + return 0; + + ret = qm_last_regs_init(qm); + if (ret) { + dev_info(&qm->pdev->dev, "failed to init qm words memory!\n"); + return ret; + } + + ret = qm_diff_regs_init(qm, dregs, reg_len); + if (ret) { + qm_last_regs_uninit(qm); + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(hisi_qm_regs_debugfs_init); + +/** + * hisi_qm_regs_debugfs_uninit() - Free memory for registers. + * @qm: device qm handle. + * @reg_len: diff registers region length. + */ +void hisi_qm_regs_debugfs_uninit(struct hisi_qm *qm, u32 reg_len) +{ + if (!qm || qm->fun_type != QM_HW_PF) + return; + + qm_diff_regs_uninit(qm, reg_len); + qm_last_regs_uninit(qm); +} +EXPORT_SYMBOL_GPL(hisi_qm_regs_debugfs_uninit); + +/** + * hisi_qm_acc_diff_regs_dump() - Dump registers's value. + * @qm: device qm handle. + * @s: Debugfs file handle. + * @dregs: diff registers handle. + * @regs_len: diff registers region length. + */ +void hisi_qm_acc_diff_regs_dump(struct hisi_qm *qm, struct seq_file *s, + struct dfx_diff_registers *dregs, u32 regs_len) +{ + u32 j, val, base_offset; + int i, ret; + + if (!qm || !s || !dregs) + return; + + ret = hisi_qm_get_dfx_access(qm); + if (ret) + return; + + down_read(&qm->qps_lock); + for (i = 0; i < regs_len; i++) { + if (!dregs[i].reg_len) + continue; + + for (j = 0; j < dregs[i].reg_len; j++) { + base_offset = dregs[i].reg_offset + j * QM_DFX_REGS_LEN; + val = readl(qm->io_base + base_offset); + if (val != dregs[i].regs[j]) + seq_printf(s, "0x%08x = 0x%08x ---> 0x%08x\n", + base_offset, dregs[i].regs[j], val); + } + } + up_read(&qm->qps_lock); + + hisi_qm_put_dfx_access(qm); +} +EXPORT_SYMBOL_GPL(hisi_qm_acc_diff_regs_dump); + +void hisi_qm_show_last_dfx_regs(struct hisi_qm *qm) +{ + struct qm_debug *debug = &qm->debug; + struct pci_dev *pdev = qm->pdev; + u32 val; + int i; + + if (qm->fun_type == QM_HW_VF || !debug->qm_last_words) + return; + + for (i = 0; i < ARRAY_SIZE(qm_dfx_regs); i++) { + val = readl_relaxed(qm->io_base + qm_dfx_regs[i].offset); + if (debug->qm_last_words[i] != val) + pci_info(pdev, "%s \t= 0x%08x => 0x%08x\n", + qm_dfx_regs[i].name, debug->qm_last_words[i], val); + } +} + +static int qm_diff_regs_show(struct seq_file *s, void *unused) +{ + struct hisi_qm *qm = s->private; + + hisi_qm_acc_diff_regs_dump(qm, s, qm->debug.qm_diff_regs, + ARRAY_SIZE(qm_diff_regs)); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(qm_diff_regs); + +static ssize_t qm_status_read(struct file *filp, char __user *buffer, + size_t count, loff_t *pos) +{ + struct hisi_qm *qm = filp->private_data; + char buf[QM_DBG_READ_LEN]; + int val, len; + + val = atomic_read(&qm->status.flags); + len = scnprintf(buf, QM_DBG_READ_LEN, "%s\n", qm_s[val]); + + return simple_read_from_buffer(buffer, count, pos, buf, len); +} + +static const struct file_operations qm_status_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = qm_status_read, +}; + +static void qm_create_debugfs_file(struct hisi_qm *qm, struct dentry *dir, + enum qm_debug_file index) +{ + struct debugfs_file *file = qm->debug.files + index; + + debugfs_create_file(qm_debug_file_name[index], 0600, dir, file, + &qm_debug_fops); + + file->index = index; + mutex_init(&file->lock); + file->debug = &qm->debug; +} + +static int qm_debugfs_atomic64_set(void *data, u64 val) +{ + if (val) + return -EINVAL; + + atomic64_set((atomic64_t *)data, 0); + + return 0; +} + +static int qm_debugfs_atomic64_get(void *data, u64 *val) +{ + *val = atomic64_read((atomic64_t *)data); + + return 0; +} + +DEFINE_DEBUGFS_ATTRIBUTE(qm_atomic64_ops, qm_debugfs_atomic64_get, + qm_debugfs_atomic64_set, "%llu\n"); + +/** + * hisi_qm_debug_init() - Initialize qm related debugfs files. + * @qm: The qm for which we want to add debugfs files. + * + * Create qm related debugfs files. + */ +void hisi_qm_debug_init(struct hisi_qm *qm) +{ + struct dfx_diff_registers *qm_regs = qm->debug.qm_diff_regs; + struct qm_dfx *dfx = &qm->debug.dfx; + struct dentry *qm_d; + void *data; + int i; + + qm_d = debugfs_create_dir("qm", qm->debug.debug_root); + qm->debug.qm_d = qm_d; + + /* only show this in PF */ + if (qm->fun_type == QM_HW_PF) { + qm_create_debugfs_file(qm, qm->debug.debug_root, CURRENT_QM); + for (i = CURRENT_Q; i < DEBUG_FILE_NUM; i++) + qm_create_debugfs_file(qm, qm->debug.qm_d, i); + } + + if (qm_regs) + debugfs_create_file("diff_regs", 0444, qm->debug.qm_d, + qm, &qm_diff_regs_fops); + + debugfs_create_file("regs", 0444, qm->debug.qm_d, qm, &qm_regs_fops); + + debugfs_create_file("cmd", 0600, qm->debug.qm_d, qm, &qm_cmd_fops); + + debugfs_create_file("status", 0444, qm->debug.qm_d, qm, + &qm_status_fops); + for (i = 0; i < ARRAY_SIZE(qm_dfx_files); i++) { + data = (atomic64_t *)((uintptr_t)dfx + qm_dfx_files[i].offset); + debugfs_create_file(qm_dfx_files[i].name, + 0644, + qm_d, + data, + &qm_atomic64_ops); + } + + if (test_bit(QM_SUPPORT_FUNC_QOS, &qm->caps)) + hisi_qm_set_algqos_init(qm); +} +EXPORT_SYMBOL_GPL(hisi_qm_debug_init); + +/** + * hisi_qm_debug_regs_clear() - clear qm debug related registers. + * @qm: The qm for which we want to clear its debug registers. + */ +void hisi_qm_debug_regs_clear(struct hisi_qm *qm) +{ + const struct debugfs_reg32 *regs; + int i; + + /* clear current_qm */ + writel(0x0, qm->io_base + QM_DFX_MB_CNT_VF); + writel(0x0, qm->io_base + QM_DFX_DB_CNT_VF); + + /* clear current_q */ + writel(0x0, qm->io_base + QM_DFX_SQE_CNT_VF_SQN); + writel(0x0, qm->io_base + QM_DFX_CQE_CNT_VF_CQN); + + /* + * these registers are reading and clearing, so clear them after + * reading them. + */ + writel(0x1, qm->io_base + QM_DFX_CNT_CLR_CE); + + regs = qm_dfx_regs; + for (i = 0; i < CNT_CYC_REGS_NUM; i++) { + readl(qm->io_base + regs->offset); + regs++; + } + + /* clear clear_enable */ + writel(0x0, qm->io_base + QM_DFX_CNT_CLR_CE); +} +EXPORT_SYMBOL_GPL(hisi_qm_debug_regs_clear); diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 441466df7c6d5..36d70b9f6117c 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -16,6 +16,7 @@ #include #include #include +#include "qm_common.h" /* eq/aeq irq enable */ #define QM_VF_AEQ_INT_SOURCE 0x0 @@ -119,8 +120,6 @@ #define QM_SQC_VFT_NUM_SHIFT_V2 45 #define QM_SQC_VFT_NUM_MASK_v2 GENMASK(9, 0) -#define QM_DFX_CNT_CLR_CE 0x100118 - #define QM_ABNORMAL_INT_SOURCE 0x100000 #define QM_ABNORMAL_INT_MASK 0x100004 #define QM_ABNORMAL_INT_MASK_VALUE 0x7fff @@ -187,14 +186,6 @@ #define QM_VF_RESET_WAIT_TIMEOUT_US \ (QM_VF_RESET_WAIT_US * QM_VF_RESET_WAIT_CNT) -#define QM_DFX_MB_CNT_VF 0x104010 -#define QM_DFX_DB_CNT_VF 0x104020 -#define QM_DFX_SQE_CNT_VF_SQN 0x104030 -#define QM_DFX_CQE_CNT_VF_CQN 0x104040 -#define QM_DFX_QN_SHIFT 16 -#define CURRENT_FUN_MASK GENMASK(5, 0) -#define CURRENT_Q_MASK GENMASK(31, 16) - #define POLL_PERIOD 10 #define POLL_TIMEOUT 1000 #define WAIT_PERIOD_US_MAX 200 @@ -211,19 +202,15 @@ #define QMC_ALIGN(sz) ALIGN(sz, 32) #define QM_DBG_READ_LEN 256 -#define QM_DBG_WRITE_LEN 1024 -#define QM_DBG_TMP_BUF_LEN 22 #define QM_PCI_COMMAND_INVALID ~0 #define QM_RESET_STOP_TX_OFFSET 1 #define QM_RESET_STOP_RX_OFFSET 2 #define WAIT_PERIOD 20 #define REMOVE_WAIT_DELAY 10 -#define QM_SQE_ADDR_MASK GENMASK(7, 0) #define QM_DRIVER_REMOVING 0 #define QM_RST_SCHED 1 -#define QM_RESETTING 2 #define QM_QOS_PARAM_NUM 2 #define QM_QOS_VAL_NUM 1 #define QM_QOS_BDF_PARAM_NUM 4 @@ -250,15 +237,6 @@ #define QM_QOS_MIN_CIR_B 100 #define QM_QOS_MAX_CIR_U 6 #define QM_QOS_MAX_CIR_S 11 -#define QM_DFX_BASE 0x0100000 -#define QM_DFX_STATE1 0x0104000 -#define QM_DFX_STATE2 0x01040C8 -#define QM_DFX_COMMON 0x0000 -#define QM_DFX_BASE_LEN 0x5A -#define QM_DFX_STATE1_LEN 0x2E -#define QM_DFX_STATE2_LEN 0x11 -#define QM_DFX_COMMON_LEN 0xC3 -#define QM_DFX_REGS_LEN 4UL #define QM_AUTOSUSPEND_DELAY 3000 #define QM_MK_CQC_DW3_V1(hop_num, pg_sz, buf_sz, cqe_sz) \ @@ -368,73 +346,6 @@ static const struct hisi_qm_cap_info qm_basic_info[] = { {QM_VF_IRQ_NUM_CAP, 0x311c, 0, GENMASK(15, 0), 0x1, 0x2, 0x3}, }; -struct qm_cqe { - __le32 rsvd0; - __le16 cmd_id; - __le16 rsvd1; - __le16 sq_head; - __le16 sq_num; - __le16 rsvd2; - __le16 w7; -}; - -struct qm_eqe { - __le32 dw0; -}; - -struct qm_aeqe { - __le32 dw0; -}; - -struct qm_sqc { - __le16 head; - __le16 tail; - __le32 base_l; - __le32 base_h; - __le32 dw3; - __le16 w8; - __le16 rsvd0; - __le16 pasid; - __le16 w11; - __le16 cq_num; - __le16 w13; - __le32 rsvd1; -}; - -struct qm_cqc { - __le16 head; - __le16 tail; - __le32 base_l; - __le32 base_h; - __le32 dw3; - __le16 w8; - __le16 rsvd0; - __le16 pasid; - __le16 w11; - __le32 dw6; - __le32 rsvd1; -}; - -struct qm_eqc { - __le16 head; - __le16 tail; - __le32 base_l; - __le32 base_h; - __le32 dw3; - __le32 rsvd[2]; - __le32 dw6; -}; - -struct qm_aeqc { - __le16 head; - __le16 tail; - __le32 base_l; - __le32 base_h; - __le32 dw3; - __le32 rsvd[2]; - __le32 dw6; -}; - struct qm_mailbox { __le16 w0; __le16 queue_num; @@ -467,25 +378,6 @@ struct hisi_qm_hw_ops { int (*set_msi)(struct hisi_qm *qm, bool set); }; -struct qm_dfx_item { - const char *name; - u32 offset; -}; - -static struct qm_dfx_item qm_dfx_files[] = { - {"err_irq", offsetof(struct qm_dfx, err_irq_cnt)}, - {"aeq_irq", offsetof(struct qm_dfx, aeq_irq_cnt)}, - {"abnormal_irq", offsetof(struct qm_dfx, abnormal_irq_cnt)}, - {"create_qp_err", offsetof(struct qm_dfx, create_qp_err_cnt)}, - {"mb_err", offsetof(struct qm_dfx, mb_err_cnt)}, -}; - -static const char * const qm_debug_file_name[] = { - [CURRENT_QM] = "current_qm", - [CURRENT_Q] = "current_q", - [CLEAR_ENABLE] = "clear_enable", -}; - struct hisi_qm_hw_error { u32 int_msk; const char *msg; @@ -510,23 +402,6 @@ static const struct hisi_qm_hw_error qm_hw_error[] = { { /* sentinel */ } }; -/* define the QM's dfx regs region and region length */ -static struct dfx_diff_registers qm_diff_regs[] = { - { - .reg_offset = QM_DFX_BASE, - .reg_len = QM_DFX_BASE_LEN, - }, { - .reg_offset = QM_DFX_STATE1, - .reg_len = QM_DFX_STATE1_LEN, - }, { - .reg_offset = QM_DFX_STATE2, - .reg_len = QM_DFX_STATE2_LEN, - }, { - .reg_offset = QM_DFX_COMMON, - .reg_len = QM_DFX_COMMON_LEN, - }, -}; - static const char * const qm_db_timeout[] = { "sq", "cq", "eq", "aeq", }; @@ -535,10 +410,6 @@ static const char * const qm_fifo_overflow[] = { "cq", "eq", "aeq", }; -static const char * const qm_s[] = { - "init", "start", "close", "stop", -}; - static const char * const qp_s[] = { "none", "init", "start", "stop", "close", }; @@ -1332,1046 +1203,150 @@ static void qm_vft_data_cfg(struct hisi_qm *qm, enum vft_type type, u32 base, (QM_SHAPER_CBS_B << QM_SHAPER_FACTOR_CBS_B_SHIFT) | (factor->cbs_s << QM_SHAPER_FACTOR_CBS_S_SHIFT); } - break; - } - } - - writel(lower_32_bits(tmp), qm->io_base + QM_VFT_CFG_DATA_L); - writel(upper_32_bits(tmp), qm->io_base + QM_VFT_CFG_DATA_H); -} - -static int qm_set_vft_common(struct hisi_qm *qm, enum vft_type type, - u32 fun_num, u32 base, u32 number) -{ - struct qm_shaper_factor *factor = NULL; - unsigned int val; - int ret; - - if (type == SHAPER_VFT && test_bit(QM_SUPPORT_FUNC_QOS, &qm->caps)) - factor = &qm->factor[fun_num]; - - ret = readl_relaxed_poll_timeout(qm->io_base + QM_VFT_CFG_RDY, val, - val & BIT(0), POLL_PERIOD, - POLL_TIMEOUT); - if (ret) - return ret; - - writel(0x0, qm->io_base + QM_VFT_CFG_OP_WR); - writel(type, qm->io_base + QM_VFT_CFG_TYPE); - if (type == SHAPER_VFT) - fun_num |= base << QM_SHAPER_VFT_OFFSET; - - writel(fun_num, qm->io_base + QM_VFT_CFG); - - qm_vft_data_cfg(qm, type, base, number, factor); - - writel(0x0, qm->io_base + QM_VFT_CFG_RDY); - writel(0x1, qm->io_base + QM_VFT_CFG_OP_ENABLE); - - return readl_relaxed_poll_timeout(qm->io_base + QM_VFT_CFG_RDY, val, - val & BIT(0), POLL_PERIOD, - POLL_TIMEOUT); -} - -static int qm_shaper_init_vft(struct hisi_qm *qm, u32 fun_num) -{ - u32 qos = qm->factor[fun_num].func_qos; - int ret, i; - - ret = qm_get_shaper_para(qos * QM_QOS_RATE, &qm->factor[fun_num]); - if (ret) { - dev_err(&qm->pdev->dev, "failed to calculate shaper parameter!\n"); - return ret; - } - writel(qm->type_rate, qm->io_base + QM_SHAPER_CFG); - for (i = ALG_TYPE_0; i <= ALG_TYPE_1; i++) { - /* The base number of queue reuse for different alg type */ - ret = qm_set_vft_common(qm, SHAPER_VFT, fun_num, i, 1); - if (ret) - return ret; - } - - return 0; -} - -/* The config should be conducted after qm_dev_mem_reset() */ -static int qm_set_sqc_cqc_vft(struct hisi_qm *qm, u32 fun_num, u32 base, - u32 number) -{ - int ret, i; - - for (i = SQC_VFT; i <= CQC_VFT; i++) { - ret = qm_set_vft_common(qm, i, fun_num, base, number); - if (ret) - return ret; - } - - /* init default shaper qos val */ - if (test_bit(QM_SUPPORT_FUNC_QOS, &qm->caps)) { - ret = qm_shaper_init_vft(qm, fun_num); - if (ret) - goto back_sqc_cqc; - } - - return 0; -back_sqc_cqc: - for (i = SQC_VFT; i <= CQC_VFT; i++) - qm_set_vft_common(qm, i, fun_num, 0, 0); - - return ret; -} - -static int qm_get_vft_v2(struct hisi_qm *qm, u32 *base, u32 *number) -{ - u64 sqc_vft; - int ret; - - ret = hisi_qm_mb(qm, QM_MB_CMD_SQC_VFT_V2, 0, 0, 1); - if (ret) - return ret; - - sqc_vft = readl(qm->io_base + QM_MB_CMD_DATA_ADDR_L) | - ((u64)readl(qm->io_base + QM_MB_CMD_DATA_ADDR_H) << 32); - *base = QM_SQC_VFT_BASE_MASK_V2 & (sqc_vft >> QM_SQC_VFT_BASE_SHIFT_V2); - *number = (QM_SQC_VFT_NUM_MASK_v2 & - (sqc_vft >> QM_SQC_VFT_NUM_SHIFT_V2)) + 1; - - return 0; -} - -static int qm_get_vf_qp_num(struct hisi_qm *qm, u32 fun_num) -{ - u32 remain_q_num, vfq_num; - u32 num_vfs = qm->vfs_num; - - vfq_num = (qm->ctrl_qp_num - qm->qp_num) / num_vfs; - if (vfq_num >= qm->max_qp_num) - return qm->max_qp_num; - - remain_q_num = (qm->ctrl_qp_num - qm->qp_num) % num_vfs; - if (vfq_num + remain_q_num <= qm->max_qp_num) - return fun_num == num_vfs ? vfq_num + remain_q_num : vfq_num; - - /* - * if vfq_num + remain_q_num > max_qp_num, the last VFs, - * each with one more queue. - */ - return fun_num + remain_q_num > num_vfs ? vfq_num + 1 : vfq_num; -} - -static struct hisi_qm *file_to_qm(struct debugfs_file *file) -{ - struct qm_debug *debug = file->debug; - - return container_of(debug, struct hisi_qm, debug); -} - -static u32 current_q_read(struct hisi_qm *qm) -{ - return readl(qm->io_base + QM_DFX_SQE_CNT_VF_SQN) >> QM_DFX_QN_SHIFT; -} - -static int current_q_write(struct hisi_qm *qm, u32 val) -{ - u32 tmp; - - if (val >= qm->debug.curr_qm_qp_num) - return -EINVAL; - - tmp = val << QM_DFX_QN_SHIFT | - (readl(qm->io_base + QM_DFX_SQE_CNT_VF_SQN) & CURRENT_FUN_MASK); - writel(tmp, qm->io_base + QM_DFX_SQE_CNT_VF_SQN); - - tmp = val << QM_DFX_QN_SHIFT | - (readl(qm->io_base + QM_DFX_CQE_CNT_VF_CQN) & CURRENT_FUN_MASK); - writel(tmp, qm->io_base + QM_DFX_CQE_CNT_VF_CQN); - - return 0; -} - -static u32 clear_enable_read(struct hisi_qm *qm) -{ - return readl(qm->io_base + QM_DFX_CNT_CLR_CE); -} - -/* rd_clr_ctrl 1 enable read clear, otherwise 0 disable it */ -static int clear_enable_write(struct hisi_qm *qm, u32 rd_clr_ctrl) -{ - if (rd_clr_ctrl > 1) - return -EINVAL; - - writel(rd_clr_ctrl, qm->io_base + QM_DFX_CNT_CLR_CE); - - return 0; -} - -static u32 current_qm_read(struct hisi_qm *qm) -{ - return readl(qm->io_base + QM_DFX_MB_CNT_VF); -} - -static int current_qm_write(struct hisi_qm *qm, u32 val) -{ - u32 tmp; - - if (val > qm->vfs_num) - return -EINVAL; - - /* According PF or VF Dev ID to calculation curr_qm_qp_num and store */ - if (!val) - qm->debug.curr_qm_qp_num = qm->qp_num; - else - qm->debug.curr_qm_qp_num = qm_get_vf_qp_num(qm, val); - - writel(val, qm->io_base + QM_DFX_MB_CNT_VF); - writel(val, qm->io_base + QM_DFX_DB_CNT_VF); - - tmp = val | - (readl(qm->io_base + QM_DFX_SQE_CNT_VF_SQN) & CURRENT_Q_MASK); - writel(tmp, qm->io_base + QM_DFX_SQE_CNT_VF_SQN); - - tmp = val | - (readl(qm->io_base + QM_DFX_CQE_CNT_VF_CQN) & CURRENT_Q_MASK); - writel(tmp, qm->io_base + QM_DFX_CQE_CNT_VF_CQN); - - return 0; -} - -static ssize_t qm_debug_read(struct file *filp, char __user *buf, - size_t count, loff_t *pos) -{ - struct debugfs_file *file = filp->private_data; - enum qm_debug_file index = file->index; - struct hisi_qm *qm = file_to_qm(file); - char tbuf[QM_DBG_TMP_BUF_LEN]; - u32 val; - int ret; - - ret = hisi_qm_get_dfx_access(qm); - if (ret) - return ret; - - mutex_lock(&file->lock); - switch (index) { - case CURRENT_QM: - val = current_qm_read(qm); - break; - case CURRENT_Q: - val = current_q_read(qm); - break; - case CLEAR_ENABLE: - val = clear_enable_read(qm); - break; - default: - goto err_input; - } - mutex_unlock(&file->lock); - - hisi_qm_put_dfx_access(qm); - ret = scnprintf(tbuf, QM_DBG_TMP_BUF_LEN, "%u\n", val); - return simple_read_from_buffer(buf, count, pos, tbuf, ret); - -err_input: - mutex_unlock(&file->lock); - hisi_qm_put_dfx_access(qm); - return -EINVAL; -} - -static ssize_t qm_debug_write(struct file *filp, const char __user *buf, - size_t count, loff_t *pos) -{ - struct debugfs_file *file = filp->private_data; - enum qm_debug_file index = file->index; - struct hisi_qm *qm = file_to_qm(file); - unsigned long val; - char tbuf[QM_DBG_TMP_BUF_LEN]; - int len, ret; - - if (*pos != 0) - return 0; - - if (count >= QM_DBG_TMP_BUF_LEN) - return -ENOSPC; - - len = simple_write_to_buffer(tbuf, QM_DBG_TMP_BUF_LEN - 1, pos, buf, - count); - if (len < 0) - return len; - - tbuf[len] = '\0'; - if (kstrtoul(tbuf, 0, &val)) - return -EFAULT; - - ret = hisi_qm_get_dfx_access(qm); - if (ret) - return ret; - - mutex_lock(&file->lock); - switch (index) { - case CURRENT_QM: - ret = current_qm_write(qm, val); - break; - case CURRENT_Q: - ret = current_q_write(qm, val); - break; - case CLEAR_ENABLE: - ret = clear_enable_write(qm, val); - break; - default: - ret = -EINVAL; - } - mutex_unlock(&file->lock); - - hisi_qm_put_dfx_access(qm); - - if (ret) - return ret; - - return count; -} - -static const struct file_operations qm_debug_fops = { - .owner = THIS_MODULE, - .open = simple_open, - .read = qm_debug_read, - .write = qm_debug_write, -}; - -#define CNT_CYC_REGS_NUM 10 -static const struct debugfs_reg32 qm_dfx_regs[] = { - /* XXX_CNT are reading clear register */ - {"QM_ECC_1BIT_CNT ", 0x104000ull}, - {"QM_ECC_MBIT_CNT ", 0x104008ull}, - {"QM_DFX_MB_CNT ", 0x104018ull}, - {"QM_DFX_DB_CNT ", 0x104028ull}, - {"QM_DFX_SQE_CNT ", 0x104038ull}, - {"QM_DFX_CQE_CNT ", 0x104048ull}, - {"QM_DFX_SEND_SQE_TO_ACC_CNT ", 0x104050ull}, - {"QM_DFX_WB_SQE_FROM_ACC_CNT ", 0x104058ull}, - {"QM_DFX_ACC_FINISH_CNT ", 0x104060ull}, - {"QM_DFX_CQE_ERR_CNT ", 0x1040b4ull}, - {"QM_DFX_FUNS_ACTIVE_ST ", 0x200ull}, - {"QM_ECC_1BIT_INF ", 0x104004ull}, - {"QM_ECC_MBIT_INF ", 0x10400cull}, - {"QM_DFX_ACC_RDY_VLD0 ", 0x1040a0ull}, - {"QM_DFX_ACC_RDY_VLD1 ", 0x1040a4ull}, - {"QM_DFX_AXI_RDY_VLD ", 0x1040a8ull}, - {"QM_DFX_FF_ST0 ", 0x1040c8ull}, - {"QM_DFX_FF_ST1 ", 0x1040ccull}, - {"QM_DFX_FF_ST2 ", 0x1040d0ull}, - {"QM_DFX_FF_ST3 ", 0x1040d4ull}, - {"QM_DFX_FF_ST4 ", 0x1040d8ull}, - {"QM_DFX_FF_ST5 ", 0x1040dcull}, - {"QM_DFX_FF_ST6 ", 0x1040e0ull}, - {"QM_IN_IDLE_ST ", 0x1040e4ull}, -}; - -static const struct debugfs_reg32 qm_vf_dfx_regs[] = { - {"QM_DFX_FUNS_ACTIVE_ST ", 0x200ull}, -}; - -/** - * hisi_qm_regs_dump() - Dump registers's value. - * @s: debugfs file handle. - * @regset: accelerator registers information. - * - * Dump accelerator registers. - */ -void hisi_qm_regs_dump(struct seq_file *s, struct debugfs_regset32 *regset) -{ - struct pci_dev *pdev = to_pci_dev(regset->dev); - struct hisi_qm *qm = pci_get_drvdata(pdev); - const struct debugfs_reg32 *regs = regset->regs; - int regs_len = regset->nregs; - int i, ret; - u32 val; - - ret = hisi_qm_get_dfx_access(qm); - if (ret) - return; - - for (i = 0; i < regs_len; i++) { - val = readl(regset->base + regs[i].offset); - seq_printf(s, "%s= 0x%08x\n", regs[i].name, val); - } - - hisi_qm_put_dfx_access(qm); -} -EXPORT_SYMBOL_GPL(hisi_qm_regs_dump); - -static int qm_regs_show(struct seq_file *s, void *unused) -{ - struct hisi_qm *qm = s->private; - struct debugfs_regset32 regset; - - if (qm->fun_type == QM_HW_PF) { - regset.regs = qm_dfx_regs; - regset.nregs = ARRAY_SIZE(qm_dfx_regs); - } else { - regset.regs = qm_vf_dfx_regs; - regset.nregs = ARRAY_SIZE(qm_vf_dfx_regs); - } - - regset.base = qm->io_base; - regset.dev = &qm->pdev->dev; - - hisi_qm_regs_dump(s, ®set); - - return 0; -} - -DEFINE_SHOW_ATTRIBUTE(qm_regs); - -static void dfx_regs_uninit(struct hisi_qm *qm, - struct dfx_diff_registers *dregs, int reg_len) -{ - int i; - - /* Setting the pointer is NULL to prevent double free */ - for (i = 0; i < reg_len; i++) { - kfree(dregs[i].regs); - dregs[i].regs = NULL; - } - kfree(dregs); -} - -static struct dfx_diff_registers *dfx_regs_init(struct hisi_qm *qm, - const struct dfx_diff_registers *cregs, u32 reg_len) -{ - struct dfx_diff_registers *diff_regs; - u32 j, base_offset; - int i; - - diff_regs = kcalloc(reg_len, sizeof(*diff_regs), GFP_KERNEL); - if (!diff_regs) - return ERR_PTR(-ENOMEM); - - for (i = 0; i < reg_len; i++) { - if (!cregs[i].reg_len) - continue; - - diff_regs[i].reg_offset = cregs[i].reg_offset; - diff_regs[i].reg_len = cregs[i].reg_len; - diff_regs[i].regs = kcalloc(QM_DFX_REGS_LEN, cregs[i].reg_len, - GFP_KERNEL); - if (!diff_regs[i].regs) - goto alloc_error; - - for (j = 0; j < diff_regs[i].reg_len; j++) { - base_offset = diff_regs[i].reg_offset + - j * QM_DFX_REGS_LEN; - diff_regs[i].regs[j] = readl(qm->io_base + base_offset); - } - } - - return diff_regs; - -alloc_error: - while (i > 0) { - i--; - kfree(diff_regs[i].regs); - } - kfree(diff_regs); - return ERR_PTR(-ENOMEM); -} - -static int qm_diff_regs_init(struct hisi_qm *qm, - struct dfx_diff_registers *dregs, u32 reg_len) -{ - qm->debug.qm_diff_regs = dfx_regs_init(qm, qm_diff_regs, ARRAY_SIZE(qm_diff_regs)); - if (IS_ERR(qm->debug.qm_diff_regs)) - return PTR_ERR(qm->debug.qm_diff_regs); - - qm->debug.acc_diff_regs = dfx_regs_init(qm, dregs, reg_len); - if (IS_ERR(qm->debug.acc_diff_regs)) { - dfx_regs_uninit(qm, qm->debug.qm_diff_regs, ARRAY_SIZE(qm_diff_regs)); - return PTR_ERR(qm->debug.acc_diff_regs); - } - - return 0; -} - -static void qm_last_regs_uninit(struct hisi_qm *qm) -{ - struct qm_debug *debug = &qm->debug; - - if (qm->fun_type == QM_HW_VF || !debug->qm_last_words) - return; - - kfree(debug->qm_last_words); - debug->qm_last_words = NULL; -} - -static int qm_last_regs_init(struct hisi_qm *qm) -{ - int dfx_regs_num = ARRAY_SIZE(qm_dfx_regs); - struct qm_debug *debug = &qm->debug; - int i; - - if (qm->fun_type == QM_HW_VF) - return 0; - - debug->qm_last_words = kcalloc(dfx_regs_num, sizeof(unsigned int), GFP_KERNEL); - if (!debug->qm_last_words) - return -ENOMEM; - - for (i = 0; i < dfx_regs_num; i++) { - debug->qm_last_words[i] = readl_relaxed(qm->io_base + - qm_dfx_regs[i].offset); - } - - return 0; -} - -static void qm_diff_regs_uninit(struct hisi_qm *qm, u32 reg_len) -{ - dfx_regs_uninit(qm, qm->debug.acc_diff_regs, reg_len); - dfx_regs_uninit(qm, qm->debug.qm_diff_regs, ARRAY_SIZE(qm_diff_regs)); -} - -/** - * hisi_qm_regs_debugfs_init() - Allocate memory for registers. - * @qm: device qm handle. - * @dregs: diff registers handle. - * @reg_len: diff registers region length. - */ -int hisi_qm_regs_debugfs_init(struct hisi_qm *qm, - struct dfx_diff_registers *dregs, u32 reg_len) -{ - int ret; - - if (!qm || !dregs) - return -EINVAL; - - if (qm->fun_type != QM_HW_PF) - return 0; - - ret = qm_last_regs_init(qm); - if (ret) { - dev_info(&qm->pdev->dev, "failed to init qm words memory!\n"); - return ret; - } - - ret = qm_diff_regs_init(qm, dregs, reg_len); - if (ret) { - qm_last_regs_uninit(qm); - return ret; - } - - return 0; -} -EXPORT_SYMBOL_GPL(hisi_qm_regs_debugfs_init); - -/** - * hisi_qm_regs_debugfs_uninit() - Free memory for registers. - * @qm: device qm handle. - * @reg_len: diff registers region length. - */ -void hisi_qm_regs_debugfs_uninit(struct hisi_qm *qm, u32 reg_len) -{ - if (!qm || qm->fun_type != QM_HW_PF) - return; - - qm_diff_regs_uninit(qm, reg_len); - qm_last_regs_uninit(qm); -} -EXPORT_SYMBOL_GPL(hisi_qm_regs_debugfs_uninit); - -/** - * hisi_qm_acc_diff_regs_dump() - Dump registers's value. - * @qm: device qm handle. - * @s: Debugfs file handle. - * @dregs: diff registers handle. - * @regs_len: diff registers region length. - */ -void hisi_qm_acc_diff_regs_dump(struct hisi_qm *qm, struct seq_file *s, - struct dfx_diff_registers *dregs, u32 regs_len) -{ - u32 j, val, base_offset; - int i, ret; - - if (!qm || !s || !dregs) - return; - - ret = hisi_qm_get_dfx_access(qm); - if (ret) - return; - - down_read(&qm->qps_lock); - for (i = 0; i < regs_len; i++) { - if (!dregs[i].reg_len) - continue; - - for (j = 0; j < dregs[i].reg_len; j++) { - base_offset = dregs[i].reg_offset + j * QM_DFX_REGS_LEN; - val = readl(qm->io_base + base_offset); - if (val != dregs[i].regs[j]) - seq_printf(s, "0x%08x = 0x%08x ---> 0x%08x\n", - base_offset, dregs[i].regs[j], val); - } - } - up_read(&qm->qps_lock); - - hisi_qm_put_dfx_access(qm); -} -EXPORT_SYMBOL_GPL(hisi_qm_acc_diff_regs_dump); - -static int qm_diff_regs_show(struct seq_file *s, void *unused) -{ - struct hisi_qm *qm = s->private; - - hisi_qm_acc_diff_regs_dump(qm, s, qm->debug.qm_diff_regs, - ARRAY_SIZE(qm_diff_regs)); - - return 0; -} -DEFINE_SHOW_ATTRIBUTE(qm_diff_regs); - -static ssize_t qm_cmd_read(struct file *filp, char __user *buffer, - size_t count, loff_t *pos) -{ - char buf[QM_DBG_READ_LEN]; - int len; - - len = scnprintf(buf, QM_DBG_READ_LEN, "%s\n", - "Please echo help to cmd to get help information"); - - return simple_read_from_buffer(buffer, count, pos, buf, len); -} - -static void *qm_ctx_alloc(struct hisi_qm *qm, size_t ctx_size, - dma_addr_t *dma_addr) -{ - struct device *dev = &qm->pdev->dev; - void *ctx_addr; - - ctx_addr = kzalloc(ctx_size, GFP_KERNEL); - if (!ctx_addr) - return ERR_PTR(-ENOMEM); - - *dma_addr = dma_map_single(dev, ctx_addr, ctx_size, DMA_FROM_DEVICE); - if (dma_mapping_error(dev, *dma_addr)) { - dev_err(dev, "DMA mapping error!\n"); - kfree(ctx_addr); - return ERR_PTR(-ENOMEM); - } - - return ctx_addr; -} - -static void qm_ctx_free(struct hisi_qm *qm, size_t ctx_size, - const void *ctx_addr, dma_addr_t *dma_addr) -{ - struct device *dev = &qm->pdev->dev; - - dma_unmap_single(dev, *dma_addr, ctx_size, DMA_FROM_DEVICE); - kfree(ctx_addr); -} - -static void dump_show(struct hisi_qm *qm, void *info, - unsigned int info_size, char *info_name) -{ - struct device *dev = &qm->pdev->dev; - u8 *info_curr = info; - u32 i; -#define BYTE_PER_DW 4 - - dev_info(dev, "%s DUMP\n", info_name); - for (i = 0; i < info_size; i += BYTE_PER_DW, info_curr += BYTE_PER_DW) { - pr_info("DW%u: %02X%02X %02X%02X\n", i / BYTE_PER_DW, - *(info_curr + 3), *(info_curr + 2), *(info_curr + 1), *(info_curr)); - } -} - -static int qm_dump_sqc_raw(struct hisi_qm *qm, dma_addr_t dma_addr, u16 qp_id) -{ - return hisi_qm_mb(qm, QM_MB_CMD_SQC, dma_addr, qp_id, 1); -} - -static int qm_dump_cqc_raw(struct hisi_qm *qm, dma_addr_t dma_addr, u16 qp_id) -{ - return hisi_qm_mb(qm, QM_MB_CMD_CQC, dma_addr, qp_id, 1); -} - -static int qm_sqc_dump(struct hisi_qm *qm, const char *s) -{ - struct device *dev = &qm->pdev->dev; - struct qm_sqc *sqc, *sqc_curr; - dma_addr_t sqc_dma; - u32 qp_id; - int ret; - - if (!s) - return -EINVAL; - - ret = kstrtou32(s, 0, &qp_id); - if (ret || qp_id >= qm->qp_num) { - dev_err(dev, "Please input qp num (0-%u)", qm->qp_num - 1); - return -EINVAL; - } - - sqc = qm_ctx_alloc(qm, sizeof(*sqc), &sqc_dma); - if (IS_ERR(sqc)) - return PTR_ERR(sqc); - - ret = qm_dump_sqc_raw(qm, sqc_dma, qp_id); - if (ret) { - down_read(&qm->qps_lock); - if (qm->sqc) { - sqc_curr = qm->sqc + qp_id; - - dump_show(qm, sqc_curr, sizeof(*sqc), "SOFT SQC"); - } - up_read(&qm->qps_lock); - - goto free_ctx; - } - - dump_show(qm, sqc, sizeof(*sqc), "SQC"); - -free_ctx: - qm_ctx_free(qm, sizeof(*sqc), sqc, &sqc_dma); - return 0; -} - -static int qm_cqc_dump(struct hisi_qm *qm, const char *s) -{ - struct device *dev = &qm->pdev->dev; - struct qm_cqc *cqc, *cqc_curr; - dma_addr_t cqc_dma; - u32 qp_id; - int ret; - - if (!s) - return -EINVAL; - - ret = kstrtou32(s, 0, &qp_id); - if (ret || qp_id >= qm->qp_num) { - dev_err(dev, "Please input qp num (0-%u)", qm->qp_num - 1); - return -EINVAL; - } - - cqc = qm_ctx_alloc(qm, sizeof(*cqc), &cqc_dma); - if (IS_ERR(cqc)) - return PTR_ERR(cqc); - - ret = qm_dump_cqc_raw(qm, cqc_dma, qp_id); - if (ret) { - down_read(&qm->qps_lock); - if (qm->cqc) { - cqc_curr = qm->cqc + qp_id; - - dump_show(qm, cqc_curr, sizeof(*cqc), "SOFT CQC"); + break; } - up_read(&qm->qps_lock); - - goto free_ctx; } - dump_show(qm, cqc, sizeof(*cqc), "CQC"); - -free_ctx: - qm_ctx_free(qm, sizeof(*cqc), cqc, &cqc_dma); - return 0; + writel(lower_32_bits(tmp), qm->io_base + QM_VFT_CFG_DATA_L); + writel(upper_32_bits(tmp), qm->io_base + QM_VFT_CFG_DATA_H); } -static int qm_eqc_aeqc_dump(struct hisi_qm *qm, char *s, size_t size, - int cmd, char *name) +static int qm_set_vft_common(struct hisi_qm *qm, enum vft_type type, + u32 fun_num, u32 base, u32 number) { - struct device *dev = &qm->pdev->dev; - dma_addr_t xeqc_dma; - void *xeqc; + struct qm_shaper_factor *factor = NULL; + unsigned int val; int ret; - if (strsep(&s, " ")) { - dev_err(dev, "Please do not input extra characters!\n"); - return -EINVAL; - } - - xeqc = qm_ctx_alloc(qm, size, &xeqc_dma); - if (IS_ERR(xeqc)) - return PTR_ERR(xeqc); + if (type == SHAPER_VFT && test_bit(QM_SUPPORT_FUNC_QOS, &qm->caps)) + factor = &qm->factor[fun_num]; - ret = hisi_qm_mb(qm, cmd, xeqc_dma, 0, 1); + ret = readl_relaxed_poll_timeout(qm->io_base + QM_VFT_CFG_RDY, val, + val & BIT(0), POLL_PERIOD, + POLL_TIMEOUT); if (ret) - goto err_free_ctx; + return ret; - dump_show(qm, xeqc, size, name); + writel(0x0, qm->io_base + QM_VFT_CFG_OP_WR); + writel(type, qm->io_base + QM_VFT_CFG_TYPE); + if (type == SHAPER_VFT) + fun_num |= base << QM_SHAPER_VFT_OFFSET; -err_free_ctx: - qm_ctx_free(qm, size, xeqc, &xeqc_dma); - return ret; -} + writel(fun_num, qm->io_base + QM_VFT_CFG); -static int q_dump_param_parse(struct hisi_qm *qm, char *s, - u32 *e_id, u32 *q_id, u16 q_depth) -{ - struct device *dev = &qm->pdev->dev; - unsigned int qp_num = qm->qp_num; - char *presult; - int ret; + qm_vft_data_cfg(qm, type, base, number, factor); - presult = strsep(&s, " "); - if (!presult) { - dev_err(dev, "Please input qp number!\n"); - return -EINVAL; - } + writel(0x0, qm->io_base + QM_VFT_CFG_RDY); + writel(0x1, qm->io_base + QM_VFT_CFG_OP_ENABLE); - ret = kstrtou32(presult, 0, q_id); - if (ret || *q_id >= qp_num) { - dev_err(dev, "Please input qp num (0-%u)", qp_num - 1); - return -EINVAL; - } + return readl_relaxed_poll_timeout(qm->io_base + QM_VFT_CFG_RDY, val, + val & BIT(0), POLL_PERIOD, + POLL_TIMEOUT); +} - presult = strsep(&s, " "); - if (!presult) { - dev_err(dev, "Please input sqe number!\n"); - return -EINVAL; - } +static int qm_shaper_init_vft(struct hisi_qm *qm, u32 fun_num) +{ + u32 qos = qm->factor[fun_num].func_qos; + int ret, i; - ret = kstrtou32(presult, 0, e_id); - if (ret || *e_id >= q_depth) { - dev_err(dev, "Please input sqe num (0-%u)", q_depth - 1); - return -EINVAL; + ret = qm_get_shaper_para(qos * QM_QOS_RATE, &qm->factor[fun_num]); + if (ret) { + dev_err(&qm->pdev->dev, "failed to calculate shaper parameter!\n"); + return ret; } - - if (strsep(&s, " ")) { - dev_err(dev, "Please do not input extra characters!\n"); - return -EINVAL; + writel(qm->type_rate, qm->io_base + QM_SHAPER_CFG); + for (i = ALG_TYPE_0; i <= ALG_TYPE_1; i++) { + /* The base number of queue reuse for different alg type */ + ret = qm_set_vft_common(qm, SHAPER_VFT, fun_num, i, 1); + if (ret) + return ret; } return 0; } -static int qm_sq_dump(struct hisi_qm *qm, char *s) +/* The config should be conducted after qm_dev_mem_reset() */ +static int qm_set_sqc_cqc_vft(struct hisi_qm *qm, u32 fun_num, u32 base, + u32 number) { - u16 sq_depth = qm->qp_array->cq_depth; - void *sqe, *sqe_curr; - struct hisi_qp *qp; - u32 qp_id, sqe_id; - int ret; - - ret = q_dump_param_parse(qm, s, &sqe_id, &qp_id, sq_depth); - if (ret) - return ret; - - sqe = kzalloc(qm->sqe_size * sq_depth, GFP_KERNEL); - if (!sqe) - return -ENOMEM; - - qp = &qm->qp_array[qp_id]; - memcpy(sqe, qp->sqe, qm->sqe_size * sq_depth); - sqe_curr = sqe + (u32)(sqe_id * qm->sqe_size); - memset(sqe_curr + qm->debug.sqe_mask_offset, QM_SQE_ADDR_MASK, - qm->debug.sqe_mask_len); + int ret, i; - dump_show(qm, sqe_curr, qm->sqe_size, "SQE"); + for (i = SQC_VFT; i <= CQC_VFT; i++) { + ret = qm_set_vft_common(qm, i, fun_num, base, number); + if (ret) + return ret; + } - kfree(sqe); + /* init default shaper qos val */ + if (test_bit(QM_SUPPORT_FUNC_QOS, &qm->caps)) { + ret = qm_shaper_init_vft(qm, fun_num); + if (ret) + goto back_sqc_cqc; + } return 0; +back_sqc_cqc: + for (i = SQC_VFT; i <= CQC_VFT; i++) + qm_set_vft_common(qm, i, fun_num, 0, 0); + + return ret; } -static int qm_cq_dump(struct hisi_qm *qm, char *s) +static int qm_get_vft_v2(struct hisi_qm *qm, u32 *base, u32 *number) { - struct qm_cqe *cqe_curr; - struct hisi_qp *qp; - u32 qp_id, cqe_id; + u64 sqc_vft; int ret; - ret = q_dump_param_parse(qm, s, &cqe_id, &qp_id, qm->qp_array->cq_depth); + ret = hisi_qm_mb(qm, QM_MB_CMD_SQC_VFT_V2, 0, 0, 1); if (ret) return ret; - qp = &qm->qp_array[qp_id]; - cqe_curr = qp->cqe + cqe_id; - dump_show(qm, cqe_curr, sizeof(struct qm_cqe), "CQE"); + sqc_vft = readl(qm->io_base + QM_MB_CMD_DATA_ADDR_L) | + ((u64)readl(qm->io_base + QM_MB_CMD_DATA_ADDR_H) << 32); + *base = QM_SQC_VFT_BASE_MASK_V2 & (sqc_vft >> QM_SQC_VFT_BASE_SHIFT_V2); + *number = (QM_SQC_VFT_NUM_MASK_v2 & + (sqc_vft >> QM_SQC_VFT_NUM_SHIFT_V2)) + 1; return 0; } -static int qm_eq_aeq_dump(struct hisi_qm *qm, const char *s, - size_t size, char *name) +void *hisi_qm_ctx_alloc(struct hisi_qm *qm, size_t ctx_size, + dma_addr_t *dma_addr) { struct device *dev = &qm->pdev->dev; - void *xeqe; - u32 xeqe_id; - int ret; - - if (!s) - return -EINVAL; - - ret = kstrtou32(s, 0, &xeqe_id); - if (ret) - return -EINVAL; - - if (!strcmp(name, "EQE") && xeqe_id >= qm->eq_depth) { - dev_err(dev, "Please input eqe num (0-%u)", qm->eq_depth - 1); - return -EINVAL; - } else if (!strcmp(name, "AEQE") && xeqe_id >= qm->aeq_depth) { - dev_err(dev, "Please input aeqe num (0-%u)", qm->eq_depth - 1); - return -EINVAL; - } - - down_read(&qm->qps_lock); - - if (qm->eqe && !strcmp(name, "EQE")) { - xeqe = qm->eqe + xeqe_id; - } else if (qm->aeqe && !strcmp(name, "AEQE")) { - xeqe = qm->aeqe + xeqe_id; - } else { - ret = -EINVAL; - goto err_unlock; - } - - dump_show(qm, xeqe, size, name); - -err_unlock: - up_read(&qm->qps_lock); - return ret; -} + void *ctx_addr; -static int qm_dbg_help(struct hisi_qm *qm, char *s) -{ - struct device *dev = &qm->pdev->dev; + ctx_addr = kzalloc(ctx_size, GFP_KERNEL); + if (!ctx_addr) + return ERR_PTR(-ENOMEM); - if (strsep(&s, " ")) { - dev_err(dev, "Please do not input extra characters!\n"); - return -EINVAL; + *dma_addr = dma_map_single(dev, ctx_addr, ctx_size, DMA_FROM_DEVICE); + if (dma_mapping_error(dev, *dma_addr)) { + dev_err(dev, "DMA mapping error!\n"); + kfree(ctx_addr); + return ERR_PTR(-ENOMEM); } - dev_info(dev, "available commands:\n"); - dev_info(dev, "sqc \n"); - dev_info(dev, "cqc \n"); - dev_info(dev, "eqc\n"); - dev_info(dev, "aeqc\n"); - dev_info(dev, "sq \n"); - dev_info(dev, "cq \n"); - dev_info(dev, "eq \n"); - dev_info(dev, "aeq \n"); - - return 0; + return ctx_addr; } -static int qm_cmd_write_dump(struct hisi_qm *qm, const char *cmd_buf) +void hisi_qm_ctx_free(struct hisi_qm *qm, size_t ctx_size, + const void *ctx_addr, dma_addr_t *dma_addr) { struct device *dev = &qm->pdev->dev; - char *presult, *s, *s_tmp; - int ret; - - s = kstrdup(cmd_buf, GFP_KERNEL); - if (!s) - return -ENOMEM; - - s_tmp = s; - presult = strsep(&s, " "); - if (!presult) { - ret = -EINVAL; - goto err_buffer_free; - } - - if (!strcmp(presult, "sqc")) - ret = qm_sqc_dump(qm, s); - else if (!strcmp(presult, "cqc")) - ret = qm_cqc_dump(qm, s); - else if (!strcmp(presult, "eqc")) - ret = qm_eqc_aeqc_dump(qm, s, sizeof(struct qm_eqc), - QM_MB_CMD_EQC, "EQC"); - else if (!strcmp(presult, "aeqc")) - ret = qm_eqc_aeqc_dump(qm, s, sizeof(struct qm_aeqc), - QM_MB_CMD_AEQC, "AEQC"); - else if (!strcmp(presult, "sq")) - ret = qm_sq_dump(qm, s); - else if (!strcmp(presult, "cq")) - ret = qm_cq_dump(qm, s); - else if (!strcmp(presult, "eq")) - ret = qm_eq_aeq_dump(qm, s, sizeof(struct qm_eqe), "EQE"); - else if (!strcmp(presult, "aeq")) - ret = qm_eq_aeq_dump(qm, s, sizeof(struct qm_aeqe), "AEQE"); - else if (!strcmp(presult, "help")) - ret = qm_dbg_help(qm, s); - else - ret = -EINVAL; - - if (ret) - dev_info(dev, "Please echo help\n"); - -err_buffer_free: - kfree(s_tmp); - return ret; + dma_unmap_single(dev, *dma_addr, ctx_size, DMA_FROM_DEVICE); + kfree(ctx_addr); } -static ssize_t qm_cmd_write(struct file *filp, const char __user *buffer, - size_t count, loff_t *pos) +static int qm_dump_sqc_raw(struct hisi_qm *qm, dma_addr_t dma_addr, u16 qp_id) { - struct hisi_qm *qm = filp->private_data; - char *cmd_buf, *cmd_buf_tmp; - int ret; - - if (*pos) - return 0; - - ret = hisi_qm_get_dfx_access(qm); - if (ret) - return ret; - - /* Judge if the instance is being reset. */ - if (unlikely(atomic_read(&qm->status.flags) == QM_STOP)) { - ret = 0; - goto put_dfx_access; - } - - if (count > QM_DBG_WRITE_LEN) { - ret = -ENOSPC; - goto put_dfx_access; - } - - cmd_buf = memdup_user_nul(buffer, count); - if (IS_ERR(cmd_buf)) { - ret = PTR_ERR(cmd_buf); - goto put_dfx_access; - } - - cmd_buf_tmp = strchr(cmd_buf, '\n'); - if (cmd_buf_tmp) { - *cmd_buf_tmp = '\0'; - count = cmd_buf_tmp - cmd_buf + 1; - } - - ret = qm_cmd_write_dump(qm, cmd_buf); - if (ret) { - kfree(cmd_buf); - goto put_dfx_access; - } - - kfree(cmd_buf); - - ret = count; - -put_dfx_access: - hisi_qm_put_dfx_access(qm); - return ret; + return hisi_qm_mb(qm, QM_MB_CMD_SQC, dma_addr, qp_id, 1); } -static const struct file_operations qm_cmd_fops = { - .owner = THIS_MODULE, - .open = simple_open, - .read = qm_cmd_read, - .write = qm_cmd_write, -}; - -static void qm_create_debugfs_file(struct hisi_qm *qm, struct dentry *dir, - enum qm_debug_file index) +static int qm_dump_cqc_raw(struct hisi_qm *qm, dma_addr_t dma_addr, u16 qp_id) { - struct debugfs_file *file = qm->debug.files + index; - - debugfs_create_file(qm_debug_file_name[index], 0600, dir, file, - &qm_debug_fops); - - file->index = index; - mutex_init(&file->lock); - file->debug = &qm->debug; + return hisi_qm_mb(qm, QM_MB_CMD_CQC, dma_addr, qp_id, 1); } static void qm_hw_error_init_v1(struct hisi_qm *qm) @@ -3155,7 +2130,7 @@ static int qm_drain_qp(struct hisi_qp *qp) return ret; } - addr = qm_ctx_alloc(qm, size, &dma_addr); + addr = hisi_qm_ctx_alloc(qm, size, &dma_addr); if (IS_ERR(addr)) { dev_err(dev, "Failed to alloc ctx for sqc and cqc!\n"); return -ENOMEM; @@ -3190,7 +2165,7 @@ static int qm_drain_qp(struct hisi_qp *qp) usleep_range(WAIT_PERIOD_US_MIN, WAIT_PERIOD_US_MAX); } - qm_ctx_free(qm, size, addr, &dma_addr); + hisi_qm_ctx_free(qm, size, addr, &dma_addr); return ret; } @@ -4173,45 +3148,6 @@ err_unlock: } EXPORT_SYMBOL_GPL(hisi_qm_stop); -static ssize_t qm_status_read(struct file *filp, char __user *buffer, - size_t count, loff_t *pos) -{ - struct hisi_qm *qm = filp->private_data; - char buf[QM_DBG_READ_LEN]; - int val, len; - - val = atomic_read(&qm->status.flags); - len = scnprintf(buf, QM_DBG_READ_LEN, "%s\n", qm_s[val]); - - return simple_read_from_buffer(buffer, count, pos, buf, len); -} - -static const struct file_operations qm_status_fops = { - .owner = THIS_MODULE, - .open = simple_open, - .read = qm_status_read, -}; - -static int qm_debugfs_atomic64_set(void *data, u64 val) -{ - if (val) - return -EINVAL; - - atomic64_set((atomic64_t *)data, 0); - - return 0; -} - -static int qm_debugfs_atomic64_get(void *data, u64 *val) -{ - *val = atomic64_read((atomic64_t *)data); - - return 0; -} - -DEFINE_DEBUGFS_ATTRIBUTE(qm_atomic64_ops, qm_debugfs_atomic64_get, - qm_debugfs_atomic64_set, "%llu\n"); - static void qm_hw_error_init(struct hisi_qm *qm) { if (!qm->ops->hw_error_init) { @@ -4732,7 +3668,7 @@ static const struct file_operations qm_algqos_fops = { * * Create function qos debugfs files, VF ping PF to get function qos. */ -static void hisi_qm_set_algqos_init(struct hisi_qm *qm) +void hisi_qm_set_algqos_init(struct hisi_qm *qm) { if (qm->fun_type == QM_HW_PF) debugfs_create_file("alg_qos", 0644, qm->debug.debug_root, @@ -4742,88 +3678,6 @@ static void hisi_qm_set_algqos_init(struct hisi_qm *qm) qm, &qm_algqos_fops); } -/** - * hisi_qm_debug_init() - Initialize qm related debugfs files. - * @qm: The qm for which we want to add debugfs files. - * - * Create qm related debugfs files. - */ -void hisi_qm_debug_init(struct hisi_qm *qm) -{ - struct dfx_diff_registers *qm_regs = qm->debug.qm_diff_regs; - struct qm_dfx *dfx = &qm->debug.dfx; - struct dentry *qm_d; - void *data; - int i; - - qm_d = debugfs_create_dir("qm", qm->debug.debug_root); - qm->debug.qm_d = qm_d; - - /* only show this in PF */ - if (qm->fun_type == QM_HW_PF) { - qm_create_debugfs_file(qm, qm->debug.debug_root, CURRENT_QM); - for (i = CURRENT_Q; i < DEBUG_FILE_NUM; i++) - qm_create_debugfs_file(qm, qm->debug.qm_d, i); - } - - if (qm_regs) - debugfs_create_file("diff_regs", 0444, qm->debug.qm_d, - qm, &qm_diff_regs_fops); - - debugfs_create_file("regs", 0444, qm->debug.qm_d, qm, &qm_regs_fops); - - debugfs_create_file("cmd", 0600, qm->debug.qm_d, qm, &qm_cmd_fops); - - debugfs_create_file("status", 0444, qm->debug.qm_d, qm, - &qm_status_fops); - for (i = 0; i < ARRAY_SIZE(qm_dfx_files); i++) { - data = (atomic64_t *)((uintptr_t)dfx + qm_dfx_files[i].offset); - debugfs_create_file(qm_dfx_files[i].name, - 0644, - qm_d, - data, - &qm_atomic64_ops); - } - - if (test_bit(QM_SUPPORT_FUNC_QOS, &qm->caps)) - hisi_qm_set_algqos_init(qm); -} -EXPORT_SYMBOL_GPL(hisi_qm_debug_init); - -/** - * hisi_qm_debug_regs_clear() - clear qm debug related registers. - * @qm: The qm for which we want to clear its debug registers. - */ -void hisi_qm_debug_regs_clear(struct hisi_qm *qm) -{ - const struct debugfs_reg32 *regs; - int i; - - /* clear current_qm */ - writel(0x0, qm->io_base + QM_DFX_MB_CNT_VF); - writel(0x0, qm->io_base + QM_DFX_DB_CNT_VF); - - /* clear current_q */ - writel(0x0, qm->io_base + QM_DFX_SQE_CNT_VF_SQN); - writel(0x0, qm->io_base + QM_DFX_CQE_CNT_VF_CQN); - - /* - * these registers are reading and clearing, so clear them after - * reading them. - */ - writel(0x1, qm->io_base + QM_DFX_CNT_CLR_CE); - - regs = qm_dfx_regs; - for (i = 0; i < CNT_CYC_REGS_NUM; i++) { - readl(qm->io_base + regs->offset); - regs++; - } - - /* clear clear_enable */ - writel(0x0, qm->io_base + QM_DFX_CNT_CLR_CE); -} -EXPORT_SYMBOL_GPL(hisi_qm_debug_regs_clear); - static void hisi_qm_init_vf_qos(struct hisi_qm *qm, int total_func) { int i; @@ -5462,24 +4316,6 @@ static int qm_controller_reset_done(struct hisi_qm *qm) return 0; } -static void qm_show_last_dfx_regs(struct hisi_qm *qm) -{ - struct qm_debug *debug = &qm->debug; - struct pci_dev *pdev = qm->pdev; - u32 val; - int i; - - if (qm->fun_type == QM_HW_VF || !debug->qm_last_words) - return; - - for (i = 0; i < ARRAY_SIZE(qm_dfx_regs); i++) { - val = readl_relaxed(qm->io_base + qm_dfx_regs[i].offset); - if (debug->qm_last_words[i] != val) - pci_info(pdev, "%s \t= 0x%08x => 0x%08x\n", - qm_dfx_regs[i].name, debug->qm_last_words[i], val); - } -} - static int qm_controller_reset(struct hisi_qm *qm) { struct pci_dev *pdev = qm->pdev; @@ -5495,7 +4331,7 @@ static int qm_controller_reset(struct hisi_qm *qm) return ret; } - qm_show_last_dfx_regs(qm); + hisi_qm_show_last_dfx_regs(qm); if (qm->err_ini->show_last_dfx_regs) qm->err_ini->show_last_dfx_regs(qm); diff --git a/drivers/crypto/hisilicon/qm_common.h b/drivers/crypto/hisilicon/qm_common.h new file mode 100644 index 0000000000000..1406a422d4551 --- /dev/null +++ b/drivers/crypto/hisilicon/qm_common.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2022 HiSilicon Limited. */ +#ifndef QM_COMMON_H +#define QM_COMMON_H + +#define QM_DBG_READ_LEN 256 +#define QM_RESETTING 2 + +struct qm_cqe { + __le32 rsvd0; + __le16 cmd_id; + __le16 rsvd1; + __le16 sq_head; + __le16 sq_num; + __le16 rsvd2; + __le16 w7; +}; + +struct qm_eqe { + __le32 dw0; +}; + +struct qm_aeqe { + __le32 dw0; +}; + +struct qm_sqc { + __le16 head; + __le16 tail; + __le32 base_l; + __le32 base_h; + __le32 dw3; + __le16 w8; + __le16 rsvd0; + __le16 pasid; + __le16 w11; + __le16 cq_num; + __le16 w13; + __le32 rsvd1; +}; + +struct qm_cqc { + __le16 head; + __le16 tail; + __le32 base_l; + __le32 base_h; + __le32 dw3; + __le16 w8; + __le16 rsvd0; + __le16 pasid; + __le16 w11; + __le32 dw6; + __le32 rsvd1; +}; + +struct qm_eqc { + __le16 head; + __le16 tail; + __le32 base_l; + __le32 base_h; + __le32 dw3; + __le32 rsvd[2]; + __le32 dw6; +}; + +struct qm_aeqc { + __le16 head; + __le16 tail; + __le32 base_l; + __le32 base_h; + __le32 dw3; + __le32 rsvd[2]; + __le32 dw6; +}; + +static const char * const qm_s[] = { + "init", "start", "close", "stop", +}; + +void *hisi_qm_ctx_alloc(struct hisi_qm *qm, size_t ctx_size, + dma_addr_t *dma_addr); +void hisi_qm_ctx_free(struct hisi_qm *qm, size_t ctx_size, + const void *ctx_addr, dma_addr_t *dma_addr); +void hisi_qm_show_last_dfx_regs(struct hisi_qm *qm); +void hisi_qm_set_algqos_init(struct hisi_qm *qm); + +#endif -- GitLab From 9c75609842f091fa814153d21243b07988dc8ef3 Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Sat, 12 Nov 2022 02:12:53 +0000 Subject: [PATCH 0759/1423] crypto: hisilicon/qm - the command dump process is modified Reduce the function complexity by use the function table in the process of dumping queue. The function input parameters are unified. And maintainability is enhanced. Signed-off-by: Kai Ye Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/debugfs.c | 130 ++++++++++++++++++++--------- 1 file changed, 90 insertions(+), 40 deletions(-) diff --git a/drivers/crypto/hisilicon/debugfs.c b/drivers/crypto/hisilicon/debugfs.c index 13bec8b2d7237..2cc1591949db7 100644 --- a/drivers/crypto/hisilicon/debugfs.c +++ b/drivers/crypto/hisilicon/debugfs.c @@ -36,6 +36,12 @@ struct qm_dfx_item { u32 offset; }; +struct qm_cmd_dump_item { + const char *cmd; + char *info_name; + int (*dump_fn)(struct hisi_qm *qm, char *cmd, char *info_name); +}; + static struct qm_dfx_item qm_dfx_files[] = { {"err_irq", offsetof(struct qm_dfx, err_irq_cnt)}, {"aeq_irq", offsetof(struct qm_dfx, aeq_irq_cnt)}, @@ -128,7 +134,7 @@ static void dump_show(struct hisi_qm *qm, void *info, } } -static int qm_sqc_dump(struct hisi_qm *qm, const char *s) +static int qm_sqc_dump(struct hisi_qm *qm, char *s, char *name) { struct device *dev = &qm->pdev->dev; struct qm_sqc *sqc, *sqc_curr; @@ -162,14 +168,14 @@ static int qm_sqc_dump(struct hisi_qm *qm, const char *s) goto free_ctx; } - dump_show(qm, sqc, sizeof(*sqc), "SQC"); + dump_show(qm, sqc, sizeof(*sqc), name); free_ctx: hisi_qm_ctx_free(qm, sizeof(*sqc), sqc, &sqc_dma); return 0; } -static int qm_cqc_dump(struct hisi_qm *qm, const char *s) +static int qm_cqc_dump(struct hisi_qm *qm, char *s, char *name) { struct device *dev = &qm->pdev->dev; struct qm_cqc *cqc, *cqc_curr; @@ -203,26 +209,35 @@ static int qm_cqc_dump(struct hisi_qm *qm, const char *s) goto free_ctx; } - dump_show(qm, cqc, sizeof(*cqc), "CQC"); + dump_show(qm, cqc, sizeof(*cqc), name); free_ctx: hisi_qm_ctx_free(qm, sizeof(*cqc), cqc, &cqc_dma); return 0; } -static int qm_eqc_aeqc_dump(struct hisi_qm *qm, char *s, size_t size, - int cmd, char *name) +static int qm_eqc_aeqc_dump(struct hisi_qm *qm, char *s, char *name) { struct device *dev = &qm->pdev->dev; dma_addr_t xeqc_dma; + size_t size; void *xeqc; int ret; + u8 cmd; if (strsep(&s, " ")) { dev_err(dev, "Please do not input extra characters!\n"); return -EINVAL; } + if (!strcmp(name, "EQC")) { + cmd = QM_MB_CMD_EQC; + size = sizeof(struct qm_eqc); + } else { + cmd = QM_MB_CMD_AEQC; + size = sizeof(struct qm_aeqc); + } + xeqc = hisi_qm_ctx_alloc(qm, size, &xeqc_dma); if (IS_ERR(xeqc)) return PTR_ERR(xeqc); @@ -278,7 +293,7 @@ static int q_dump_param_parse(struct hisi_qm *qm, char *s, return 0; } -static int qm_sq_dump(struct hisi_qm *qm, char *s) +static int qm_sq_dump(struct hisi_qm *qm, char *s, char *name) { u16 sq_depth = qm->qp_array->cq_depth; void *sqe, *sqe_curr; @@ -300,14 +315,14 @@ static int qm_sq_dump(struct hisi_qm *qm, char *s) memset(sqe_curr + qm->debug.sqe_mask_offset, QM_SQE_ADDR_MASK, qm->debug.sqe_mask_len); - dump_show(qm, sqe_curr, qm->sqe_size, "SQE"); + dump_show(qm, sqe_curr, qm->sqe_size, name); kfree(sqe); return 0; } -static int qm_cq_dump(struct hisi_qm *qm, char *s) +static int qm_cq_dump(struct hisi_qm *qm, char *s, char *name) { struct qm_cqe *cqe_curr; struct hisi_qp *qp; @@ -320,15 +335,16 @@ static int qm_cq_dump(struct hisi_qm *qm, char *s) qp = &qm->qp_array[qp_id]; cqe_curr = qp->cqe + cqe_id; - dump_show(qm, cqe_curr, sizeof(struct qm_cqe), "CQE"); + dump_show(qm, cqe_curr, sizeof(struct qm_cqe), name); return 0; } -static int qm_eq_aeq_dump(struct hisi_qm *qm, const char *s, - size_t size, char *name) +static int qm_eq_aeq_dump(struct hisi_qm *qm, char *s, char *name) { struct device *dev = &qm->pdev->dev; + u16 xeq_depth; + size_t size; void *xeqe; u32 xeqe_id; int ret; @@ -340,11 +356,16 @@ static int qm_eq_aeq_dump(struct hisi_qm *qm, const char *s, if (ret) return -EINVAL; - if (!strcmp(name, "EQE") && xeqe_id >= qm->eq_depth) { - dev_err(dev, "Please input eqe num (0-%u)", qm->eq_depth - 1); - return -EINVAL; - } else if (!strcmp(name, "AEQE") && xeqe_id >= qm->aeq_depth) { - dev_err(dev, "Please input aeqe num (0-%u)", qm->eq_depth - 1); + if (!strcmp(name, "EQE")) { + xeq_depth = qm->eq_depth; + size = sizeof(struct qm_eqe); + } else { + xeq_depth = qm->aeq_depth; + size = sizeof(struct qm_aeqe); + } + + if (xeqe_id >= xeq_depth) { + dev_err(dev, "Please input eqe or aeqe num (0-%u)", xeq_depth - 1); return -EINVAL; } @@ -388,11 +409,47 @@ static int qm_dbg_help(struct hisi_qm *qm, char *s) return 0; } +static const struct qm_cmd_dump_item qm_cmd_dump_table[] = { + { + .cmd = "sqc", + .info_name = "SQC", + .dump_fn = qm_sqc_dump, + }, { + .cmd = "cqc", + .info_name = "CQC", + .dump_fn = qm_cqc_dump, + }, { + .cmd = "eqc", + .info_name = "EQC", + .dump_fn = qm_eqc_aeqc_dump, + }, { + .cmd = "aeqc", + .info_name = "AEQC", + .dump_fn = qm_eqc_aeqc_dump, + }, { + .cmd = "sq", + .info_name = "SQE", + .dump_fn = qm_sq_dump, + }, { + .cmd = "cq", + .info_name = "CQE", + .dump_fn = qm_cq_dump, + }, { + .cmd = "eq", + .info_name = "EQE", + .dump_fn = qm_eq_aeq_dump, + }, { + .cmd = "aeq", + .info_name = "AEQE", + .dump_fn = qm_eq_aeq_dump, + }, +}; + static int qm_cmd_write_dump(struct hisi_qm *qm, const char *cmd_buf) { struct device *dev = &qm->pdev->dev; char *presult, *s, *s_tmp; - int ret; + int table_size, i, ret; s = kstrdup(cmd_buf, GFP_KERNEL); if (!s) @@ -405,31 +462,24 @@ static int qm_cmd_write_dump(struct hisi_qm *qm, const char *cmd_buf) goto err_buffer_free; } - if (!strcmp(presult, "sqc")) - ret = qm_sqc_dump(qm, s); - else if (!strcmp(presult, "cqc")) - ret = qm_cqc_dump(qm, s); - else if (!strcmp(presult, "eqc")) - ret = qm_eqc_aeqc_dump(qm, s, sizeof(struct qm_eqc), - QM_MB_CMD_EQC, "EQC"); - else if (!strcmp(presult, "aeqc")) - ret = qm_eqc_aeqc_dump(qm, s, sizeof(struct qm_aeqc), - QM_MB_CMD_AEQC, "AEQC"); - else if (!strcmp(presult, "sq")) - ret = qm_sq_dump(qm, s); - else if (!strcmp(presult, "cq")) - ret = qm_cq_dump(qm, s); - else if (!strcmp(presult, "eq")) - ret = qm_eq_aeq_dump(qm, s, sizeof(struct qm_eqe), "EQE"); - else if (!strcmp(presult, "aeq")) - ret = qm_eq_aeq_dump(qm, s, sizeof(struct qm_aeqe), "AEQE"); - else if (!strcmp(presult, "help")) + if (!strcmp(presult, "help")) { ret = qm_dbg_help(qm, s); - else - ret = -EINVAL; + goto err_buffer_free; + } - if (ret) + table_size = ARRAY_SIZE(qm_cmd_dump_table); + for (i = 0; i < table_size; i++) { + if (!strcmp(presult, qm_cmd_dump_table[i].cmd)) { + ret = qm_cmd_dump_table[i].dump_fn(qm, s, + qm_cmd_dump_table[i].info_name); + break; + } + } + + if (i == table_size) { dev_info(dev, "Please echo help\n"); + ret = -EINVAL; + } err_buffer_free: kfree(s_tmp); -- GitLab From 2132d4efaa66388f1f79c79a920908a22464686b Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Sat, 12 Nov 2022 08:51:04 +0000 Subject: [PATCH 0760/1423] crypto: hisilicon/sec - fix spelling mistake 'ckeck' -> 'check' There are a couple of spelling mistakes in sec2. Fix them. Signed-off-by: Kai Ye Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/sec2/sec_crypto.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/hisilicon/sec2/sec_crypto.c b/drivers/crypto/hisilicon/sec2/sec_crypto.c index 84ae8ddd1a131..a2630ddc02944 100644 --- a/drivers/crypto/hisilicon/sec2/sec_crypto.c +++ b/drivers/crypto/hisilicon/sec2/sec_crypto.c @@ -2009,7 +2009,7 @@ static int sec_aead_sha512_ctx_init(struct crypto_aead *tfm) return sec_aead_ctx_init(tfm, "sha512"); } -static int sec_skcipher_cryptlen_ckeck(struct sec_ctx *ctx, +static int sec_skcipher_cryptlen_check(struct sec_ctx *ctx, struct sec_req *sreq) { u32 cryptlen = sreq->c_req.sk_req->cryptlen; @@ -2071,7 +2071,7 @@ static int sec_skcipher_param_check(struct sec_ctx *ctx, struct sec_req *sreq) } return 0; } else if (c_alg == SEC_CALG_AES || c_alg == SEC_CALG_SM4) { - return sec_skcipher_cryptlen_ckeck(ctx, sreq); + return sec_skcipher_cryptlen_check(ctx, sreq); } dev_err(dev, "skcipher algorithm error!\n"); -- GitLab From 75df46b598b5b46b0857ee7d2410deaf215e23d1 Mon Sep 17 00:00:00 2001 From: Wenkai Lin Date: Sat, 12 Nov 2022 08:51:05 +0000 Subject: [PATCH 0761/1423] crypto: hisilicon/sec - remove continuous blank lines Fix that put two or more continuous blank lines inside function. Signed-off-by: Wenkai Lin Signed-off-by: Kai Ye Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/sec2/sec_crypto.c | 1 - drivers/crypto/hisilicon/sec2/sec_main.c | 1 - 2 files changed, 2 deletions(-) diff --git a/drivers/crypto/hisilicon/sec2/sec_crypto.c b/drivers/crypto/hisilicon/sec2/sec_crypto.c index a2630ddc02944..f5bfc9755a4a3 100644 --- a/drivers/crypto/hisilicon/sec2/sec_crypto.c +++ b/drivers/crypto/hisilicon/sec2/sec_crypto.c @@ -283,7 +283,6 @@ static int sec_bd_send(struct sec_ctx *ctx, struct sec_req *req) spin_lock_bh(&qp_ctx->req_lock); ret = hisi_qp_send(qp_ctx->qp, &req->sec_sqe); - if (ctx->fake_req_limit <= atomic_read(&qp_ctx->qp->qp_status.used) && !ret) { list_add_tail(&req->backlog_head, &qp_ctx->backlog); diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c index 4e24735d95bae..93572c0d4faa3 100644 --- a/drivers/crypto/hisilicon/sec2/sec_main.c +++ b/drivers/crypto/hisilicon/sec2/sec_main.c @@ -427,7 +427,6 @@ static void sec_set_endian(struct hisi_qm *qm) if (!IS_ENABLED(CONFIG_64BIT)) reg |= BIT(1); - if (!IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN)) reg |= BIT(0); -- GitLab From 6c7b2202e4d11572ab23a89aeec49005b94bb966 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 17 Nov 2022 12:25:02 -0500 Subject: [PATCH 0762/1423] KVM: x86: avoid memslot check in NX hugepage recovery if it cannot succeed Since gfn_to_memslot() is relatively expensive, it helps to skip it if it the memslot cannot possibly have dirty logging enabled. In order to do this, add to struct kvm a counter of the number of log-page memslots. While the correct value can only be read with slots_lock taken, the NX recovery thread is content with using an approximate value. Therefore, the counter is an atomic_t. Based on https://lore.kernel.org/kvm/20221027200316.2221027-2-dmatlack@google.com/ by David Matlack. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 22 +++++++++++++++++++--- include/linux/kvm_host.h | 5 +++++ virt/kvm/kvm_main.c | 8 ++++++++ 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index cfff74685a259..4736d7849c60f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6878,16 +6878,32 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm) WARN_ON_ONCE(!sp->nx_huge_page_disallowed); WARN_ON_ONCE(!sp->role.direct); - slot = gfn_to_memslot(kvm, sp->gfn); - WARN_ON_ONCE(!slot); - /* * Unaccount and do not attempt to recover any NX Huge Pages * that are being dirty tracked, as they would just be faulted * back in as 4KiB pages. The NX Huge Pages in this slot will be * recovered, along with all the other huge pages in the slot, * when dirty logging is disabled. + * + * Since gfn_to_memslot() is relatively expensive, it helps to + * skip it if it the test cannot possibly return true. On the + * other hand, if any memslot has logging enabled, chances are + * good that all of them do, in which case unaccount_nx_huge_page() + * is much cheaper than zapping the page. + * + * If a memslot update is in progress, reading an incorrect value + * of kvm->nr_memslots_dirty_logging is not a problem: if it is + * becoming zero, gfn_to_memslot() will be done unnecessarily; if + * it is becoming nonzero, the page will be zapped unnecessarily. + * Either way, this only affects efficiency in racy situations, + * and not correctness. */ + slot = NULL; + if (atomic_read(&kvm->nr_memslots_dirty_logging)) { + slot = gfn_to_memslot(kvm, sp->gfn); + WARN_ON_ONCE(!slot); + } + if (slot && kvm_slot_dirty_track_enabled(slot)) unaccount_nx_huge_page(kvm, sp); else if (is_tdp_mmu_page(sp)) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index e6e66c5e56f24..6f0f389f5f9cd 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -722,6 +722,11 @@ struct kvm { /* The current active memslot set for each address space */ struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM]; struct xarray vcpu_array; + /* + * Protected by slots_lock, but can be read outside if an + * incorrect answer is acceptable. + */ + atomic_t nr_memslots_dirty_logging; /* Used to wait for completion of MMU notifiers. */ spinlock_t mn_invalidate_lock; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 43bbe4fde078f..1782c4555d94f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1641,6 +1641,8 @@ static void kvm_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *new, enum kvm_mr_change change) { + int old_flags = old ? old->flags : 0; + int new_flags = new ? new->flags : 0; /* * Update the total number of memslot pages before calling the arch * hook so that architectures can consume the result directly. @@ -1650,6 +1652,12 @@ static void kvm_commit_memory_region(struct kvm *kvm, else if (change == KVM_MR_CREATE) kvm->nr_memslot_pages += new->npages; + if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) { + int change = (new_flags & KVM_MEM_LOG_DIRTY_PAGES) ? 1 : -1; + atomic_set(&kvm->nr_memslots_dirty_logging, + atomic_read(&kvm->nr_memslots_dirty_logging) + change); + } + kvm_arch_commit_memory_region(kvm, old, new, change); switch (change) { -- GitLab From 74c8e6bffbe10c4470139496f930c0b0752c85c9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 29 Oct 2022 00:47:34 -0700 Subject: [PATCH 0763/1423] driver core: Add __alloc_size hint to devm allocators Mark the devm_*alloc()-family of allocations with appropriate __alloc_size()/__realloc_size() hints so the compiler can attempt to reason about buffer lengths from allocations. Cc: Greg Kroah-Hartman Cc: Rasmus Villemoes Cc: Thomas Gleixner Cc: Jason Gunthorpe Cc: Nishanth Menon Cc: Michael Kelley Cc: Dan Williams Cc: Won Chung Signed-off-by: Kees Cook Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20221029074734.gonna.276-kees@kernel.org --- include/linux/device.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/linux/device.h b/include/linux/device.h index 424b55df02727..5e4cd857e74f8 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -197,9 +197,9 @@ void devres_remove_group(struct device *dev, void *id); int devres_release_group(struct device *dev, void *id); /* managed devm_k.alloc/kfree for device drivers */ -void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp) __malloc; +void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp) __alloc_size(2); void *devm_krealloc(struct device *dev, void *ptr, size_t size, - gfp_t gfp) __must_check; + gfp_t gfp) __must_check __realloc_size(3); __printf(3, 0) char *devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt, va_list ap) __malloc; __printf(3, 4) char *devm_kasprintf(struct device *dev, gfp_t gfp, @@ -226,7 +226,8 @@ static inline void *devm_kcalloc(struct device *dev, void devm_kfree(struct device *dev, const void *p); char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp) __malloc; const char *devm_kstrdup_const(struct device *dev, const char *s, gfp_t gfp); -void *devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp); +void *devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp) + __realloc_size(3); unsigned long devm_get_free_pages(struct device *dev, gfp_t gfp_mask, unsigned int order); -- GitLab From 96d845a67b7e406cfed7880a724c8ca6121e022e Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 2 Nov 2022 08:42:15 -0700 Subject: [PATCH 0764/1423] drm/fsl-dcu: Fix return type of fsl_dcu_drm_connector_mode_valid() With clang's kernel control flow integrity (kCFI, CONFIG_CFI_CLANG), indirect call targets are validated against the expected function pointer prototype to make sure the call target is valid to help mitigate ROP attacks. If they are not identical, there is a failure at run time, which manifests as either a kernel panic or thread getting killed. A proposed warning in clang aims to catch these at compile time, which reveals: drivers/gpu/drm/fsl-dcu/fsl_dcu_drm_rgb.c:74:16: error: incompatible function pointer types initializing 'enum drm_mode_status (*)(struct drm_connector *, struct drm_display_mode *)' with an expression of type 'int (struct drm_connector *, struct drm_display_mode *)' [-Werror,-Wincompatible-function-pointer-types-strict] .mode_valid = fsl_dcu_drm_connector_mode_valid, ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1 error generated. ->mode_valid() in 'struct drm_connector_helper_funcs' expects a return type of 'enum drm_mode_status', not 'int'. Adjust the return type of fsl_dcu_drm_connector_mode_valid() to match the prototype's to resolve the warning and CFI failure. Link: https://github.com/ClangBuiltLinux/linux/issues/1750 Reported-by: Sami Tolvanen Signed-off-by: Nathan Chancellor Reviewed-by: Kees Cook Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221102154215.78059-1-nathan@kernel.org --- drivers/gpu/drm/fsl-dcu/fsl_dcu_drm_rgb.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/fsl-dcu/fsl_dcu_drm_rgb.c b/drivers/gpu/drm/fsl-dcu/fsl_dcu_drm_rgb.c index 4d4a715b429d1..2c2b92324a2e9 100644 --- a/drivers/gpu/drm/fsl-dcu/fsl_dcu_drm_rgb.c +++ b/drivers/gpu/drm/fsl-dcu/fsl_dcu_drm_rgb.c @@ -60,8 +60,9 @@ static int fsl_dcu_drm_connector_get_modes(struct drm_connector *connector) return drm_panel_get_modes(fsl_connector->panel, connector); } -static int fsl_dcu_drm_connector_mode_valid(struct drm_connector *connector, - struct drm_display_mode *mode) +static enum drm_mode_status +fsl_dcu_drm_connector_mode_valid(struct drm_connector *connector, + struct drm_display_mode *mode) { if (mode->hdisplay & 0xf) return MODE_ERROR; -- GitLab From 0ad811cc08a937d875cbad0149c1bab17f84ba05 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 2 Nov 2022 08:56:23 -0700 Subject: [PATCH 0765/1423] drm/sti: Fix return type of sti_{dvo,hda,hdmi}_connector_mode_valid() With clang's kernel control flow integrity (kCFI, CONFIG_CFI_CLANG), indirect call targets are validated against the expected function pointer prototype to make sure the call target is valid to help mitigate ROP attacks. If they are not identical, there is a failure at run time, which manifests as either a kernel panic or thread getting killed. A proposed warning in clang aims to catch these at compile time, which reveals: drivers/gpu/drm/sti/sti_hda.c:637:16: error: incompatible function pointer types initializing 'enum drm_mode_status (*)(struct drm_connector *, struct drm_display_mode *)' with an expression of type 'int (struct drm_connector *, struct drm_display_mode *)' [-Werror,-Wincompatible-function-pointer-types-strict] .mode_valid = sti_hda_connector_mode_valid, ^~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/gpu/drm/sti/sti_dvo.c:376:16: error: incompatible function pointer types initializing 'enum drm_mode_status (*)(struct drm_connector *, struct drm_display_mode *)' with an expression of type 'int (struct drm_connector *, struct drm_display_mode *)' [-Werror,-Wincompatible-function-pointer-types-strict] .mode_valid = sti_dvo_connector_mode_valid, ^~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/gpu/drm/sti/sti_hdmi.c:1035:16: error: incompatible function pointer types initializing 'enum drm_mode_status (*)(struct drm_connector *, struct drm_display_mode *)' with an expression of type 'int (struct drm_connector *, struct drm_display_mode *)' [-Werror,-Wincompatible-function-pointer-types-strict] .mode_valid = sti_hdmi_connector_mode_valid, ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ->mode_valid() in 'struct drm_connector_helper_funcs' expects a return type of 'enum drm_mode_status', not 'int'. Adjust the return type of sti_{dvo,hda,hdmi}_connector_mode_valid() to match the prototype's to resolve the warning and CFI failure. Link: https://github.com/ClangBuiltLinux/linux/issues/1750 Signed-off-by: Nathan Chancellor Reviewed-by: Kees Cook Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221102155623.3042869-1-nathan@kernel.org --- drivers/gpu/drm/sti/sti_dvo.c | 5 +++-- drivers/gpu/drm/sti/sti_hda.c | 5 +++-- drivers/gpu/drm/sti/sti_hdmi.c | 5 +++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/sti/sti_dvo.c b/drivers/gpu/drm/sti/sti_dvo.c index b6ee8a82e656c..076d5f30a09cc 100644 --- a/drivers/gpu/drm/sti/sti_dvo.c +++ b/drivers/gpu/drm/sti/sti_dvo.c @@ -346,8 +346,9 @@ static int sti_dvo_connector_get_modes(struct drm_connector *connector) #define CLK_TOLERANCE_HZ 50 -static int sti_dvo_connector_mode_valid(struct drm_connector *connector, - struct drm_display_mode *mode) +static enum drm_mode_status +sti_dvo_connector_mode_valid(struct drm_connector *connector, + struct drm_display_mode *mode) { int target = mode->clock * 1000; int target_min = target - CLK_TOLERANCE_HZ; diff --git a/drivers/gpu/drm/sti/sti_hda.c b/drivers/gpu/drm/sti/sti_hda.c index 03cc401ed5934..a53b5a15c2a9d 100644 --- a/drivers/gpu/drm/sti/sti_hda.c +++ b/drivers/gpu/drm/sti/sti_hda.c @@ -601,8 +601,9 @@ static int sti_hda_connector_get_modes(struct drm_connector *connector) #define CLK_TOLERANCE_HZ 50 -static int sti_hda_connector_mode_valid(struct drm_connector *connector, - struct drm_display_mode *mode) +static enum drm_mode_status +sti_hda_connector_mode_valid(struct drm_connector *connector, + struct drm_display_mode *mode) { int target = mode->clock * 1000; int target_min = target - CLK_TOLERANCE_HZ; diff --git a/drivers/gpu/drm/sti/sti_hdmi.c b/drivers/gpu/drm/sti/sti_hdmi.c index cb82622877d20..09e0cadb63683 100644 --- a/drivers/gpu/drm/sti/sti_hdmi.c +++ b/drivers/gpu/drm/sti/sti_hdmi.c @@ -1004,8 +1004,9 @@ fail: #define CLK_TOLERANCE_HZ 50 -static int sti_hdmi_connector_mode_valid(struct drm_connector *connector, - struct drm_display_mode *mode) +static enum drm_mode_status +sti_hdmi_connector_mode_valid(struct drm_connector *connector, + struct drm_display_mode *mode) { int target = mode->clock * 1000; int target_min = target - CLK_TOLERANCE_HZ; -- GitLab From 089fe572a2e0a89e36a455d299d801770293d08f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 1 Nov 2022 15:53:39 +0100 Subject: [PATCH 0766/1423] x86/hyperv: Move VMCB enlightenment definitions to hyperv-tlfs.h Move Hyper-V's VMCB enlightenment definitions to the TLFS header; the definitions come directly from the TLFS[*], not from KVM. No functional change intended. [*] https://learn.microsoft.com/en-us/virtualization/hyper-v-on-windows/tlfs/datatypes/hv_svm_enlightened_vmcb_fields [vitaly: rename VMCB_HV_ -> HV_VMCB_ to match the rest of hyperv-tlfs.h, keep svm/hyperv.h] Signed-off-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-2-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/hyperv-tlfs.h | 22 +++++++++++++++++++ arch/x86/kvm/svm/hyperv.h | 22 ------------------- arch/x86/kvm/svm/nested.c | 2 +- arch/x86/kvm/svm/svm_onhyperv.c | 2 +- arch/x86/kvm/svm/svm_onhyperv.h | 4 ++-- .../selftests/kvm/x86_64/hyperv_svm_test.c | 6 ++--- 6 files changed, 29 insertions(+), 29 deletions(-) diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 3089ec352743b..245a806a97170 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -598,6 +598,28 @@ struct hv_enlightened_vmcs { #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF +/* + * Hyper-V uses the software reserved 32 bytes in VMCB control area to expose + * SVM enlightenments to guests. + */ +struct hv_enlightenments { + struct __packed hv_enlightenments_control { + u32 nested_flush_hypercall:1; + u32 msr_bitmap:1; + u32 enlightened_npt_tlb: 1; + u32 reserved:29; + } __packed hv_enlightenments_control; + u32 hv_vp_id; + u64 hv_vm_id; + u64 partition_assist_page; + u64 reserved; +} __packed; + +/* + * Hyper-V uses the software reserved clean bit in VMCB. + */ +#define HV_VMCB_NESTED_ENLIGHTENMENTS 31 + struct hv_partition_assist_pg { u32 tlb_lock_count; }; diff --git a/arch/x86/kvm/svm/hyperv.h b/arch/x86/kvm/svm/hyperv.h index 7d6d97968fb98..c59544cdf03b7 100644 --- a/arch/x86/kvm/svm/hyperv.h +++ b/arch/x86/kvm/svm/hyperv.h @@ -10,26 +10,4 @@ #include "../hyperv.h" -/* - * Hyper-V uses the software reserved 32 bytes in VMCB - * control area to expose SVM enlightenments to guests. - */ -struct hv_enlightenments { - struct __packed hv_enlightenments_control { - u32 nested_flush_hypercall:1; - u32 msr_bitmap:1; - u32 enlightened_npt_tlb: 1; - u32 reserved:29; - } __packed hv_enlightenments_control; - u32 hv_vp_id; - u64 hv_vm_id; - u64 partition_assist_page; - u64 reserved; -} __packed; - -/* - * Hyper-V uses the software reserved clean bit in VMCB - */ -#define VMCB_HV_NESTED_ENLIGHTENMENTS VMCB_SW - #endif /* __ARCH_X86_KVM_SVM_HYPERV_H__ */ diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 3aa9184d1e4ed..a2f25bffff488 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -195,7 +195,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) if (!svm->nested.force_msr_bitmap_recalc && kvm_hv_hypercall_enabled(&svm->vcpu) && hve->hv_enlightenments_control.msr_bitmap && - (svm->nested.ctl.clean & BIT(VMCB_HV_NESTED_ENLIGHTENMENTS))) + (svm->nested.ctl.clean & BIT(HV_VMCB_NESTED_ENLIGHTENMENTS))) goto set_msrpm_base_pa; if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c index 8cdc62c74a964..ed5e793925441 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.c +++ b/arch/x86/kvm/svm/svm_onhyperv.c @@ -32,7 +32,7 @@ int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) hve->hv_vm_id = (unsigned long)vcpu->kvm; if (!hve->hv_enlightenments_control.nested_flush_hypercall) { hve->hv_enlightenments_control.nested_flush_hypercall = 1; - vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS); + vmcb_mark_dirty(to_svm(vcpu)->vmcb, HV_VMCB_NESTED_ENLIGHTENMENTS); } return 0; diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index e2fc593804657..66e61a73caebc 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -64,7 +64,7 @@ static inline void svm_hv_vmcb_dirty_nested_enlightenments( (struct hv_enlightenments *)vmcb->control.reserved_sw; if (hve->hv_enlightenments_control.msr_bitmap) - vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS); + vmcb_mark_dirty(vmcb, HV_VMCB_NESTED_ENLIGHTENMENTS); } static inline void svm_hv_update_vp_id(struct vmcb *vmcb, @@ -76,7 +76,7 @@ static inline void svm_hv_update_vp_id(struct vmcb *vmcb, if (hve->hv_vp_id != vp_index) { hve->hv_vp_id = vp_index; - vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS); + vmcb_mark_dirty(vmcb, HV_VMCB_NESTED_ENLIGHTENMENTS); } } #else diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c index a380ad7bb9b34..5060fcfe17601 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c @@ -39,7 +39,7 @@ struct hv_enlightenments { /* * Hyper-V uses the software reserved clean bit in VMCB */ -#define VMCB_HV_NESTED_ENLIGHTENMENTS (1U << 31) +#define HV_VMCB_NESTED_ENLIGHTENMENTS (1U << 31) void l2_guest_code(void) { @@ -98,14 +98,14 @@ static void __attribute__((__flatten__)) guest_code(struct svm_test_data *svm) /* Intercept RDMSR 0xc0000101 without telling KVM about it */ set_bit(2 * (MSR_GS_BASE & 0x1fff), svm->msr + 0x800); /* Make sure HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP is set */ - vmcb->control.clean |= VMCB_HV_NESTED_ENLIGHTENMENTS; + vmcb->control.clean |= HV_VMCB_NESTED_ENLIGHTENMENTS; run_guest(vmcb, svm->vmcb_gpa); /* Make sure we don't see SVM_EXIT_MSR here so eMSR bitmap works */ GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL); vmcb->save.rip += 3; /* vmcall */ /* Now tell KVM we've changed MSR-Bitmap */ - vmcb->control.clean &= ~VMCB_HV_NESTED_ENLIGHTENMENTS; + vmcb->control.clean &= ~HV_VMCB_NESTED_ENLIGHTENMENTS; run_guest(vmcb, svm->vmcb_gpa); GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR); vmcb->save.rip += 2; /* rdmsr */ -- GitLab From 381fc63ac0754e05d3921e9d399b89dfdfd2b2e5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 1 Nov 2022 15:53:40 +0100 Subject: [PATCH 0767/1423] KVM: selftests: Move "struct hv_enlightenments" to x86_64/svm.h Move Hyper-V's VMCB "struct hv_enlightenments" to the svm.h header so that the struct can be referenced in "struct vmcb_control_area". Alternatively, a dedicated header for SVM+Hyper-V could be added, a la x86_64/evmcs.h, but it doesn't appear that Hyper-V will end up needing a wholesale replacement for the VMCB. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-3-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/include/x86_64/svm.h | 17 +++++++++++++++++ .../selftests/kvm/x86_64/hyperv_svm_test.c | 18 ------------------ 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/svm.h b/tools/testing/selftests/kvm/include/x86_64/svm.h index c8343ff84f7f7..89ce2c6b57fe0 100644 --- a/tools/testing/selftests/kvm/include/x86_64/svm.h +++ b/tools/testing/selftests/kvm/include/x86_64/svm.h @@ -58,6 +58,23 @@ enum { INTERCEPT_RDPRU, }; +struct hv_enlightenments { + struct __packed hv_enlightenments_control { + u32 nested_flush_hypercall:1; + u32 msr_bitmap:1; + u32 enlightened_npt_tlb: 1; + u32 reserved:29; + } __packed hv_enlightenments_control; + u32 hv_vp_id; + u64 hv_vm_id; + u64 partition_assist_page; + u64 reserved; +} __packed; + +/* + * Hyper-V uses the software reserved clean bit in VMCB + */ +#define HV_VMCB_NESTED_ENLIGHTENMENTS (1U << 31) struct __attribute__ ((__packed__)) vmcb_control_area { u32 intercept_cr; diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c index 5060fcfe17601..2fd64b419928a 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c @@ -23,24 +23,6 @@ #define L2_GUEST_STACK_SIZE 256 -struct hv_enlightenments { - struct __packed hv_enlightenments_control { - u32 nested_flush_hypercall:1; - u32 msr_bitmap:1; - u32 enlightened_npt_tlb: 1; - u32 reserved:29; - } __packed hv_enlightenments_control; - u32 hv_vp_id; - u64 hv_vm_id; - u64 partition_assist_page; - u64 reserved; -} __packed; - -/* - * Hyper-V uses the software reserved clean bit in VMCB - */ -#define HV_VMCB_NESTED_ENLIGHTENMENTS (1U << 31) - void l2_guest_code(void) { GUEST_SYNC(3); -- GitLab From 68ae7c7bc56a4504ed5efde7c2f8d6024148a35e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 1 Nov 2022 15:53:41 +0100 Subject: [PATCH 0768/1423] KVM: SVM: Add a proper field for Hyper-V VMCB enlightenments Add a union to provide hv_enlightenments side-by-side with the sw_reserved bytes that Hyper-V's enlightenments overlay. Casting sw_reserved everywhere is messy, confusing, and unnecessarily unsafe. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-4-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/svm.h | 7 ++++++- arch/x86/kvm/svm/nested.c | 9 ++++----- arch/x86/kvm/svm/svm.h | 5 ++++- arch/x86/kvm/svm/svm_onhyperv.c | 2 +- arch/x86/kvm/svm/svm_onhyperv.h | 15 +++++++-------- tools/testing/selftests/kvm/include/x86_64/svm.h | 5 ++++- .../selftests/kvm/x86_64/hyperv_svm_test.c | 3 +-- 7 files changed, 27 insertions(+), 19 deletions(-) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 4352b46dd20c9..b37249e7c6607 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -5,6 +5,8 @@ #include #include +#include + /* * 32-bit intercept words in the VMCB Control Area, starting * at Byte offset 000h. @@ -161,7 +163,10 @@ struct __attribute__ ((__packed__)) vmcb_control_area { * Offset 0x3e0, 32 bytes reserved * for use by hypervisor/software. */ - u8 reserved_sw[32]; + union { + struct hv_enlightenments hv_enlightenments; + u8 reserved_sw[32]; + }; }; diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index a2f25bffff488..622fe00c3acf1 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -180,8 +180,7 @@ void recalc_intercepts(struct vcpu_svm *svm) */ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) { - struct hv_enlightenments *hve = - (struct hv_enlightenments *)svm->nested.ctl.reserved_sw; + struct hv_enlightenments *hve = &svm->nested.ctl.hv_enlightenments; int i; /* @@ -370,8 +369,8 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu, /* Hyper-V extensions (Enlightened VMCB) */ if (kvm_hv_hypercall_enabled(vcpu)) { to->clean = from->clean; - memcpy(to->reserved_sw, from->reserved_sw, - sizeof(struct hv_enlightenments)); + memcpy(&to->hv_enlightenments, &from->hv_enlightenments, + sizeof(to->hv_enlightenments)); } } @@ -1488,7 +1487,7 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst, dst->virt_ext = from->virt_ext; dst->pause_filter_count = from->pause_filter_count; dst->pause_filter_thresh = from->pause_filter_thresh; - /* 'clean' and 'reserved_sw' are not changed by KVM */ + /* 'clean' and 'hv_enlightenments' are not changed by KVM */ } static int svm_get_nested_state(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 199a2ecef1cec..a5af367502e42 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -151,7 +151,10 @@ struct vmcb_ctrl_area_cached { u64 nested_cr3; u64 virt_ext; u32 clean; - u8 reserved_sw[32]; + union { + struct hv_enlightenments hv_enlightenments; + u8 reserved_sw[32]; + }; }; struct svm_nested_state { diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c index ed5e793925441..422d00fee24ab 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.c +++ b/arch/x86/kvm/svm/svm_onhyperv.c @@ -26,7 +26,7 @@ int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) if (!*p_hv_pa_pg) return -ENOMEM; - hve = (struct hv_enlightenments *)to_svm(vcpu)->vmcb->control.reserved_sw; + hve = &to_svm(vcpu)->vmcb->control.hv_enlightenments; hve->partition_assist_page = __pa(*p_hv_pa_pg); hve->hv_vm_id = (unsigned long)vcpu->kvm; diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index 66e61a73caebc..5c664dd7bee21 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -17,8 +17,10 @@ int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu); static inline void svm_hv_init_vmcb(struct vmcb *vmcb) { - struct hv_enlightenments *hve = - (struct hv_enlightenments *)vmcb->control.reserved_sw; + struct hv_enlightenments *hve = &vmcb->control.hv_enlightenments; + + BUILD_BUG_ON(sizeof(vmcb->control.hv_enlightenments) != + sizeof(vmcb->control.reserved_sw)); if (npt_enabled && ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) @@ -60,18 +62,15 @@ static inline void svm_hv_vmcb_dirty_nested_enlightenments( struct kvm_vcpu *vcpu) { struct vmcb *vmcb = to_svm(vcpu)->vmcb; - struct hv_enlightenments *hve = - (struct hv_enlightenments *)vmcb->control.reserved_sw; + struct hv_enlightenments *hve = &vmcb->control.hv_enlightenments; if (hve->hv_enlightenments_control.msr_bitmap) vmcb_mark_dirty(vmcb, HV_VMCB_NESTED_ENLIGHTENMENTS); } -static inline void svm_hv_update_vp_id(struct vmcb *vmcb, - struct kvm_vcpu *vcpu) +static inline void svm_hv_update_vp_id(struct vmcb *vmcb, struct kvm_vcpu *vcpu) { - struct hv_enlightenments *hve = - (struct hv_enlightenments *)vmcb->control.reserved_sw; + struct hv_enlightenments *hve = &vmcb->control.hv_enlightenments; u32 vp_index = kvm_hv_get_vpindex(vcpu); if (hve->hv_vp_id != vp_index) { diff --git a/tools/testing/selftests/kvm/include/x86_64/svm.h b/tools/testing/selftests/kvm/include/x86_64/svm.h index 89ce2c6b57fe0..6e1527aa34191 100644 --- a/tools/testing/selftests/kvm/include/x86_64/svm.h +++ b/tools/testing/selftests/kvm/include/x86_64/svm.h @@ -123,7 +123,10 @@ struct __attribute__ ((__packed__)) vmcb_control_area { * Offset 0x3e0, 32 bytes reserved * for use by hypervisor/software. */ - u8 reserved_sw[32]; + union { + struct hv_enlightenments hv_enlightenments; + u8 reserved_sw[32]; + }; }; diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c index 2fd64b419928a..8ef6a4c83cb1e 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c @@ -46,8 +46,7 @@ static void __attribute__((__flatten__)) guest_code(struct svm_test_data *svm) { unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; struct vmcb *vmcb = svm->vmcb; - struct hv_enlightenments *hve = - (struct hv_enlightenments *)vmcb->control.reserved_sw; + struct hv_enlightenments *hve = &vmcb->control.hv_enlightenments; GUEST_SYNC(1); -- GitLab From 26b516bb39215cf60aa1fb55d0a6fd73058698fa Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 1 Nov 2022 15:53:42 +0100 Subject: [PATCH 0769/1423] x86/hyperv: KVM: Rename "hv_enlightenments" to "hv_vmcb_enlightenments" Now that KVM isn't littered with "struct hv_enlightenments" casts, rename the struct to "hv_vmcb_enlightenments" to highlight the fact that the struct is specifically for SVM's VMCB. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Michael Kelley Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-5-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/hyperv-tlfs.h | 2 +- arch/x86/include/asm/svm.h | 2 +- arch/x86/kvm/svm/nested.c | 2 +- arch/x86/kvm/svm/svm.h | 2 +- arch/x86/kvm/svm/svm_onhyperv.c | 2 +- arch/x86/kvm/svm/svm_onhyperv.h | 6 +++--- tools/testing/selftests/kvm/include/x86_64/svm.h | 4 ++-- tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c | 2 +- 8 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 245a806a97170..c5e0e5a06c0dc 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -602,7 +602,7 @@ struct hv_enlightened_vmcs { * Hyper-V uses the software reserved 32 bytes in VMCB control area to expose * SVM enlightenments to guests. */ -struct hv_enlightenments { +struct hv_vmcb_enlightenments { struct __packed hv_enlightenments_control { u32 nested_flush_hypercall:1; u32 msr_bitmap:1; diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index b37249e7c6607..cb1ee53ad3b18 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -164,7 +164,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area { * for use by hypervisor/software. */ union { - struct hv_enlightenments hv_enlightenments; + struct hv_vmcb_enlightenments hv_enlightenments; u8 reserved_sw[32]; }; }; diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 622fe00c3acf1..cc8f47e7e294d 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -180,7 +180,7 @@ void recalc_intercepts(struct vcpu_svm *svm) */ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) { - struct hv_enlightenments *hve = &svm->nested.ctl.hv_enlightenments; + struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments; int i; /* diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index a5af367502e42..4826e6cc611bf 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -152,7 +152,7 @@ struct vmcb_ctrl_area_cached { u64 virt_ext; u32 clean; union { - struct hv_enlightenments hv_enlightenments; + struct hv_vmcb_enlightenments hv_enlightenments; u8 reserved_sw[32]; }; }; diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c index 422d00fee24ab..52c73a8be72b1 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.c +++ b/arch/x86/kvm/svm/svm_onhyperv.c @@ -16,7 +16,7 @@ int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) { - struct hv_enlightenments *hve; + struct hv_vmcb_enlightenments *hve; struct hv_partition_assist_pg **p_hv_pa_pg = &to_kvm_hv(vcpu->kvm)->hv_pa_pg; diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index 5c664dd7bee21..d5cb2c62e3557 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -17,7 +17,7 @@ int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu); static inline void svm_hv_init_vmcb(struct vmcb *vmcb) { - struct hv_enlightenments *hve = &vmcb->control.hv_enlightenments; + struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments; BUILD_BUG_ON(sizeof(vmcb->control.hv_enlightenments) != sizeof(vmcb->control.reserved_sw)); @@ -62,7 +62,7 @@ static inline void svm_hv_vmcb_dirty_nested_enlightenments( struct kvm_vcpu *vcpu) { struct vmcb *vmcb = to_svm(vcpu)->vmcb; - struct hv_enlightenments *hve = &vmcb->control.hv_enlightenments; + struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments; if (hve->hv_enlightenments_control.msr_bitmap) vmcb_mark_dirty(vmcb, HV_VMCB_NESTED_ENLIGHTENMENTS); @@ -70,7 +70,7 @@ static inline void svm_hv_vmcb_dirty_nested_enlightenments( static inline void svm_hv_update_vp_id(struct vmcb *vmcb, struct kvm_vcpu *vcpu) { - struct hv_enlightenments *hve = &vmcb->control.hv_enlightenments; + struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments; u32 vp_index = kvm_hv_get_vpindex(vcpu); if (hve->hv_vp_id != vp_index) { diff --git a/tools/testing/selftests/kvm/include/x86_64/svm.h b/tools/testing/selftests/kvm/include/x86_64/svm.h index 6e1527aa34191..483e6ae12f69e 100644 --- a/tools/testing/selftests/kvm/include/x86_64/svm.h +++ b/tools/testing/selftests/kvm/include/x86_64/svm.h @@ -58,7 +58,7 @@ enum { INTERCEPT_RDPRU, }; -struct hv_enlightenments { +struct hv_vmcb_enlightenments { struct __packed hv_enlightenments_control { u32 nested_flush_hypercall:1; u32 msr_bitmap:1; @@ -124,7 +124,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area { * for use by hypervisor/software. */ union { - struct hv_enlightenments hv_enlightenments; + struct hv_vmcb_enlightenments hv_enlightenments; u8 reserved_sw[32]; }; }; diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c index 8ef6a4c83cb1e..1c3fc38b4f151 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c @@ -46,7 +46,7 @@ static void __attribute__((__flatten__)) guest_code(struct svm_test_data *svm) { unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; struct vmcb *vmcb = svm->vmcb; - struct hv_enlightenments *hve = &vmcb->control.hv_enlightenments; + struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments; GUEST_SYNC(1); -- GitLab From b83237ad2167a0f9a43b909adb42623941b741b8 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:53:43 +0100 Subject: [PATCH 0770/1423] KVM: x86: Rename 'enable_direct_tlbflush' to 'enable_l2_tlb_flush' To make terminology between Hyper-V-on-KVM and KVM-on-Hyper-V consistent, rename 'enable_direct_tlbflush' to 'enable_l2_tlb_flush'. The change eliminates the use of confusing 'direct' and adds the missing underscore. No functional change. Reviewed-by: Maxim Levitsky Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-6-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 2 +- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm/svm_onhyperv.c | 2 +- arch/x86/kvm/svm/svm_onhyperv.h | 6 +++--- arch/x86/kvm/vmx/vmx.c | 6 +++--- arch/x86/kvm/x86.c | 6 +++--- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index ea58e67e9a670..abccd51dcfca1 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -125,7 +125,7 @@ KVM_X86_OP_OPTIONAL(guest_memory_reclaimed) KVM_X86_OP(get_msr_feature) KVM_X86_OP(can_emulate_instruction) KVM_X86_OP(apic_init_signal_blocked) -KVM_X86_OP_OPTIONAL(enable_direct_tlbflush) +KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush) KVM_X86_OP_OPTIONAL(migrate_timers) KVM_X86_OP(msr_filter_changed) KVM_X86_OP(complete_emulated_msr) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 598eb3b9ae440..a413f841e8308 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1652,7 +1652,7 @@ struct kvm_x86_ops { void *insn, int insn_len); bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu); - int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu); + int (*enable_l2_tlb_flush)(struct kvm_vcpu *vcpu); void (*migrate_timers)(struct kvm_vcpu *vcpu); void (*msr_filter_changed)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c index 52c73a8be72b1..26a89d0da93e4 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.c +++ b/arch/x86/kvm/svm/svm_onhyperv.c @@ -14,7 +14,7 @@ #include "kvm_onhyperv.h" #include "svm_onhyperv.h" -int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) +int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) { struct hv_vmcb_enlightenments *hve; struct hv_partition_assist_pg **p_hv_pa_pg = diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index d5cb2c62e3557..45faf84476cec 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -13,7 +13,7 @@ static struct kvm_x86_ops svm_x86_ops; -int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu); +int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu); static inline void svm_hv_init_vmcb(struct vmcb *vmcb) { @@ -53,8 +53,8 @@ static inline void svm_hv_hardware_setup(void) vp_ap->nested_control.features.directhypercall = 1; } - svm_x86_ops.enable_direct_tlbflush = - svm_hv_enable_direct_tlbflush; + svm_x86_ops.enable_l2_tlb_flush = + svm_hv_enable_l2_tlb_flush; } } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index aca88524fd1ec..5806ab88851e4 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -527,7 +527,7 @@ static unsigned long host_idt_base; static bool __read_mostly enlightened_vmcs = true; module_param(enlightened_vmcs, bool, 0444); -static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) +static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) { struct hv_enlightened_vmcs *evmcs; struct hv_partition_assist_pg **p_hv_pa_pg = @@ -8520,8 +8520,8 @@ static int __init vmx_init(void) } if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) - vmx_x86_ops.enable_direct_tlbflush - = hv_enable_direct_tlbflush; + vmx_x86_ops.enable_l2_tlb_flush + = hv_enable_l2_tlb_flush; } else { enlightened_vmcs = false; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 404325a13dc2e..838ed4ea7e4d8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4481,7 +4481,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0; break; case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: - r = kvm_x86_ops.enable_direct_tlbflush != NULL; + r = kvm_x86_ops.enable_l2_tlb_flush != NULL; break; case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: r = kvm_x86_ops.nested_ops->enable_evmcs != NULL; @@ -5494,10 +5494,10 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, } return r; case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: - if (!kvm_x86_ops.enable_direct_tlbflush) + if (!kvm_x86_ops.enable_l2_tlb_flush) return -ENOTTY; - return static_call(kvm_x86_enable_direct_tlbflush)(vcpu); + return static_call(kvm_x86_enable_l2_tlb_flush)(vcpu); case KVM_CAP_HYPERV_ENFORCE_CPUID: return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]); -- GitLab From a789aeba419647c44d7e7320de20fea037c211d0 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:53:44 +0100 Subject: [PATCH 0771/1423] KVM: VMX: Rename "vmx/evmcs.{ch}" to "vmx/hyperv.{ch}" To conform with SVM, rename VMX specific Hyper-V files from "evmcs.{ch}" to "hyperv.{ch}". While Enlightened VMCS is a lion's share of these files, some stuff (e.g. enlightened MSR bitmap, the upcoming Hyper-V L2 TLB flush, ...) goes beyond that. Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-7-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/vmx/{evmcs.c => hyperv.c} | 3 +-- arch/x86/kvm/vmx/{evmcs.h => hyperv.h} | 8 +++++--- arch/x86/kvm/vmx/nested.c | 1 - arch/x86/kvm/vmx/vmx.c | 1 - arch/x86/kvm/vmx/vmx_ops.h | 2 +- 6 files changed, 8 insertions(+), 9 deletions(-) rename arch/x86/kvm/vmx/{evmcs.c => hyperv.c} (99%) rename arch/x86/kvm/vmx/{evmcs.h => hyperv.h} (98%) diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index b8a494b6a5ec9..4cf407563feea 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -23,7 +23,7 @@ kvm-$(CONFIG_KVM_XEN) += xen.o kvm-$(CONFIG_KVM_SMM) += smm.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ - vmx/evmcs.o vmx/nested.o vmx/posted_intr.o + vmx/hyperv.o vmx/nested.o vmx/posted_intr.o kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/hyperv.c similarity index 99% rename from arch/x86/kvm/vmx/evmcs.c rename to arch/x86/kvm/vmx/hyperv.c index d8b23c96d6272..5e239158174ea 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -3,9 +3,8 @@ #include #include -#include "../hyperv.h" #include "../cpuid.h" -#include "evmcs.h" +#include "hyperv.h" #include "vmcs.h" #include "vmx.h" #include "trace.h" diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/hyperv.h similarity index 98% rename from arch/x86/kvm/vmx/evmcs.h rename to arch/x86/kvm/vmx/hyperv.h index 6f746ef3c0386..99a151af7a817 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __KVM_X86_VMX_EVMCS_H -#define __KVM_X86_VMX_EVMCS_H +#ifndef __KVM_X86_VMX_HYPERV_H +#define __KVM_X86_VMX_HYPERV_H #include @@ -8,6 +8,8 @@ #include #include +#include "../hyperv.h" + #include "capabilities.h" #include "vmcs.h" #include "vmcs12.h" @@ -242,4 +244,4 @@ int nested_enable_evmcs(struct kvm_vcpu *vcpu, void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); int nested_evmcs_check_controls(struct vmcs12 *vmcs12); -#endif /* __KVM_X86_VMX_EVMCS_H */ +#endif /* __KVM_X86_VMX_HYPERV_H */ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 7924dea936781..048b2c3e3b3fc 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -7,7 +7,6 @@ #include #include "cpuid.h" -#include "evmcs.h" #include "hyperv.h" #include "mmu.h" #include "nested.h" diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5806ab88851e4..cea8c07f52298 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -51,7 +51,6 @@ #include "capabilities.h" #include "cpuid.h" -#include "evmcs.h" #include "hyperv.h" #include "kvm_onhyperv.h" #include "irq.h" diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index ec268df83ed67..f6f23c7397dc5 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -6,7 +6,7 @@ #include -#include "evmcs.h" +#include "hyperv.h" #include "vmcs.h" #include "../x86.h" -- GitLab From e94cea0930195adee67c93140ad9f95e430b2be8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 1 Nov 2022 15:53:45 +0100 Subject: [PATCH 0772/1423] KVM: x86: Move clearing of TLB_FLUSH_CURRENT to kvm_vcpu_flush_tlb_all() Clear KVM_REQ_TLB_FLUSH_CURRENT in kvm_vcpu_flush_tlb_all() instead of in its sole caller that processes KVM_REQ_TLB_FLUSH. Regardless of why/when kvm_vcpu_flush_tlb_all() is called, flushing "all" TLB entries also flushes "current" TLB entries. Ideally, there will never be another caller of kvm_vcpu_flush_tlb_all(), and moving the handling "requires" extra work to document the ordering requirement, but future Hyper-V paravirt TLB flushing support will add similar logic for flush "guest" (Hyper-V can flush a subset of "guest" entries). And in the Hyper-V case, KVM needs to do more than just clear the request, the queue of GPAs to flush also needs to purged, and doing all only in the request path is undesirable as kvm_vcpu_flush_tlb_guest() does have multiple callers (though it's unlikely KVM's paravirt TLB flush will coincide with Hyper-V's paravirt TLB flush). Move the logic even though it adds extra "work" so that KVM will be consistent with how flush requests are processed when the Hyper-V support lands. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-8-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 838ed4ea7e4d8..7fc5508c0b4a9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3399,6 +3399,9 @@ static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; static_call(kvm_x86_flush_tlb_all)(vcpu); + + /* Flushing all ASIDs flushes the current ASID... */ + kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) @@ -10236,12 +10239,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_mmu_sync_roots(vcpu); if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu)) kvm_mmu_load_pgd(vcpu); - if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) { + + /* + * Note, the order matters here, as flushing "all" TLB entries + * also flushes the "current" TLB entries, i.e. servicing the + * flush "all" will clear any request to flush "current". + */ + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) kvm_vcpu_flush_tlb_all(vcpu); - /* Flushing all ASIDs flushes the current ASID... */ - kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); - } kvm_service_local_tlb_flush_requests(vcpu); if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { -- GitLab From adc43caa0a25746e1a9dabbab241abd01120dbfe Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:53:46 +0100 Subject: [PATCH 0773/1423] KVM: x86: hyper-v: Resurrect dedicated KVM_REQ_HV_TLB_FLUSH flag In preparation to implementing fine-grained Hyper-V TLB flush and L2 TLB flush, resurrect dedicated KVM_REQ_HV_TLB_FLUSH request bit. As KVM_REQ_TLB_FLUSH_GUEST is a stronger operation, clear KVM_REQ_HV_TLB_FLUSH request in kvm_vcpu_flush_tlb_guest(). The flush itself is temporary handled by kvm_vcpu_flush_tlb_guest(). No functional change intended. Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-9-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/hyperv.c | 4 ++-- arch/x86/kvm/svm/svm.c | 7 +++++++ arch/x86/kvm/x86.c | 9 +++++++++ 4 files changed, 20 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a413f841e8308..0b85230a0e0a7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -110,6 +110,8 @@ KVM_ARCH_REQ_FLAGS(30, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_MMU_FREE_OBSOLETE_ROOTS \ KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_HV_TLB_FLUSH \ + KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 0adf4a437e85f..3c0f639f6a05e 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1870,11 +1870,11 @@ do_flush: * analyze it here, flush TLB regardless of the specified address space. */ if (all_cpus) { - kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH_GUEST); + kvm_make_all_cpus_request(kvm, KVM_REQ_HV_TLB_FLUSH); } else { sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask); - kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST, vcpu_mask); + kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask); } ret_success: diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 7efc4fdaa446b..4ea6ddd998994 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3722,6 +3722,13 @@ static void svm_flush_tlb_current(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + /* + * Unlike VMX, SVM doesn't provide a way to flush only NPT TLB entries. + * A TLB flush for the current ASID flushes both "host" and "guest" TLB + * entries, and thus is a superset of Hyper-V's fine grained flushing. + */ + kvm_clear_request(KVM_REQ_HV_TLB_FLUSH, vcpu); + /* * Flush only the current ASID even if the TLB flush was invoked via * kvm_flush_remote_tlbs(). Although flushing remote TLBs requires all diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7fc5508c0b4a9..12e49e8566d49 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3420,6 +3420,12 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) } static_call(kvm_x86_flush_tlb_guest)(vcpu); + + /* + * Flushing all "guest" TLB is always a superset of Hyper-V's fine + * grained flushing. + */ + kvm_clear_request(KVM_REQ_HV_TLB_FLUSH, vcpu); } @@ -10250,6 +10256,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_service_local_tlb_flush_requests(vcpu); + if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu)) + kvm_vcpu_flush_tlb_guest(vcpu); + if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; r = 0; -- GitLab From 0823570f01989d3703751f66534a138d4fae062e Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:53:47 +0100 Subject: [PATCH 0774/1423] KVM: x86: hyper-v: Introduce TLB flush fifo To allow flushing individual GVAs instead of always flushing the whole VPID a per-vCPU structure to pass the requests is needed. Use standard 'kfifo' to queue two types of entries: individual GVA (GFN + up to 4095 following GFNs in the lower 12 bits) and 'flush all'. The size of the fifo is arbitrarily set to '16'. Note, kvm_hv_flush_tlb() only queues 'flush all' entries for now and kvm_hv_vcpu_flush_tlb() doesn't actually read the fifo just resets the queue before returning -EOPNOTSUPP (which triggers full TLB flush) so the functional change is very small but the infrastructure is prepared to handle individual GVA flush requests. Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-10-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 20 ++++++++++++++ arch/x86/kvm/hyperv.c | 47 +++++++++++++++++++++++++++++++++ arch/x86/kvm/hyperv.h | 15 +++++++++++ arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/x86.c | 11 ++++++-- 5 files changed, 92 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0b85230a0e0a7..3e35dcf40dc7c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -618,6 +619,23 @@ struct kvm_vcpu_hv_synic { bool dont_zero_synic_pages; }; +/* The maximum number of entries on the TLB flush fifo. */ +#define KVM_HV_TLB_FLUSH_FIFO_SIZE (16) +/* + * Note: the following 'magic' entry is made up by KVM to avoid putting + * anything besides GVA on the TLB flush fifo. It is theoretically possible + * to observe a request to flush 4095 PFNs starting from 0xfffffffffffff000 + * which will look identical. KVM's action to 'flush everything' instead of + * flushing these particular addresses is, however, fully legitimate as + * flushing more than requested is always OK. + */ +#define KVM_HV_TLB_FLUSHALL_ENTRY ((u64)-1) + +struct kvm_vcpu_hv_tlb_flush_fifo { + spinlock_t write_lock; + DECLARE_KFIFO(entries, u64, KVM_HV_TLB_FLUSH_FIFO_SIZE); +}; + /* Hyper-V per vcpu emulation context */ struct kvm_vcpu_hv { struct kvm_vcpu *vcpu; @@ -639,6 +657,8 @@ struct kvm_vcpu_hv { u32 nested_eax; /* HYPERV_CPUID_NESTED_FEATURES.EAX */ u32 nested_ebx; /* HYPERV_CPUID_NESTED_FEATURES.EBX */ } cpuid_cache; + + struct kvm_vcpu_hv_tlb_flush_fifo tlb_flush_fifo; }; /* Xen HVM per vcpu emulation context */ diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 3c0f639f6a05e..9d9a5ff2d54bd 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -954,6 +955,9 @@ int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) hv_vcpu->vp_index = vcpu->vcpu_idx; + INIT_KFIFO(hv_vcpu->tlb_flush_fifo.entries); + spin_lock_init(&hv_vcpu->tlb_flush_fifo.write_lock); + return 0; } @@ -1783,6 +1787,37 @@ static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc, var_cnt * sizeof(*sparse_banks)); } +static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + u64 flush_all_entry = KVM_HV_TLB_FLUSHALL_ENTRY; + + if (!hv_vcpu) + return; + + tlb_flush_fifo = &hv_vcpu->tlb_flush_fifo; + + kfifo_in_spinlocked_noirqsave(&tlb_flush_fifo->entries, &flush_all_entry, + 1, &tlb_flush_fifo->write_lock); +} + +int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + if (!hv_vcpu) + return -EINVAL; + + tlb_flush_fifo = &hv_vcpu->tlb_flush_fifo; + + kfifo_reset_out(&tlb_flush_fifo->entries); + + /* Precise flushing isn't implemented yet. */ + return -EOPNOTSUPP; +} + static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) { struct kvm *kvm = vcpu->kvm; @@ -1791,6 +1826,8 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS); u64 valid_bank_mask; u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; + struct kvm_vcpu *v; + unsigned long i; bool all_cpus; /* @@ -1870,10 +1907,20 @@ do_flush: * analyze it here, flush TLB regardless of the specified address space. */ if (all_cpus) { + kvm_for_each_vcpu(i, v, kvm) + hv_tlb_flush_enqueue(v); + kvm_make_all_cpus_request(kvm, KVM_REQ_HV_TLB_FLUSH); } else { sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask); + for_each_set_bit(i, vcpu_mask, KVM_MAX_VCPUS) { + v = kvm_get_vcpu(kvm, i); + if (!v) + continue; + hv_tlb_flush_enqueue(v); + } + kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask); } diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 1030b1b505523..f79edf9234cd4 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -151,4 +151,19 @@ int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args); int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); +static inline void kvm_hv_vcpu_purge_flush_tlb(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + if (!hv_vcpu || !kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu)) + return; + + tlb_flush_fifo = &hv_vcpu->tlb_flush_fifo; + + kfifo_reset_out(&tlb_flush_fifo->entries); +} + +int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu); + #endif diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 4ea6ddd998994..91352d6928452 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3727,7 +3727,7 @@ static void svm_flush_tlb_current(struct kvm_vcpu *vcpu) * A TLB flush for the current ASID flushes both "host" and "guest" TLB * entries, and thus is a superset of Hyper-V's fine grained flushing. */ - kvm_clear_request(KVM_REQ_HV_TLB_FLUSH, vcpu); + kvm_hv_vcpu_purge_flush_tlb(vcpu); /* * Flush only the current ASID even if the TLB flush was invoked via diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 12e49e8566d49..72ac6bf05c8b4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3425,7 +3425,7 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) * Flushing all "guest" TLB is always a superset of Hyper-V's fine * grained flushing. */ - kvm_clear_request(KVM_REQ_HV_TLB_FLUSH, vcpu); + kvm_hv_vcpu_purge_flush_tlb(vcpu); } @@ -10256,7 +10256,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_service_local_tlb_flush_requests(vcpu); - if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu)) + /* + * Fall back to a "full" guest flush if Hyper-V's precise + * flushing fails. Note, Hyper-V's flushing is per-vCPU, but + * the flushes are considered "remote" and not "local" because + * the requests can be initiated from other vCPUs. + */ + if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu) && + kvm_hv_vcpu_flush_tlb(vcpu)) kvm_vcpu_flush_tlb_guest(vcpu); if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { -- GitLab From 56b5354fd8f9173de2e1614864e5fb7bec8c50c4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 1 Nov 2022 15:53:48 +0100 Subject: [PATCH 0775/1423] KVM: x86: hyper-v: Add helper to read hypercall data for array Move the guts of kvm_get_sparse_vp_set() to a helper so that the code for reading a guest-provided array can be reused in the future, e.g. for getting a list of virtual addresses whose TLB entries need to be flushed. Opportunisticaly swap the order of the data and XMM adjustment so that the XMM/gpa offsets are bundled together. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-11-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 53 +++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 9d9a5ff2d54bd..3ba7e2d2fbbdc 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1753,38 +1753,51 @@ struct kvm_hv_hcall { sse128_t xmm[HV_HYPERCALL_MAX_XMM_REGISTERS]; }; -static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc, - int consumed_xmm_halves, - u64 *sparse_banks, gpa_t offset) -{ - u16 var_cnt; - int i; - if (hc->var_cnt > 64) - return -EINVAL; - - /* Ignore banks that cannot possibly contain a legal VP index. */ - var_cnt = min_t(u16, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS); +static int kvm_hv_get_hc_data(struct kvm *kvm, struct kvm_hv_hcall *hc, + u16 orig_cnt, u16 cnt_cap, u64 *data, + int consumed_xmm_halves, gpa_t offset) +{ + /* + * Preserve the original count when ignoring entries via a "cap", KVM + * still needs to validate the guest input (though the non-XMM path + * punts on the checks). + */ + u16 cnt = min(orig_cnt, cnt_cap); + int i, j; if (hc->fast) { /* * Each XMM holds two sparse banks, but do not count halves that * have already been consumed for hypercall parameters. */ - if (hc->var_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - consumed_xmm_halves) + if (orig_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - consumed_xmm_halves) return HV_STATUS_INVALID_HYPERCALL_INPUT; - for (i = 0; i < var_cnt; i++) { - int j = i + consumed_xmm_halves; + + for (i = 0; i < cnt; i++) { + j = i + consumed_xmm_halves; if (j % 2) - sparse_banks[i] = sse128_hi(hc->xmm[j / 2]); + data[i] = sse128_hi(hc->xmm[j / 2]); else - sparse_banks[i] = sse128_lo(hc->xmm[j / 2]); + data[i] = sse128_lo(hc->xmm[j / 2]); } return 0; } - return kvm_read_guest(kvm, hc->ingpa + offset, sparse_banks, - var_cnt * sizeof(*sparse_banks)); + return kvm_read_guest(kvm, hc->ingpa + offset, data, + cnt * sizeof(*data)); +} + +static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc, + u64 *sparse_banks, int consumed_xmm_halves, + gpa_t offset) +{ + if (hc->var_cnt > 64) + return -EINVAL; + + /* Cap var_cnt to ignore banks that cannot contain a legal VP index. */ + return kvm_hv_get_hc_data(kvm, hc, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS, + sparse_banks, consumed_xmm_halves, offset); } static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu) @@ -1895,7 +1908,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) if (!hc->var_cnt) goto ret_success; - if (kvm_get_sparse_vp_set(kvm, hc, 2, sparse_banks, + if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks, 2, offsetof(struct hv_tlb_flush_ex, hv_vp_set.bank_contents))) return HV_STATUS_INVALID_HYPERCALL_INPUT; @@ -2006,7 +2019,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) if (!hc->var_cnt) goto ret_success; - if (kvm_get_sparse_vp_set(kvm, hc, 1, sparse_banks, + if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks, 1, offsetof(struct hv_send_ipi_ex, vp_set.bank_contents))) return HV_STATUS_INVALID_HYPERCALL_INPUT; -- GitLab From 260970862c88b4130e9e12be023c7e2c2d37a966 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:53:49 +0100 Subject: [PATCH 0776/1423] KVM: x86: hyper-v: Handle HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST{,EX} calls gently Currently, HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST{,EX} calls are handled the exact same way as HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE{,EX}: by flushing the whole VPID and this is sub-optimal. Switch to handling these requests with 'flush_tlb_gva()' hooks instead. Use the newly introduced TLB flush fifo to queue the requests. Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-12-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 111 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 95 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 3ba7e2d2fbbdc..6868c478617c0 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1800,7 +1800,14 @@ static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc, sparse_banks, consumed_xmm_halves, offset); } -static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu) +static int kvm_hv_get_tlb_flush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc, u64 entries[], + int consumed_xmm_halves, gpa_t offset) +{ + return kvm_hv_get_hc_data(kvm, hc, hc->rep_cnt, hc->rep_cnt, + entries, consumed_xmm_halves, offset); +} + +static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu, u64 *entries, int count) { struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo; struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); @@ -1811,24 +1818,64 @@ static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu) tlb_flush_fifo = &hv_vcpu->tlb_flush_fifo; - kfifo_in_spinlocked_noirqsave(&tlb_flush_fifo->entries, &flush_all_entry, - 1, &tlb_flush_fifo->write_lock); + spin_lock(&tlb_flush_fifo->write_lock); + + /* + * All entries should fit on the fifo leaving one free for 'flush all' + * entry in case another request comes in. In case there's not enough + * space, just put 'flush all' entry there. + */ + if (count && entries && count < kfifo_avail(&tlb_flush_fifo->entries)) { + WARN_ON(kfifo_in(&tlb_flush_fifo->entries, entries, count) != count); + goto out_unlock; + } + + /* + * Note: full fifo always contains 'flush all' entry, no need to check the + * return value. + */ + kfifo_in(&tlb_flush_fifo->entries, &flush_all_entry, 1); + +out_unlock: + spin_unlock(&tlb_flush_fifo->write_lock); } int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo; struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + u64 entries[KVM_HV_TLB_FLUSH_FIFO_SIZE]; + int i, j, count; + gva_t gva; - if (!hv_vcpu) + if (!tdp_enabled || !hv_vcpu) return -EINVAL; tlb_flush_fifo = &hv_vcpu->tlb_flush_fifo; + count = kfifo_out(&tlb_flush_fifo->entries, entries, KVM_HV_TLB_FLUSH_FIFO_SIZE); + + for (i = 0; i < count; i++) { + if (entries[i] == KVM_HV_TLB_FLUSHALL_ENTRY) + goto out_flush_all; + + /* + * Lower 12 bits of 'address' encode the number of additional + * pages to flush. + */ + gva = entries[i] & PAGE_MASK; + for (j = 0; j < (entries[i] & ~PAGE_MASK) + 1; j++) + static_call(kvm_x86_flush_tlb_gva)(vcpu, gva + j * PAGE_SIZE); + + ++vcpu->stat.tlb_flush; + } + return 0; + +out_flush_all: kfifo_reset_out(&tlb_flush_fifo->entries); - /* Precise flushing isn't implemented yet. */ - return -EOPNOTSUPP; + /* Fall back to full flush. */ + return -ENOSPC; } static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) @@ -1837,11 +1884,21 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) struct hv_tlb_flush_ex flush_ex; struct hv_tlb_flush flush; DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS); + /* + * Normally, there can be no more than 'KVM_HV_TLB_FLUSH_FIFO_SIZE' + * entries on the TLB flush fifo. The last entry, however, needs to be + * always left free for 'flush all' entry which gets placed when + * there is not enough space to put all the requested entries. + */ + u64 __tlb_flush_entries[KVM_HV_TLB_FLUSH_FIFO_SIZE - 1]; + u64 *tlb_flush_entries; u64 valid_bank_mask; u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; struct kvm_vcpu *v; unsigned long i; bool all_cpus; + int consumed_xmm_halves = 0; + gpa_t data_offset; /* * The Hyper-V TLFS doesn't allow more than 64 sparse banks, e.g. the @@ -1857,10 +1914,12 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) flush.address_space = hc->ingpa; flush.flags = hc->outgpa; flush.processor_mask = sse128_lo(hc->xmm[0]); + consumed_xmm_halves = 1; } else { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush, sizeof(flush)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; + data_offset = sizeof(flush); } trace_kvm_hv_flush_tlb(flush.processor_mask, @@ -1884,10 +1943,12 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) flush_ex.flags = hc->outgpa; memcpy(&flush_ex.hv_vp_set, &hc->xmm[0], sizeof(hc->xmm[0])); + consumed_xmm_halves = 2; } else { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush_ex, sizeof(flush_ex)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; + data_offset = sizeof(flush_ex); } trace_kvm_hv_flush_tlb_ex(flush_ex.hv_vp_set.valid_bank_mask, @@ -1902,26 +1963,44 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) if (hc->var_cnt != hweight64(valid_bank_mask)) return HV_STATUS_INVALID_HYPERCALL_INPUT; - if (all_cpus) - goto do_flush; + if (!all_cpus) { + if (!hc->var_cnt) + goto ret_success; - if (!hc->var_cnt) - goto ret_success; + if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks, + consumed_xmm_halves, data_offset)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + } + + /* + * Hyper-V TLFS doesn't explicitly forbid non-empty sparse vCPU + * banks (and, thus, non-zero 'var_cnt') for the 'all vCPUs' + * case (HV_GENERIC_SET_ALL). Always adjust data_offset and + * consumed_xmm_halves to make sure TLB flush entries are read + * from the correct offset. + */ + data_offset += hc->var_cnt * sizeof(sparse_banks[0]); + consumed_xmm_halves += hc->var_cnt; + } - if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks, 2, - offsetof(struct hv_tlb_flush_ex, - hv_vp_set.bank_contents))) + if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE || + hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX || + hc->rep_cnt > ARRAY_SIZE(__tlb_flush_entries)) { + tlb_flush_entries = NULL; + } else { + if (kvm_hv_get_tlb_flush_entries(kvm, hc, __tlb_flush_entries, + consumed_xmm_halves, data_offset)) return HV_STATUS_INVALID_HYPERCALL_INPUT; + tlb_flush_entries = __tlb_flush_entries; } -do_flush: /* * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't * analyze it here, flush TLB regardless of the specified address space. */ if (all_cpus) { kvm_for_each_vcpu(i, v, kvm) - hv_tlb_flush_enqueue(v); + hv_tlb_flush_enqueue(v, tlb_flush_entries, hc->rep_cnt); kvm_make_all_cpus_request(kvm, KVM_REQ_HV_TLB_FLUSH); } else { @@ -1931,7 +2010,7 @@ do_flush: v = kvm_get_vcpu(kvm, i); if (!v) continue; - hv_tlb_flush_enqueue(v); + hv_tlb_flush_enqueue(v, tlb_flush_entries, hc->rep_cnt); } kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask); -- GitLab From f84fcb66568c0b00626f7f03e28db7d0dcba8098 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:53:50 +0100 Subject: [PATCH 0777/1423] KVM: x86: hyper-v: Expose support for extended gva ranges for flush hypercalls Extended GVA ranges support bit seems to indicate whether lower 12 bits of GVA can be used to specify up to 4095 additional consequent GVAs to flush. This is somewhat described in TLFS. Previously, KVM was handling HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST{,EX} requests by flushing the whole VPID so technically, extended GVA ranges were already supported. As such requests are handled more gently now, advertizing support for extended ranges starts making sense to reduce the size of TLB flush requests. Reviewed-by: Maxim Levitsky Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-13-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/hyperv-tlfs.h | 2 ++ arch/x86/kvm/hyperv.c | 1 + 2 files changed, 3 insertions(+) diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index c5e0e5a06c0dc..6639979302ab7 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -61,6 +61,8 @@ #define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE BIT(10) /* Support for debug MSRs available */ #define HV_FEATURE_DEBUG_MSRS_AVAILABLE BIT(11) +/* Support for extended gva ranges for flush hypercalls available */ +#define HV_FEATURE_EXT_GVA_RANGES_FLUSH BIT(14) /* * Support for returning hypercall output block via XMM * registers is available diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 6868c478617c0..fca9c51891f5f 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -2641,6 +2641,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->ebx |= HV_DEBUGGING; ent->edx |= HV_X64_GUEST_DEBUGGING_AVAILABLE; ent->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE; + ent->edx |= HV_FEATURE_EXT_GVA_RANGES_FLUSH; /* * Direct Synthetic timers only make sense with in-kernel -- GitLab From aee738236dca0d0870789138ec494e15d6303566 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:53:51 +0100 Subject: [PATCH 0778/1423] KVM: x86: Prepare kvm_hv_flush_tlb() to handle L2's GPAs To handle L2 TLB flush requests, KVM needs to translate the specified L2 GPA to L1 GPA to read hypercall arguments from there. No functional change as KVM doesn't handle VMCALL/VMMCALL from L2 yet. Reviewed-by: Maxim Levitsky Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-14-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index fca9c51891f5f..cb145987f5b85 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -23,6 +23,7 @@ #include "ioapic.h" #include "cpuid.h" #include "hyperv.h" +#include "mmu.h" #include "xen.h" #include @@ -1908,6 +1909,19 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) */ BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > 64); + /* + * 'Slow' hypercall's first parameter is the address in guest's memory + * where hypercall parameters are placed. This is either a GPA or a + * nested GPA when KVM is handling the call from L2 ('direct' TLB + * flush). Translate the address here so the memory can be uniformly + * read with kvm_read_guest(). + */ + if (!hc->fast && is_guest_mode(vcpu)) { + hc->ingpa = translate_nested_gpa(vcpu, hc->ingpa, 0, NULL); + if (unlikely(hc->ingpa == INVALID_GPA)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + } + if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST || hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE) { if (hc->fast) { -- GitLab From bd19c94a19b09b563a20862c651859f6e3d73847 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:53:52 +0100 Subject: [PATCH 0779/1423] x86/hyperv: Introduce HV_MAX_SPARSE_VCPU_BANKS/HV_VCPUS_PER_SPARSE_BANK constants It may not come clear from where the magical '64' value used in __cpumask_to_vpset() come from. Moreover, '64' means both the maximum sparse bank number as well as the number of vCPUs per bank. Add defines to make things clear. These defines are also going to be used by KVM. No functional change. Reviewed-by: Maxim Levitsky Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-15-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- include/asm-generic/hyperv-tlfs.h | 5 +++++ include/asm-generic/mshyperv.h | 11 ++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h index fdce7a4cfc6ff..020ca9bdbb79a 100644 --- a/include/asm-generic/hyperv-tlfs.h +++ b/include/asm-generic/hyperv-tlfs.h @@ -399,6 +399,11 @@ struct hv_vpset { u64 bank_contents[]; } __packed; +/* The maximum number of sparse vCPU banks which can be encoded by 'struct hv_vpset' */ +#define HV_MAX_SPARSE_VCPU_BANKS (64) +/* The number of vCPUs in one sparse bank */ +#define HV_VCPUS_PER_SPARSE_BANK (64) + /* HvCallSendSyntheticClusterIpi hypercall */ struct hv_send_ipi { u32 vector; diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index bfb9eb9d7215b..d55d2833a37b7 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -211,9 +211,10 @@ static inline int __cpumask_to_vpset(struct hv_vpset *vpset, { int cpu, vcpu, vcpu_bank, vcpu_offset, nr_bank = 1; int this_cpu = smp_processor_id(); + int max_vcpu_bank = hv_max_vp_index / HV_VCPUS_PER_SPARSE_BANK; - /* valid_bank_mask can represent up to 64 banks */ - if (hv_max_vp_index / 64 >= 64) + /* vpset.valid_bank_mask can represent up to HV_MAX_SPARSE_VCPU_BANKS banks */ + if (max_vcpu_bank >= HV_MAX_SPARSE_VCPU_BANKS) return 0; /* @@ -221,7 +222,7 @@ static inline int __cpumask_to_vpset(struct hv_vpset *vpset, * structs are not cleared between calls, we risk flushing unneeded * vCPUs otherwise. */ - for (vcpu_bank = 0; vcpu_bank <= hv_max_vp_index / 64; vcpu_bank++) + for (vcpu_bank = 0; vcpu_bank <= max_vcpu_bank; vcpu_bank++) vpset->bank_contents[vcpu_bank] = 0; /* @@ -233,8 +234,8 @@ static inline int __cpumask_to_vpset(struct hv_vpset *vpset, vcpu = hv_cpu_number_to_vp_number(cpu); if (vcpu == VP_INVAL) return -1; - vcpu_bank = vcpu / 64; - vcpu_offset = vcpu % 64; + vcpu_bank = vcpu / HV_VCPUS_PER_SPARSE_BANK; + vcpu_offset = vcpu % HV_VCPUS_PER_SPARSE_BANK; __set_bit(vcpu_offset, (unsigned long *) &vpset->bank_contents[vcpu_bank]); if (vcpu_bank >= nr_bank) -- GitLab From ca7372aca7f4b2f1b29a9941053999d224d1e7c7 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:53:53 +0100 Subject: [PATCH 0780/1423] KVM: x86: hyper-v: Use HV_MAX_SPARSE_VCPU_BANKS/HV_VCPUS_PER_SPARSE_BANK instead of raw '64' It may not be clear from where the '64' limit for the maximum sparse bank number comes from, use HV_MAX_SPARSE_VCPU_BANKS define instead. Use HV_VCPUS_PER_SPARSE_BANK in KVM_HV_MAX_SPARSE_VCPU_SET_BITS's definition. Opportunistically adjust the comment around BUILD_BUG_ON(). No functional change. Suggested-by: Sean Christopherson Reviewed-by: Maxim Levitsky Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-16-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index cb145987f5b85..2fceb85687c13 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -40,7 +40,7 @@ #include "irq.h" #include "fpu.h" -#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64) +#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, HV_VCPUS_PER_SPARSE_BANK) static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, bool vcpu_kick); @@ -1793,7 +1793,7 @@ static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc, u64 *sparse_banks, int consumed_xmm_halves, gpa_t offset) { - if (hc->var_cnt > 64) + if (hc->var_cnt > HV_MAX_SPARSE_VCPU_BANKS) return -EINVAL; /* Cap var_cnt to ignore banks that cannot contain a legal VP index. */ @@ -1902,12 +1902,11 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) gpa_t data_offset; /* - * The Hyper-V TLFS doesn't allow more than 64 sparse banks, e.g. the - * valid mask is a u64. Fail the build if KVM's max allowed number of - * vCPUs (>4096) would exceed this limit, KVM will additional changes - * for Hyper-V support to avoid setting the guest up to fail. + * The Hyper-V TLFS doesn't allow more than HV_MAX_SPARSE_VCPU_BANKS + * sparse banks. Fail the build if KVM's max allowed number of + * vCPUs (>4096) exceeds this limit. */ - BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > 64); + BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > HV_MAX_SPARSE_VCPU_BANKS); /* * 'Slow' hypercall's first parameter is the address in guest's memory -- GitLab From b6c2c22fa7012616b3039c9f559bf01195137b9d Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:53:54 +0100 Subject: [PATCH 0781/1423] KVM: x86: hyper-v: Don't use sparse_set_to_vcpu_mask() in kvm_hv_send_ipi() Get rid of on-stack allocation of vcpu_mask and optimize kvm_hv_send_ipi() for a smaller number of vCPUs in the request. When Hyper-V TLB flush is in use, HvSendSyntheticClusterIpi{,Ex} calls are not commonly used to send IPIs to a large number of vCPUs (and are rarely used in general). Introduce hv_is_vp_in_sparse_set() to directly check if the specified VP_ID is present in sparse vCPU set. Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-17-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 2fceb85687c13..0bfa59838e0a5 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1741,6 +1741,28 @@ static void sparse_set_to_vcpu_mask(struct kvm *kvm, u64 *sparse_banks, } } +static bool hv_is_vp_in_sparse_set(u32 vp_id, u64 valid_bank_mask, u64 sparse_banks[]) +{ + int valid_bit_nr = vp_id / HV_VCPUS_PER_SPARSE_BANK; + unsigned long sbank; + + if (!test_bit(valid_bit_nr, (unsigned long *)&valid_bank_mask)) + return false; + + /* + * The index into the sparse bank is the number of preceding bits in + * the valid mask. Optimize for VMs with <64 vCPUs by skipping the + * fancy math if there can't possibly be preceding bits. + */ + if (valid_bit_nr) + sbank = hweight64(valid_bank_mask & GENMASK_ULL(valid_bit_nr - 1, 0)); + else + sbank = 0; + + return test_bit(vp_id % HV_VCPUS_PER_SPARSE_BANK, + (unsigned long *)&sparse_banks[sbank]); +} + struct kvm_hv_hcall { u64 param; u64 ingpa; @@ -2035,8 +2057,8 @@ ret_success: ((u64)hc->rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET); } -static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector, - unsigned long *vcpu_bitmap) +static void kvm_hv_send_ipi_to_many(struct kvm *kvm, u32 vector, + u64 *sparse_banks, u64 valid_bank_mask) { struct kvm_lapic_irq irq = { .delivery_mode = APIC_DM_FIXED, @@ -2046,7 +2068,9 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector, unsigned long i; kvm_for_each_vcpu(i, vcpu, kvm) { - if (vcpu_bitmap && !test_bit(i, vcpu_bitmap)) + if (sparse_banks && + !hv_is_vp_in_sparse_set(kvm_hv_get_vpindex(vcpu), + valid_bank_mask, sparse_banks)) continue; /* We fail only when APIC is disabled */ @@ -2059,7 +2083,6 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) struct kvm *kvm = vcpu->kvm; struct hv_send_ipi_ex send_ipi_ex; struct hv_send_ipi send_ipi; - DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS); u64 valid_bank_mask; u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; u32 vector; @@ -2121,13 +2144,10 @@ check_and_send_ipi: if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) return HV_STATUS_INVALID_HYPERCALL_INPUT; - if (all_cpus) { - kvm_send_ipi_to_many(kvm, vector, NULL); - } else { - sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask); - - kvm_send_ipi_to_many(kvm, vector, vcpu_mask); - } + if (all_cpus) + kvm_hv_send_ipi_to_many(kvm, vector, NULL, 0); + else + kvm_hv_send_ipi_to_many(kvm, vector, sparse_banks, valid_bank_mask); ret_success: return HV_STATUS_SUCCESS; -- GitLab From 53ca765a041d5a24650d3f01bced791be5d72df7 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:53:55 +0100 Subject: [PATCH 0782/1423] KVM: x86: hyper-v: Create a separate fifo for L2 TLB flush To handle L2 TLB flush requests, KVM needs to use a separate fifo from regular (L1) Hyper-V TLB flush requests: e.g. when a request to flush something in L2 is made, the target vCPU can transition from L2 to L1, receive a request to flush a GVA for L1 and then try to enter L2 back. The first request needs to be processed at this point. Similarly, requests to flush GVAs in L1 must wait until L2 exits to L1. No functional change as KVM doesn't handle L2 TLB flush requests from L2 yet. Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-18-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 8 +++++++- arch/x86/kvm/hyperv.c | 11 +++++++---- arch/x86/kvm/hyperv.h | 19 ++++++++++++++++--- 3 files changed, 30 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3e35dcf40dc7c..89f9c98ff4456 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -631,6 +631,12 @@ struct kvm_vcpu_hv_synic { */ #define KVM_HV_TLB_FLUSHALL_ENTRY ((u64)-1) +enum hv_tlb_flush_fifos { + HV_L1_TLB_FLUSH_FIFO, + HV_L2_TLB_FLUSH_FIFO, + HV_NR_TLB_FLUSH_FIFOS, +}; + struct kvm_vcpu_hv_tlb_flush_fifo { spinlock_t write_lock; DECLARE_KFIFO(entries, u64, KVM_HV_TLB_FLUSH_FIFO_SIZE); @@ -658,7 +664,7 @@ struct kvm_vcpu_hv { u32 nested_ebx; /* HYPERV_CPUID_NESTED_FEATURES.EBX */ } cpuid_cache; - struct kvm_vcpu_hv_tlb_flush_fifo tlb_flush_fifo; + struct kvm_vcpu_hv_tlb_flush_fifo tlb_flush_fifo[HV_NR_TLB_FLUSH_FIFOS]; }; /* Xen HVM per vcpu emulation context */ diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 0bfa59838e0a5..9898463103032 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -956,8 +956,10 @@ int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) hv_vcpu->vp_index = vcpu->vcpu_idx; - INIT_KFIFO(hv_vcpu->tlb_flush_fifo.entries); - spin_lock_init(&hv_vcpu->tlb_flush_fifo.write_lock); + for (i = 0; i < HV_NR_TLB_FLUSH_FIFOS; i++) { + INIT_KFIFO(hv_vcpu->tlb_flush_fifo[i].entries); + spin_lock_init(&hv_vcpu->tlb_flush_fifo[i].write_lock); + } return 0; } @@ -1839,7 +1841,8 @@ static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu, u64 *entries, int count) if (!hv_vcpu) return; - tlb_flush_fifo = &hv_vcpu->tlb_flush_fifo; + /* kvm_hv_flush_tlb() is not ready to handle requests for L2s yet */ + tlb_flush_fifo = &hv_vcpu->tlb_flush_fifo[HV_L1_TLB_FLUSH_FIFO]; spin_lock(&tlb_flush_fifo->write_lock); @@ -1874,7 +1877,7 @@ int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu) if (!tdp_enabled || !hv_vcpu) return -EINVAL; - tlb_flush_fifo = &hv_vcpu->tlb_flush_fifo; + tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu, is_guest_mode(vcpu)); count = kfifo_out(&tlb_flush_fifo->entries, entries, KVM_HV_TLB_FLUSH_FIFO_SIZE); diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index f79edf9234cd4..8942e8c6c912e 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -22,6 +22,7 @@ #define __ARCH_X86_KVM_HYPERV_H__ #include +#include "x86.h" /* "Hv#1" signature */ #define HYPERV_CPUID_SIGNATURE_EAX 0x31237648 @@ -151,15 +152,27 @@ int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args); int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); +static inline struct kvm_vcpu_hv_tlb_flush_fifo *kvm_hv_get_tlb_flush_fifo(struct kvm_vcpu *vcpu, + bool is_guest_mode) +{ + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + int i = is_guest_mode ? HV_L2_TLB_FLUSH_FIFO : + HV_L1_TLB_FLUSH_FIFO; + + /* KVM does not handle L2 TLB flush requests yet */ + WARN_ON_ONCE(i != HV_L1_TLB_FLUSH_FIFO); + + return &hv_vcpu->tlb_flush_fifo[i]; +} + static inline void kvm_hv_vcpu_purge_flush_tlb(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo; - struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); - if (!hv_vcpu || !kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu)) + if (!to_hv_vcpu(vcpu) || !kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu)) return; - tlb_flush_fifo = &hv_vcpu->tlb_flush_fifo; + tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu, is_guest_mode(vcpu)); kfifo_reset_out(&tlb_flush_fifo->entries); } -- GitLab From 7d5e88d301f84a7b64602dbe3640f288223095ea Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:53:56 +0100 Subject: [PATCH 0783/1423] KVM: x86: hyper-v: Use preallocated buffer in 'struct kvm_vcpu_hv' instead of on-stack 'sparse_banks' To make kvm_hv_flush_tlb() ready to handle L2 TLB flush requests, KVM needs to allow for all 64 sparse vCPU banks regardless of KVM_MAX_VCPUs as L1 may use vCPU overcommit for L2. To avoid growing on-stack allocation, make 'sparse_banks' part of per-vCPU 'struct kvm_vcpu_hv' which is allocated dynamically. Note: sparse_set_to_vcpu_mask() can't currently be used to handle L2 requests as KVM does not keep L2 VM_ID -> L2 VCPU_ID -> L1 vCPU mappings, i.e. its vp_bitmap array is still bounded by the number of L1 vCPUs and so can remain an on-stack allocation. Reviewed-by: Maxim Levitsky Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-19-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/hyperv.c | 6 ++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 89f9c98ff4456..4596f19f927b1 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -665,6 +665,9 @@ struct kvm_vcpu_hv { } cpuid_cache; struct kvm_vcpu_hv_tlb_flush_fifo tlb_flush_fifo[HV_NR_TLB_FLUSH_FIFOS]; + + /* Preallocated buffer for handling hypercalls passing sparse vCPU set */ + u64 sparse_banks[HV_MAX_SPARSE_VCPU_BANKS]; }; /* Xen HVM per vcpu emulation context */ diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 9898463103032..058e14564389d 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1906,6 +1906,8 @@ out_flush_all: static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) { + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + u64 *sparse_banks = hv_vcpu->sparse_banks; struct kvm *kvm = vcpu->kvm; struct hv_tlb_flush_ex flush_ex; struct hv_tlb_flush flush; @@ -1919,7 +1921,6 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) u64 __tlb_flush_entries[KVM_HV_TLB_FLUSH_FIFO_SIZE - 1]; u64 *tlb_flush_entries; u64 valid_bank_mask; - u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; struct kvm_vcpu *v; unsigned long i; bool all_cpus; @@ -2083,11 +2084,12 @@ static void kvm_hv_send_ipi_to_many(struct kvm *kvm, u32 vector, static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) { + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + u64 *sparse_banks = hv_vcpu->sparse_banks; struct kvm *kvm = vcpu->kvm; struct hv_send_ipi_ex send_ipi_ex; struct hv_send_ipi send_ipi; u64 valid_bank_mask; - u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; u32 vector; bool all_cpus; -- GitLab From 38edb45231832ef2aa191e2f3f77e30ad0bb4b61 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:53:57 +0100 Subject: [PATCH 0784/1423] KVM: nVMX: Keep track of hv_vm_id/hv_vp_id when eVMCS is in use To handle L2 TLB flush requests, KVM needs to keep track of L2's VM_ID/ VP_IDs which are set by L1 hypervisor. 'Partition assist page' address is also needed to handle post-flush exit to L1 upon request. Reviewed-by: Maxim Levitsky Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-20-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 6 ++++++ arch/x86/kvm/vmx/nested.c | 15 +++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4596f19f927b1..63dad1e129698 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -668,6 +668,12 @@ struct kvm_vcpu_hv { /* Preallocated buffer for handling hypercalls passing sparse vCPU set */ u64 sparse_banks[HV_MAX_SPARSE_VCPU_BANKS]; + + struct { + u64 pa_page_gpa; + u64 vm_id; + u32 vp_id; + } nested; }; /* Xen HVM per vcpu emulation context */ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 048b2c3e3b3fc..cce68fd5befbc 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -225,6 +225,7 @@ static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) { + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { @@ -233,6 +234,12 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) } vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; + + if (hv_vcpu) { + hv_vcpu->nested.pa_page_gpa = INVALID_GPA; + hv_vcpu->nested.vm_id = 0; + hv_vcpu->nested.vp_id = 0; + } } static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, @@ -1557,11 +1564,19 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields { struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu); /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ vmcs12->tpr_threshold = evmcs->tpr_threshold; vmcs12->guest_rip = evmcs->guest_rip; + if (unlikely(!(hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) { + hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page; + hv_vcpu->nested.vm_id = evmcs->hv_vm_id; + hv_vcpu->nested.vp_id = evmcs->hv_vp_id; + } + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { vmcs12->guest_rsp = evmcs->guest_rsp; -- GitLab From e45aa2444d280747d27d4d98685d761125c4e364 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:53:58 +0100 Subject: [PATCH 0785/1423] KVM: nSVM: Keep track of Hyper-V hv_vm_id/hv_vp_id Similar to nSVM, KVM needs to know L2's VM_ID/VP_ID and Partition assist page address to handle L2 TLB flush requests. Reviewed-by: Maxim Levitsky Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-21-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/hyperv.h | 15 +++++++++++++++ arch/x86/kvm/svm/nested.c | 2 ++ 2 files changed, 17 insertions(+) diff --git a/arch/x86/kvm/svm/hyperv.h b/arch/x86/kvm/svm/hyperv.h index c59544cdf03b7..e97d80974e720 100644 --- a/arch/x86/kvm/svm/hyperv.h +++ b/arch/x86/kvm/svm/hyperv.h @@ -9,5 +9,20 @@ #include #include "../hyperv.h" +#include "svm.h" + +static inline void nested_svm_hv_update_vm_vp_ids(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + if (!hv_vcpu) + return; + + hv_vcpu->nested.pa_page_gpa = hve->partition_assist_page; + hv_vcpu->nested.vm_id = hve->hv_vm_id; + hv_vcpu->nested.vp_id = hve->hv_vp_id; +} #endif /* __ARCH_X86_KVM_SVM_HYPERV_H__ */ diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index cc8f47e7e294d..aa36c349da430 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -800,6 +800,8 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, if (kvm_vcpu_apicv_active(vcpu)) kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); + nested_svm_hv_update_vm_vp_ids(vcpu); + return 0; } -- GitLab From b0c9c25e46252a576a974dd659f2396774e0dbb1 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:53:59 +0100 Subject: [PATCH 0786/1423] KVM: x86: Introduce .hv_inject_synthetic_vmexit_post_tlb_flush() nested hook Hyper-V supports injecting synthetic L2->L1 exit after performing L2 TLB flush operation but the procedure is vendor specific. Introduce .hv_inject_synthetic_vmexit_post_tlb_flush nested hook for it. Reviewed-by: Maxim Levitsky Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-22-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/Makefile | 3 ++- arch/x86/kvm/svm/hyperv.c | 11 +++++++++++ arch/x86/kvm/svm/hyperv.h | 2 ++ arch/x86/kvm/svm/nested.c | 1 + arch/x86/kvm/vmx/hyperv.c | 4 ++++ arch/x86/kvm/vmx/hyperv.h | 1 + arch/x86/kvm/vmx/nested.c | 1 + 8 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kvm/svm/hyperv.c diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 63dad1e129698..ebf90f4f1a21e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1722,6 +1722,7 @@ struct kvm_x86_nested_ops { int (*enable_evmcs)(struct kvm_vcpu *vcpu, uint16_t *vmcs_version); uint16_t (*get_evmcs_version)(struct kvm_vcpu *vcpu); + void (*hv_inject_synthetic_vmexit_post_tlb_flush)(struct kvm_vcpu *vcpu); }; struct kvm_x86_init_ops { diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 4cf407563feea..80e3fe184d17e 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -26,7 +26,8 @@ kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ vmx/hyperv.o vmx/nested.o vmx/posted_intr.o kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o -kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o +kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o \ + svm/sev.o svm/hyperv.o ifdef CONFIG_HYPERV kvm-amd-y += svm/svm_onhyperv.o diff --git a/arch/x86/kvm/svm/hyperv.c b/arch/x86/kvm/svm/hyperv.c new file mode 100644 index 0000000000000..911f51021af1f --- /dev/null +++ b/arch/x86/kvm/svm/hyperv.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD SVM specific code for Hyper-V on KVM. + * + * Copyright 2022 Red Hat, Inc. and/or its affiliates. + */ +#include "hyperv.h" + +void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu) +{ +} diff --git a/arch/x86/kvm/svm/hyperv.h b/arch/x86/kvm/svm/hyperv.h index e97d80974e720..7564bdf652e47 100644 --- a/arch/x86/kvm/svm/hyperv.h +++ b/arch/x86/kvm/svm/hyperv.h @@ -25,4 +25,6 @@ static inline void nested_svm_hv_update_vm_vp_ids(struct kvm_vcpu *vcpu) hv_vcpu->nested.vp_id = hve->hv_vp_id; } +void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu); + #endif /* __ARCH_X86_KVM_SVM_HYPERV_H__ */ diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index aa36c349da430..748e4de40c8fa 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1730,4 +1730,5 @@ struct kvm_x86_nested_ops svm_nested_ops = { .get_nested_state_pages = svm_get_nested_state_pages, .get_state = svm_get_nested_state, .set_state = svm_set_nested_state, + .hv_inject_synthetic_vmexit_post_tlb_flush = svm_hv_inject_synthetic_vmexit_post_tlb_flush, }; diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c index 5e239158174ea..f05464db4fdc1 100644 --- a/arch/x86/kvm/vmx/hyperv.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -506,3 +506,7 @@ int nested_enable_evmcs(struct kvm_vcpu *vcpu, return 0; } + +void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu) +{ +} diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index 99a151af7a817..8efaffe9215bf 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -243,5 +243,6 @@ int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version); void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); int nested_evmcs_check_controls(struct vmcs12 *vmcs12); +void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu); #endif /* __KVM_X86_VMX_HYPERV_H */ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index cce68fd5befbc..396712d13211f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -6995,4 +6995,5 @@ struct kvm_x86_nested_ops vmx_nested_ops = { .write_log_dirty = nested_vmx_write_pml_buffer, .enable_evmcs = nested_enable_evmcs, .get_evmcs_version = nested_get_evmcs_version, + .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush, }; -- GitLab From 3c9eb0655fc03fb5e84f1db334ebc832d9c5ac31 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:54:00 +0100 Subject: [PATCH 0787/1423] KVM: x86: hyper-v: Introduce kvm_hv_is_tlb_flush_hcall() The newly introduced helper checks whether vCPU is performing a Hyper-V TLB flush hypercall. This is required to filter out L2 TLB flush hypercalls for processing. Reviewed-by: Maxim Levitsky Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-23-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 8942e8c6c912e..5f9c76b45f468 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -177,6 +177,23 @@ static inline void kvm_hv_vcpu_purge_flush_tlb(struct kvm_vcpu *vcpu) kfifo_reset_out(&tlb_flush_fifo->entries); } +static inline bool kvm_hv_is_tlb_flush_hcall(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + u16 code; + + if (!hv_vcpu) + return false; + + code = is_64_bit_hypercall(vcpu) ? kvm_rcx_read(vcpu) : + kvm_rax_read(vcpu); + + return (code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE || + code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST || + code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX || + code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX); +} + int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu); #endif -- GitLab From c58a318f6090efe06e6702b8882e2026f44f620e Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:54:01 +0100 Subject: [PATCH 0788/1423] KVM: x86: hyper-v: L2 TLB flush Handle L2 TLB flush requests by going through all vCPUs and checking whether there are vCPUs running the same VM_ID with a VP_ID specified in the requests. Perform synthetic exit to L2 upon finish. Note, while checking VM_ID/VP_ID of running vCPUs seem to be a bit racy, we count on the fact that KVM flushes the whole L2 VPID upon transition. Also, KVM_REQ_HV_TLB_FLUSH request needs to be done upon transition between L1 and L2 to make sure all pending requests are always processed. For the reference, Hyper-V TLFS refers to the feature as "Direct Virtual Flush". Note, nVMX/nSVM code does not handle VMCALL/VMMCALL from L2 yet. Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-24-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 80 ++++++++++++++++++++++++++++++++++++------- arch/x86/kvm/hyperv.h | 3 -- arch/x86/kvm/trace.h | 21 +++++++----- 3 files changed, 80 insertions(+), 24 deletions(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 058e14564389d..3715a6f026a23 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -34,6 +34,7 @@ #include #include +#include #include #include "trace.h" @@ -1832,18 +1833,16 @@ static int kvm_hv_get_tlb_flush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc entries, consumed_xmm_halves, offset); } -static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu, u64 *entries, int count) +static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu, + struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo, + u64 *entries, int count) { - struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo; struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); u64 flush_all_entry = KVM_HV_TLB_FLUSHALL_ENTRY; if (!hv_vcpu) return; - /* kvm_hv_flush_tlb() is not ready to handle requests for L2s yet */ - tlb_flush_fifo = &hv_vcpu->tlb_flush_fifo[HV_L1_TLB_FLUSH_FIFO]; - spin_lock(&tlb_flush_fifo->write_lock); /* @@ -1912,6 +1911,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) struct hv_tlb_flush_ex flush_ex; struct hv_tlb_flush flush; DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS); + struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo; /* * Normally, there can be no more than 'KVM_HV_TLB_FLUSH_FIFO_SIZE' * entries on the TLB flush fifo. The last entry, however, needs to be @@ -1962,7 +1962,8 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) } trace_kvm_hv_flush_tlb(flush.processor_mask, - flush.address_space, flush.flags); + flush.address_space, flush.flags, + is_guest_mode(vcpu)); valid_bank_mask = BIT_ULL(0); sparse_banks[0] = flush.processor_mask; @@ -1993,7 +1994,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) trace_kvm_hv_flush_tlb_ex(flush_ex.hv_vp_set.valid_bank_mask, flush_ex.hv_vp_set.format, flush_ex.address_space, - flush_ex.flags); + flush_ex.flags, is_guest_mode(vcpu)); valid_bank_mask = flush_ex.hv_vp_set.valid_bank_mask; all_cpus = flush_ex.hv_vp_set.format != @@ -2037,19 +2038,57 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't * analyze it here, flush TLB regardless of the specified address space. */ - if (all_cpus) { - kvm_for_each_vcpu(i, v, kvm) - hv_tlb_flush_enqueue(v, tlb_flush_entries, hc->rep_cnt); + if (all_cpus && !is_guest_mode(vcpu)) { + kvm_for_each_vcpu(i, v, kvm) { + tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false); + hv_tlb_flush_enqueue(v, tlb_flush_fifo, + tlb_flush_entries, hc->rep_cnt); + } kvm_make_all_cpus_request(kvm, KVM_REQ_HV_TLB_FLUSH); - } else { + } else if (!is_guest_mode(vcpu)) { sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask); for_each_set_bit(i, vcpu_mask, KVM_MAX_VCPUS) { v = kvm_get_vcpu(kvm, i); if (!v) continue; - hv_tlb_flush_enqueue(v, tlb_flush_entries, hc->rep_cnt); + tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false); + hv_tlb_flush_enqueue(v, tlb_flush_fifo, + tlb_flush_entries, hc->rep_cnt); + } + + kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask); + } else { + struct kvm_vcpu_hv *hv_v; + + bitmap_zero(vcpu_mask, KVM_MAX_VCPUS); + + kvm_for_each_vcpu(i, v, kvm) { + hv_v = to_hv_vcpu(v); + + /* + * The following check races with nested vCPUs entering/exiting + * and/or migrating between L1's vCPUs, however the only case when + * KVM *must* flush the TLB is when the target L2 vCPU keeps + * running on the same L1 vCPU from the moment of the request until + * kvm_hv_flush_tlb() returns. TLB is fully flushed in all other + * cases, e.g. when the target L2 vCPU migrates to a different L1 + * vCPU or when the corresponding L1 vCPU temporary switches to a + * different L2 vCPU while the request is being processed. + */ + if (!hv_v || hv_v->nested.vm_id != hv_vcpu->nested.vm_id) + continue; + + if (!all_cpus && + !hv_is_vp_in_sparse_set(hv_v->nested.vp_id, valid_bank_mask, + sparse_banks)) + continue; + + __set_bit(i, vcpu_mask); + tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, true); + hv_tlb_flush_enqueue(v, tlb_flush_fifo, + tlb_flush_entries, hc->rep_cnt); } kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask); @@ -2239,10 +2278,25 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result) { + u32 tlb_lock_count = 0; + int ret; + + if (hv_result_success(result) && is_guest_mode(vcpu) && + kvm_hv_is_tlb_flush_hcall(vcpu) && + kvm_read_guest(vcpu->kvm, to_hv_vcpu(vcpu)->nested.pa_page_gpa, + &tlb_lock_count, sizeof(tlb_lock_count))) + result = HV_STATUS_INVALID_HYPERCALL_INPUT; + trace_kvm_hv_hypercall_done(result); kvm_hv_hypercall_set_result(vcpu, result); ++vcpu->stat.hypercalls; - return kvm_skip_emulated_instruction(vcpu); + + ret = kvm_skip_emulated_instruction(vcpu); + + if (tlb_lock_count) + kvm_x86_ops.nested_ops->hv_inject_synthetic_vmexit_post_tlb_flush(vcpu); + + return ret; } static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 5f9c76b45f468..7706e203ff43e 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -159,9 +159,6 @@ static inline struct kvm_vcpu_hv_tlb_flush_fifo *kvm_hv_get_tlb_flush_fifo(struc int i = is_guest_mode ? HV_L2_TLB_FLUSH_FIFO : HV_L1_TLB_FLUSH_FIFO; - /* KVM does not handle L2 TLB flush requests yet */ - WARN_ON_ONCE(i != HV_L1_TLB_FLUSH_FIFO); - return &hv_vcpu->tlb_flush_fifo[i]; } diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index bc25589ad5886..09f3392dd8300 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1547,38 +1547,41 @@ TRACE_EVENT(kvm_hv_timer_state, * Tracepoint for kvm_hv_flush_tlb. */ TRACE_EVENT(kvm_hv_flush_tlb, - TP_PROTO(u64 processor_mask, u64 address_space, u64 flags), - TP_ARGS(processor_mask, address_space, flags), + TP_PROTO(u64 processor_mask, u64 address_space, u64 flags, bool guest_mode), + TP_ARGS(processor_mask, address_space, flags, guest_mode), TP_STRUCT__entry( __field(u64, processor_mask) __field(u64, address_space) __field(u64, flags) + __field(bool, guest_mode) ), TP_fast_assign( __entry->processor_mask = processor_mask; __entry->address_space = address_space; __entry->flags = flags; + __entry->guest_mode = guest_mode; ), - TP_printk("processor_mask 0x%llx address_space 0x%llx flags 0x%llx", + TP_printk("processor_mask 0x%llx address_space 0x%llx flags 0x%llx %s", __entry->processor_mask, __entry->address_space, - __entry->flags) + __entry->flags, __entry->guest_mode ? "(L2)" : "") ); /* * Tracepoint for kvm_hv_flush_tlb_ex. */ TRACE_EVENT(kvm_hv_flush_tlb_ex, - TP_PROTO(u64 valid_bank_mask, u64 format, u64 address_space, u64 flags), - TP_ARGS(valid_bank_mask, format, address_space, flags), + TP_PROTO(u64 valid_bank_mask, u64 format, u64 address_space, u64 flags, bool guest_mode), + TP_ARGS(valid_bank_mask, format, address_space, flags, guest_mode), TP_STRUCT__entry( __field(u64, valid_bank_mask) __field(u64, format) __field(u64, address_space) __field(u64, flags) + __field(bool, guest_mode) ), TP_fast_assign( @@ -1586,12 +1589,14 @@ TRACE_EVENT(kvm_hv_flush_tlb_ex, __entry->format = format; __entry->address_space = address_space; __entry->flags = flags; + __entry->guest_mode = guest_mode; ), TP_printk("valid_bank_mask 0x%llx format 0x%llx " - "address_space 0x%llx flags 0x%llx", + "address_space 0x%llx flags 0x%llx %s", __entry->valid_bank_mask, __entry->format, - __entry->address_space, __entry->flags) + __entry->address_space, __entry->flags, + __entry->guest_mode ? "(L2)" : "") ); /* -- GitLab From d4baf1a9a572910d7b4cd63d23bf4be89b7648bd Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:54:02 +0100 Subject: [PATCH 0789/1423] KVM: x86: hyper-v: Introduce fast guest_hv_cpuid_has_l2_tlb_flush() check Introduce a helper to quickly check if KVM needs to handle VMCALL/VMMCALL from L2 in L0 to process L2 TLB flush requests. Reviewed-by: Maxim Levitsky Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-25-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 7706e203ff43e..bd698eb2bda10 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -174,6 +174,14 @@ static inline void kvm_hv_vcpu_purge_flush_tlb(struct kvm_vcpu *vcpu) kfifo_reset_out(&tlb_flush_fifo->entries); } +static inline bool guest_hv_cpuid_has_l2_tlb_flush(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + return hv_vcpu && + (hv_vcpu->cpuid_cache.nested_eax & HV_X64_NESTED_DIRECT_FLUSH); +} + static inline bool kvm_hv_is_tlb_flush_hcall(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); -- GitLab From 046f5756c49106471bc98bd32b87a62d0717ddda Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:54:03 +0100 Subject: [PATCH 0790/1423] KVM: nVMX: hyper-v: Cache VP assist page in 'struct kvm_vcpu_hv' In preparation to enabling L2 TLB flush, cache VP assist page in 'struct kvm_vcpu_hv'. While on it, rename nested_enlightened_vmentry() to nested_get_evmptr() and make it return eVMCS GPA directly. No functional change intended. Reviewed-by: Maxim Levitsky Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-26-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/hyperv.c | 10 ++++++---- arch/x86/kvm/hyperv.h | 3 +-- arch/x86/kvm/vmx/hyperv.c | 21 +++++++-------------- arch/x86/kvm/vmx/hyperv.h | 2 +- arch/x86/kvm/vmx/nested.c | 6 +++--- 6 files changed, 20 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ebf90f4f1a21e..d1013c4f673ca 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -669,6 +669,8 @@ struct kvm_vcpu_hv { /* Preallocated buffer for handling hypercalls passing sparse vCPU set */ u64 sparse_banks[HV_MAX_SPARSE_VCPU_BANKS]; + struct hv_vp_assist_page vp_assist_page; + struct { u64 pa_page_gpa; u64 vm_id; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 3715a6f026a23..ce245e37d08fc 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -900,13 +900,15 @@ bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_hv_assist_page_enabled); -bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu, - struct hv_vp_assist_page *assist_page) +bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu) { - if (!kvm_hv_assist_page_enabled(vcpu)) + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + if (!hv_vcpu || !kvm_hv_assist_page_enabled(vcpu)) return false; + return !kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, - assist_page, sizeof(*assist_page)); + &hv_vcpu->vp_assist_page, sizeof(struct hv_vp_assist_page)); } EXPORT_SYMBOL_GPL(kvm_hv_get_assist_page); diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index bd698eb2bda10..81313e418b80a 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -108,8 +108,7 @@ int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages); void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu); bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu); -bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu, - struct hv_vp_assist_page *assist_page); +bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu); static inline struct kvm_vcpu_hv_stimer *to_hv_stimer(struct kvm_vcpu *vcpu, int timer_index) diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c index f05464db4fdc1..bceca1a99804f 100644 --- a/arch/x86/kvm/vmx/hyperv.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -321,24 +321,17 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = { }; const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1); -bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa) +u64 nested_get_evmptr(struct kvm_vcpu *vcpu) { - struct hv_vp_assist_page assist_page; - - *evmcs_gpa = -1ull; - - if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page))) - return false; - - if (unlikely(!assist_page.enlighten_vmentry)) - return false; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); - if (unlikely(!evmptr_is_valid(assist_page.current_nested_vmcs))) - return false; + if (unlikely(!kvm_hv_get_assist_page(vcpu))) + return EVMPTR_INVALID; - *evmcs_gpa = assist_page.current_nested_vmcs; + if (unlikely(!hv_vcpu->vp_assist_page.enlighten_vmentry)) + return EVMPTR_INVALID; - return true; + return hv_vcpu->vp_assist_page.current_nested_vmcs; } uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index 8efaffe9215bf..8bf366730d333 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -237,7 +237,7 @@ enum nested_evmptrld_status { EVMPTRLD_ERROR, }; -bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa); +u64 nested_get_evmptr(struct kvm_vcpu *vcpu); uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu); int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 396712d13211f..38e6cb8abe627 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1992,7 +1992,8 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( if (likely(!guest_cpuid_has_evmcs(vcpu))) return EVMPTRLD_DISABLED; - if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) { + evmcs_gpa = nested_get_evmptr(vcpu); + if (!evmptr_is_valid(evmcs_gpa)) { nested_release_evmcs(vcpu); return EVMPTRLD_DISABLED; } @@ -5221,7 +5222,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 zero = 0; gpa_t vmptr; - u64 evmcs_gpa; int r; if (!nested_vmx_check_permission(vcpu)) @@ -5247,7 +5247,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) * vmx->nested.hv_evmcs but this shouldn't be a problem. */ if (likely(!guest_cpuid_has_evmcs(vcpu) || - !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) { + !evmptr_is_valid(nested_get_evmptr(vcpu)))) { if (vmptr == vmx->nested.current_vmptr) nested_release_vmcs12(vcpu); -- GitLab From c30e9bc8b606077142969a807ada42ca921e605a Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:54:04 +0100 Subject: [PATCH 0791/1423] KVM: nVMX: hyper-v: Enable L2 TLB flush Enable L2 TLB flush feature on nVMX when: - Enlightened VMCS is in use. - The feature flag is enabled in eVMCS. - The feature flag is enabled in partition assist page. Perform synthetic vmexit to L1 after processing TLB flush call upon request (HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH). Note: nested_evmcs_l2_tlb_flush_enabled() uses cached VP assist page copy which gets updated from nested_vmx_handle_enlightened_vmptrld(). This is also guaranteed to happen post migration with eVMCS backed L2 running. Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-27-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/hyperv-tlfs.h | 9 +++++++++ arch/x86/kvm/vmx/hyperv.c | 17 +++++++++++++++++ arch/x86/kvm/vmx/hyperv.h | 1 + arch/x86/kvm/vmx/nested.c | 20 ++++++++++++++++++++ 4 files changed, 47 insertions(+) diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 6639979302ab7..b25c6792d409c 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -600,6 +600,15 @@ struct hv_enlightened_vmcs { #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF +/* + * Note, Hyper-V isn't actually stealing bit 28 from Intel, just abusing it by + * pairing it with architecturally impossible exit reasons. Bit 28 is set only + * on SMI exits to a SMI transfer monitor (STM) and if and only if a MTF VM-Exit + * is pending. I.e. it will never be set by hardware for non-SMI exits (there + * are only three), nor will it ever be set unless the VMM is an STM. + */ +#define HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH 0x10000031 + /* * Hyper-V uses the software reserved 32 bytes in VMCB control area to expose * SVM enlightenments to guests. diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c index bceca1a99804f..04a0bba58c7de 100644 --- a/arch/x86/kvm/vmx/hyperv.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -5,6 +5,7 @@ #include "../cpuid.h" #include "hyperv.h" +#include "nested.h" #include "vmcs.h" #include "vmx.h" #include "trace.h" @@ -500,6 +501,22 @@ int nested_enable_evmcs(struct kvm_vcpu *vcpu, return 0; } +bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; + + if (!hv_vcpu || !evmcs) + return false; + + if (!evmcs->hv_enlightenments_control.nested_flush_hypercall) + return false; + + return hv_vcpu->vp_assist_page.nested_control.features.directhypercall; +} + void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu) { + nested_vmx_vmexit(vcpu, HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH, 0, 0); } diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index 8bf366730d333..571e7929d14e7 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -243,6 +243,7 @@ int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version); void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); int nested_evmcs_check_controls(struct vmcs12 *vmcs12); +bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu); void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu); #endif /* __KVM_X86_VMX_HYPERV_H */ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 38e6cb8abe627..b28be793de298 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1132,6 +1132,15 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, { struct vcpu_vmx *vmx = to_vmx(vcpu); + /* + * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or + * L2's VP_ID upon request from the guest. Make sure we check for + * pending entries in the right FIFO upon L1/L2 transition as these + * requests are put by other vCPUs asynchronously. + */ + if (to_hv_vcpu(vcpu) && enable_ept) + kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu); + /* * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a @@ -3267,6 +3276,12 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) { + /* + * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy + * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory + * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post + * migration. + */ if (!nested_get_evmcs_page(vcpu)) { pr_debug_ratelimited("%s: enlightened vmptrld failed\n", __func__); @@ -6144,6 +6159,11 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, * Handle L2's bus locks in L0 directly. */ return true; + case EXIT_REASON_VMCALL: + /* Hyper-V L2 TLB flush hypercall is handled by L0 */ + return guest_hv_cpuid_has_l2_tlb_flush(vcpu) && + nested_evmcs_l2_tlb_flush_enabled(vcpu) && + kvm_hv_is_tlb_flush_hcall(vcpu); default: break; } -- GitLab From b415d8d417bbe5403626b74e1041101ac23d602f Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:54:05 +0100 Subject: [PATCH 0792/1423] KVM: x86: Make kvm_hv_get_assist_page() return 0/-errno Convert kvm_hv_get_assist_page() to return 'int' and propagate possible errors from kvm_read_guest_cached(). Suggested-by: Sean Christopherson Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-28-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 8 ++++---- arch/x86/kvm/hyperv.h | 2 +- arch/x86/kvm/vmx/hyperv.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index ce245e37d08fc..15880da73a7b1 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -900,15 +900,15 @@ bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_hv_assist_page_enabled); -bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu) +int kvm_hv_get_assist_page(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); if (!hv_vcpu || !kvm_hv_assist_page_enabled(vcpu)) - return false; + return -EFAULT; - return !kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, - &hv_vcpu->vp_assist_page, sizeof(struct hv_vp_assist_page)); + return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, + &hv_vcpu->vp_assist_page, sizeof(struct hv_vp_assist_page)); } EXPORT_SYMBOL_GPL(kvm_hv_get_assist_page); diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 81313e418b80a..5157622c2fb3a 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -108,7 +108,7 @@ int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages); void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu); bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu); -bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu); +int kvm_hv_get_assist_page(struct kvm_vcpu *vcpu); static inline struct kvm_vcpu_hv_stimer *to_hv_stimer(struct kvm_vcpu *vcpu, int timer_index) diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c index 04a0bba58c7de..ae03d1fe03552 100644 --- a/arch/x86/kvm/vmx/hyperv.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -326,7 +326,7 @@ u64 nested_get_evmptr(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); - if (unlikely(!kvm_hv_get_assist_page(vcpu))) + if (unlikely(kvm_hv_get_assist_page(vcpu))) return EVMPTR_INVALID; if (unlikely(!hv_vcpu->vp_assist_page.enlighten_vmentry)) -- GitLab From 3f4a812edf5cb0a50e65fbdfafdb3e688da18f16 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:54:06 +0100 Subject: [PATCH 0793/1423] KVM: nSVM: hyper-v: Enable L2 TLB flush Implement Hyper-V L2 TLB flush for nSVM. The feature needs to be enabled both in extended 'nested controls' in VMCB and VP assist page. According to Hyper-V TLFS, synthetic vmexit to L1 is performed with - HV_SVM_EXITCODE_ENL exit_code. - HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH exit_info_1. Note: VP assist page is cached in 'struct kvm_vcpu_hv' so recalc_intercepts() doesn't need to read from guest's memory. KVM needs to update the case upon each VMRUN and after svm_set_nested_state (svm_get_nested_state_pages()) to handle the case when the guest got migrated while L2 was running. Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-29-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/hyperv-tlfs.h | 4 ++++ arch/x86/kvm/hyperv.h | 11 ++++++++++ arch/x86/kvm/svm/hyperv.c | 7 ++++++ arch/x86/kvm/svm/hyperv.h | 15 +++++++++++++ arch/x86/kvm/svm/nested.c | 35 ++++++++++++++++++++++++++++-- 5 files changed, 70 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index b25c6792d409c..e3efaf6e6b628 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -631,6 +631,10 @@ struct hv_vmcb_enlightenments { */ #define HV_VMCB_NESTED_ENLIGHTENMENTS 31 +/* Synthetic VM-Exit */ +#define HV_SVM_EXITCODE_ENL 0xf0000000 +#define HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH (1) + struct hv_partition_assist_pg { u32 tlb_lock_count; }; diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 5157622c2fb3a..9f96414a31c52 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -198,6 +198,17 @@ static inline bool kvm_hv_is_tlb_flush_hcall(struct kvm_vcpu *vcpu) code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX); } +static inline int kvm_hv_verify_vp_assist(struct kvm_vcpu *vcpu) +{ + if (!to_hv_vcpu(vcpu)) + return 0; + + if (!kvm_hv_assist_page_enabled(vcpu)) + return 0; + + return kvm_hv_get_assist_page(vcpu); +} + int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/kvm/svm/hyperv.c b/arch/x86/kvm/svm/hyperv.c index 911f51021af1f..088f6429b24ce 100644 --- a/arch/x86/kvm/svm/hyperv.c +++ b/arch/x86/kvm/svm/hyperv.c @@ -8,4 +8,11 @@ void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->control.exit_code = HV_SVM_EXITCODE_ENL; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH; + svm->vmcb->control.exit_info_2 = 0; + nested_svm_vmexit(svm); } diff --git a/arch/x86/kvm/svm/hyperv.h b/arch/x86/kvm/svm/hyperv.h index 7564bdf652e47..02f4784b5d446 100644 --- a/arch/x86/kvm/svm/hyperv.h +++ b/arch/x86/kvm/svm/hyperv.h @@ -25,6 +25,21 @@ static inline void nested_svm_hv_update_vm_vp_ids(struct kvm_vcpu *vcpu) hv_vcpu->nested.vp_id = hve->hv_vp_id; } +static inline bool nested_svm_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + if (!hv_vcpu) + return false; + + if (!hve->hv_enlightenments_control.nested_flush_hypercall) + return false; + + return hv_vcpu->vp_assist_page.nested_control.features.directhypercall; +} + void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu); #endif /* __ARCH_X86_KVM_SVM_HYPERV_H__ */ diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 748e4de40c8fa..bc9cd7086fa97 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -150,8 +150,12 @@ void recalc_intercepts(struct vcpu_svm *svm) vmcb_clr_intercept(c, INTERCEPT_VINTR); } - /* We don't want to see VMMCALLs from a nested guest */ - vmcb_clr_intercept(c, INTERCEPT_VMMCALL); + /* + * We want to see VMMCALLs from a nested guest only when Hyper-V L2 TLB + * flush feature is enabled. + */ + if (!nested_svm_l2_tlb_flush_enabled(&svm->vcpu)) + vmcb_clr_intercept(c, INTERCEPT_VMMCALL); for (i = 0; i < MAX_INTERCEPT; i++) c->intercepts[i] |= g->intercepts[i]; @@ -473,6 +477,15 @@ static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm, static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu) { + /* + * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or + * L2's VP_ID upon request from the guest. Make sure we check for + * pending entries in the right FIFO upon L1/L2 transition as these + * requests are put by other vCPUs asynchronously. + */ + if (to_hv_vcpu(vcpu) && npt_enabled) + kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu); + /* * TODO: optimize unconditional TLB flush/MMU sync. A partial list of * things to fix before this can be conditional: @@ -824,6 +837,13 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) return 1; } + /* This fails when VP assist page is enabled but the supplied GPA is bogus */ + ret = kvm_hv_verify_vp_assist(vcpu); + if (ret) { + kvm_inject_gp(vcpu, 0); + return ret; + } + vmcb12_gpa = svm->vmcb->save.rax; ret = kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map); if (ret == -EINVAL) { @@ -1421,6 +1441,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu) int nested_svm_exit_special(struct vcpu_svm *svm) { u32 exit_code = svm->vmcb->control.exit_code; + struct kvm_vcpu *vcpu = &svm->vcpu; switch (exit_code) { case SVM_EXIT_INTR: @@ -1439,6 +1460,13 @@ int nested_svm_exit_special(struct vcpu_svm *svm) return NESTED_EXIT_HOST; break; } + case SVM_EXIT_VMMCALL: + /* Hyper-V L2 TLB flush hypercall is handled by L0 */ + if (guest_hv_cpuid_has_l2_tlb_flush(vcpu) && + nested_svm_l2_tlb_flush_enabled(vcpu) && + kvm_hv_is_tlb_flush_hcall(vcpu)) + return NESTED_EXIT_HOST; + break; default: break; } @@ -1719,6 +1747,9 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) return false; } + if (kvm_hv_verify_vp_assist(vcpu)) + return false; + return true; } -- GitLab From f4de6a1fa3ee81197239603756fc5c4259e5ef1b Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:54:07 +0100 Subject: [PATCH 0794/1423] KVM: x86: Expose Hyper-V L2 TLB flush feature With both nSVM and nVMX implementations in place, KVM can now expose Hyper-V L2 TLB flush feature to userspace. Reviewed-by: Maxim Levitsky Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-30-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 15880da73a7b1..2c7f2a26421e8 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -2779,6 +2779,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, case HYPERV_CPUID_NESTED_FEATURES: ent->eax = evmcs_ver; + ent->eax |= HV_X64_NESTED_DIRECT_FLUSH; ent->eax |= HV_X64_NESTED_MSR_BITMAP; ent->ebx |= HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL; break; -- GitLab From 676a863ce605caca3a559bdb8e40f640c15f1fde Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:54:08 +0100 Subject: [PATCH 0795/1423] KVM: selftests: Better XMM read/write helpers set_xmm()/get_xmm() helpers are fairly useless as they only read 64 bits from 128-bit registers. Moreover, these helpers are not used. Borrow _kvm_read_sse_reg()/_kvm_write_sse_reg() from KVM limiting them to XMM0-XMM8 for now. Reviewed-by: Maxim Levitsky Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-31-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/include/x86_64/processor.h | 70 ++++++++++--------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index f838ac5865dc5..a10f39affa45e 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -603,71 +603,73 @@ static inline bool this_pmu_has(struct kvm_x86_pmu_feature feature) !this_cpu_has(feature.anti_feature); } -#define SET_XMM(__var, __xmm) \ - asm volatile("movq %0, %%"#__xmm : : "r"(__var) : #__xmm) +typedef u32 __attribute__((vector_size(16))) sse128_t; +#define __sse128_u union { sse128_t vec; u64 as_u64[2]; u32 as_u32[4]; } +#define sse128_lo(x) ({ __sse128_u t; t.vec = x; t.as_u64[0]; }) +#define sse128_hi(x) ({ __sse128_u t; t.vec = x; t.as_u64[1]; }) -static inline void set_xmm(int n, unsigned long val) +static inline void read_sse_reg(int reg, sse128_t *data) { - switch (n) { + switch (reg) { case 0: - SET_XMM(val, xmm0); + asm("movdqa %%xmm0, %0" : "=m"(*data)); break; case 1: - SET_XMM(val, xmm1); + asm("movdqa %%xmm1, %0" : "=m"(*data)); break; case 2: - SET_XMM(val, xmm2); + asm("movdqa %%xmm2, %0" : "=m"(*data)); break; case 3: - SET_XMM(val, xmm3); + asm("movdqa %%xmm3, %0" : "=m"(*data)); break; case 4: - SET_XMM(val, xmm4); + asm("movdqa %%xmm4, %0" : "=m"(*data)); break; case 5: - SET_XMM(val, xmm5); + asm("movdqa %%xmm5, %0" : "=m"(*data)); break; case 6: - SET_XMM(val, xmm6); + asm("movdqa %%xmm6, %0" : "=m"(*data)); break; case 7: - SET_XMM(val, xmm7); + asm("movdqa %%xmm7, %0" : "=m"(*data)); break; + default: + BUG(); } } -#define GET_XMM(__xmm) \ -({ \ - unsigned long __val; \ - asm volatile("movq %%"#__xmm", %0" : "=r"(__val)); \ - __val; \ -}) - -static inline unsigned long get_xmm(int n) +static inline void write_sse_reg(int reg, const sse128_t *data) { - assert(n >= 0 && n <= 7); - - switch (n) { + switch (reg) { case 0: - return GET_XMM(xmm0); + asm("movdqa %0, %%xmm0" : : "m"(*data)); + break; case 1: - return GET_XMM(xmm1); + asm("movdqa %0, %%xmm1" : : "m"(*data)); + break; case 2: - return GET_XMM(xmm2); + asm("movdqa %0, %%xmm2" : : "m"(*data)); + break; case 3: - return GET_XMM(xmm3); + asm("movdqa %0, %%xmm3" : : "m"(*data)); + break; case 4: - return GET_XMM(xmm4); + asm("movdqa %0, %%xmm4" : : "m"(*data)); + break; case 5: - return GET_XMM(xmm5); + asm("movdqa %0, %%xmm5" : : "m"(*data)); + break; case 6: - return GET_XMM(xmm6); + asm("movdqa %0, %%xmm6" : : "m"(*data)); + break; case 7: - return GET_XMM(xmm7); + asm("movdqa %0, %%xmm7" : : "m"(*data)); + break; + default: + BUG(); } - - /* never reached */ - return 0; } static inline void cpu_relax(void) -- GitLab From c05a0a71c5d0aea010af19f21ccdc0d576066790 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:54:09 +0100 Subject: [PATCH 0796/1423] KVM: selftests: Move HYPERV_LINUX_OS_ID definition to a common header HYPERV_LINUX_OS_ID needs to be written to HV_X64_MSR_GUEST_OS_ID by each Hyper-V specific selftest. Reviewed-by: Maxim Levitsky Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-32-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/x86_64/hyperv.h | 3 +++ tools/testing/selftests/kvm/x86_64/hyperv_features.c | 6 ++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/hyperv.h b/tools/testing/selftests/kvm/include/x86_64/hyperv.h index b66910702c0a7..f0a8a93694b2b 100644 --- a/tools/testing/selftests/kvm/include/x86_64/hyperv.h +++ b/tools/testing/selftests/kvm/include/x86_64/hyperv.h @@ -185,4 +185,7 @@ /* hypercall options */ #define HV_HYPERCALL_FAST_BIT BIT(16) +/* Proper HV_X64_MSR_GUEST_OS_ID value */ +#define HYPERV_LINUX_OS_ID ((u64)0x8100 << 48) + #endif /* !SELFTEST_KVM_HYPERV_H */ diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c index 2b6d455acf8ae..6558cc61cf69f 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c @@ -13,8 +13,6 @@ #include "processor.h" #include "hyperv.h" -#define LINUX_OS_ID ((u64)0x8100 << 48) - static inline uint8_t hypercall(u64 control, vm_vaddr_t input_address, vm_vaddr_t output_address, uint64_t *hv_status) { @@ -72,7 +70,7 @@ static void guest_hcall(vm_vaddr_t pgs_gpa, struct hcall_data *hcall) GUEST_ASSERT(hcall->control); - wrmsr(HV_X64_MSR_GUEST_OS_ID, LINUX_OS_ID); + wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID); wrmsr(HV_X64_MSR_HYPERCALL, pgs_gpa); if (!(hcall->control & HV_HYPERCALL_FAST_BIT)) { @@ -170,7 +168,7 @@ static void guest_test_msrs_access(void) */ msr->idx = HV_X64_MSR_GUEST_OS_ID; msr->write = 1; - msr->write_val = LINUX_OS_ID; + msr->write_val = HYPERV_LINUX_OS_ID; msr->available = 1; break; case 3: -- GitLab From caf4110fbaa89a20733facb062381e89961ce698 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:54:10 +0100 Subject: [PATCH 0797/1423] KVM: selftests: Move the function doing Hyper-V hypercall to a common header All Hyper-V specific tests issuing hypercalls need this. Reviewed-by: Maxim Levitsky Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-33-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/include/x86_64/hyperv.h | 19 ++++++++++++++++++ .../selftests/kvm/x86_64/hyperv_features.c | 20 +------------------ 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/hyperv.h b/tools/testing/selftests/kvm/include/x86_64/hyperv.h index f0a8a93694b2b..7ed8f4f5f7d89 100644 --- a/tools/testing/selftests/kvm/include/x86_64/hyperv.h +++ b/tools/testing/selftests/kvm/include/x86_64/hyperv.h @@ -185,6 +185,25 @@ /* hypercall options */ #define HV_HYPERCALL_FAST_BIT BIT(16) +static inline uint8_t hyperv_hypercall(u64 control, vm_vaddr_t input_address, + vm_vaddr_t output_address, + uint64_t *hv_status) +{ + uint64_t error_code; + uint8_t vector; + + /* Note both the hypercall and the "asm safe" clobber r9-r11. */ + asm volatile("mov %[output_address], %%r8\n\t" + KVM_ASM_SAFE("vmcall") + : "=a" (*hv_status), + "+c" (control), "+d" (input_address), + KVM_ASM_SAFE_OUTPUTS(vector, error_code) + : [output_address] "r"(output_address), + "a" (-EFAULT) + : "cc", "memory", "r8", KVM_ASM_SAFE_CLOBBERS); + return vector; +} + /* Proper HV_X64_MSR_GUEST_OS_ID value */ #define HYPERV_LINUX_OS_ID ((u64)0x8100 << 48) diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c index 6558cc61cf69f..5ff4ff2365bb3 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c @@ -13,24 +13,6 @@ #include "processor.h" #include "hyperv.h" -static inline uint8_t hypercall(u64 control, vm_vaddr_t input_address, - vm_vaddr_t output_address, uint64_t *hv_status) -{ - uint64_t error_code; - uint8_t vector; - - /* Note both the hypercall and the "asm safe" clobber r9-r11. */ - asm volatile("mov %[output_address], %%r8\n\t" - KVM_ASM_SAFE("vmcall") - : "=a" (*hv_status), - "+c" (control), "+d" (input_address), - KVM_ASM_SAFE_OUTPUTS(vector, error_code) - : [output_address] "r"(output_address), - "a" (-EFAULT) - : "cc", "memory", "r8", KVM_ASM_SAFE_CLOBBERS); - return vector; -} - struct msr_data { uint32_t idx; bool available; @@ -80,7 +62,7 @@ static void guest_hcall(vm_vaddr_t pgs_gpa, struct hcall_data *hcall) input = output = 0; } - vector = hypercall(hcall->control, input, output, &res); + vector = hyperv_hypercall(hcall->control, input, output, &res); if (hcall->ud_expected) { GUEST_ASSERT_2(vector == UD_VECTOR, hcall->control, vector); } else { -- GitLab From 998489245d8469c21f1517c4bbe192e0ef3c3374 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:54:11 +0100 Subject: [PATCH 0798/1423] KVM: selftests: Hyper-V PV IPI selftest Introduce a selftest for Hyper-V PV IPI hypercalls (HvCallSendSyntheticClusterIpi, HvCallSendSyntheticClusterIpiEx). The test creates one 'sender' vCPU and two 'receiver' vCPU and then issues various combinations of send IPI hypercalls in both 'normal' and 'fast' (with XMM input where necessary) mode. Later, the test checks whether IPIs were delivered to the expected destination vCPU[s]. Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-34-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + .../selftests/kvm/include/x86_64/hyperv.h | 35 +- .../selftests/kvm/x86_64/hyperv_features.c | 2 +- .../testing/selftests/kvm/x86_64/hyperv_ipi.c | 314 ++++++++++++++++++ 5 files changed, 349 insertions(+), 4 deletions(-) create mode 100644 tools/testing/selftests/kvm/x86_64/hyperv_ipi.c diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 582e2e198fbf5..3b3218cb46ed6 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -25,6 +25,7 @@ /x86_64/hyperv_clock /x86_64/hyperv_cpuid /x86_64/hyperv_features +/x86_64/hyperv_ipi /x86_64/hyperv_svm_test /x86_64/max_vcpuid_cap_test /x86_64/mmio_warning_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index f62dcfcda618c..4095b1212f08c 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -87,6 +87,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/fix_hypercall_test TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features +TEST_GEN_PROGS_x86_64 += x86_64/hyperv_ipi TEST_GEN_PROGS_x86_64 += x86_64/hyperv_svm_test TEST_GEN_PROGS_x86_64 += x86_64/kvm_clock_test TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test diff --git a/tools/testing/selftests/kvm/include/x86_64/hyperv.h b/tools/testing/selftests/kvm/include/x86_64/hyperv.h index 7ed8f4f5f7d89..c757e40011731 100644 --- a/tools/testing/selftests/kvm/include/x86_64/hyperv.h +++ b/tools/testing/selftests/kvm/include/x86_64/hyperv.h @@ -9,6 +9,8 @@ #ifndef SELFTEST_KVM_HYPERV_H #define SELFTEST_KVM_HYPERV_H +#include "processor.h" + #define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000 #define HYPERV_CPUID_INTERFACE 0x40000001 #define HYPERV_CPUID_VERSION 0x40000002 @@ -184,10 +186,15 @@ /* hypercall options */ #define HV_HYPERCALL_FAST_BIT BIT(16) +#define HV_HYPERCALL_VARHEAD_OFFSET 17 -static inline uint8_t hyperv_hypercall(u64 control, vm_vaddr_t input_address, - vm_vaddr_t output_address, - uint64_t *hv_status) +/* + * Issue a Hyper-V hypercall. Returns exception vector raised or 0, 'hv_status' + * is set to the hypercall status (if no exception occurred). + */ +static inline uint8_t __hyperv_hypercall(u64 control, vm_vaddr_t input_address, + vm_vaddr_t output_address, + uint64_t *hv_status) { uint64_t error_code; uint8_t vector; @@ -204,6 +211,28 @@ static inline uint8_t hyperv_hypercall(u64 control, vm_vaddr_t input_address, return vector; } +/* Issue a Hyper-V hypercall and assert that it succeeded. */ +static inline void hyperv_hypercall(u64 control, vm_vaddr_t input_address, + vm_vaddr_t output_address) +{ + uint64_t hv_status; + uint8_t vector; + + vector = __hyperv_hypercall(control, input_address, output_address, &hv_status); + + GUEST_ASSERT(!vector); + GUEST_ASSERT((hv_status & 0xffff) == 0); +} + +/* Write 'Fast' hypercall input 'data' to the first 'n_sse_regs' SSE regs */ +static inline void hyperv_write_xmm_input(void *data, int n_sse_regs) +{ + int i; + + for (i = 0; i < n_sse_regs; i++) + write_sse_reg(i, (sse128_t *)(data + sizeof(sse128_t) * i)); +} + /* Proper HV_X64_MSR_GUEST_OS_ID value */ #define HYPERV_LINUX_OS_ID ((u64)0x8100 << 48) diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c index 5ff4ff2365bb3..3163c3e8db0a7 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c @@ -62,7 +62,7 @@ static void guest_hcall(vm_vaddr_t pgs_gpa, struct hcall_data *hcall) input = output = 0; } - vector = hyperv_hypercall(hcall->control, input, output, &res); + vector = __hyperv_hypercall(hcall->control, input, output, &res); if (hcall->ud_expected) { GUEST_ASSERT_2(vector == UD_VECTOR, hcall->control, vector); } else { diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c b/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c new file mode 100644 index 0000000000000..8b791eac7d5a3 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c @@ -0,0 +1,314 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hyper-V HvCallSendSyntheticClusterIpi{,Ex} tests + * + * Copyright (C) 2022, Red Hat, Inc. + * + */ + +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include +#include + +#include "kvm_util.h" +#include "hyperv.h" +#include "test_util.h" +#include "vmx.h" + +#define RECEIVER_VCPU_ID_1 2 +#define RECEIVER_VCPU_ID_2 65 + +#define IPI_VECTOR 0xfe + +static volatile uint64_t ipis_rcvd[RECEIVER_VCPU_ID_2 + 1]; + +struct hv_vpset { + u64 format; + u64 valid_bank_mask; + u64 bank_contents[2]; +}; + +enum HV_GENERIC_SET_FORMAT { + HV_GENERIC_SET_SPARSE_4K, + HV_GENERIC_SET_ALL, +}; + +/* HvCallSendSyntheticClusterIpi hypercall */ +struct hv_send_ipi { + u32 vector; + u32 reserved; + u64 cpu_mask; +}; + +/* HvCallSendSyntheticClusterIpiEx hypercall */ +struct hv_send_ipi_ex { + u32 vector; + u32 reserved; + struct hv_vpset vp_set; +}; + +static inline void hv_init(vm_vaddr_t pgs_gpa) +{ + wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID); + wrmsr(HV_X64_MSR_HYPERCALL, pgs_gpa); +} + +static void receiver_code(void *hcall_page, vm_vaddr_t pgs_gpa) +{ + u32 vcpu_id; + + x2apic_enable(); + hv_init(pgs_gpa); + + vcpu_id = rdmsr(HV_X64_MSR_VP_INDEX); + + /* Signal sender vCPU we're ready */ + ipis_rcvd[vcpu_id] = (u64)-1; + + for (;;) + asm volatile("sti; hlt; cli"); +} + +static void guest_ipi_handler(struct ex_regs *regs) +{ + u32 vcpu_id = rdmsr(HV_X64_MSR_VP_INDEX); + + ipis_rcvd[vcpu_id]++; + wrmsr(HV_X64_MSR_EOI, 1); +} + +static inline void nop_loop(void) +{ + int i; + + for (i = 0; i < 100000000; i++) + asm volatile("nop"); +} + +static void sender_guest_code(void *hcall_page, vm_vaddr_t pgs_gpa) +{ + struct hv_send_ipi *ipi = (struct hv_send_ipi *)hcall_page; + struct hv_send_ipi_ex *ipi_ex = (struct hv_send_ipi_ex *)hcall_page; + int stage = 1, ipis_expected[2] = {0}; + + hv_init(pgs_gpa); + GUEST_SYNC(stage++); + + /* Wait for receiver vCPUs to come up */ + while (!ipis_rcvd[RECEIVER_VCPU_ID_1] || !ipis_rcvd[RECEIVER_VCPU_ID_2]) + nop_loop(); + ipis_rcvd[RECEIVER_VCPU_ID_1] = ipis_rcvd[RECEIVER_VCPU_ID_2] = 0; + + /* 'Slow' HvCallSendSyntheticClusterIpi to RECEIVER_VCPU_ID_1 */ + ipi->vector = IPI_VECTOR; + ipi->cpu_mask = 1 << RECEIVER_VCPU_ID_1; + hyperv_hypercall(HVCALL_SEND_IPI, pgs_gpa, pgs_gpa + 4096); + nop_loop(); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]); + GUEST_SYNC(stage++); + /* 'Fast' HvCallSendSyntheticClusterIpi to RECEIVER_VCPU_ID_1 */ + hyperv_hypercall(HVCALL_SEND_IPI | HV_HYPERCALL_FAST_BIT, + IPI_VECTOR, 1 << RECEIVER_VCPU_ID_1); + nop_loop(); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]); + GUEST_SYNC(stage++); + + /* 'Slow' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_1 */ + memset(hcall_page, 0, 4096); + ipi_ex->vector = IPI_VECTOR; + ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K; + ipi_ex->vp_set.valid_bank_mask = 1 << 0; + ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_1); + hyperv_hypercall(HVCALL_SEND_IPI_EX | (1 << HV_HYPERCALL_VARHEAD_OFFSET), + pgs_gpa, pgs_gpa + 4096); + nop_loop(); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]); + GUEST_SYNC(stage++); + /* 'XMM Fast' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_1 */ + hyperv_write_xmm_input(&ipi_ex->vp_set.valid_bank_mask, 1); + hyperv_hypercall(HVCALL_SEND_IPI_EX | HV_HYPERCALL_FAST_BIT | + (1 << HV_HYPERCALL_VARHEAD_OFFSET), + IPI_VECTOR, HV_GENERIC_SET_SPARSE_4K); + nop_loop(); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]); + GUEST_SYNC(stage++); + + /* 'Slow' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_2 */ + memset(hcall_page, 0, 4096); + ipi_ex->vector = IPI_VECTOR; + ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K; + ipi_ex->vp_set.valid_bank_mask = 1 << 1; + ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_2 - 64); + hyperv_hypercall(HVCALL_SEND_IPI_EX | (1 << HV_HYPERCALL_VARHEAD_OFFSET), + pgs_gpa, pgs_gpa + 4096); + nop_loop(); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ipis_expected[0]); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]); + GUEST_SYNC(stage++); + /* 'XMM Fast' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_2 */ + hyperv_write_xmm_input(&ipi_ex->vp_set.valid_bank_mask, 1); + hyperv_hypercall(HVCALL_SEND_IPI_EX | HV_HYPERCALL_FAST_BIT | + (1 << HV_HYPERCALL_VARHEAD_OFFSET), + IPI_VECTOR, HV_GENERIC_SET_SPARSE_4K); + nop_loop(); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ipis_expected[0]); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]); + GUEST_SYNC(stage++); + + /* 'Slow' HvCallSendSyntheticClusterIpiEx to both RECEIVER_VCPU_ID_{1,2} */ + memset(hcall_page, 0, 4096); + ipi_ex->vector = IPI_VECTOR; + ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K; + ipi_ex->vp_set.valid_bank_mask = 1 << 1 | 1; + ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_1); + ipi_ex->vp_set.bank_contents[1] = BIT(RECEIVER_VCPU_ID_2 - 64); + hyperv_hypercall(HVCALL_SEND_IPI_EX | (2 << HV_HYPERCALL_VARHEAD_OFFSET), + pgs_gpa, pgs_gpa + 4096); + nop_loop(); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]); + GUEST_SYNC(stage++); + /* 'XMM Fast' HvCallSendSyntheticClusterIpiEx to both RECEIVER_VCPU_ID_{1, 2} */ + hyperv_write_xmm_input(&ipi_ex->vp_set.valid_bank_mask, 2); + hyperv_hypercall(HVCALL_SEND_IPI_EX | HV_HYPERCALL_FAST_BIT | + (2 << HV_HYPERCALL_VARHEAD_OFFSET), + IPI_VECTOR, HV_GENERIC_SET_SPARSE_4K); + nop_loop(); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]); + GUEST_SYNC(stage++); + + /* 'Slow' HvCallSendSyntheticClusterIpiEx to HV_GENERIC_SET_ALL */ + memset(hcall_page, 0, 4096); + ipi_ex->vector = IPI_VECTOR; + ipi_ex->vp_set.format = HV_GENERIC_SET_ALL; + hyperv_hypercall(HVCALL_SEND_IPI_EX, pgs_gpa, pgs_gpa + 4096); + nop_loop(); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]); + GUEST_SYNC(stage++); + /* + * 'XMM Fast' HvCallSendSyntheticClusterIpiEx to HV_GENERIC_SET_ALL. + * Nothing to write anything to XMM regs. + */ + hyperv_hypercall(HVCALL_SEND_IPI_EX | HV_HYPERCALL_FAST_BIT, + IPI_VECTOR, HV_GENERIC_SET_ALL); + nop_loop(); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]); + GUEST_SYNC(stage++); + + GUEST_DONE(); +} + +static void *vcpu_thread(void *arg) +{ + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)arg; + int old, r; + + r = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old); + TEST_ASSERT(!r, "pthread_setcanceltype failed on vcpu_id=%u with errno=%d", + vcpu->id, r); + + vcpu_run(vcpu); + + TEST_FAIL("vCPU %u exited unexpectedly", vcpu->id); + + return NULL; +} + +static void cancel_join_vcpu_thread(pthread_t thread, struct kvm_vcpu *vcpu) +{ + void *retval; + int r; + + r = pthread_cancel(thread); + TEST_ASSERT(!r, "pthread_cancel on vcpu_id=%d failed with errno=%d", + vcpu->id, r); + + r = pthread_join(thread, &retval); + TEST_ASSERT(!r, "pthread_join on vcpu_id=%d failed with errno=%d", + vcpu->id, r); + TEST_ASSERT(retval == PTHREAD_CANCELED, + "expected retval=%p, got %p", PTHREAD_CANCELED, + retval); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu[3]; + unsigned int exit_reason; + vm_vaddr_t hcall_page; + pthread_t threads[2]; + int stage = 1, r; + struct ucall uc; + + vm = vm_create_with_one_vcpu(&vcpu[0], sender_guest_code); + + /* Hypercall input/output */ + hcall_page = vm_vaddr_alloc_pages(vm, 2); + memset(addr_gva2hva(vm, hcall_page), 0x0, 2 * getpagesize()); + + vm_init_descriptor_tables(vm); + + vcpu[1] = vm_vcpu_add(vm, RECEIVER_VCPU_ID_1, receiver_code); + vcpu_init_descriptor_tables(vcpu[1]); + vcpu_args_set(vcpu[1], 2, hcall_page, addr_gva2gpa(vm, hcall_page)); + vcpu_set_msr(vcpu[1], HV_X64_MSR_VP_INDEX, RECEIVER_VCPU_ID_1); + vcpu_set_hv_cpuid(vcpu[1]); + + vcpu[2] = vm_vcpu_add(vm, RECEIVER_VCPU_ID_2, receiver_code); + vcpu_init_descriptor_tables(vcpu[2]); + vcpu_args_set(vcpu[2], 2, hcall_page, addr_gva2gpa(vm, hcall_page)); + vcpu_set_msr(vcpu[2], HV_X64_MSR_VP_INDEX, RECEIVER_VCPU_ID_2); + vcpu_set_hv_cpuid(vcpu[2]); + + vm_install_exception_handler(vm, IPI_VECTOR, guest_ipi_handler); + + vcpu_args_set(vcpu[0], 2, hcall_page, addr_gva2gpa(vm, hcall_page)); + vcpu_set_hv_cpuid(vcpu[0]); + + r = pthread_create(&threads[0], NULL, vcpu_thread, vcpu[1]); + TEST_ASSERT(!r, "pthread_create failed errno=%d", r); + + r = pthread_create(&threads[1], NULL, vcpu_thread, vcpu[2]); + TEST_ASSERT(!r, "pthread_create failed errno=%d", errno); + + while (true) { + vcpu_run(vcpu[0]); + + exit_reason = vcpu[0]->run->exit_reason; + TEST_ASSERT(exit_reason == KVM_EXIT_IO, + "unexpected exit reason: %u (%s)", + exit_reason, exit_reason_str(exit_reason)); + + switch (get_ucall(vcpu[0], &uc)) { + case UCALL_SYNC: + TEST_ASSERT(uc.args[1] == stage, + "Unexpected stage: %ld (%d expected)\n", + uc.args[1], stage); + break; + case UCALL_DONE: + goto done; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + /* NOT REACHED */ + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + + stage++; + } + +done: + cancel_join_vcpu_thread(threads[0], vcpu[1]); + cancel_join_vcpu_thread(threads[1], vcpu[2]); + kvm_vm_free(vm); + + return r; +} -- GitLab From 56fc7732031d9999a35cb43b3de30d52ae30fd3f Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:54:12 +0100 Subject: [PATCH 0799/1423] KVM: selftests: Fill in vm->vpages_mapped bitmap in virt_map() too Similar to vm_vaddr_alloc(), virt_map() needs to reflect the mapping in vm->vpages_mapped. While on it, remove unneeded code wrapping in vm_vaddr_alloc(). Reviewed-by: Andrew Jones Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-35-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/kvm_util.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 5ac8f207ed92d..a3ff96ec02355 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1319,8 +1319,7 @@ vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min) virt_pg_map(vm, vaddr, paddr); - sparsebit_set(vm->vpages_mapped, - vaddr >> vm->page_shift); + sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift); } return vaddr_start; @@ -1393,6 +1392,8 @@ void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, virt_pg_map(vm, vaddr, paddr); vaddr += page_size; paddr += page_size; + + sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift); } } -- GitLab From 2d4a5f91837f4a75cf02010b9a52bfe52d0efd40 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:54:13 +0100 Subject: [PATCH 0800/1423] KVM: selftests: Export vm_vaddr_unused_gap() to make it possible to request unmapped ranges Currently, tests can only request a new vaddr range by using vm_vaddr_alloc()/vm_vaddr_alloc_page()/vm_vaddr_alloc_pages() but these functions allocate and map physical pages too. Make it possible to request unmapped range too. Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-36-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/kvm_util_base.h | 1 + tools/testing/selftests/kvm/lib/kvm_util.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 228212ede05eb..c7685c7038ff0 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -385,6 +385,7 @@ void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id); +vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min); vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min); vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages); vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index a3ff96ec02355..1d26a21601785 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1214,8 +1214,8 @@ struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) * TEST_ASSERT failure occurs for invalid input or no area of at least * sz unallocated bytes >= vaddr_min is available. */ -static vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz, - vm_vaddr_t vaddr_min) +vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz, + vm_vaddr_t vaddr_min) { uint64_t pages = (sz + vm->page_size - 1) >> vm->page_shift; -- GitLab From 8f649b57856b855f8a28724321e7ae72e31d513a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 18 Nov 2022 13:58:51 -0800 Subject: [PATCH 0801/1423] IB/hfi1: Replace 1-element array with singleton Zero-length arrays are deprecated[1] and are being replaced with flexible array members in support of the ongoing efforts to tighten the FORTIFY_SOURCE routines on memcpy(), correctly instrument array indexing with UBSAN_BOUNDS, and to globally enable -fstrict-flex-arrays=3. Replace zero-length array with flexible-array member "lvs" in struct opa_port_data_counters_msg and struct opa_port_error_counters64_msg. Additionally, the "port" member of several structs is defined as a single-element, but is only ever accessed at index 0. Replace it with a singleton so that flexible array usage is sane. This results in no differences in binary output. [1] https://github.com/KSPP/linux/issues/78 Link: https://lore.kernel.org/r/20221118215847.never.416-kees@kernel.org Signed-off-by: Kees Cook Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/mad.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index 4146a2113a954..e5e783c45810b 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -2437,9 +2437,9 @@ struct opa_port_data_counters_msg { __be64 port_vl_xmit_wait_data; __be64 port_vl_rcv_bubble; __be64 port_vl_mark_fecn; - } vls[0]; + } vls[]; /* array size defined by #bits set in vl_select_mask*/ - } port[1]; /* array size defined by #ports in attribute modifier */ + } port; }; struct opa_port_error_counters64_msg { @@ -2470,9 +2470,9 @@ struct opa_port_error_counters64_msg { u8 reserved3[7]; struct _vls_ectrs { __be64 port_vl_xmit_discards; - } vls[0]; + } vls[]; /* array size defined by #bits set in vl_select_mask */ - } port[1]; /* array size defined by #ports in attribute modifier */ + } port; }; struct opa_port_error_info_msg { @@ -2543,7 +2543,7 @@ struct opa_port_error_info_msg { u8 error_info; } __packed fm_config_ei; __u32 reserved9; - } port[1]; /* actual array size defined by #ports in attr modifier */ + } port; }; /* opa_port_error_info_msg error_info_select_mask bit definitions */ @@ -2966,7 +2966,7 @@ static int pma_get_opa_datacounters(struct opa_pma_mad *pmp, } /* Sanity check */ - response_data_size = struct_size(req, port[0].vls, num_vls); + response_data_size = struct_size(req, port.vls, num_vls); if (response_data_size > sizeof(pmp->data)) { pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; @@ -2986,7 +2986,7 @@ static int pma_get_opa_datacounters(struct opa_pma_mad *pmp, return reply((struct ib_mad_hdr *)pmp); } - rsp = &req->port[0]; + rsp = &req->port; memset(rsp, 0, sizeof(*rsp)); rsp->port_number = port; @@ -3182,7 +3182,7 @@ static int pma_get_opa_porterrors(struct opa_pma_mad *pmp, return reply((struct ib_mad_hdr *)pmp); } - response_data_size = struct_size(req, port[0].vls, num_vls); + response_data_size = struct_size(req, port.vls, num_vls); if (response_data_size > sizeof(pmp->data)) { pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; @@ -3201,7 +3201,7 @@ static int pma_get_opa_porterrors(struct opa_pma_mad *pmp, return reply((struct ib_mad_hdr *)pmp); } - rsp = &req->port[0]; + rsp = &req->port; ibp = to_iport(ibdev, port_num); ppd = ppd_from_ibp(ibp); @@ -3340,7 +3340,7 @@ static int pma_get_opa_errorinfo(struct opa_pma_mad *pmp, u64 reg; req = (struct opa_port_error_info_msg *)pmp->data; - rsp = &req->port[0]; + rsp = &req->port; num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod)); num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3])); @@ -3590,7 +3590,7 @@ static int pma_set_opa_errorinfo(struct opa_pma_mad *pmp, u32 error_info_select; req = (struct opa_port_error_info_msg *)pmp->data; - rsp = &req->port[0]; + rsp = &req->port; num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod)); num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3])); -- GitLab From 8e1a76493be9868fef34f977296a69769dcdfa6f Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Fri, 11 Nov 2022 21:35:37 -0500 Subject: [PATCH 0802/1423] RDMA/rxe: Remove reliable datagram support The rdma_rxe driver does not actually support the reliable datagram transport but contains a variable with RD opcodes in driver code. And this variable is never used. So remove it. Link: https://lore.kernel.org/r/20221112023537.432912-1-yanjun.zhu@intel.com Signed-off-by: Zhu Yanjun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_hdr.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_hdr.h b/drivers/infiniband/sw/rxe/rxe_hdr.h index e432f9e37795e..804594b76040a 100644 --- a/drivers/infiniband/sw/rxe/rxe_hdr.h +++ b/drivers/infiniband/sw/rxe/rxe_hdr.h @@ -742,7 +742,6 @@ enum aeth_syndrome { AETH_NAK_INVALID_REQ = 0x61, AETH_NAK_REM_ACC_ERR = 0x62, AETH_NAK_REM_OP_ERR = 0x63, - AETH_NAK_INV_RD_REQ = 0x64, }; static inline u8 __aeth_syn(void *arg) -- GitLab From 7d984dac8f6bf4ebd3398af82b357e1d181ecaac Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Sun, 30 Oct 2022 03:04:33 +0000 Subject: [PATCH 0803/1423] RDMA/rxe: Fix mr->map double free rxe_mr_cleanup() which tries to free mr->map again will be called when rxe_mr_init_user() fails: CPU: 0 PID: 4917 Comm: rdma_flush_serv Kdump: loaded Not tainted 6.1.0-rc1-roce-flush+ #25 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x45/0x5d panic+0x19e/0x349 end_report.part.0+0x54/0x7c kasan_report.cold+0xa/0xf rxe_mr_cleanup+0x9d/0xf0 [rdma_rxe] __rxe_cleanup+0x10a/0x1e0 [rdma_rxe] rxe_reg_user_mr+0xb7/0xd0 [rdma_rxe] ib_uverbs_reg_mr+0x26a/0x480 [ib_uverbs] ib_uverbs_handler_UVERBS_METHOD_INVOKE_WRITE+0x1a2/0x250 [ib_uverbs] ib_uverbs_cmd_verbs+0x1397/0x15a0 [ib_uverbs] This issue was firstly exposed since commit b18c7da63fcb ("RDMA/rxe: Fix memory leak in error path code") and then we fixed it in commit 8ff5f5d9d8cf ("RDMA/rxe: Prevent double freeing rxe_map_set()") but this fix was reverted together at last by commit 1e75550648da (Revert "RDMA/rxe: Create duplicate mapping tables for FMRs") Simply let rxe_mr_cleanup() always handle freeing the mr->map once it is successfully allocated. Fixes: 1e75550648da ("Revert "RDMA/rxe: Create duplicate mapping tables for FMRs"") Link: https://lore.kernel.org/r/1667099073-2-1-git-send-email-lizhijian@fujitsu.com Signed-off-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mr.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index cd846cf82a84c..b1423000e4bcd 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -97,6 +97,7 @@ err2: kfree(mr->map[i]); kfree(mr->map); + mr->map = NULL; err1: return -ENOMEM; } @@ -120,7 +121,6 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, int num_buf; void *vaddr; int err; - int i; umem = ib_umem_get(&rxe->ib_dev, start, length, access); if (IS_ERR(umem)) { @@ -159,9 +159,8 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, if (!vaddr) { rxe_dbg_mr(mr, "Unable to get virtual address\n"); err = -ENOMEM; - goto err_cleanup_map; + goto err_release_umem; } - buf->addr = (uintptr_t)vaddr; buf->size = PAGE_SIZE; num_buf++; @@ -178,10 +177,6 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, return 0; -err_cleanup_map: - for (i = 0; i < mr->num_map; i++) - kfree(mr->map[i]); - kfree(mr->map); err_release_umem: ib_umem_release(umem); err_out: -- GitLab From 8eaa6f7d569b4a22bfc1b0a3fdfeeb401feb65a4 Mon Sep 17 00:00:00 2001 From: Luoyouming Date: Tue, 8 Nov 2022 21:38:46 +0800 Subject: [PATCH 0804/1423] RDMA/hns: Fix ext_sge num error when post send In the HNS ROCE driver, The sge is divided into standard sge and extended sge. There are 2 standard sge in RC/XRC, and the UD standard sge is 0. In the scenario of RC SQ inline, if the data does not exceed 32bytes, the standard sge will be used. If it exceeds, only the extended sge will be used to fill the data. Currently, when filling the extended sge, max_gs is directly used as the number of the extended sge, which did not subtract the number of standard sge. There is a logical error. The new algorithm subtracts the number of standard sge from max_gs to get the actual number of extended sge. Fixes: 30b707886aeb ("RDMA/hns: Support inline data in extented sge space for RC") Link: https://lore.kernel.org/r/20221108133847.2304539-2-xuhaoyue1@hisilicon.com Signed-off-by: Luoyouming Signed-off-by: Haoyue Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 1ead35fb031b0..dcb59c05edfd8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -188,20 +188,29 @@ static void set_atomic_seg(const struct ib_send_wr *wr, hr_reg_write(rc_sq_wqe, RC_SEND_WQE_SGE_NUM, valid_num_sge); } +static unsigned int get_std_sge_num(struct hns_roce_qp *qp) +{ + if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_UD) + return 0; + + return HNS_ROCE_SGE_IN_WQE; +} + static int fill_ext_sge_inl_data(struct hns_roce_qp *qp, const struct ib_send_wr *wr, unsigned int *sge_idx, u32 msg_len) { struct ib_device *ibdev = &(to_hr_dev(qp->ibqp.device))->ib_dev; - unsigned int ext_sge_sz = qp->sq.max_gs * HNS_ROCE_SGE_SIZE; unsigned int left_len_in_pg; unsigned int idx = *sge_idx; + unsigned int std_sge_num; unsigned int i = 0; unsigned int len; void *addr; void *dseg; - if (msg_len > ext_sge_sz) { + std_sge_num = get_std_sge_num(qp); + if (msg_len > (qp->sq.max_gs - std_sge_num) * HNS_ROCE_SGE_SIZE) { ibdev_err(ibdev, "no enough extended sge space for inline data.\n"); return -EINVAL; -- GitLab From 0c5e259b06a8efc69f929ad777ea49281bb58e37 Mon Sep 17 00:00:00 2001 From: Luoyouming Date: Tue, 8 Nov 2022 21:38:47 +0800 Subject: [PATCH 0805/1423] RDMA/hns: Fix incorrect sge nums calculation The user usually configures the number of sge through the max_send_sge parameter when creating qp, and configures the maximum size of inline data that can be sent through max_inline_data. Inline uses sge to fill data to send. Expect the following: 1) When the sge space cannot hold inline data, the sge space needs to be expanded to accommodate all inline data 2) When the sge space is enough to accommodate inline data, the upper limit of inline data can be increased so that users can send larger inline data Currently case one is not implemented. When the inline data is larger than the sge space, an error of insufficient sge space occurs. This part of the code needs to be reimplemented according to the expected rules. The calculation method of sge num is modified to take the maximum value of max_send_sge and the sge for max_inline_data to solve this problem. Fixes: 05201e01be93 ("RDMA/hns: Refactor process of setting extended sge") Fixes: 30b707886aeb ("RDMA/hns: Support inline data in extented sge space for RC") Link: https://lore.kernel.org/r/20221108133847.2304539-3-xuhaoyue1@hisilicon.com Signed-off-by: Luoyouming Signed-off-by: Haoyue Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 3 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 12 +-- drivers/infiniband/hw/hns/hns_roce_main.c | 18 +++- drivers/infiniband/hw/hns/hns_roce_qp.c | 107 ++++++++++++++++---- include/uapi/rdma/hns-abi.h | 15 +++ 5 files changed, 125 insertions(+), 30 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 723e55a7de8d9..f701cc86896b3 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -202,6 +202,7 @@ struct hns_roce_ucontext { struct list_head page_list; struct mutex page_mutex; struct hns_user_mmap_entry *db_mmap_entry; + u32 config; }; struct hns_roce_pd { @@ -334,6 +335,7 @@ struct hns_roce_wq { u32 head; u32 tail; void __iomem *db_reg; + u32 ext_sge_cnt; }; struct hns_roce_sge { @@ -635,6 +637,7 @@ struct hns_roce_qp { struct list_head rq_node; /* all recv qps are on a list */ struct list_head sq_node; /* all send qps are on a list */ struct hns_user_mmap_entry *dwqe_mmap_entry; + u32 config; }; struct hns_roce_ib_iboe { diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index dcb59c05edfd8..939811867249f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -188,14 +188,6 @@ static void set_atomic_seg(const struct ib_send_wr *wr, hr_reg_write(rc_sq_wqe, RC_SEND_WQE_SGE_NUM, valid_num_sge); } -static unsigned int get_std_sge_num(struct hns_roce_qp *qp) -{ - if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_UD) - return 0; - - return HNS_ROCE_SGE_IN_WQE; -} - static int fill_ext_sge_inl_data(struct hns_roce_qp *qp, const struct ib_send_wr *wr, unsigned int *sge_idx, u32 msg_len) @@ -203,14 +195,12 @@ static int fill_ext_sge_inl_data(struct hns_roce_qp *qp, struct ib_device *ibdev = &(to_hr_dev(qp->ibqp.device))->ib_dev; unsigned int left_len_in_pg; unsigned int idx = *sge_idx; - unsigned int std_sge_num; unsigned int i = 0; unsigned int len; void *addr; void *dseg; - std_sge_num = get_std_sge_num(qp); - if (msg_len > (qp->sq.max_gs - std_sge_num) * HNS_ROCE_SGE_SIZE) { + if (msg_len > qp->sq.ext_sge_cnt * HNS_ROCE_SGE_SIZE) { ibdev_err(ibdev, "no enough extended sge space for inline data.\n"); return -EINVAL; diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index dcf89689a4c62..8ba68ac12388d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -354,10 +354,11 @@ static int hns_roce_alloc_uar_entry(struct ib_ucontext *uctx) static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) { - int ret; struct hns_roce_ucontext *context = to_hr_ucontext(uctx); - struct hns_roce_ib_alloc_ucontext_resp resp = {}; struct hns_roce_dev *hr_dev = to_hr_dev(uctx->device); + struct hns_roce_ib_alloc_ucontext_resp resp = {}; + struct hns_roce_ib_alloc_ucontext ucmd = {}; + int ret; if (!hr_dev->active) return -EAGAIN; @@ -365,6 +366,19 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, resp.qp_tab_size = hr_dev->caps.num_qps; resp.srq_tab_size = hr_dev->caps.num_srqs; + ret = ib_copy_from_udata(&ucmd, udata, + min(udata->inlen, sizeof(ucmd))); + if (ret) + return ret; + + if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) + context->config = ucmd.config & HNS_ROCE_EXSGE_FLAGS; + + if (context->config & HNS_ROCE_EXSGE_FLAGS) { + resp.config |= HNS_ROCE_RSP_EXSGE_FLAGS; + resp.max_inline_data = hr_dev->caps.max_sq_inline; + } + ret = hns_roce_uar_alloc(hr_dev, &context->uar); if (ret) goto error_fail_uar_alloc; diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index f0bd82a18069a..0ae335fb205ca 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -476,38 +476,109 @@ static int set_rq_size(struct hns_roce_dev *hr_dev, struct ib_qp_cap *cap, return 0; } -static u32 get_wqe_ext_sge_cnt(struct hns_roce_qp *qp) +static u32 get_max_inline_data(struct hns_roce_dev *hr_dev, + struct ib_qp_cap *cap) { - /* GSI/UD QP only has extended sge */ - if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_UD) - return qp->sq.max_gs; - - if (qp->sq.max_gs > HNS_ROCE_SGE_IN_WQE) - return qp->sq.max_gs - HNS_ROCE_SGE_IN_WQE; + if (cap->max_inline_data) { + cap->max_inline_data = roundup_pow_of_two(cap->max_inline_data); + return min(cap->max_inline_data, + hr_dev->caps.max_sq_inline); + } return 0; } +static void update_inline_data(struct hns_roce_qp *hr_qp, + struct ib_qp_cap *cap) +{ + u32 sge_num = hr_qp->sq.ext_sge_cnt; + + if (hr_qp->config & HNS_ROCE_EXSGE_FLAGS) { + if (!(hr_qp->ibqp.qp_type == IB_QPT_GSI || + hr_qp->ibqp.qp_type == IB_QPT_UD)) + sge_num = max((u32)HNS_ROCE_SGE_IN_WQE, sge_num); + + cap->max_inline_data = max(cap->max_inline_data, + sge_num * HNS_ROCE_SGE_SIZE); + } + + hr_qp->max_inline_data = cap->max_inline_data; +} + +static u32 get_sge_num_from_max_send_sge(bool is_ud_or_gsi, + u32 max_send_sge) +{ + unsigned int std_sge_num; + unsigned int min_sge; + + std_sge_num = is_ud_or_gsi ? 0 : HNS_ROCE_SGE_IN_WQE; + min_sge = is_ud_or_gsi ? 1 : 0; + return max_send_sge > std_sge_num ? (max_send_sge - std_sge_num) : + min_sge; +} + +static unsigned int get_sge_num_from_max_inl_data(bool is_ud_or_gsi, + u32 max_inline_data) +{ + unsigned int inline_sge; + + inline_sge = roundup_pow_of_two(max_inline_data) / HNS_ROCE_SGE_SIZE; + + /* + * if max_inline_data less than + * HNS_ROCE_SGE_IN_WQE * HNS_ROCE_SGE_SIZE, + * In addition to ud's mode, no need to extend sge. + */ + if (!is_ud_or_gsi && inline_sge <= HNS_ROCE_SGE_IN_WQE) + inline_sge = 0; + + return inline_sge; +} + static void set_ext_sge_param(struct hns_roce_dev *hr_dev, u32 sq_wqe_cnt, struct hns_roce_qp *hr_qp, struct ib_qp_cap *cap) { + bool is_ud_or_gsi = (hr_qp->ibqp.qp_type == IB_QPT_GSI || + hr_qp->ibqp.qp_type == IB_QPT_UD); + unsigned int std_sge_num; + u32 inline_ext_sge = 0; + u32 ext_wqe_sge_cnt; u32 total_sge_cnt; - u32 wqe_sge_cnt; + + cap->max_inline_data = get_max_inline_data(hr_dev, cap); hr_qp->sge.sge_shift = HNS_ROCE_SGE_SHIFT; + std_sge_num = is_ud_or_gsi ? 0 : HNS_ROCE_SGE_IN_WQE; + ext_wqe_sge_cnt = get_sge_num_from_max_send_sge(is_ud_or_gsi, + cap->max_send_sge); - hr_qp->sq.max_gs = max(1U, cap->max_send_sge); + if (hr_qp->config & HNS_ROCE_EXSGE_FLAGS) { + inline_ext_sge = max(ext_wqe_sge_cnt, + get_sge_num_from_max_inl_data(is_ud_or_gsi, + cap->max_inline_data)); + hr_qp->sq.ext_sge_cnt = inline_ext_sge ? + roundup_pow_of_two(inline_ext_sge) : 0; - wqe_sge_cnt = get_wqe_ext_sge_cnt(hr_qp); + hr_qp->sq.max_gs = max(1U, (hr_qp->sq.ext_sge_cnt + std_sge_num)); + hr_qp->sq.max_gs = min(hr_qp->sq.max_gs, hr_dev->caps.max_sq_sg); + + ext_wqe_sge_cnt = hr_qp->sq.ext_sge_cnt; + } else { + hr_qp->sq.max_gs = max(1U, cap->max_send_sge); + hr_qp->sq.max_gs = min(hr_qp->sq.max_gs, hr_dev->caps.max_sq_sg); + hr_qp->sq.ext_sge_cnt = hr_qp->sq.max_gs; + } /* If the number of extended sge is not zero, they MUST use the * space of HNS_HW_PAGE_SIZE at least. */ - if (wqe_sge_cnt) { - total_sge_cnt = roundup_pow_of_two(sq_wqe_cnt * wqe_sge_cnt); + if (ext_wqe_sge_cnt) { + total_sge_cnt = roundup_pow_of_two(sq_wqe_cnt * ext_wqe_sge_cnt); hr_qp->sge.sge_cnt = max(total_sge_cnt, (u32)HNS_HW_PAGE_SIZE / HNS_ROCE_SGE_SIZE); } + + update_inline_data(hr_qp, cap); } static int check_sq_size_with_integrity(struct hns_roce_dev *hr_dev, @@ -556,6 +627,7 @@ static int set_user_sq_size(struct hns_roce_dev *hr_dev, hr_qp->sq.wqe_shift = ucmd->log_sq_stride; hr_qp->sq.wqe_cnt = cnt; + cap->max_send_sge = hr_qp->sq.max_gs; return 0; } @@ -986,13 +1058,9 @@ static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct hns_roce_ib_create_qp *ucmd) { struct ib_device *ibdev = &hr_dev->ib_dev; + struct hns_roce_ucontext *uctx; int ret; - if (init_attr->cap.max_inline_data > hr_dev->caps.max_sq_inline) - init_attr->cap.max_inline_data = hr_dev->caps.max_sq_inline; - - hr_qp->max_inline_data = init_attr->cap.max_inline_data; - if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) hr_qp->sq_signal_bits = IB_SIGNAL_ALL_WR; else @@ -1015,12 +1083,17 @@ static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, return ret; } + uctx = rdma_udata_to_drv_context(udata, struct hns_roce_ucontext, + ibucontext); + hr_qp->config = uctx->config; ret = set_user_sq_size(hr_dev, &init_attr->cap, hr_qp, ucmd); if (ret) ibdev_err(ibdev, "failed to set user SQ size, ret = %d.\n", ret); } else { + if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) + hr_qp->config = HNS_ROCE_EXSGE_FLAGS; ret = set_kernel_sq_size(hr_dev, &init_attr->cap, hr_qp); if (ret) ibdev_err(ibdev, diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h index f6fde06db4b4e..745790ce3c261 100644 --- a/include/uapi/rdma/hns-abi.h +++ b/include/uapi/rdma/hns-abi.h @@ -85,11 +85,26 @@ struct hns_roce_ib_create_qp_resp { __aligned_u64 dwqe_mmap_key; }; +enum { + HNS_ROCE_EXSGE_FLAGS = 1 << 0, +}; + +enum { + HNS_ROCE_RSP_EXSGE_FLAGS = 1 << 0, +}; + struct hns_roce_ib_alloc_ucontext_resp { __u32 qp_tab_size; __u32 cqe_size; __u32 srq_tab_size; __u32 reserved; + __u32 config; + __u32 max_inline_data; +}; + +struct hns_roce_ib_alloc_ucontext { + __u32 config; + __u32 reserved; }; struct hns_roce_ib_alloc_pd_resp { -- GitLab From 26d2d0594d7016dbcbce4038aa202c2858d5a944 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 13 Nov 2022 16:38:25 +0000 Subject: [PATCH 0806/1423] KVM: arm64: PMU: Do not let AArch32 change the counters' top 32 bits Even when using PMUv3p5 (which implies 64bit counters), there is no way for AArch32 to write to the top 32 bits of the counters. The only way to influence these bits (other than by counting events) is by writing PMCR.P==1. Make sure we obey the architecture and preserve the top 32 bits on a counter update. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221113163832.3154370-10-maz@kernel.org --- arch/arm64/kvm/pmu-emul.c | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index ea0c8411641fd..7a945fa6dd033 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -119,13 +119,8 @@ u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx) return counter; } -/** - * kvm_pmu_set_counter_value - set PMU counter value - * @vcpu: The vcpu pointer - * @select_idx: The counter index - * @val: The counter value - */ -void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) +static void kvm_pmu_set_counter(struct kvm_vcpu *vcpu, u64 select_idx, u64 val, + bool force) { u64 reg; @@ -135,12 +130,36 @@ void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) kvm_pmu_release_perf_event(&vcpu->arch.pmu.pmc[select_idx]); reg = counter_index_to_reg(select_idx); + + if (vcpu_mode_is_32bit(vcpu) && select_idx != ARMV8_PMU_CYCLE_IDX && + !force) { + /* + * Even with PMUv3p5, AArch32 cannot write to the top + * 32bit of the counters. The only possible course of + * action is to use PMCR.P, which will reset them to + * 0 (the only use of the 'force' parameter). + */ + val = __vcpu_sys_reg(vcpu, reg) & GENMASK(63, 32); + val |= lower_32_bits(val); + } + __vcpu_sys_reg(vcpu, reg) = val; /* Recreate the perf event to reflect the updated sample_period */ kvm_pmu_create_perf_event(vcpu, select_idx); } +/** + * kvm_pmu_set_counter_value - set PMU counter value + * @vcpu: The vcpu pointer + * @select_idx: The counter index + * @val: The counter value + */ +void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) +{ + kvm_pmu_set_counter(vcpu, select_idx, val, false); +} + /** * kvm_pmu_release_perf_event - remove the perf event * @pmc: The PMU counter pointer @@ -533,7 +552,7 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); mask &= ~BIT(ARMV8_PMU_CYCLE_IDX); for_each_set_bit(i, &mask, 32) - kvm_pmu_set_counter_value(vcpu, i, 0); + kvm_pmu_set_counter(vcpu, i, 0, true); } } -- GitLab From 3d0dba5764b94308b8c4257ad64e383f11ce0c92 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 13 Nov 2022 16:38:26 +0000 Subject: [PATCH 0807/1423] KVM: arm64: PMU: Move the ID_AA64DFR0_EL1.PMUver limit to VM creation As further patches will enable the selection of a PMU revision from userspace, sample the supported PMU revision at VM creation time, rather than building each time the ID_AA64DFR0_EL1 register is accessed. This shouldn't result in any change in behaviour. Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221113163832.3154370-11-maz@kernel.org --- arch/arm64/include/asm/kvm_host.h | 4 ++++ arch/arm64/kvm/arm.c | 6 ++++++ arch/arm64/kvm/pmu-emul.c | 11 ++++++++++ arch/arm64/kvm/sys_regs.c | 36 ++++++++++++++++++++++++------- include/kvm/arm_pmu.h | 6 ++++++ 5 files changed, 55 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 45e2136322ba2..cc44e3bc528d8 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -163,6 +163,10 @@ struct kvm_arch { u8 pfr0_csv2; u8 pfr0_csv3; + struct { + u8 imp:4; + u8 unimp:4; + } dfr0_pmuver; /* Hypercall features firmware registers' descriptor */ struct kvm_smccc_features smccc_feat; diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 94d33e296e10c..f956aab438c7a 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -164,6 +164,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) set_default_spectre(kvm); kvm_arm_init_hypercalls(kvm); + /* + * Initialise the default PMUver before there is a chance to + * create an actual PMU. + */ + kvm->arch.dfr0_pmuver.imp = kvm_arm_pmu_get_pmuver_limit(); + return ret; out_free_stage2_pgd: kvm_free_stage2_pgd(&kvm->arch.mmu); diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 7a945fa6dd033..94ca2d17a4e46 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -1047,3 +1047,14 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) return -ENXIO; } + +u8 kvm_arm_pmu_get_pmuver_limit(void) +{ + u64 tmp; + + tmp = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); + tmp = cpuid_feature_cap_perfmon_field(tmp, + ID_AA64DFR0_EL1_PMUVer_SHIFT, + ID_AA64DFR0_EL1_PMUVer_V3P4); + return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), tmp); +} diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index f4a7c5abcbca4..297b4fcbf9696 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1062,6 +1062,27 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, return true; } +static u8 vcpu_pmuver(const struct kvm_vcpu *vcpu) +{ + if (kvm_vcpu_has_pmu(vcpu)) + return vcpu->kvm->arch.dfr0_pmuver.imp; + + return vcpu->kvm->arch.dfr0_pmuver.unimp; +} + +static u8 pmuver_to_perfmon(u8 pmuver) +{ + switch (pmuver) { + case ID_AA64DFR0_EL1_PMUVer_IMP: + return ID_DFR0_PERFMON_8_0; + case ID_AA64DFR0_EL1_PMUVer_IMP_DEF: + return ID_DFR0_PERFMON_IMP_DEF; + default: + /* Anything ARMv8.1+ and NI have the same value. For now. */ + return pmuver; + } +} + /* Read a sanitised cpufeature ID register by sys_reg_desc */ static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r) { @@ -1111,18 +1132,17 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r /* Limit debug to ARMv8.0 */ val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer); val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), 6); - /* Limit guests to PMUv3 for ARMv8.4 */ - val = cpuid_feature_cap_perfmon_field(val, - ID_AA64DFR0_EL1_PMUVer_SHIFT, - kvm_vcpu_has_pmu(vcpu) ? ID_AA64DFR0_EL1_PMUVer_V3P4 : 0); + /* Set PMUver to the required version */ + val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), + vcpu_pmuver(vcpu)); /* Hide SPE from guests */ val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer); break; case SYS_ID_DFR0_EL1: - /* Limit guests to PMUv3 for ARMv8.4 */ - val = cpuid_feature_cap_perfmon_field(val, - ID_DFR0_PERFMON_SHIFT, - kvm_vcpu_has_pmu(vcpu) ? ID_DFR0_PERFMON_8_4 : 0); + val &= ~ARM64_FEATURE_MASK(ID_DFR0_PERFMON); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_DFR0_PERFMON), + pmuver_to_perfmon(vcpu_pmuver(vcpu))); break; } diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 96b192139a23a..812f729c91087 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -89,6 +89,8 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu); vcpu->arch.pmu.events = *kvm_get_pmu_events(); \ } while (0) +u8 kvm_arm_pmu_get_pmuver_limit(void); + #else struct kvm_pmu { }; @@ -154,6 +156,10 @@ static inline u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) static inline void kvm_pmu_update_vcpu_events(struct kvm_vcpu *vcpu) {} static inline void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) {} static inline void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) {} +static inline u8 kvm_arm_pmu_get_pmuver_limit(void) +{ + return 0; +} #endif -- GitLab From 60e651ff1f48bfdf8fec80d35510bd89ecf8c766 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 13 Nov 2022 16:38:27 +0000 Subject: [PATCH 0808/1423] KVM: arm64: PMU: Allow ID_AA64DFR0_EL1.PMUver to be set from userspace Allow userspace to write ID_AA64DFR0_EL1, on the condition that only the PMUver field can be altered and be at most the one that was initially computed for the guest. Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221113163832.3154370-12-maz@kernel.org --- arch/arm64/kvm/sys_regs.c | 42 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 297b4fcbf9696..49585258ae6c0 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1242,6 +1242,45 @@ static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, return 0; } +static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + u64 val) +{ + u8 pmuver, host_pmuver; + bool valid_pmu; + + host_pmuver = kvm_arm_pmu_get_pmuver_limit(); + + /* + * Allow AA64DFR0_EL1.PMUver to be set from userspace as long + * as it doesn't promise more than what the HW gives us. We + * allow an IMPDEF PMU though, only if no PMU is supported + * (KVM backward compatibility handling). + */ + pmuver = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), val); + if ((pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF && pmuver > host_pmuver)) + return -EINVAL; + + valid_pmu = (pmuver != 0 && pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF); + + /* Make sure view register and PMU support do match */ + if (kvm_vcpu_has_pmu(vcpu) != valid_pmu) + return -EINVAL; + + /* We can only differ with PMUver, and anything else is an error */ + val ^= read_id_reg(vcpu, rd); + val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer); + if (val) + return -EINVAL; + + if (valid_pmu) + vcpu->kvm->arch.dfr0_pmuver.imp = pmuver; + else + vcpu->kvm->arch.dfr0_pmuver.unimp = pmuver; + + return 0; +} + /* * cpufeature ID register user accessors * @@ -1503,7 +1542,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_UNALLOCATED(4,7), /* CRm=5 */ - ID_SANITISED(ID_AA64DFR0_EL1), + { SYS_DESC(SYS_ID_AA64DFR0_EL1), .access = access_id_reg, + .get_user = get_id_reg, .set_user = set_id_aa64dfr0_el1, }, ID_SANITISED(ID_AA64DFR1_EL1), ID_UNALLOCATED(5,2), ID_UNALLOCATED(5,3), -- GitLab From d82e0dfdfda73f91e7282e1083a2cd7cd366ea87 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 13 Nov 2022 16:38:28 +0000 Subject: [PATCH 0809/1423] KVM: arm64: PMU: Allow ID_DFR0_EL1.PerfMon to be set from userspace Allow userspace to write ID_DFR0_EL1, on the condition that only the PerfMon field can be altered and be something that is compatible with what was computed for the AArch64 view of the guest. Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221113163832.3154370-13-maz@kernel.org --- arch/arm64/kvm/sys_regs.c | 57 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 49585258ae6c0..b8ac58723459c 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1070,6 +1070,19 @@ static u8 vcpu_pmuver(const struct kvm_vcpu *vcpu) return vcpu->kvm->arch.dfr0_pmuver.unimp; } +static u8 perfmon_to_pmuver(u8 perfmon) +{ + switch (perfmon) { + case ID_DFR0_PERFMON_8_0: + return ID_AA64DFR0_EL1_PMUVer_IMP; + case ID_DFR0_PERFMON_IMP_DEF: + return ID_AA64DFR0_EL1_PMUVer_IMP_DEF; + default: + /* Anything ARMv8.1+ and NI have the same value. For now. */ + return perfmon; + } +} + static u8 pmuver_to_perfmon(u8 pmuver) { switch (pmuver) { @@ -1281,6 +1294,46 @@ static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, return 0; } +static int set_id_dfr0_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + u64 val) +{ + u8 perfmon, host_perfmon; + bool valid_pmu; + + host_perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit()); + + /* + * Allow DFR0_EL1.PerfMon to be set from userspace as long as + * it doesn't promise more than what the HW gives us on the + * AArch64 side (as everything is emulated with that), and + * that this is a PMUv3. + */ + perfmon = FIELD_GET(ARM64_FEATURE_MASK(ID_DFR0_PERFMON), val); + if ((perfmon != ID_DFR0_PERFMON_IMP_DEF && perfmon > host_perfmon) || + (perfmon != 0 && perfmon < ID_DFR0_PERFMON_8_0)) + return -EINVAL; + + valid_pmu = (perfmon != 0 && perfmon != ID_DFR0_PERFMON_IMP_DEF); + + /* Make sure view register and PMU support do match */ + if (kvm_vcpu_has_pmu(vcpu) != valid_pmu) + return -EINVAL; + + /* We can only differ with PerfMon, and anything else is an error */ + val ^= read_id_reg(vcpu, rd); + val &= ~ARM64_FEATURE_MASK(ID_DFR0_PERFMON); + if (val) + return -EINVAL; + + if (valid_pmu) + vcpu->kvm->arch.dfr0_pmuver.imp = perfmon_to_pmuver(perfmon); + else + vcpu->kvm->arch.dfr0_pmuver.unimp = perfmon_to_pmuver(perfmon); + + return 0; +} + /* * cpufeature ID register user accessors * @@ -1502,7 +1555,9 @@ static const struct sys_reg_desc sys_reg_descs[] = { /* CRm=1 */ AA32_ID_SANITISED(ID_PFR0_EL1), AA32_ID_SANITISED(ID_PFR1_EL1), - AA32_ID_SANITISED(ID_DFR0_EL1), + { SYS_DESC(SYS_ID_DFR0_EL1), .access = access_id_reg, + .get_user = get_id_reg, .set_user = set_id_dfr0_el1, + .visibility = aa32_id_visibility, }, ID_HIDDEN(ID_AFR0_EL1), AA32_ID_SANITISED(ID_MMFR0_EL1), AA32_ID_SANITISED(ID_MMFR1_EL1), -- GitLab From 11af4c37165e36a6090172ded5d06acdf15206da Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 13 Nov 2022 16:38:29 +0000 Subject: [PATCH 0810/1423] KVM: arm64: PMU: Implement PMUv3p5 long counter support PMUv3p5 (which is mandatory with ARMv8.5) comes with some extra features: - All counters are 64bit - The overflow point is controlled by the PMCR_EL0.LP bit Add the required checks in the helpers that control counter width and overflow, as well as the sysreg handling for the LP bit. A new kvm_pmu_is_3p5() helper makes it easy to spot the PMUv3p5 specific handling. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221113163832.3154370-14-maz@kernel.org --- arch/arm64/kvm/pmu-emul.c | 8 +++++--- arch/arm64/kvm/sys_regs.c | 4 ++++ include/kvm/arm_pmu.h | 7 +++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 94ca2d17a4e46..7e25ff73cbba8 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -52,13 +52,15 @@ static u32 kvm_pmu_event_mask(struct kvm *kvm) */ static bool kvm_pmu_idx_is_64bit(struct kvm_vcpu *vcpu, u64 select_idx) { - return (select_idx == ARMV8_PMU_CYCLE_IDX); + return (select_idx == ARMV8_PMU_CYCLE_IDX || kvm_pmu_is_3p5(vcpu)); } static bool kvm_pmu_idx_has_64bit_overflow(struct kvm_vcpu *vcpu, u64 select_idx) { - return (select_idx == ARMV8_PMU_CYCLE_IDX && - __vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_LC); + u64 val = __vcpu_sys_reg(vcpu, PMCR_EL0); + + return (select_idx < ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LP)) || + (select_idx == ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LC)); } static bool kvm_pmu_counter_can_chain(struct kvm_vcpu *vcpu, u64 idx) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index b8ac58723459c..67eac0f747be8 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -654,6 +654,8 @@ static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) | (ARMV8_PMU_PMCR_MASK & 0xdecafbad)) & (~ARMV8_PMU_PMCR_E); if (!kvm_supports_32bit_el0()) val |= ARMV8_PMU_PMCR_LC; + if (!kvm_pmu_is_3p5(vcpu)) + val &= ~ARMV8_PMU_PMCR_LP; __vcpu_sys_reg(vcpu, r->reg) = val; } @@ -703,6 +705,8 @@ static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, val |= p->regval & ARMV8_PMU_PMCR_MASK; if (!kvm_supports_32bit_el0()) val |= ARMV8_PMU_PMCR_LC; + if (!kvm_pmu_is_3p5(vcpu)) + val &= ~ARMV8_PMU_PMCR_LP; __vcpu_sys_reg(vcpu, PMCR_EL0) = val; kvm_pmu_handle_pmcr(vcpu, val); kvm_vcpu_pmu_restore_guest(vcpu); diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 812f729c91087..628775334d5ef 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -89,6 +89,12 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu); vcpu->arch.pmu.events = *kvm_get_pmu_events(); \ } while (0) +/* + * Evaluates as true when emulating PMUv3p5, and false otherwise. + */ +#define kvm_pmu_is_3p5(vcpu) \ + (vcpu->kvm->arch.dfr0_pmuver.imp >= ID_AA64DFR0_EL1_PMUVer_V3P5) + u8 kvm_arm_pmu_get_pmuver_limit(void); #else @@ -153,6 +159,7 @@ static inline u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) } #define kvm_vcpu_has_pmu(vcpu) ({ false; }) +#define kvm_pmu_is_3p5(vcpu) ({ false; }) static inline void kvm_pmu_update_vcpu_events(struct kvm_vcpu *vcpu) {} static inline void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) {} static inline void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) {} -- GitLab From 1f7c978282855d6b2abd608064004c74902e791d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 13 Nov 2022 16:38:30 +0000 Subject: [PATCH 0811/1423] KVM: arm64: PMU: Allow PMUv3p5 to be exposed to the guest Now that the infrastructure is in place, bump the PMU support up to PMUv3p5. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221113163832.3154370-15-maz@kernel.org --- arch/arm64/kvm/pmu-emul.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 7e25ff73cbba8..be881ae671336 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -1057,6 +1057,6 @@ u8 kvm_arm_pmu_get_pmuver_limit(void) tmp = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); tmp = cpuid_feature_cap_perfmon_field(tmp, ID_AA64DFR0_EL1_PMUVer_SHIFT, - ID_AA64DFR0_EL1_PMUVer_V3P4); + ID_AA64DFR0_EL1_PMUVer_V3P5); return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), tmp); } -- GitLab From 9bad925dd741408825590eccc495d073cc246de0 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 13 Nov 2022 16:38:31 +0000 Subject: [PATCH 0812/1423] KVM: arm64: PMU: Simplify vcpu computation on perf overflow notification The way we compute the target vcpu on getting an overflow is a bit odd, as we use the PMC array as an anchor for kvm_pmc_to_vcpu, while we could directly compute the correct address. Get rid of the intermediate step and directly compute the target vcpu. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221113163832.3154370-16-maz@kernel.org --- arch/arm64/kvm/pmu-emul.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index be881ae671336..49a0046604979 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -405,11 +405,8 @@ void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu) static void kvm_pmu_perf_overflow_notify_vcpu(struct irq_work *work) { struct kvm_vcpu *vcpu; - struct kvm_pmu *pmu; - - pmu = container_of(work, struct kvm_pmu, overflow_work); - vcpu = kvm_pmc_to_vcpu(pmu->pmc); + vcpu = container_of(work, struct kvm_vcpu, arch.pmu.overflow_work); kvm_vcpu_kick(vcpu); } -- GitLab From d56bdce586e7fabd2b3339f476e0e4c059b24e19 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 13 Nov 2022 16:38:32 +0000 Subject: [PATCH 0813/1423] KVM: arm64: PMU: Make kvm_pmc the main data structure The PMU code has historically been torn between referencing a counter as a pair vcpu+index or as the PMC pointer. Given that it is pretty easy to go from one representation to the other, standardise on the latter which, IMHO, makes the code slightly more readable. YMMV. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221113163832.3154370-17-maz@kernel.org --- arch/arm64/kvm/pmu-emul.c | 174 +++++++++++++++++++------------------- 1 file changed, 87 insertions(+), 87 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 49a0046604979..3295dea34f4c9 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -22,9 +22,19 @@ DEFINE_STATIC_KEY_FALSE(kvm_arm_pmu_available); static LIST_HEAD(arm_pmus); static DEFINE_MUTEX(arm_pmus_lock); -static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx); +static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc); static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc); +static struct kvm_vcpu *kvm_pmc_to_vcpu(const struct kvm_pmc *pmc) +{ + return container_of(pmc, struct kvm_vcpu, arch.pmu.pmc[pmc->idx]); +} + +static struct kvm_pmc *kvm_vcpu_idx_to_pmc(struct kvm_vcpu *vcpu, int cnt_idx) +{ + return &vcpu->arch.pmu.pmc[cnt_idx]; +} + static u32 kvm_pmu_event_mask(struct kvm *kvm) { unsigned int pmuver; @@ -46,38 +56,27 @@ static u32 kvm_pmu_event_mask(struct kvm *kvm) } /** - * kvm_pmu_idx_is_64bit - determine if select_idx is a 64bit counter - * @vcpu: The vcpu pointer - * @select_idx: The counter index + * kvm_pmc_is_64bit - determine if counter is 64bit + * @pmc: counter context */ -static bool kvm_pmu_idx_is_64bit(struct kvm_vcpu *vcpu, u64 select_idx) +static bool kvm_pmc_is_64bit(struct kvm_pmc *pmc) { - return (select_idx == ARMV8_PMU_CYCLE_IDX || kvm_pmu_is_3p5(vcpu)); + return (pmc->idx == ARMV8_PMU_CYCLE_IDX || + kvm_pmu_is_3p5(kvm_pmc_to_vcpu(pmc))); } -static bool kvm_pmu_idx_has_64bit_overflow(struct kvm_vcpu *vcpu, u64 select_idx) +static bool kvm_pmc_has_64bit_overflow(struct kvm_pmc *pmc) { - u64 val = __vcpu_sys_reg(vcpu, PMCR_EL0); + u64 val = __vcpu_sys_reg(kvm_pmc_to_vcpu(pmc), PMCR_EL0); - return (select_idx < ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LP)) || - (select_idx == ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LC)); + return (pmc->idx < ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LP)) || + (pmc->idx == ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LC)); } -static bool kvm_pmu_counter_can_chain(struct kvm_vcpu *vcpu, u64 idx) +static bool kvm_pmu_counter_can_chain(struct kvm_pmc *pmc) { - return (!(idx & 1) && (idx + 1) < ARMV8_PMU_CYCLE_IDX && - !kvm_pmu_idx_has_64bit_overflow(vcpu, idx)); -} - -static struct kvm_vcpu *kvm_pmc_to_vcpu(struct kvm_pmc *pmc) -{ - struct kvm_pmu *pmu; - struct kvm_vcpu_arch *vcpu_arch; - - pmc -= pmc->idx; - pmu = container_of(pmc, struct kvm_pmu, pmc[0]); - vcpu_arch = container_of(pmu, struct kvm_vcpu_arch, pmu); - return container_of(vcpu_arch, struct kvm_vcpu, arch); + return (!(pmc->idx & 1) && (pmc->idx + 1) < ARMV8_PMU_CYCLE_IDX && + !kvm_pmc_has_64bit_overflow(pmc)); } static u32 counter_index_to_reg(u64 idx) @@ -90,21 +89,12 @@ static u32 counter_index_to_evtreg(u64 idx) return (idx == ARMV8_PMU_CYCLE_IDX) ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + idx; } -/** - * kvm_pmu_get_counter_value - get PMU counter value - * @vcpu: The vcpu pointer - * @select_idx: The counter index - */ -u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx) +static u64 kvm_pmu_get_pmc_value(struct kvm_pmc *pmc) { + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); u64 counter, reg, enabled, running; - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc = &pmu->pmc[select_idx]; - - if (!kvm_vcpu_has_pmu(vcpu)) - return 0; - reg = counter_index_to_reg(select_idx); + reg = counter_index_to_reg(pmc->idx); counter = __vcpu_sys_reg(vcpu, reg); /* @@ -115,25 +105,35 @@ u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx) counter += perf_event_read_value(pmc->perf_event, &enabled, &running); - if (!kvm_pmu_idx_is_64bit(vcpu, select_idx)) + if (!kvm_pmc_is_64bit(pmc)) counter = lower_32_bits(counter); return counter; } -static void kvm_pmu_set_counter(struct kvm_vcpu *vcpu, u64 select_idx, u64 val, - bool force) +/** + * kvm_pmu_get_counter_value - get PMU counter value + * @vcpu: The vcpu pointer + * @select_idx: The counter index + */ +u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx) { - u64 reg; - if (!kvm_vcpu_has_pmu(vcpu)) - return; + return 0; - kvm_pmu_release_perf_event(&vcpu->arch.pmu.pmc[select_idx]); + return kvm_pmu_get_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, select_idx)); +} - reg = counter_index_to_reg(select_idx); +static void kvm_pmu_set_pmc_value(struct kvm_pmc *pmc, u64 val, bool force) +{ + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); + u64 reg; - if (vcpu_mode_is_32bit(vcpu) && select_idx != ARMV8_PMU_CYCLE_IDX && + kvm_pmu_release_perf_event(pmc); + + reg = counter_index_to_reg(pmc->idx); + + if (vcpu_mode_is_32bit(vcpu) && pmc->idx != ARMV8_PMU_CYCLE_IDX && !force) { /* * Even with PMUv3p5, AArch32 cannot write to the top @@ -148,7 +148,7 @@ static void kvm_pmu_set_counter(struct kvm_vcpu *vcpu, u64 select_idx, u64 val, __vcpu_sys_reg(vcpu, reg) = val; /* Recreate the perf event to reflect the updated sample_period */ - kvm_pmu_create_perf_event(vcpu, select_idx); + kvm_pmu_create_perf_event(pmc); } /** @@ -159,7 +159,10 @@ static void kvm_pmu_set_counter(struct kvm_vcpu *vcpu, u64 select_idx, u64 val, */ void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) { - kvm_pmu_set_counter(vcpu, select_idx, val, false); + if (!kvm_vcpu_has_pmu(vcpu)) + return; + + kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, select_idx), val, false); } /** @@ -181,14 +184,15 @@ static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc) * * If this counter has been configured to monitor some event, release it here. */ -static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc) +static void kvm_pmu_stop_counter(struct kvm_pmc *pmc) { + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); u64 reg, val; if (!pmc->perf_event) return; - val = kvm_pmu_get_counter_value(vcpu, pmc->idx); + val = kvm_pmu_get_pmc_value(pmc); reg = counter_index_to_reg(pmc->idx); @@ -219,11 +223,10 @@ void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) { unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); - struct kvm_pmu *pmu = &vcpu->arch.pmu; int i; for_each_set_bit(i, &mask, 32) - kvm_pmu_stop_counter(vcpu, &pmu->pmc[i]); + kvm_pmu_stop_counter(kvm_vcpu_idx_to_pmc(vcpu, i)); } /** @@ -234,10 +237,9 @@ void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) { int i; - struct kvm_pmu *pmu = &vcpu->arch.pmu; for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) - kvm_pmu_release_perf_event(&pmu->pmc[i]); + kvm_pmu_release_perf_event(kvm_vcpu_idx_to_pmc(vcpu, i)); irq_work_sync(&vcpu->arch.pmu.overflow_work); } @@ -262,9 +264,6 @@ u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu) void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val) { int i; - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc; - if (!kvm_vcpu_has_pmu(vcpu)) return; @@ -272,13 +271,15 @@ void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val) return; for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { + struct kvm_pmc *pmc; + if (!(val & BIT(i))) continue; - pmc = &pmu->pmc[i]; + pmc = kvm_vcpu_idx_to_pmc(vcpu, i); if (!pmc->perf_event) { - kvm_pmu_create_perf_event(vcpu, i); + kvm_pmu_create_perf_event(pmc); } else { perf_event_enable(pmc->perf_event); if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE) @@ -297,17 +298,17 @@ void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val) void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val) { int i; - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc; if (!kvm_vcpu_has_pmu(vcpu) || !val) return; for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { + struct kvm_pmc *pmc; + if (!(val & BIT(i))) continue; - pmc = &pmu->pmc[i]; + pmc = kvm_vcpu_idx_to_pmc(vcpu, i); if (pmc->perf_event) perf_event_disable(pmc->perf_event); @@ -427,6 +428,7 @@ static void kvm_pmu_counter_increment(struct kvm_vcpu *vcpu, mask &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); for_each_set_bit(i, &mask, ARMV8_PMU_CYCLE_IDX) { + struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i); u64 type, reg; /* Filter on event type */ @@ -437,30 +439,30 @@ static void kvm_pmu_counter_increment(struct kvm_vcpu *vcpu, /* Increment this counter */ reg = __vcpu_sys_reg(vcpu, counter_index_to_reg(i)) + 1; - if (!kvm_pmu_idx_is_64bit(vcpu, i)) + if (!kvm_pmc_is_64bit(pmc)) reg = lower_32_bits(reg); __vcpu_sys_reg(vcpu, counter_index_to_reg(i)) = reg; /* No overflow? move on */ - if (kvm_pmu_idx_has_64bit_overflow(vcpu, i) ? reg : lower_32_bits(reg)) + if (kvm_pmc_has_64bit_overflow(pmc) ? reg : lower_32_bits(reg)) continue; /* Mark overflow */ __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i); - if (kvm_pmu_counter_can_chain(vcpu, i)) + if (kvm_pmu_counter_can_chain(pmc)) kvm_pmu_counter_increment(vcpu, BIT(i + 1), ARMV8_PMUV3_PERFCTR_CHAIN); } } /* Compute the sample period for a given counter value */ -static u64 compute_period(struct kvm_vcpu *vcpu, u64 select_idx, u64 counter) +static u64 compute_period(struct kvm_pmc *pmc, u64 counter) { u64 val; - if (kvm_pmu_idx_is_64bit(vcpu, select_idx)) { - if (!kvm_pmu_idx_has_64bit_overflow(vcpu, select_idx)) + if (kvm_pmc_is_64bit(pmc)) { + if (!kvm_pmc_has_64bit_overflow(pmc)) val = -(counter & GENMASK(31, 0)); else val = (-counter) & GENMASK(63, 0); @@ -490,7 +492,7 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event, * Reset the sample period to the architectural limit, * i.e. the point where the counter overflows. */ - period = compute_period(vcpu, idx, local64_read(&perf_event->count)); + period = compute_period(pmc, local64_read(&perf_event->count)); local64_set(&perf_event->hw.period_left, 0); perf_event->attr.sample_period = period; @@ -498,7 +500,7 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event, __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(idx); - if (kvm_pmu_counter_can_chain(vcpu, idx)) + if (kvm_pmu_counter_can_chain(pmc)) kvm_pmu_counter_increment(vcpu, BIT(idx + 1), ARMV8_PMUV3_PERFCTR_CHAIN); @@ -551,34 +553,33 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); mask &= ~BIT(ARMV8_PMU_CYCLE_IDX); for_each_set_bit(i, &mask, 32) - kvm_pmu_set_counter(vcpu, i, 0, true); + kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, i), 0, true); } } -static bool kvm_pmu_counter_is_enabled(struct kvm_vcpu *vcpu, u64 select_idx) +static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc) { + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); return (__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) && - (__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(select_idx)); + (__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(pmc->idx)); } /** * kvm_pmu_create_perf_event - create a perf event for a counter - * @vcpu: The vcpu pointer - * @select_idx: The number of selected counter + * @pmc: Counter context */ -static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx) +static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc) { + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); struct arm_pmu *arm_pmu = vcpu->kvm->arch.arm_pmu; - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc = &pmu->pmc[select_idx]; struct perf_event *event; struct perf_event_attr attr; - u64 eventsel, counter, reg, data; + u64 eventsel, reg, data; - reg = counter_index_to_evtreg(select_idx); + reg = counter_index_to_evtreg(pmc->idx); data = __vcpu_sys_reg(vcpu, reg); - kvm_pmu_stop_counter(vcpu, pmc); + kvm_pmu_stop_counter(pmc); if (pmc->idx == ARMV8_PMU_CYCLE_IDX) eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES; else @@ -604,24 +605,22 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx) attr.type = arm_pmu->pmu.type; attr.size = sizeof(attr); attr.pinned = 1; - attr.disabled = !kvm_pmu_counter_is_enabled(vcpu, pmc->idx); + attr.disabled = !kvm_pmu_counter_is_enabled(pmc); attr.exclude_user = data & ARMV8_PMU_EXCLUDE_EL0 ? 1 : 0; attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0; attr.exclude_hv = 1; /* Don't count EL2 events */ attr.exclude_host = 1; /* Don't count host events */ attr.config = eventsel; - counter = kvm_pmu_get_counter_value(vcpu, select_idx); - /* * If counting with a 64bit counter, advertise it to the perf * code, carefully dealing with the initial sample period * which also depends on the overflow. */ - if (kvm_pmu_idx_is_64bit(vcpu, select_idx)) + if (kvm_pmc_is_64bit(pmc)) attr.config1 |= PERF_ATTR_CFG1_COUNTER_64BIT; - attr.sample_period = compute_period(vcpu, select_idx, counter); + attr.sample_period = compute_period(pmc, kvm_pmu_get_pmc_value(pmc)); event = perf_event_create_kernel_counter(&attr, -1, current, kvm_pmu_perf_overflow, pmc); @@ -648,6 +647,7 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx) void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, u64 select_idx) { + struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, select_idx); u64 reg, mask; if (!kvm_vcpu_has_pmu(vcpu)) @@ -657,11 +657,11 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, mask &= ~ARMV8_PMU_EVTYPE_EVENT; mask |= kvm_pmu_event_mask(vcpu->kvm); - reg = counter_index_to_evtreg(select_idx); + reg = counter_index_to_evtreg(pmc->idx); __vcpu_sys_reg(vcpu, reg) = data & mask; - kvm_pmu_create_perf_event(vcpu, select_idx); + kvm_pmu_create_perf_event(pmc); } void kvm_host_pmu_init(struct arm_pmu *pmu) -- GitLab From 9e7726a8a08a65ed48e2749ef62ec4970bdf851f Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:54:15 +0100 Subject: [PATCH 0814/1423] KVM: selftests: Hyper-V PV TLB flush selftest Introduce a selftest for Hyper-V PV TLB flush hypercalls (HvFlushVirtualAddressSpace/HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressList/HvFlushVirtualAddressListEx). The test creates one 'sender' vCPU and two 'worker' vCPU which do busy loop reading from a certain GVA checking the observed value. Sender vCPU swaos the data page with another page filled with a different value. The expectation for workers is also altered. Without TLB flush on worker vCPUs, they may continue to observe old value. To guard against accidental TLB flushes for worker vCPUs the test is repeated 100 times. Hyper-V TLB flush hypercalls are tested in both 'normal' and 'XMM fast' modes. Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-38-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + .../selftests/kvm/include/x86_64/hyperv.h | 1 + .../selftests/kvm/x86_64/hyperv_tlb_flush.c | 690 ++++++++++++++++++ 4 files changed, 693 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 3b3218cb46ed6..dc7e28cf2da07 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -27,6 +27,7 @@ /x86_64/hyperv_features /x86_64/hyperv_ipi /x86_64/hyperv_svm_test +/x86_64/hyperv_tlb_flush /x86_64/max_vcpuid_cap_test /x86_64/mmio_warning_test /x86_64/monitor_mwait_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 4095b1212f08c..058b15213c5dd 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -89,6 +89,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features TEST_GEN_PROGS_x86_64 += x86_64/hyperv_ipi TEST_GEN_PROGS_x86_64 += x86_64/hyperv_svm_test +TEST_GEN_PROGS_x86_64 += x86_64/hyperv_tlb_flush TEST_GEN_PROGS_x86_64 += x86_64/kvm_clock_test TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test diff --git a/tools/testing/selftests/kvm/include/x86_64/hyperv.h b/tools/testing/selftests/kvm/include/x86_64/hyperv.h index c757e40011731..ae945f7408352 100644 --- a/tools/testing/selftests/kvm/include/x86_64/hyperv.h +++ b/tools/testing/selftests/kvm/include/x86_64/hyperv.h @@ -187,6 +187,7 @@ /* hypercall options */ #define HV_HYPERCALL_FAST_BIT BIT(16) #define HV_HYPERCALL_VARHEAD_OFFSET 17 +#define HV_HYPERCALL_REP_COMP_OFFSET 32 /* * Issue a Hyper-V hypercall. Returns exception vector raised or 0, 'hv_status' diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c b/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c new file mode 100644 index 0000000000000..68f97ff720a74 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c @@ -0,0 +1,690 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hyper-V HvFlushVirtualAddress{List,Space}{,Ex} tests + * + * Copyright (C) 2022, Red Hat, Inc. + * + */ + +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include +#include +#include + +#include "kvm_util.h" +#include "processor.h" +#include "hyperv.h" +#include "test_util.h" +#include "vmx.h" + +#define WORKER_VCPU_ID_1 2 +#define WORKER_VCPU_ID_2 65 + +#define NTRY 100 +#define NTEST_PAGES 2 + +struct hv_vpset { + u64 format; + u64 valid_bank_mask; + u64 bank_contents[]; +}; + +enum HV_GENERIC_SET_FORMAT { + HV_GENERIC_SET_SPARSE_4K, + HV_GENERIC_SET_ALL, +}; + +#define HV_FLUSH_ALL_PROCESSORS BIT(0) +#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1) +#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2) +#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3) + +/* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */ +struct hv_tlb_flush { + u64 address_space; + u64 flags; + u64 processor_mask; + u64 gva_list[]; +} __packed; + +/* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */ +struct hv_tlb_flush_ex { + u64 address_space; + u64 flags; + struct hv_vpset hv_vp_set; + u64 gva_list[]; +} __packed; + +/* + * Pass the following info to 'workers' and 'sender' + * - Hypercall page's GVA + * - Hypercall page's GPA + * - Test pages GVA + * - GVAs of the test pages' PTEs + */ +struct test_data { + vm_vaddr_t hcall_gva; + vm_paddr_t hcall_gpa; + vm_vaddr_t test_pages; + vm_vaddr_t test_pages_pte[NTEST_PAGES]; +}; + +/* 'Worker' vCPU code checking the contents of the test page */ +static void worker_guest_code(vm_vaddr_t test_data) +{ + struct test_data *data = (struct test_data *)test_data; + u32 vcpu_id = rdmsr(HV_X64_MSR_VP_INDEX); + void *exp_page = (void *)data->test_pages + PAGE_SIZE * NTEST_PAGES; + u64 *this_cpu = (u64 *)(exp_page + vcpu_id * sizeof(u64)); + u64 expected, val; + + x2apic_enable(); + wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID); + + for (;;) { + cpu_relax(); + + expected = READ_ONCE(*this_cpu); + + /* + * Make sure the value in the test page is read after reading + * the expectation for the first time. Pairs with wmb() in + * prepare_to_test(). + */ + rmb(); + + val = READ_ONCE(*(u64 *)data->test_pages); + + /* + * Make sure the value in the test page is read after before + * reading the expectation for the second time. Pairs with wmb() + * post_test(). + */ + rmb(); + + /* + * '0' indicates the sender is between iterations, wait until + * the sender is ready for this vCPU to start checking again. + */ + if (!expected) + continue; + + /* + * Re-read the per-vCPU byte to ensure the sender didn't move + * onto a new iteration. + */ + if (expected != READ_ONCE(*this_cpu)) + continue; + + GUEST_ASSERT(val == expected); + } +} + +/* + * Write per-CPU info indicating what each 'worker' CPU is supposed to see in + * test page. '0' means don't check. + */ +static void set_expected_val(void *addr, u64 val, int vcpu_id) +{ + void *exp_page = addr + PAGE_SIZE * NTEST_PAGES; + + *(u64 *)(exp_page + vcpu_id * sizeof(u64)) = val; +} + +/* + * Update PTEs swapping two test pages. + * TODO: use swap()/xchg() when these are provided. + */ +static void swap_two_test_pages(vm_paddr_t pte_gva1, vm_paddr_t pte_gva2) +{ + uint64_t tmp = *(uint64_t *)pte_gva1; + + *(uint64_t *)pte_gva1 = *(uint64_t *)pte_gva2; + *(uint64_t *)pte_gva2 = tmp; +} + +/* + * TODO: replace the silly NOP loop with a proper udelay() implementation. + */ +static inline void do_delay(void) +{ + int i; + + for (i = 0; i < 1000000; i++) + asm volatile("nop"); +} + +/* + * Prepare to test: 'disable' workers by setting the expectation to '0', + * clear hypercall input page and then swap two test pages. + */ +static inline void prepare_to_test(struct test_data *data) +{ + /* Clear hypercall input page */ + memset((void *)data->hcall_gva, 0, PAGE_SIZE); + + /* 'Disable' workers */ + set_expected_val((void *)data->test_pages, 0x0, WORKER_VCPU_ID_1); + set_expected_val((void *)data->test_pages, 0x0, WORKER_VCPU_ID_2); + + /* Make sure workers are 'disabled' before we swap PTEs. */ + wmb(); + + /* Make sure workers have enough time to notice */ + do_delay(); + + /* Swap test page mappings */ + swap_two_test_pages(data->test_pages_pte[0], data->test_pages_pte[1]); +} + +/* + * Finalize the test: check hypercall resule set the expected val for + * 'worker' CPUs and give them some time to test. + */ +static inline void post_test(struct test_data *data, u64 exp1, u64 exp2) +{ + /* Make sure we change the expectation after swapping PTEs */ + wmb(); + + /* Set the expectation for workers, '0' means don't test */ + set_expected_val((void *)data->test_pages, exp1, WORKER_VCPU_ID_1); + set_expected_val((void *)data->test_pages, exp2, WORKER_VCPU_ID_2); + + /* Make sure workers have enough time to test */ + do_delay(); +} + +#define TESTVAL1 0x0101010101010101 +#define TESTVAL2 0x0202020202020202 + +/* Main vCPU doing the test */ +static void sender_guest_code(vm_vaddr_t test_data) +{ + struct test_data *data = (struct test_data *)test_data; + struct hv_tlb_flush *flush = (struct hv_tlb_flush *)data->hcall_gva; + struct hv_tlb_flush_ex *flush_ex = (struct hv_tlb_flush_ex *)data->hcall_gva; + vm_paddr_t hcall_gpa = data->hcall_gpa; + int i, stage = 1; + + wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID); + wrmsr(HV_X64_MSR_HYPERCALL, data->hcall_gpa); + + /* "Slow" hypercalls */ + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE for WORKER_VCPU_ID_1 */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + flush->processor_mask = BIT(WORKER_VCPU_ID_1); + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, hcall_gpa, + hcall_gpa + PAGE_SIZE); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, 0x0); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST for WORKER_VCPU_ID_1 */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + flush->processor_mask = BIT(WORKER_VCPU_ID_1); + flush->gva_list[0] = (u64)data->test_pages; + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST | + (1UL << HV_HYPERCALL_REP_COMP_OFFSET), + hcall_gpa, hcall_gpa + PAGE_SIZE); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, 0x0); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE for HV_FLUSH_ALL_PROCESSORS */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES | + HV_FLUSH_ALL_PROCESSORS; + flush->processor_mask = 0; + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, hcall_gpa, + hcall_gpa + PAGE_SIZE); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST for HV_FLUSH_ALL_PROCESSORS */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES | + HV_FLUSH_ALL_PROCESSORS; + flush->gva_list[0] = (u64)data->test_pages; + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST | + (1UL << HV_HYPERCALL_REP_COMP_OFFSET), + hcall_gpa, hcall_gpa + PAGE_SIZE); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, + i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for WORKER_VCPU_ID_2 */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K; + flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64); + flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_2 % 64); + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX | + (1 << HV_HYPERCALL_VARHEAD_OFFSET), + hcall_gpa, hcall_gpa + PAGE_SIZE); + post_test(data, 0x0, i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for WORKER_VCPU_ID_2 */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K; + flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64); + flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_2 % 64); + /* bank_contents and gva_list occupy the same space, thus [1] */ + flush_ex->gva_list[1] = (u64)data->test_pages; + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX | + (1 << HV_HYPERCALL_VARHEAD_OFFSET) | + (1UL << HV_HYPERCALL_REP_COMP_OFFSET), + hcall_gpa, hcall_gpa + PAGE_SIZE); + post_test(data, 0x0, i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for both vCPUs */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K; + flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64) | + BIT_ULL(WORKER_VCPU_ID_1 / 64); + flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_1 % 64); + flush_ex->hv_vp_set.bank_contents[1] = BIT_ULL(WORKER_VCPU_ID_2 % 64); + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX | + (2 << HV_HYPERCALL_VARHEAD_OFFSET), + hcall_gpa, hcall_gpa + PAGE_SIZE); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, + i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for both vCPUs */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K; + flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_1 / 64) | + BIT_ULL(WORKER_VCPU_ID_2 / 64); + flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_1 % 64); + flush_ex->hv_vp_set.bank_contents[1] = BIT_ULL(WORKER_VCPU_ID_2 % 64); + /* bank_contents and gva_list occupy the same space, thus [2] */ + flush_ex->gva_list[2] = (u64)data->test_pages; + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX | + (2 << HV_HYPERCALL_VARHEAD_OFFSET) | + (1UL << HV_HYPERCALL_REP_COMP_OFFSET), + hcall_gpa, hcall_gpa + PAGE_SIZE); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, + i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for HV_GENERIC_SET_ALL */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + flush_ex->hv_vp_set.format = HV_GENERIC_SET_ALL; + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, + hcall_gpa, hcall_gpa + PAGE_SIZE); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, + i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for HV_GENERIC_SET_ALL */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + flush_ex->hv_vp_set.format = HV_GENERIC_SET_ALL; + flush_ex->gva_list[0] = (u64)data->test_pages; + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX | + (1UL << HV_HYPERCALL_REP_COMP_OFFSET), + hcall_gpa, hcall_gpa + PAGE_SIZE); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, + i % 2 ? TESTVAL1 : TESTVAL2); + } + + /* "Fast" hypercalls */ + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE for WORKER_VCPU_ID_1 */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush->processor_mask = BIT(WORKER_VCPU_ID_1); + hyperv_write_xmm_input(&flush->processor_mask, 1); + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | + HV_HYPERCALL_FAST_BIT, 0x0, + HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, 0x0); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST for WORKER_VCPU_ID_1 */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush->processor_mask = BIT(WORKER_VCPU_ID_1); + flush->gva_list[0] = (u64)data->test_pages; + hyperv_write_xmm_input(&flush->processor_mask, 1); + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST | + HV_HYPERCALL_FAST_BIT | + (1UL << HV_HYPERCALL_REP_COMP_OFFSET), + 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, 0x0); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE for HV_FLUSH_ALL_PROCESSORS */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + hyperv_write_xmm_input(&flush->processor_mask, 1); + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | + HV_HYPERCALL_FAST_BIT, 0x0, + HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES | + HV_FLUSH_ALL_PROCESSORS); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, + i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST for HV_FLUSH_ALL_PROCESSORS */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush->gva_list[0] = (u64)data->test_pages; + hyperv_write_xmm_input(&flush->processor_mask, 1); + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST | + HV_HYPERCALL_FAST_BIT | + (1UL << HV_HYPERCALL_REP_COMP_OFFSET), 0x0, + HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES | + HV_FLUSH_ALL_PROCESSORS); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, + i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for WORKER_VCPU_ID_2 */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K; + flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64); + flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_2 % 64); + hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2); + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX | + HV_HYPERCALL_FAST_BIT | + (1 << HV_HYPERCALL_VARHEAD_OFFSET), + 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES); + post_test(data, 0x0, i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for WORKER_VCPU_ID_2 */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K; + flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64); + flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_2 % 64); + /* bank_contents and gva_list occupy the same space, thus [1] */ + flush_ex->gva_list[1] = (u64)data->test_pages; + hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2); + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX | + HV_HYPERCALL_FAST_BIT | + (1 << HV_HYPERCALL_VARHEAD_OFFSET) | + (1UL << HV_HYPERCALL_REP_COMP_OFFSET), + 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES); + post_test(data, 0x0, i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for both vCPUs */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K; + flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64) | + BIT_ULL(WORKER_VCPU_ID_1 / 64); + flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_1 % 64); + flush_ex->hv_vp_set.bank_contents[1] = BIT_ULL(WORKER_VCPU_ID_2 % 64); + hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2); + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX | + HV_HYPERCALL_FAST_BIT | + (2 << HV_HYPERCALL_VARHEAD_OFFSET), + 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES); + post_test(data, i % 2 ? TESTVAL1 : + TESTVAL2, i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for both vCPUs */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K; + flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_1 / 64) | + BIT_ULL(WORKER_VCPU_ID_2 / 64); + flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_1 % 64); + flush_ex->hv_vp_set.bank_contents[1] = BIT_ULL(WORKER_VCPU_ID_2 % 64); + /* bank_contents and gva_list occupy the same space, thus [2] */ + flush_ex->gva_list[2] = (u64)data->test_pages; + hyperv_write_xmm_input(&flush_ex->hv_vp_set, 3); + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX | + HV_HYPERCALL_FAST_BIT | + (2 << HV_HYPERCALL_VARHEAD_OFFSET) | + (1UL << HV_HYPERCALL_REP_COMP_OFFSET), + 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, + i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for HV_GENERIC_SET_ALL */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + flush_ex->hv_vp_set.format = HV_GENERIC_SET_ALL; + hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2); + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX | + HV_HYPERCALL_FAST_BIT, + 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, + i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for HV_GENERIC_SET_ALL */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + flush_ex->hv_vp_set.format = HV_GENERIC_SET_ALL; + flush_ex->gva_list[0] = (u64)data->test_pages; + hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2); + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX | + HV_HYPERCALL_FAST_BIT | + (1UL << HV_HYPERCALL_REP_COMP_OFFSET), + 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, + i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_DONE(); +} + +static void *vcpu_thread(void *arg) +{ + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)arg; + struct ucall uc; + int old; + int r; + unsigned int exit_reason; + + r = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old); + TEST_ASSERT(!r, "pthread_setcanceltype failed on vcpu_id=%u with errno=%d", + vcpu->id, r); + + vcpu_run(vcpu); + exit_reason = vcpu->run->exit_reason; + + TEST_ASSERT(exit_reason == KVM_EXIT_IO, + "vCPU %u exited with unexpected exit reason %u-%s, expected KVM_EXIT_IO", + vcpu->id, exit_reason, exit_reason_str(exit_reason)); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + /* NOT REACHED */ + default: + TEST_FAIL("Unexpected ucall %lu, vCPU %d", uc.cmd, vcpu->id); + } + + return NULL; +} + +static void cancel_join_vcpu_thread(pthread_t thread, struct kvm_vcpu *vcpu) +{ + void *retval; + int r; + + r = pthread_cancel(thread); + TEST_ASSERT(!r, "pthread_cancel on vcpu_id=%d failed with errno=%d", + vcpu->id, r); + + r = pthread_join(thread, &retval); + TEST_ASSERT(!r, "pthread_join on vcpu_id=%d failed with errno=%d", + vcpu->id, r); + TEST_ASSERT(retval == PTHREAD_CANCELED, + "expected retval=%p, got %p", PTHREAD_CANCELED, + retval); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu[3]; + unsigned int exit_reason; + pthread_t threads[2]; + vm_vaddr_t test_data_page, gva; + vm_paddr_t gpa; + uint64_t *pte; + struct test_data *data; + struct ucall uc; + int stage = 1, r, i; + + vm = vm_create_with_one_vcpu(&vcpu[0], sender_guest_code); + + /* Test data page */ + test_data_page = vm_vaddr_alloc_page(vm); + data = (struct test_data *)addr_gva2hva(vm, test_data_page); + + /* Hypercall input/output */ + data->hcall_gva = vm_vaddr_alloc_pages(vm, 2); + data->hcall_gpa = addr_gva2gpa(vm, data->hcall_gva); + memset(addr_gva2hva(vm, data->hcall_gva), 0x0, 2 * PAGE_SIZE); + + /* + * Test pages: the first one is filled with '0x01's, the second with '0x02's + * and the test will swap their mappings. The third page keeps the indication + * about the current state of mappings. + */ + data->test_pages = vm_vaddr_alloc_pages(vm, NTEST_PAGES + 1); + for (i = 0; i < NTEST_PAGES; i++) + memset(addr_gva2hva(vm, data->test_pages + PAGE_SIZE * i), + (u8)(i + 1), PAGE_SIZE); + set_expected_val(addr_gva2hva(vm, data->test_pages), 0x0, WORKER_VCPU_ID_1); + set_expected_val(addr_gva2hva(vm, data->test_pages), 0x0, WORKER_VCPU_ID_2); + + /* + * Get PTE pointers for test pages and map them inside the guest. + * Use separate page for each PTE for simplicity. + */ + gva = vm_vaddr_unused_gap(vm, NTEST_PAGES * PAGE_SIZE, KVM_UTIL_MIN_VADDR); + for (i = 0; i < NTEST_PAGES; i++) { + pte = vm_get_page_table_entry(vm, data->test_pages + i * PAGE_SIZE); + gpa = addr_hva2gpa(vm, pte); + __virt_pg_map(vm, gva + PAGE_SIZE * i, gpa & PAGE_MASK, PG_LEVEL_4K); + data->test_pages_pte[i] = gva + (gpa & ~PAGE_MASK); + } + + /* + * Sender vCPU which performs the test: swaps test pages, sets expectation + * for 'workers' and issues TLB flush hypercalls. + */ + vcpu_args_set(vcpu[0], 1, test_data_page); + vcpu_set_hv_cpuid(vcpu[0]); + + /* Create worker vCPUs which check the contents of the test pages */ + vcpu[1] = vm_vcpu_add(vm, WORKER_VCPU_ID_1, worker_guest_code); + vcpu_args_set(vcpu[1], 1, test_data_page); + vcpu_set_msr(vcpu[1], HV_X64_MSR_VP_INDEX, WORKER_VCPU_ID_1); + vcpu_set_hv_cpuid(vcpu[1]); + + vcpu[2] = vm_vcpu_add(vm, WORKER_VCPU_ID_2, worker_guest_code); + vcpu_args_set(vcpu[2], 1, test_data_page); + vcpu_set_msr(vcpu[2], HV_X64_MSR_VP_INDEX, WORKER_VCPU_ID_2); + vcpu_set_hv_cpuid(vcpu[2]); + + r = pthread_create(&threads[0], NULL, vcpu_thread, vcpu[1]); + TEST_ASSERT(!r, "pthread_create() failed"); + + r = pthread_create(&threads[1], NULL, vcpu_thread, vcpu[2]); + TEST_ASSERT(!r, "pthread_create() failed"); + + while (true) { + vcpu_run(vcpu[0]); + exit_reason = vcpu[0]->run->exit_reason; + + TEST_ASSERT(exit_reason == KVM_EXIT_IO, + "unexpected exit reason: %u (%s)", + exit_reason, exit_reason_str(exit_reason)); + + switch (get_ucall(vcpu[0], &uc)) { + case UCALL_SYNC: + TEST_ASSERT(uc.args[1] == stage, + "Unexpected stage: %ld (%d expected)\n", + uc.args[1], stage); + break; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + /* NOT REACHED */ + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + + stage++; + } + +done: + cancel_join_vcpu_thread(threads[0], vcpu[1]); + cancel_join_vcpu_thread(threads[1], vcpu[2]); + kvm_vm_free(vm); + + return 0; +} -- GitLab From 1ad51c0c0cdd5315405d1c93a345635451c245bb Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:54:16 +0100 Subject: [PATCH 0815/1423] KVM: selftests: Sync 'struct hv_enlightened_vmcs' definition with hyperv-tlfs.h 'struct hv_enlightened_vmcs' definition in selftests is not '__packed' and so we rely on the compiler doing the right padding. This is not obvious so it seems beneficial to use the same definition as in kernel. Reviewed-by: Maxim Levitsky Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-39-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/x86_64/evmcs.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/evmcs.h b/tools/testing/selftests/kvm/include/x86_64/evmcs.h index 58db74f68af25..4b6840df29793 100644 --- a/tools/testing/selftests/kvm/include/x86_64/evmcs.h +++ b/tools/testing/selftests/kvm/include/x86_64/evmcs.h @@ -41,6 +41,8 @@ struct hv_enlightened_vmcs { u16 host_gs_selector; u16 host_tr_selector; + u16 padding16_1; + u64 host_ia32_pat; u64 host_ia32_efer; @@ -159,7 +161,7 @@ struct hv_enlightened_vmcs { u64 ept_pointer; u16 virtual_processor_id; - u16 padding16[3]; + u16 padding16_2[3]; u64 padding64_2[5]; u64 guest_physical_address; @@ -195,13 +197,13 @@ struct hv_enlightened_vmcs { u64 guest_rip; u32 hv_clean_fields; - u32 hv_padding_32; + u32 padding32_1; u32 hv_synthetic_controls; struct { u32 nested_flush_hypercall:1; u32 msr_bitmap:1; u32 reserved:30; - } hv_enlightenments_control; + } __packed hv_enlightenments_control; u32 hv_vp_id; u32 padding32_2; u64 hv_vm_id; @@ -222,7 +224,7 @@ struct hv_enlightened_vmcs { u64 host_ssp; u64 host_ia32_int_ssp_table_addr; u64 padding64_6; -}; +} __packed; #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE 0 #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP BIT(0) -- GitLab From d7b14a868ac2122459b2d702fe05c75c48a687bf Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:54:17 +0100 Subject: [PATCH 0816/1423] KVM: selftests: Sync 'struct hv_vp_assist_page' definition with hyperv-tlfs.h 'struct hv_vp_assist_page' definition doesn't match TLFS. Also, define 'struct hv_nested_enlightenments_control' and use it instead of opaque '__u64'. Reviewed-by: Maxim Levitsky Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-40-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/include/x86_64/evmcs.h | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/evmcs.h b/tools/testing/selftests/kvm/include/x86_64/evmcs.h index 4b6840df29793..efdc62704f276 100644 --- a/tools/testing/selftests/kvm/include/x86_64/evmcs.h +++ b/tools/testing/selftests/kvm/include/x86_64/evmcs.h @@ -20,14 +20,26 @@ extern bool enable_evmcs; +struct hv_nested_enlightenments_control { + struct { + __u32 directhypercall:1; + __u32 reserved:31; + } features; + struct { + __u32 reserved; + } hypercallControls; +} __packed; + +/* Define virtual processor assist page structure. */ struct hv_vp_assist_page { __u32 apic_assist; - __u32 reserved; - __u64 vtl_control[2]; - __u64 nested_enlightenments_control[2]; - __u32 enlighten_vmentry; + __u32 reserved1; + __u64 vtl_control[3]; + struct hv_nested_enlightenments_control nested_control; + __u8 enlighten_vmentry; + __u8 reserved2[7]; __u64 current_nested_vmcs; -}; +} __packed; struct hv_enlightened_vmcs { u32 revision_id; -- GitLab From e8f3d23c02d09210a980ef211b3e8a99d44cb602 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:54:18 +0100 Subject: [PATCH 0817/1423] KVM: selftests: Move Hyper-V VP assist page enablement out of evmcs.h Hyper-V VP assist page is not eVMCS specific, it is also used for enlightened nSVM. Move the code to vendor neutral place. Reviewed-by: Maxim Levitsky Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-41-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/Makefile | 1 + .../selftests/kvm/include/x86_64/evmcs.h | 40 +------------------ .../selftests/kvm/include/x86_64/hyperv.h | 31 ++++++++++++++ .../testing/selftests/kvm/lib/x86_64/hyperv.c | 21 ++++++++++ .../testing/selftests/kvm/x86_64/evmcs_test.c | 1 + 5 files changed, 56 insertions(+), 38 deletions(-) create mode 100644 tools/testing/selftests/kvm/lib/x86_64/hyperv.c diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 058b15213c5dd..246d52e9df60d 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -53,6 +53,7 @@ LIBKVM_STRING += lib/string_override.c LIBKVM_x86_64 += lib/x86_64/apic.c LIBKVM_x86_64 += lib/x86_64/handlers.S +LIBKVM_x86_64 += lib/x86_64/hyperv.c LIBKVM_x86_64 += lib/x86_64/memstress.c LIBKVM_x86_64 += lib/x86_64/processor.c LIBKVM_x86_64 += lib/x86_64/svm.c diff --git a/tools/testing/selftests/kvm/include/x86_64/evmcs.h b/tools/testing/selftests/kvm/include/x86_64/evmcs.h index efdc62704f276..2530b5aeb4bae 100644 --- a/tools/testing/selftests/kvm/include/x86_64/evmcs.h +++ b/tools/testing/selftests/kvm/include/x86_64/evmcs.h @@ -10,6 +10,7 @@ #define SELFTEST_KVM_EVMCS_H #include +#include "hyperv.h" #include "vmx.h" #define u16 uint16_t @@ -20,27 +21,6 @@ extern bool enable_evmcs; -struct hv_nested_enlightenments_control { - struct { - __u32 directhypercall:1; - __u32 reserved:31; - } features; - struct { - __u32 reserved; - } hypercallControls; -} __packed; - -/* Define virtual processor assist page structure. */ -struct hv_vp_assist_page { - __u32 apic_assist; - __u32 reserved1; - __u64 vtl_control[3]; - struct hv_nested_enlightenments_control nested_control; - __u8 enlighten_vmentry; - __u8 reserved2[7]; - __u64 current_nested_vmcs; -} __packed; - struct hv_enlightened_vmcs { u32 revision_id; u32 abort; @@ -257,29 +237,13 @@ struct hv_enlightened_vmcs { #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL BIT(15) #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF -#define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073 -#define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001 -#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12 -#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK \ - (~((1ull << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) - 1)) - extern struct hv_enlightened_vmcs *current_evmcs; -extern struct hv_vp_assist_page *current_vp_assist; int vcpu_enable_evmcs(struct kvm_vcpu *vcpu); -static inline int enable_vp_assist(uint64_t vp_assist_pa, void *vp_assist) +static inline void evmcs_enable(void) { - u64 val = (vp_assist_pa & HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK) | - HV_X64_MSR_VP_ASSIST_PAGE_ENABLE; - - wrmsr(HV_X64_MSR_VP_ASSIST_PAGE, val); - - current_vp_assist = vp_assist; - enable_evmcs = true; - - return 0; } static inline int evmcs_vmptrld(uint64_t vmcs_pa, void *vmcs) diff --git a/tools/testing/selftests/kvm/include/x86_64/hyperv.h b/tools/testing/selftests/kvm/include/x86_64/hyperv.h index ae945f7408352..ba38fa347cba3 100644 --- a/tools/testing/selftests/kvm/include/x86_64/hyperv.h +++ b/tools/testing/selftests/kvm/include/x86_64/hyperv.h @@ -237,4 +237,35 @@ static inline void hyperv_write_xmm_input(void *data, int n_sse_regs) /* Proper HV_X64_MSR_GUEST_OS_ID value */ #define HYPERV_LINUX_OS_ID ((u64)0x8100 << 48) +#define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073 +#define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001 +#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12 +#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK \ + (~((1ull << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) - 1)) + +struct hv_nested_enlightenments_control { + struct { + __u32 directhypercall:1; + __u32 reserved:31; + } features; + struct { + __u32 reserved; + } hypercallControls; +} __packed; + +/* Define virtual processor assist page structure. */ +struct hv_vp_assist_page { + __u32 apic_assist; + __u32 reserved1; + __u64 vtl_control[3]; + struct hv_nested_enlightenments_control nested_control; + __u8 enlighten_vmentry; + __u8 reserved2[7]; + __u64 current_nested_vmcs; +} __packed; + +extern struct hv_vp_assist_page *current_vp_assist; + +int enable_vp_assist(uint64_t vp_assist_pa, void *vp_assist); + #endif /* !SELFTEST_KVM_HYPERV_H */ diff --git a/tools/testing/selftests/kvm/lib/x86_64/hyperv.c b/tools/testing/selftests/kvm/lib/x86_64/hyperv.c new file mode 100644 index 0000000000000..32dc0afd9e5bf --- /dev/null +++ b/tools/testing/selftests/kvm/lib/x86_64/hyperv.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Hyper-V specific functions. + * + * Copyright (C) 2021, Red Hat Inc. + */ +#include +#include "processor.h" +#include "hyperv.h" + +int enable_vp_assist(uint64_t vp_assist_pa, void *vp_assist) +{ + uint64_t val = (vp_assist_pa & HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK) | + HV_X64_MSR_VP_ASSIST_PAGE_ENABLE; + + wrmsr(HV_X64_MSR_VP_ASSIST_PAGE, val); + + current_vp_assist = vp_assist; + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c index 99bc202243d23..9007fb04343bc 100644 --- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c @@ -79,6 +79,7 @@ void guest_code(struct vmx_pages *vmx_pages) GUEST_SYNC(2); enable_vp_assist(vmx_pages->vp_assist_gpa, vmx_pages->vp_assist); + evmcs_enable(); GUEST_ASSERT(vmx_pages->vmcs_gpa); GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); -- GitLab From cd8f11bd6bbd00565cf39c6335cf2788795fdca7 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:54:19 +0100 Subject: [PATCH 0818/1423] KVM: selftests: Split off load_evmcs() from load_vmcs() In preparation to putting Hyper-V specific test pages to a dedicated struct, move eVMCS load logic from load_vmcs(). Tests call load_vmcs() directly and the only one which needs 'enlightened' version is evmcs_test so there's not much gain in having this merged. Temporary pass both GPA and HVA to load_evmcs(). Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-42-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/include/x86_64/evmcs.h | 10 ++++++ tools/testing/selftests/kvm/lib/x86_64/vmx.c | 32 +++++++------------ .../testing/selftests/kvm/x86_64/evmcs_test.c | 4 +-- 3 files changed, 24 insertions(+), 22 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/evmcs.h b/tools/testing/selftests/kvm/include/x86_64/evmcs.h index 2530b5aeb4bae..59b60d45b8f6a 100644 --- a/tools/testing/selftests/kvm/include/x86_64/evmcs.h +++ b/tools/testing/selftests/kvm/include/x86_64/evmcs.h @@ -256,6 +256,16 @@ static inline int evmcs_vmptrld(uint64_t vmcs_pa, void *vmcs) return 0; } +static inline bool load_evmcs(uint64_t enlightened_vmcs_gpa, void *enlightened_vmcs) +{ + if (evmcs_vmptrld(enlightened_vmcs_gpa, enlightened_vmcs)) + return false; + + current_evmcs->revision_id = EVMCS_VERSION; + + return true; +} + static inline int evmcs_vmptrst(uint64_t *value) { *value = current_vp_assist->current_nested_vmcs & diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c index 3e4ea846366cb..318ee4658f0b9 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c @@ -171,26 +171,18 @@ bool prepare_for_vmx_operation(struct vmx_pages *vmx) bool load_vmcs(struct vmx_pages *vmx) { - if (!enable_evmcs) { - /* Load a VMCS. */ - *(uint32_t *)(vmx->vmcs) = vmcs_revision(); - if (vmclear(vmx->vmcs_gpa)) - return false; - - if (vmptrld(vmx->vmcs_gpa)) - return false; - - /* Setup shadow VMCS, do not load it yet. */ - *(uint32_t *)(vmx->shadow_vmcs) = - vmcs_revision() | 0x80000000ul; - if (vmclear(vmx->shadow_vmcs_gpa)) - return false; - } else { - if (evmcs_vmptrld(vmx->enlightened_vmcs_gpa, - vmx->enlightened_vmcs)) - return false; - current_evmcs->revision_id = EVMCS_VERSION; - } + /* Load a VMCS. */ + *(uint32_t *)(vmx->vmcs) = vmcs_revision(); + if (vmclear(vmx->vmcs_gpa)) + return false; + + if (vmptrld(vmx->vmcs_gpa)) + return false; + + /* Setup shadow VMCS, do not load it yet. */ + *(uint32_t *)(vmx->shadow_vmcs) = vmcs_revision() | 0x80000000ul; + if (vmclear(vmx->shadow_vmcs_gpa)) + return false; return true; } diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c index 9007fb04343bc..5a4c8b1873aab 100644 --- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c @@ -81,10 +81,10 @@ void guest_code(struct vmx_pages *vmx_pages) enable_vp_assist(vmx_pages->vp_assist_gpa, vmx_pages->vp_assist); evmcs_enable(); - GUEST_ASSERT(vmx_pages->vmcs_gpa); GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); GUEST_SYNC(3); - GUEST_ASSERT(load_vmcs(vmx_pages)); + GUEST_ASSERT(load_evmcs(vmx_pages->enlightened_vmcs_gpa, + vmx_pages->enlightened_vmcs)); GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa); GUEST_SYNC(4); -- GitLab From 2dc458b8622182ac4d89de793c548cf04f632801 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:54:20 +0100 Subject: [PATCH 0819/1423] KVM: selftests: Create a vendor independent helper to allocate Hyper-V specific test pages There's no need to pollute VMX and SVM code with Hyper-V specific stuff and allocate Hyper-V specific test pages for all test as only few really need them. Create a dedicated struct and an allocation helper. Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-43-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/include/x86_64/evmcs.h | 4 ++-- .../selftests/kvm/include/x86_64/hyperv.h | 15 +++++++++++++ .../selftests/kvm/include/x86_64/vmx.h | 8 ------- .../testing/selftests/kvm/lib/x86_64/hyperv.c | 20 +++++++++++++++++ tools/testing/selftests/kvm/lib/x86_64/vmx.c | 12 ---------- .../testing/selftests/kvm/x86_64/evmcs_test.c | 22 +++++++++---------- 6 files changed, 48 insertions(+), 33 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/evmcs.h b/tools/testing/selftests/kvm/include/x86_64/evmcs.h index 59b60d45b8f6a..94d6059e9a12d 100644 --- a/tools/testing/selftests/kvm/include/x86_64/evmcs.h +++ b/tools/testing/selftests/kvm/include/x86_64/evmcs.h @@ -256,9 +256,9 @@ static inline int evmcs_vmptrld(uint64_t vmcs_pa, void *vmcs) return 0; } -static inline bool load_evmcs(uint64_t enlightened_vmcs_gpa, void *enlightened_vmcs) +static inline bool load_evmcs(struct hyperv_test_pages *hv) { - if (evmcs_vmptrld(enlightened_vmcs_gpa, enlightened_vmcs)) + if (evmcs_vmptrld(hv->enlightened_vmcs_gpa, hv->enlightened_vmcs)) return false; current_evmcs->revision_id = EVMCS_VERSION; diff --git a/tools/testing/selftests/kvm/include/x86_64/hyperv.h b/tools/testing/selftests/kvm/include/x86_64/hyperv.h index ba38fa347cba3..becdd8245e846 100644 --- a/tools/testing/selftests/kvm/include/x86_64/hyperv.h +++ b/tools/testing/selftests/kvm/include/x86_64/hyperv.h @@ -268,4 +268,19 @@ extern struct hv_vp_assist_page *current_vp_assist; int enable_vp_assist(uint64_t vp_assist_pa, void *vp_assist); +struct hyperv_test_pages { + /* VP assist page */ + void *vp_assist_hva; + uint64_t vp_assist_gpa; + void *vp_assist; + + /* Enlightened VMCS */ + void *enlightened_vmcs_hva; + uint64_t enlightened_vmcs_gpa; + void *enlightened_vmcs; +}; + +struct hyperv_test_pages *vcpu_alloc_hyperv_test_pages(struct kvm_vm *vm, + vm_vaddr_t *p_hv_pages_gva); + #endif /* !SELFTEST_KVM_HYPERV_H */ diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h index e9c96b49966a6..ef784bd6dfc2d 100644 --- a/tools/testing/selftests/kvm/include/x86_64/vmx.h +++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h @@ -517,14 +517,6 @@ struct vmx_pages { uint64_t vmwrite_gpa; void *vmwrite; - void *vp_assist_hva; - uint64_t vp_assist_gpa; - void *vp_assist; - - void *enlightened_vmcs_hva; - uint64_t enlightened_vmcs_gpa; - void *enlightened_vmcs; - void *eptp_hva; uint64_t eptp_gpa; void *eptp; diff --git a/tools/testing/selftests/kvm/lib/x86_64/hyperv.c b/tools/testing/selftests/kvm/lib/x86_64/hyperv.c index 32dc0afd9e5bf..a2fc083c65efe 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/hyperv.c +++ b/tools/testing/selftests/kvm/lib/x86_64/hyperv.c @@ -8,6 +8,26 @@ #include "processor.h" #include "hyperv.h" +struct hyperv_test_pages *vcpu_alloc_hyperv_test_pages(struct kvm_vm *vm, + vm_vaddr_t *p_hv_pages_gva) +{ + vm_vaddr_t hv_pages_gva = vm_vaddr_alloc_page(vm); + struct hyperv_test_pages *hv = addr_gva2hva(vm, hv_pages_gva); + + /* Setup of a region of guest memory for the VP Assist page. */ + hv->vp_assist = (void *)vm_vaddr_alloc_page(vm); + hv->vp_assist_hva = addr_gva2hva(vm, (uintptr_t)hv->vp_assist); + hv->vp_assist_gpa = addr_gva2gpa(vm, (uintptr_t)hv->vp_assist); + + /* Setup of a region of guest memory for the enlightened VMCS. */ + hv->enlightened_vmcs = (void *)vm_vaddr_alloc_page(vm); + hv->enlightened_vmcs_hva = addr_gva2hva(vm, (uintptr_t)hv->enlightened_vmcs); + hv->enlightened_vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)hv->enlightened_vmcs); + + *p_hv_pages_gva = hv_pages_gva; + return hv; +} + int enable_vp_assist(uint64_t vp_assist_pa, void *vp_assist) { uint64_t val = (vp_assist_pa & HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK) | diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c index 318ee4658f0b9..59d97531c9b17 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c @@ -109,18 +109,6 @@ vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva) vmx->vmwrite_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmwrite); memset(vmx->vmwrite_hva, 0, getpagesize()); - /* Setup of a region of guest memory for the VP Assist page. */ - vmx->vp_assist = (void *)vm_vaddr_alloc_page(vm); - vmx->vp_assist_hva = addr_gva2hva(vm, (uintptr_t)vmx->vp_assist); - vmx->vp_assist_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vp_assist); - - /* Setup of a region of guest memory for the enlightened VMCS. */ - vmx->enlightened_vmcs = (void *)vm_vaddr_alloc_page(vm); - vmx->enlightened_vmcs_hva = - addr_gva2hva(vm, (uintptr_t)vmx->enlightened_vmcs); - vmx->enlightened_vmcs_gpa = - addr_gva2gpa(vm, (uintptr_t)vmx->enlightened_vmcs); - *p_vmx_gva = vmx_gva; return vmx; } diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c index 5a4c8b1873aab..74f076ba574b3 100644 --- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c @@ -68,7 +68,7 @@ void l2_guest_code(void) vmcall(); } -void guest_code(struct vmx_pages *vmx_pages) +void guest_code(struct vmx_pages *vmx_pages, struct hyperv_test_pages *hv_pages) { #define L2_GUEST_STACK_SIZE 64 unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; @@ -78,23 +78,22 @@ void guest_code(struct vmx_pages *vmx_pages) GUEST_SYNC(1); GUEST_SYNC(2); - enable_vp_assist(vmx_pages->vp_assist_gpa, vmx_pages->vp_assist); + enable_vp_assist(hv_pages->vp_assist_gpa, hv_pages->vp_assist); evmcs_enable(); GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); GUEST_SYNC(3); - GUEST_ASSERT(load_evmcs(vmx_pages->enlightened_vmcs_gpa, - vmx_pages->enlightened_vmcs)); - GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa); + GUEST_ASSERT(load_evmcs(hv_pages)); + GUEST_ASSERT(vmptrstz() == hv_pages->enlightened_vmcs_gpa); GUEST_SYNC(4); - GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa); + GUEST_ASSERT(vmptrstz() == hv_pages->enlightened_vmcs_gpa); prepare_vmcs(vmx_pages, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); GUEST_SYNC(5); - GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa); + GUEST_ASSERT(vmptrstz() == hv_pages->enlightened_vmcs_gpa); current_evmcs->revision_id = -1u; GUEST_ASSERT(vmlaunch()); current_evmcs->revision_id = EVMCS_VERSION; @@ -104,7 +103,7 @@ void guest_code(struct vmx_pages *vmx_pages) PIN_BASED_NMI_EXITING); GUEST_ASSERT(!vmlaunch()); - GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa); + GUEST_ASSERT(vmptrstz() == hv_pages->enlightened_vmcs_gpa); /* * NMI forces L2->L1 exit, resuming L2 and hope that EVMCS is @@ -152,7 +151,7 @@ void guest_code(struct vmx_pages *vmx_pages) GUEST_SYNC(11); /* Try enlightened vmptrld with an incorrect GPA */ - evmcs_vmptrld(0xdeadbeef, vmx_pages->enlightened_vmcs); + evmcs_vmptrld(0xdeadbeef, hv_pages->enlightened_vmcs); GUEST_ASSERT(vmlaunch()); GUEST_ASSERT(ud_count == 1); GUEST_DONE(); @@ -199,7 +198,7 @@ static struct kvm_vcpu *save_restore_vm(struct kvm_vm *vm, int main(int argc, char *argv[]) { - vm_vaddr_t vmx_pages_gva = 0; + vm_vaddr_t vmx_pages_gva = 0, hv_pages_gva = 0; struct kvm_vcpu *vcpu; struct kvm_vm *vm; @@ -217,7 +216,8 @@ int main(int argc, char *argv[]) vcpu_enable_evmcs(vcpu); vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vcpu, 1, vmx_pages_gva); + vcpu_alloc_hyperv_test_pages(vm, &hv_pages_gva); + vcpu_args_set(vcpu, 2, vmx_pages_gva, hv_pages_gva); vm_init_descriptor_tables(vm); vcpu_init_descriptor_tables(vcpu); -- GitLab From 6c15c3c46520374e1a144942e5228f963f5eb2d5 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:54:21 +0100 Subject: [PATCH 0820/1423] KVM: selftests: Allocate Hyper-V partition assist page In preparation to testing Hyper-V L2 TLB flush hypercalls, allocate so-called Partition assist page. Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-44-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/x86_64/hyperv.h | 5 +++++ tools/testing/selftests/kvm/lib/x86_64/hyperv.c | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/tools/testing/selftests/kvm/include/x86_64/hyperv.h b/tools/testing/selftests/kvm/include/x86_64/hyperv.h index becdd8245e846..9218bb5f44bfd 100644 --- a/tools/testing/selftests/kvm/include/x86_64/hyperv.h +++ b/tools/testing/selftests/kvm/include/x86_64/hyperv.h @@ -274,6 +274,11 @@ struct hyperv_test_pages { uint64_t vp_assist_gpa; void *vp_assist; + /* Partition assist page */ + void *partition_assist_hva; + uint64_t partition_assist_gpa; + void *partition_assist; + /* Enlightened VMCS */ void *enlightened_vmcs_hva; uint64_t enlightened_vmcs_gpa; diff --git a/tools/testing/selftests/kvm/lib/x86_64/hyperv.c b/tools/testing/selftests/kvm/lib/x86_64/hyperv.c index a2fc083c65efe..efb7e7a1354dc 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/hyperv.c +++ b/tools/testing/selftests/kvm/lib/x86_64/hyperv.c @@ -19,6 +19,11 @@ struct hyperv_test_pages *vcpu_alloc_hyperv_test_pages(struct kvm_vm *vm, hv->vp_assist_hva = addr_gva2hva(vm, (uintptr_t)hv->vp_assist); hv->vp_assist_gpa = addr_gva2gpa(vm, (uintptr_t)hv->vp_assist); + /* Setup of a region of guest memory for the partition assist page. */ + hv->partition_assist = (void *)vm_vaddr_alloc_page(vm); + hv->partition_assist_hva = addr_gva2hva(vm, (uintptr_t)hv->partition_assist); + hv->partition_assist_gpa = addr_gva2gpa(vm, (uintptr_t)hv->partition_assist); + /* Setup of a region of guest memory for the enlightened VMCS. */ hv->enlightened_vmcs = (void *)vm_vaddr_alloc_page(vm); hv->enlightened_vmcs_hva = addr_gva2hva(vm, (uintptr_t)hv->enlightened_vmcs); -- GitLab From 8fda37cf3d41f1dfb0667c4b10e3dd01d17735b8 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:54:22 +0100 Subject: [PATCH 0821/1423] KVM: selftests: Stuff RAX/RCX with 'safe' values in vmmcall()/vmcall() vmmcall()/vmcall() are used to exit from L2 to L1 and no concrete hypercall ABI is currenty followed. With the introduction of Hyper-V L2 TLB flush it becomes (theoretically) possible that L0 will take responsibility for handling the call and no L1 exit will happen. Prevent this by stuffing RAX (KVM ABI) and RCX (Hyper-V ABI) with 'safe' values. While on it, convert vmmcall() to 'static inline', make it setup stack frame and move to include/x86_64/svm_util.h. Signed-off-by: Vitaly Kuznetsov Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-45-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/include/x86_64/processor.h | 5 ----- .../selftests/kvm/include/x86_64/svm_util.h | 14 ++++++++++++++ tools/testing/selftests/kvm/include/x86_64/vmx.h | 15 ++++++++++----- 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index a10f39affa45e..5d310abe6c3f4 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -677,11 +677,6 @@ static inline void cpu_relax(void) asm volatile("rep; nop" ::: "memory"); } -#define vmmcall() \ - __asm__ __volatile__( \ - "vmmcall\n" \ - ) - #define ud2() \ __asm__ __volatile__( \ "ud2\n" \ diff --git a/tools/testing/selftests/kvm/include/x86_64/svm_util.h b/tools/testing/selftests/kvm/include/x86_64/svm_util.h index 7aee6244ab6ac..044f0f872ba9d 100644 --- a/tools/testing/selftests/kvm/include/x86_64/svm_util.h +++ b/tools/testing/selftests/kvm/include/x86_64/svm_util.h @@ -32,6 +32,20 @@ struct svm_test_data { uint64_t msr_gpa; }; +static inline void vmmcall(void) +{ + /* + * Stuff RAX and RCX with "safe" values to make sure L0 doesn't handle + * it as a valid hypercall (e.g. Hyper-V L2 TLB flush) as the intended + * use of this function is to exit to L1 from L2. Clobber all other + * GPRs as L1 doesn't correctly preserve them during vmexits. + */ + __asm__ __volatile__("push %%rbp; vmmcall; pop %%rbp" + : : "a"(0xdeadbeef), "c"(0xbeefdead) + : "rbx", "rdx", "rsi", "rdi", "r8", "r9", + "r10", "r11", "r12", "r13", "r14", "r15"); +} + #define stgi() \ __asm__ __volatile__( \ "stgi\n" \ diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h index ef784bd6dfc2d..5f0c0a29c556e 100644 --- a/tools/testing/selftests/kvm/include/x86_64/vmx.h +++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h @@ -437,11 +437,16 @@ static inline int vmresume(void) static inline void vmcall(void) { - /* Currently, L1 destroys our GPRs during vmexits. */ - __asm__ __volatile__("push %%rbp; vmcall; pop %%rbp" : : : - "rax", "rbx", "rcx", "rdx", - "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", - "r13", "r14", "r15"); + /* + * Stuff RAX and RCX with "safe" values to make sure L0 doesn't handle + * it as a valid hypercall (e.g. Hyper-V L2 TLB flush) as the intended + * use of this function is to exit to L1 from L2. Clobber all other + * GPRs as L1 doesn't correctly preserve them during vmexits. + */ + __asm__ __volatile__("push %%rbp; vmcall; pop %%rbp" + : : "a"(0xdeadbeef), "c"(0xbeefdead) + : "rbx", "rdx", "rsi", "rdi", "r8", "r9", + "r10", "r11", "r12", "r13", "r14", "r15"); } static inline int vmread(uint64_t encoding, uint64_t *value) -- GitLab From 75ee7505feae16bbfbed62115e04f762047c4765 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:54:23 +0100 Subject: [PATCH 0822/1423] KVM: selftests: Introduce rdmsr_from_l2() and use it for MSR-Bitmap tests Hyper-V MSR-Bitmap tests do RDMSR from L2 to exit to L1. While 'evmcs_test' correctly clobbers all GPRs (which are not preserved), 'hyperv_svm_test' does not. Introduce a more generic rdmsr_from_l2() to avoid code duplication and remove hardcoding of MSRs. Do not put it in common code because it is really just a selftests bug rather than a processor feature that requires it. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-46-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/x86_64/evmcs_test.c | 27 +++++++------------ .../selftests/kvm/x86_64/hyperv_svm_test.c | 17 +++++++++--- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c index 74f076ba574b3..58fa98512c248 100644 --- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c @@ -30,22 +30,15 @@ static void guest_nmi_handler(struct ex_regs *regs) { } -/* Exits to L1 destroy GRPs! */ -static inline void rdmsr_fs_base(void) +static inline void rdmsr_from_l2(uint32_t msr) { - __asm__ __volatile__ ("mov $0xc0000100, %%rcx; rdmsr" : : : - "rax", "rbx", "rcx", "rdx", - "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", - "r13", "r14", "r15"); -} -static inline void rdmsr_gs_base(void) -{ - __asm__ __volatile__ ("mov $0xc0000101, %%rcx; rdmsr" : : : - "rax", "rbx", "rcx", "rdx", - "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", - "r13", "r14", "r15"); + /* Currently, L1 doesn't preserve GPRs during vmexits. */ + __asm__ __volatile__ ("rdmsr" : : "c"(msr) : + "rax", "rbx", "rdx", "rsi", "rdi", "r8", "r9", + "r10", "r11", "r12", "r13", "r14", "r15"); } +/* Exit to L1 from L2 with RDMSR instruction */ void l2_guest_code(void) { GUEST_SYNC(7); @@ -58,11 +51,11 @@ void l2_guest_code(void) vmcall(); /* MSR-Bitmap tests */ - rdmsr_fs_base(); /* intercepted */ - rdmsr_fs_base(); /* intercepted */ - rdmsr_gs_base(); /* not intercepted */ + rdmsr_from_l2(MSR_FS_BASE); /* intercepted */ + rdmsr_from_l2(MSR_FS_BASE); /* intercepted */ + rdmsr_from_l2(MSR_GS_BASE); /* not intercepted */ vmcall(); - rdmsr_gs_base(); /* intercepted */ + rdmsr_from_l2(MSR_GS_BASE); /* intercepted */ /* Done, exit to L1 and never come back. */ vmcall(); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c index 1c3fc38b4f151..3c9a2a1b4cfd8 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c @@ -23,6 +23,15 @@ #define L2_GUEST_STACK_SIZE 256 +/* Exit to L1 from L2 with RDMSR instruction */ +static inline void rdmsr_from_l2(uint32_t msr) +{ + /* Currently, L1 doesn't preserve GPRs during vmexits. */ + __asm__ __volatile__ ("rdmsr" : : "c"(msr) : + "rax", "rbx", "rdx", "rsi", "rdi", "r8", "r9", + "r10", "r11", "r12", "r13", "r14", "r15"); +} + void l2_guest_code(void) { GUEST_SYNC(3); @@ -30,11 +39,11 @@ void l2_guest_code(void) vmmcall(); /* MSR-Bitmap tests */ - rdmsr(MSR_FS_BASE); /* intercepted */ - rdmsr(MSR_FS_BASE); /* intercepted */ - rdmsr(MSR_GS_BASE); /* not intercepted */ + rdmsr_from_l2(MSR_FS_BASE); /* intercepted */ + rdmsr_from_l2(MSR_FS_BASE); /* intercepted */ + rdmsr_from_l2(MSR_GS_BASE); /* not intercepted */ vmmcall(); - rdmsr(MSR_GS_BASE); /* intercepted */ + rdmsr_from_l2(MSR_GS_BASE); /* intercepted */ GUEST_SYNC(5); -- GitLab From 4b5d8b222bf185bda25b56de403afde7b6d3c466 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:54:24 +0100 Subject: [PATCH 0823/1423] KVM: selftests: evmcs_test: Introduce L2 TLB flush test Enable Hyper-V L2 TLB flush and check that Hyper-V TLB flush hypercalls from L2 don't exit to L1 unless 'TlbLockCount' is set in the Partition assist page. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-47-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/include/x86_64/evmcs.h | 2 + .../testing/selftests/kvm/x86_64/evmcs_test.c | 50 ++++++++++++++++++- 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/evmcs.h b/tools/testing/selftests/kvm/include/x86_64/evmcs.h index 94d6059e9a12d..901caf0e0939b 100644 --- a/tools/testing/selftests/kvm/include/x86_64/evmcs.h +++ b/tools/testing/selftests/kvm/include/x86_64/evmcs.h @@ -237,6 +237,8 @@ struct hv_enlightened_vmcs { #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL BIT(15) #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF +#define HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH 0x10000031 + extern struct hv_enlightened_vmcs *current_evmcs; int vcpu_enable_evmcs(struct kvm_vcpu *vcpu); diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c index 58fa98512c248..ba09d300c9539 100644 --- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c @@ -16,6 +16,7 @@ #include "kvm_util.h" +#include "hyperv.h" #include "vmx.h" static int ud_count; @@ -41,6 +42,8 @@ static inline void rdmsr_from_l2(uint32_t msr) /* Exit to L1 from L2 with RDMSR instruction */ void l2_guest_code(void) { + u64 unused; + GUEST_SYNC(7); GUEST_SYNC(8); @@ -57,15 +60,31 @@ void l2_guest_code(void) vmcall(); rdmsr_from_l2(MSR_GS_BASE); /* intercepted */ + /* L2 TLB flush tests */ + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | HV_HYPERCALL_FAST_BIT, 0x0, + HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES | HV_FLUSH_ALL_PROCESSORS); + rdmsr_from_l2(MSR_FS_BASE); + /* + * Note: hypercall status (RAX) is not preserved correctly by L1 after + * synthetic vmexit, use unchecked version. + */ + __hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | HV_HYPERCALL_FAST_BIT, 0x0, + HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES | HV_FLUSH_ALL_PROCESSORS, + &unused); + /* Done, exit to L1 and never come back. */ vmcall(); } -void guest_code(struct vmx_pages *vmx_pages, struct hyperv_test_pages *hv_pages) +void guest_code(struct vmx_pages *vmx_pages, struct hyperv_test_pages *hv_pages, + vm_vaddr_t hv_hcall_page_gpa) { #define L2_GUEST_STACK_SIZE 64 unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID); + wrmsr(HV_X64_MSR_HYPERCALL, hv_hcall_page_gpa); + x2apic_enable(); GUEST_SYNC(1); @@ -95,7 +114,17 @@ void guest_code(struct vmx_pages *vmx_pages, struct hyperv_test_pages *hv_pages) vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmreadz(PIN_BASED_VM_EXEC_CONTROL) | PIN_BASED_NMI_EXITING); + /* L2 TLB flush setup */ + current_evmcs->partition_assist_page = hv_pages->partition_assist_gpa; + current_evmcs->hv_enlightenments_control.nested_flush_hypercall = 1; + current_evmcs->hv_vm_id = 1; + current_evmcs->hv_vp_id = 1; + current_vp_assist->nested_control.features.directhypercall = 1; + *(u32 *)(hv_pages->partition_assist) = 0; + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_EXCEPTION_NMI); + GUEST_ASSERT_EQ((vmreadz(VM_EXIT_INTR_INFO) & 0xff), NMI_VECTOR); GUEST_ASSERT(vmptrstz() == hv_pages->enlightened_vmcs_gpa); /* @@ -139,6 +168,18 @@ void guest_code(struct vmx_pages *vmx_pages, struct hyperv_test_pages *hv_pages) GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ); current_evmcs->guest_rip += 2; /* rdmsr */ + /* + * L2 TLB flush test. First VMCALL should be handled directly by L0, + * no VMCALL exit expected. + */ + GUEST_ASSERT(!vmresume()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ); + current_evmcs->guest_rip += 2; /* rdmsr */ + /* Enable synthetic vmexit */ + *(u32 *)(hv_pages->partition_assist) = 1; + GUEST_ASSERT(!vmresume()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH); + GUEST_ASSERT(!vmresume()); GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); GUEST_SYNC(11); @@ -192,6 +233,7 @@ static struct kvm_vcpu *save_restore_vm(struct kvm_vm *vm, int main(int argc, char *argv[]) { vm_vaddr_t vmx_pages_gva = 0, hv_pages_gva = 0; + vm_vaddr_t hcall_page; struct kvm_vcpu *vcpu; struct kvm_vm *vm; @@ -205,12 +247,16 @@ int main(int argc, char *argv[]) TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE)); TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)); + hcall_page = vm_vaddr_alloc_pages(vm, 1); + memset(addr_gva2hva(vm, hcall_page), 0x0, getpagesize()); + vcpu_set_hv_cpuid(vcpu); vcpu_enable_evmcs(vcpu); vcpu_alloc_vmx(vm, &vmx_pages_gva); vcpu_alloc_hyperv_test_pages(vm, &hv_pages_gva); - vcpu_args_set(vcpu, 2, vmx_pages_gva, hv_pages_gva); + vcpu_args_set(vcpu, 3, vmx_pages_gva, hv_pages_gva, addr_gva2gpa(vm, hcall_page)); + vcpu_set_msr(vcpu, HV_X64_MSR_VP_INDEX, vcpu->id); vm_init_descriptor_tables(vm); vcpu_init_descriptor_tables(vcpu); -- GitLab From 9c2e881945dca4904e8817acf4f0a928570bd400 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:54:25 +0100 Subject: [PATCH 0824/1423] KVM: selftests: hyperv_svm_test: Introduce L2 TLB flush test Enable Hyper-V L2 TLB flush and check that Hyper-V TLB flush hypercalls from L2 don't exit to L1 unless 'TlbLockCount' is set in the Partition assist page. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-48-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/include/x86_64/svm.h | 4 ++ .../selftests/kvm/x86_64/hyperv_svm_test.c | 59 +++++++++++++++++-- 2 files changed, 58 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/svm.h b/tools/testing/selftests/kvm/include/x86_64/svm.h index 483e6ae12f69e..4803e10560558 100644 --- a/tools/testing/selftests/kvm/include/x86_64/svm.h +++ b/tools/testing/selftests/kvm/include/x86_64/svm.h @@ -76,6 +76,10 @@ struct hv_vmcb_enlightenments { */ #define HV_VMCB_NESTED_ENLIGHTENMENTS (1U << 31) +/* Synthetic VM-Exit */ +#define HV_SVM_EXITCODE_ENL 0xf0000000 +#define HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH (1) + struct __attribute__ ((__packed__)) vmcb_control_area { u32 intercept_cr; u32 intercept_dr; diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c index 3c9a2a1b4cfd8..3b3cc94ba8e47 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c @@ -34,6 +34,8 @@ static inline void rdmsr_from_l2(uint32_t msr) void l2_guest_code(void) { + u64 unused; + GUEST_SYNC(3); /* Exit to L1 */ vmmcall(); @@ -47,11 +49,28 @@ void l2_guest_code(void) GUEST_SYNC(5); + /* L2 TLB flush tests */ + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | + HV_HYPERCALL_FAST_BIT, 0x0, + HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES | + HV_FLUSH_ALL_PROCESSORS); + rdmsr_from_l2(MSR_FS_BASE); + /* + * Note: hypercall status (RAX) is not preserved correctly by L1 after + * synthetic vmexit, use unchecked version. + */ + __hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | + HV_HYPERCALL_FAST_BIT, 0x0, + HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES | + HV_FLUSH_ALL_PROCESSORS, &unused); + /* Done, exit to L1 and never come back. */ vmmcall(); } -static void __attribute__((__flatten__)) guest_code(struct svm_test_data *svm) +static void __attribute__((__flatten__)) guest_code(struct svm_test_data *svm, + struct hyperv_test_pages *hv_pages, + vm_vaddr_t pgs_gpa) { unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; struct vmcb *vmcb = svm->vmcb; @@ -59,13 +78,23 @@ static void __attribute__((__flatten__)) guest_code(struct svm_test_data *svm) GUEST_SYNC(1); - wrmsr(HV_X64_MSR_GUEST_OS_ID, (u64)0x8100 << 48); + wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID); + wrmsr(HV_X64_MSR_HYPERCALL, pgs_gpa); + enable_vp_assist(hv_pages->vp_assist_gpa, hv_pages->vp_assist); GUEST_ASSERT(svm->vmcb_gpa); /* Prepare for L2 execution. */ generic_svm_setup(svm, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + /* L2 TLB flush setup */ + hve->partition_assist_page = hv_pages->partition_assist_gpa; + hve->hv_enlightenments_control.nested_flush_hypercall = 1; + hve->hv_vm_id = 1; + hve->hv_vp_id = 1; + current_vp_assist->nested_control.features.directhypercall = 1; + *(u32 *)(hv_pages->partition_assist) = 0; + GUEST_SYNC(2); run_guest(vmcb, svm->vmcb_gpa); GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL); @@ -100,6 +129,20 @@ static void __attribute__((__flatten__)) guest_code(struct svm_test_data *svm) GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR); vmcb->save.rip += 2; /* rdmsr */ + + /* + * L2 TLB flush test. First VMCALL should be handled directly by L0, + * no VMCALL exit expected. + */ + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR); + vmcb->save.rip += 2; /* rdmsr */ + /* Enable synthetic vmexit */ + *(u32 *)(hv_pages->partition_assist) = 1; + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(vmcb->control.exit_code == HV_SVM_EXITCODE_ENL); + GUEST_ASSERT(vmcb->control.exit_info_1 == HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH); + run_guest(vmcb, svm->vmcb_gpa); GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL); GUEST_SYNC(6); @@ -109,8 +152,8 @@ static void __attribute__((__flatten__)) guest_code(struct svm_test_data *svm) int main(int argc, char *argv[]) { - vm_vaddr_t nested_gva = 0; - + vm_vaddr_t nested_gva = 0, hv_pages_gva = 0; + vm_vaddr_t hcall_page; struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct kvm_run *run; @@ -124,7 +167,13 @@ int main(int argc, char *argv[]) vcpu_set_hv_cpuid(vcpu); run = vcpu->run; vcpu_alloc_svm(vm, &nested_gva); - vcpu_args_set(vcpu, 1, nested_gva); + vcpu_alloc_hyperv_test_pages(vm, &hv_pages_gva); + + hcall_page = vm_vaddr_alloc_pages(vm, 1); + memset(addr_gva2hva(vm, hcall_page), 0x0, getpagesize()); + + vcpu_args_set(vcpu, 3, nested_gva, hv_pages_gva, addr_gva2gpa(vm, hcall_page)); + vcpu_set_msr(vcpu, HV_X64_MSR_VP_INDEX, vcpu->id); for (stage = 1;; stage++) { vcpu_run(vcpu); -- GitLab From 0fa32dad1e78629cb42999dacd82489503fdf4c2 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 1 Nov 2022 15:54:26 +0100 Subject: [PATCH 0825/1423] KVM: selftests: Rename 'evmcs_test' to 'hyperv_evmcs' Conform to the rest of Hyper-V emulation selftests which have 'hyperv' prefix. Get rid of '_test' suffix as well as the purpose of this code is fairly obvious. Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20221101145426.251680-49-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 2 +- tools/testing/selftests/kvm/Makefile | 2 +- .../selftests/kvm/x86_64/{evmcs_test.c => hyperv_evmcs.c} | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename tools/testing/selftests/kvm/x86_64/{evmcs_test.c => hyperv_evmcs.c} (100%) diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index dc7e28cf2da07..082855d94c722 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -16,7 +16,6 @@ /x86_64/cpuid_test /x86_64/cr4_cpuid_sync_test /x86_64/debug_regs -/x86_64/evmcs_test /x86_64/exit_on_emulation_failure_test /x86_64/fix_hypercall_test /x86_64/get_msr_index_features @@ -24,6 +23,7 @@ /x86_64/kvm_pv_test /x86_64/hyperv_clock /x86_64/hyperv_cpuid +/x86_64/hyperv_evmcs /x86_64/hyperv_features /x86_64/hyperv_ipi /x86_64/hyperv_svm_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 246d52e9df60d..2275ba861e0e5 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -82,11 +82,11 @@ TEST_PROGS_x86_64 += x86_64/nx_huge_pages_test.sh TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features -TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test TEST_GEN_PROGS_x86_64 += x86_64/exit_on_emulation_failure_test TEST_GEN_PROGS_x86_64 += x86_64/fix_hypercall_test TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid +TEST_GEN_PROGS_x86_64 += x86_64/hyperv_evmcs TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features TEST_GEN_PROGS_x86_64 += x86_64/hyperv_ipi TEST_GEN_PROGS_x86_64 += x86_64/hyperv_svm_test diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c similarity index 100% rename from tools/testing/selftests/kvm/x86_64/evmcs_test.c rename to tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c -- GitLab From 28b4b0596343d19d140da059eee0e5c2b5328731 Mon Sep 17 00:00:00 2001 From: Long Li Date: Thu, 17 Nov 2022 13:02:56 -0800 Subject: [PATCH 0826/1423] xfs: fix incorrect i_nlink caused by inode racing The following error occurred during the fsstress test: XFS: Assertion failed: VFS_I(ip)->i_nlink >= 2, file: fs/xfs/xfs_inode.c, line: 2452 The problem was that inode race condition causes incorrect i_nlink to be written to disk, and then it is read into memory. Consider the following call graph, inodes that are marked as both XFS_IFLUSHING and XFS_IRECLAIMABLE, i_nlink will be reset to 1 and then restored to original value in xfs_reinit_inode(). Therefore, the i_nlink of directory on disk may be set to 1. xfsaild xfs_inode_item_push xfs_iflush_cluster xfs_iflush xfs_inode_to_disk xfs_iget xfs_iget_cache_hit xfs_iget_recycle xfs_reinit_inode inode_init_always xfs_reinit_inode() needs to hold the ILOCK_EXCL as it is changing internal inode state and can race with other RCU protected inode lookups. On the read side, xfs_iflush_cluster() grabs the ILOCK_SHARED while under rcu + ip->i_flags_lock, and so xfs_iflush/xfs_inode_to_disk() are protected from racing inode updates (during transactions) by that lock. Fixes: ff7bebeb91f8 ("xfs: refactor the inode recycling code") # goes further back than this Signed-off-by: Long Li Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_icache.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index eae7427062cf9..f35e2cee52655 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -342,6 +342,9 @@ xfs_iget_recycle( trace_xfs_iget_recycle(ip); + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) + return -EAGAIN; + /* * We need to make it look like the inode is being reclaimed to prevent * the actual reclaim workers from stomping over us while we recycle @@ -355,6 +358,7 @@ xfs_iget_recycle( ASSERT(!rwsem_is_locked(&inode->i_rwsem)); error = xfs_reinit_inode(mp, inode); + xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) { /* * Re-initializing the inode failed, and we are in deep @@ -518,6 +522,8 @@ xfs_iget_cache_hit( if (ip->i_flags & XFS_IRECLAIMABLE) { /* Drops i_flags_lock and RCU read lock. */ error = xfs_iget_recycle(pag, ip); + if (error == -EAGAIN) + goto out_skip; if (error) return error; } else { -- GitLab From 2a402120a8d413238999a67ebff5b7dca0e5d14c Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Wed, 16 Nov 2022 10:45:35 +0100 Subject: [PATCH 0827/1423] IB/isert: use the ISCSI_LOGIN_CURRENT_STAGE macro Use the proper macro to get the current_stage value. Link: https://lore.kernel.org/r/20221116094535.138298-1-mlombard@redhat.com Signed-off-by: Maurizio Lombardi Reviewed-by: Mike Christie Acked-by: Sagi Grimberg Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/isert/ib_isert.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index b360a1527cd10..75404885cf981 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -993,9 +993,8 @@ isert_rx_login_req(struct isert_conn *isert_conn) * login request PDU. */ login->leading_connection = (!login_req->tsih) ? 1 : 0; - login->current_stage = - (login_req->flags & ISCSI_FLAG_LOGIN_CURRENT_STAGE_MASK) - >> 2; + login->current_stage = ISCSI_LOGIN_CURRENT_STAGE( + login_req->flags); login->version_min = login_req->min_version; login->version_max = login_req->max_version; memcpy(login->isid, login_req->isid, 6); -- GitLab From 2d9cd957d40c3ac491b358e7cff0515bb07a3a9c Mon Sep 17 00:00:00 2001 From: Zeng Heng Date: Mon, 21 Nov 2022 10:00:29 +0800 Subject: [PATCH 0828/1423] PCI: Check for alloc failure in pci_request_irq() When kvasprintf() fails to allocate memory, it returns a NULL pointer. Return error from pci_request_irq() so we don't dereference it. [bhelgaas: commit log] Fixes: 704e8953d3e9 ("PCI/irq: Add pci_request_irq() and pci_free_irq() helpers") Link: https://lore.kernel.org/r/20221121020029.3759444-1-zengheng4@huawei.com Signed-off-by: Zeng Heng Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig --- drivers/pci/irq.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pci/irq.c b/drivers/pci/irq.c index 12ecd0aaa28d6..0050e8f6814ed 100644 --- a/drivers/pci/irq.c +++ b/drivers/pci/irq.c @@ -44,6 +44,8 @@ int pci_request_irq(struct pci_dev *dev, unsigned int nr, irq_handler_t handler, va_start(ap, fmt); devname = kvasprintf(GFP_KERNEL, fmt, ap); va_end(ap); + if (!devname) + return -ENOMEM; ret = request_threaded_irq(pci_irq_vector(dev, nr), handler, thread_fn, irqflags, devname, dev_id); -- GitLab From 9b51d072da1d27e1193e84708201c48e385ad912 Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Thu, 17 Nov 2022 21:15:46 +0800 Subject: [PATCH 0829/1423] RDMA/hfi: Decrease PCI device reference count in error path pci_get_device() will increase the reference count for the returned pci_dev, and also decrease the reference count for the input parameter *from* if it is not NULL. If we break out the loop in node_affinity_init() with 'dev' not NULL, we need to call pci_dev_put() to decrease the reference count. Add missing pci_dev_put() in error path. Fixes: c513de490f80 ("IB/hfi1: Invalid NUMA node information can cause a divide by zero") Signed-off-by: Xiongfeng Wang Link: https://lore.kernel.org/r/20221117131546.113280-1-wangxiongfeng2@huawei.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hfi1/affinity.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c index 877f8e84a672a..77ee77d4000fb 100644 --- a/drivers/infiniband/hw/hfi1/affinity.c +++ b/drivers/infiniband/hw/hfi1/affinity.c @@ -177,6 +177,8 @@ out: for (node = 0; node < node_affinity.num_possible_nodes; node++) hfi1_per_node_cntr[node] = 1; + pci_dev_put(dev); + return 0; } -- GitLab From 8e96729fc26c8967db45a3fb7a60387619f77a22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 21 Nov 2022 18:22:36 +0100 Subject: [PATCH 0830/1423] crypto: ccree - Make cc_debugfs_global_fini() available for module init function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ccree_init() calls cc_debugfs_global_fini(), the former is an init function and the latter an exit function though. A modular build emits: WARNING: modpost: drivers/crypto/ccree/ccree.o: section mismatch in reference: init_module (section: .init.text) -> cc_debugfs_global_fini (section: .exit.text) (with CONFIG_DEBUG_SECTION_MISMATCH=y). Fixes: 4f1c596df706 ("crypto: ccree - Remove debugfs when platform_driver_register failed") Signed-off-by: Uwe Kleine-König Signed-off-by: Herbert Xu --- drivers/crypto/ccree/cc_debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/ccree/cc_debugfs.c b/drivers/crypto/ccree/cc_debugfs.c index 7083767602fcf..8f008f024f8f1 100644 --- a/drivers/crypto/ccree/cc_debugfs.c +++ b/drivers/crypto/ccree/cc_debugfs.c @@ -55,7 +55,7 @@ void __init cc_debugfs_global_init(void) cc_debugfs_dir = debugfs_create_dir("ccree", NULL); } -void __exit cc_debugfs_global_fini(void) +void cc_debugfs_global_fini(void) { debugfs_remove(cc_debugfs_dir); } -- GitLab From 3a5154c723ba5ceb9ce374a7307e03263c03fd29 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 18 Nov 2022 18:22:20 +0000 Subject: [PATCH 0831/1423] KVM: arm64: Take a pointer to walker data in kvm_dereference_pteref() Rather than passing through the state of the KVM_PGTABLE_WALK_SHARED flag, just take a pointer to the whole walker structure instead. Move around struct kvm_pgtable and the RCU indirection such that the associated ifdeffery remains in one place while ensuring the walker + flags definitions precede their use. No functional change intended. Signed-off-by: Oliver Upton Acked-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221118182222.3932898-2-oliver.upton@linux.dev --- arch/arm64/include/asm/kvm_pgtable.h | 144 ++++++++++++++------------- arch/arm64/kvm/hyp/pgtable.c | 6 +- 2 files changed, 76 insertions(+), 74 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index a874ce0ce7b51..f23af693e3c52 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -37,54 +37,6 @@ static inline u64 kvm_get_parange(u64 mmfr0) typedef u64 kvm_pte_t; -/* - * RCU cannot be used in a non-kernel context such as the hyp. As such, page - * table walkers used in hyp do not call into RCU and instead use other - * synchronization mechanisms (such as a spinlock). - */ -#if defined(__KVM_NVHE_HYPERVISOR__) || defined(__KVM_VHE_HYPERVISOR__) - -typedef kvm_pte_t *kvm_pteref_t; - -static inline kvm_pte_t *kvm_dereference_pteref(kvm_pteref_t pteref, bool shared) -{ - return pteref; -} - -static inline void kvm_pgtable_walk_begin(void) {} -static inline void kvm_pgtable_walk_end(void) {} - -static inline bool kvm_pgtable_walk_lock_held(void) -{ - return true; -} - -#else - -typedef kvm_pte_t __rcu *kvm_pteref_t; - -static inline kvm_pte_t *kvm_dereference_pteref(kvm_pteref_t pteref, bool shared) -{ - return rcu_dereference_check(pteref, !shared); -} - -static inline void kvm_pgtable_walk_begin(void) -{ - rcu_read_lock(); -} - -static inline void kvm_pgtable_walk_end(void) -{ - rcu_read_unlock(); -} - -static inline bool kvm_pgtable_walk_lock_held(void) -{ - return rcu_read_lock_held(); -} - -#endif - #define KVM_PTE_VALID BIT(0) #define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT) @@ -212,29 +164,6 @@ enum kvm_pgtable_prot { typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end, enum kvm_pgtable_prot prot); -/** - * struct kvm_pgtable - KVM page-table. - * @ia_bits: Maximum input address size, in bits. - * @start_level: Level at which the page-table walk starts. - * @pgd: Pointer to the first top-level entry of the page-table. - * @mm_ops: Memory management callbacks. - * @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables. - * @flags: Stage-2 page-table flags. - * @force_pte_cb: Function that returns true if page level mappings must - * be used instead of block mappings. - */ -struct kvm_pgtable { - u32 ia_bits; - u32 start_level; - kvm_pteref_t pgd; - struct kvm_pgtable_mm_ops *mm_ops; - - /* Stage-2 only */ - struct kvm_s2_mmu *mmu; - enum kvm_pgtable_stage2_flags flags; - kvm_pgtable_force_pte_cb_t force_pte_cb; -}; - /** * enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk. * @KVM_PGTABLE_WALK_LEAF: Visit leaf entries, including invalid @@ -285,6 +214,79 @@ struct kvm_pgtable_walker { const enum kvm_pgtable_walk_flags flags; }; +/* + * RCU cannot be used in a non-kernel context such as the hyp. As such, page + * table walkers used in hyp do not call into RCU and instead use other + * synchronization mechanisms (such as a spinlock). + */ +#if defined(__KVM_NVHE_HYPERVISOR__) || defined(__KVM_VHE_HYPERVISOR__) + +typedef kvm_pte_t *kvm_pteref_t; + +static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walker, + kvm_pteref_t pteref) +{ + return pteref; +} + +static inline void kvm_pgtable_walk_begin(void) {} +static inline void kvm_pgtable_walk_end(void) {} + +static inline bool kvm_pgtable_walk_lock_held(void) +{ + return true; +} + +#else + +typedef kvm_pte_t __rcu *kvm_pteref_t; + +static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walker, + kvm_pteref_t pteref) +{ + return rcu_dereference_check(pteref, !(walker->flags & KVM_PGTABLE_WALK_SHARED)); +} + +static inline void kvm_pgtable_walk_begin(void) +{ + rcu_read_lock(); +} + +static inline void kvm_pgtable_walk_end(void) +{ + rcu_read_unlock(); +} + +static inline bool kvm_pgtable_walk_lock_held(void) +{ + return rcu_read_lock_held(); +} + +#endif + +/** + * struct kvm_pgtable - KVM page-table. + * @ia_bits: Maximum input address size, in bits. + * @start_level: Level at which the page-table walk starts. + * @pgd: Pointer to the first top-level entry of the page-table. + * @mm_ops: Memory management callbacks. + * @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables. + * @flags: Stage-2 page-table flags. + * @force_pte_cb: Function that returns true if page level mappings must + * be used instead of block mappings. + */ +struct kvm_pgtable { + u32 ia_bits; + u32 start_level; + kvm_pteref_t pgd; + struct kvm_pgtable_mm_ops *mm_ops; + + /* Stage-2 only */ + struct kvm_s2_mmu *mmu; + enum kvm_pgtable_stage2_flags flags; + kvm_pgtable_force_pte_cb_t force_pte_cb; +}; + /** * kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 page-table. * @pgt: Uninitialised page-table structure to initialise. diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 5bca9610d040d..b5b91a8828365 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -188,7 +188,7 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, kvm_pteref_t pteref, u32 level) { enum kvm_pgtable_walk_flags flags = data->walker->flags; - kvm_pte_t *ptep = kvm_dereference_pteref(pteref, flags & KVM_PGTABLE_WALK_SHARED); + kvm_pte_t *ptep = kvm_dereference_pteref(data->walker, pteref); struct kvm_pgtable_visit_ctx ctx = { .ptep = ptep, .old = READ_ONCE(*ptep), @@ -558,7 +558,7 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt) }; WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); - pgt->mm_ops->put_page(kvm_dereference_pteref(pgt->pgd, false)); + pgt->mm_ops->put_page(kvm_dereference_pteref(&walker, pgt->pgd)); pgt->pgd = NULL; } @@ -1241,7 +1241,7 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE; - pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(pgt->pgd, false), pgd_sz); + pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz); pgt->pgd = NULL; } -- GitLab From b7833bf202e3068abb77c642a0843f696e9c8d38 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 18 Nov 2022 18:22:21 +0000 Subject: [PATCH 0832/1423] KVM: arm64: Don't acquire RCU read lock for exclusive table walks Marek reported a BUG resulting from the recent parallel faults changes, as the hyp stage-1 map walker attempted to allocate table memory while holding the RCU read lock: BUG: sleeping function called from invalid context at include/linux/sched/mm.h:274 in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 1, name: swapper/0 preempt_count: 0, expected: 0 RCU nest depth: 1, expected: 0 2 locks held by swapper/0/1: #0: ffff80000a8a44d0 (kvm_hyp_pgd_mutex){+.+.}-{3:3}, at: __create_hyp_mappings+0x80/0xc4 #1: ffff80000a927720 (rcu_read_lock){....}-{1:2}, at: kvm_pgtable_walk+0x0/0x1f4 CPU: 2 PID: 1 Comm: swapper/0 Not tainted 6.1.0-rc3+ #5918 Hardware name: Raspberry Pi 3 Model B (DT) Call trace: dump_backtrace.part.0+0xe4/0xf0 show_stack+0x18/0x40 dump_stack_lvl+0x8c/0xb8 dump_stack+0x18/0x34 __might_resched+0x178/0x220 __might_sleep+0x48/0xa0 prepare_alloc_pages+0x178/0x1a0 __alloc_pages+0x9c/0x109c alloc_page_interleave+0x1c/0xc4 alloc_pages+0xec/0x160 get_zeroed_page+0x1c/0x44 kvm_hyp_zalloc_page+0x14/0x20 hyp_map_walker+0xd4/0x134 kvm_pgtable_visitor_cb.isra.0+0x38/0x5c __kvm_pgtable_walk+0x1a4/0x220 kvm_pgtable_walk+0x104/0x1f4 kvm_pgtable_hyp_map+0x80/0xc4 __create_hyp_mappings+0x9c/0xc4 kvm_mmu_init+0x144/0x1cc kvm_arch_init+0xe4/0xef4 kvm_init+0x3c/0x3d0 arm_init+0x20/0x30 do_one_initcall+0x74/0x400 kernel_init_freeable+0x2e0/0x350 kernel_init+0x24/0x130 ret_from_fork+0x10/0x20 Since the hyp stage-1 table walkers are serialized by kvm_hyp_pgd_mutex, RCU protection really doesn't add anything. Don't acquire the RCU read lock for an exclusive walk. Reported-by: Marek Szyprowski Signed-off-by: Oliver Upton Acked-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221118182222.3932898-3-oliver.upton@linux.dev --- arch/arm64/include/asm/kvm_pgtable.h | 14 ++++++++------ arch/arm64/kvm/hyp/pgtable.c | 4 ++-- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index f23af693e3c52..4b6b52ebc11c3 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -229,8 +229,8 @@ static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walke return pteref; } -static inline void kvm_pgtable_walk_begin(void) {} -static inline void kvm_pgtable_walk_end(void) {} +static inline void kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker) {} +static inline void kvm_pgtable_walk_end(struct kvm_pgtable_walker *walker) {} static inline bool kvm_pgtable_walk_lock_held(void) { @@ -247,14 +247,16 @@ static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walke return rcu_dereference_check(pteref, !(walker->flags & KVM_PGTABLE_WALK_SHARED)); } -static inline void kvm_pgtable_walk_begin(void) +static inline void kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker) { - rcu_read_lock(); + if (walker->flags & KVM_PGTABLE_WALK_SHARED) + rcu_read_lock(); } -static inline void kvm_pgtable_walk_end(void) +static inline void kvm_pgtable_walk_end(struct kvm_pgtable_walker *walker) { - rcu_read_unlock(); + if (walker->flags & KVM_PGTABLE_WALK_SHARED) + rcu_read_unlock(); } static inline bool kvm_pgtable_walk_lock_held(void) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index b5b91a8828365..d6f3753cb87ed 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -289,9 +289,9 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, }; int r; - kvm_pgtable_walk_begin(); + kvm_pgtable_walk_begin(walker); r = _kvm_pgtable_walk(pgt, &walk_data); - kvm_pgtable_walk_end(); + kvm_pgtable_walk_end(walker); return r; } -- GitLab From 5e806c5812e8012a83496cf96bdba266b3aec428 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 18 Nov 2022 18:22:22 +0000 Subject: [PATCH 0833/1423] KVM: arm64: Reject shared table walks in the hyp code Exclusive table walks are the only supported table walk in the hyp, as there is no construct like RCU available in the hypervisor code. Reject any attempt to do a shared table walk by returning an error and allowing the caller to clean up the mess. Suggested-by: Will Deacon Signed-off-by: Oliver Upton Acked-by: Will Deacon Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221118182222.3932898-4-oliver.upton@linux.dev --- arch/arm64/include/asm/kvm_pgtable.h | 17 +++++++++++++++-- arch/arm64/kvm/hyp/pgtable.c | 5 ++++- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 4b6b52ebc11c3..d5cb01f8dc066 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -229,7 +229,18 @@ static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walke return pteref; } -static inline void kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker) {} +static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker) +{ + /* + * Due to the lack of RCU (or a similar protection scheme), only + * non-shared table walkers are allowed in the hypervisor. + */ + if (walker->flags & KVM_PGTABLE_WALK_SHARED) + return -EPERM; + + return 0; +} + static inline void kvm_pgtable_walk_end(struct kvm_pgtable_walker *walker) {} static inline bool kvm_pgtable_walk_lock_held(void) @@ -247,10 +258,12 @@ static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walke return rcu_dereference_check(pteref, !(walker->flags & KVM_PGTABLE_WALK_SHARED)); } -static inline void kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker) +static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker) { if (walker->flags & KVM_PGTABLE_WALK_SHARED) rcu_read_lock(); + + return 0; } static inline void kvm_pgtable_walk_end(struct kvm_pgtable_walker *walker) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index d6f3753cb87ed..58dbe0ab567fe 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -289,7 +289,10 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, }; int r; - kvm_pgtable_walk_begin(walker); + r = kvm_pgtable_walk_begin(walker); + if (r) + return r; + r = _kvm_pgtable_walk(pgt, &walk_data); kvm_pgtable_walk_end(walker); -- GitLab From 9907526d25c4ad8a6e3006487a544140776ba005 Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Mon, 21 Nov 2022 18:44:10 -0600 Subject: [PATCH 0834/1423] RDMA/irdma: Initialize net_type before checking it The av->net_type is not initialized before it is checked in irdma_modify_qp_roce. This leads to an incorrect update to the ARP cache and QP context. RoCEv2 connections might fail as result. Set the net_type using rdma_gid_attr_network_type. Fixes: 80005c43d4c8 ("RDMA/irdma: Use net_type to check network type") Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Link: https://lore.kernel.org/r/20221122004410.1471-1-shiraz.saleem@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/verbs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index dc3f5f3fee901..f6973ea55eda7 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -1213,6 +1213,7 @@ int irdma_modify_qp_roce(struct ib_qp *ibqp, struct ib_qp_attr *attr, av->attrs = attr->ah_attr; rdma_gid2ip((struct sockaddr *)&av->sgid_addr, &sgid_attr->gid); rdma_gid2ip((struct sockaddr *)&av->dgid_addr, &attr->ah_attr.grh.dgid); + av->net_type = rdma_gid_attr_network_type(sgid_attr); if (av->net_type == RDMA_NETWORK_IPV6) { __be32 *daddr = av->dgid_addr.saddr_in6.sin6_addr.in6_u.u6_addr32; -- GitLab From a115aa00b18f7b8982b8f458149632caf64a862a Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Sat, 19 Nov 2022 15:08:34 +0800 Subject: [PATCH 0835/1423] RDMA/hns: fix memory leak in hns_roce_alloc_mr() When hns_roce_mr_enable() failed in hns_roce_alloc_mr(), mr_key is not released. Compiled test only. Fixes: 9b2cf76c9f05 ("RDMA/hns: Optimize PBL buffer allocation process") Signed-off-by: Zhengchao Shao Link: https://lore.kernel.org/r/20221119070834.48502-1-shaozhengchao@huawei.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_mr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 845ac7d3831f4..37a5cf62f88b4 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -392,10 +392,10 @@ struct ib_mr *hns_roce_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, return &mr->ibmr; -err_key: - free_mr_key(hr_dev, mr); err_pbl: free_mr_pbl(hr_dev, mr); +err_key: + free_mr_key(hr_dev, mr); err_free: kfree(mr); return ERR_PTR(ret); -- GitLab From 05f5747414c6ecb8a7f9b0c1dc10bcffa6dfb5ba Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 21 Nov 2022 19:15:18 +0100 Subject: [PATCH 0836/1423] PCI/portdrv: Set PCIE_PORT_SERVICE_HP for Root and Downstream Ports only It is reported that on some systems pciehp binds to an Upstream Port and attempts to operate it which causes devices below the Port to disappear from the bus. This happens because acpiphp sets dev->is_hotplug_bridge for that Port (after receiving a Device Check notification on it from the platform firmware via ACPI) during the enumeration of PCI devices. get_port_device_capability() sees that dev->is_hotplug_bridge is set and adds PCIE_PORT_SERVICE_HP to Port services (which allows pciehp to bind to the Port in question) without consulting the PCIe type, which should be either Root Port or Downstream Port for the hotplug capability to be present. Per PCIe r6.0, sec 7.5.3.2, the Slot Implemented bit is only valid for Downstream Ports (including Root Ports), and PCIe hotplug depends on the Slot Capabilities / Control / Status registers. Make get_port_device_capability() more robust by adding a PCIe type check to it before adding PCIE_PORT_SERVICE_HP to Port services which helps to avoid the problem. [bhelgaas: add spec citation] Suggested-by: Lukas Wunner Link: https://lore.kernel.org/r/4786090.31r3eYUQgx@kreacher Reported-by: Rodrigo Vivi Signed-off-by: Rafael J. Wysocki Signed-off-by: Bjorn Helgaas Reviewed-by: Lukas Wunner --- drivers/pci/pcie/portdrv_core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index 1ac7fec47d6fb..98f0126aaa903 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -209,6 +209,8 @@ static int get_port_device_capability(struct pci_dev *dev) int services = 0; if (dev->is_hotplug_bridge && + (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT || + pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM) && (pcie_ports_native || host->native_pcie_hotplug)) { services |= PCIE_PORT_SERVICE_HP; -- GitLab From c63a3be76df678b173c59f1d5dc19a21b2d1c753 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 21 Nov 2022 19:16:57 +0100 Subject: [PATCH 0837/1423] PCI: acpiphp: Avoid setting is_hotplug_bridge for PCIe Upstream Ports It is reported that on some systems pciehp binds to an Upstream Port and attempts to operate it which causes devices below the Port to disappear from the bus. This happens because acpiphp sets dev->is_hotplug_bridge for that Port (after receiving a Device Check notification on it from the platform firmware via ACPI) during the enumeration of PCI devices. get_port_device_capability() sees that dev->is_hotplug_bridge is set and adds PCIE_PORT_SERVICE_HP to Port services, which allows pciehp to bind to the Port in question. Even though this particular problem can be addressed by making the portdrv_core checks more robust, it also causes power management to work differently on the affected systems which generally is not desirable (PCIe Ports with dev->is_hotplug_bridge set have to pass additional tests to be allowed to go into the D3hot/cold power states which affects runtime PM of devices below these Ports). For this reason, amend check_hotplug_bridge() with a PCIe type check to prevent it from setting dev->is_hotplug_bridge for Upstream Ports. Suggested-by: Lukas Wunner Link: https://lore.kernel.org/r/2262230.ElGaqSPkdT@kreacher Reported-by: Rodrigo Vivi Tested-by: Rodrigo Vivi Signed-off-by: Rafael J. Wysocki Signed-off-by: Bjorn Helgaas Reviewed-by: Rodrigo Vivi Reviewed-by: Lukas Wunner --- drivers/pci/hotplug/acpiphp_glue.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c index 6efa3d8db9a56..5b1f271c6034b 100644 --- a/drivers/pci/hotplug/acpiphp_glue.c +++ b/drivers/pci/hotplug/acpiphp_glue.c @@ -411,6 +411,14 @@ static void check_hotplug_bridge(struct acpiphp_slot *slot, struct pci_dev *dev) if (dev->is_hotplug_bridge) return; + /* + * In the PCIe case, only Root Ports and Downstream Ports are capable of + * accommodating hotplug devices, so avoid marking Upstream Ports as + * "hotplug bridges". + */ + if (pci_is_pcie(dev) && pci_pcie_type(dev) == PCI_EXP_TYPE_UPSTREAM) + return; + list_for_each_entry(func, &slot->funcs, sibling) { if (PCI_FUNC(dev->devfn) == func->function) { dev->is_hotplug_bridge = 1; -- GitLab From cb6562c380832a930ffd1722ac9d479b454aed4e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 22 Nov 2022 15:37:39 -0400 Subject: [PATCH 0838/1423] RDMA/rxe: Do not NULL deref on debugging failure path Correct the mistake, mr is obviously NULL in this code path. Fixes: 2778b72b1df0 ("RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_mr.c") Link: https://lore.kernel.org/r/Y3eeJW0AdyJYhYyQ@kili Reported-by: Dan Carpenter Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index b1423000e4bcd..b7c9ff1ddf0e1 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -519,7 +519,7 @@ int rxe_invalidate_mr(struct rxe_qp *qp, u32 key) mr = rxe_pool_get_index(&rxe->mr_pool, key >> 8); if (!mr) { - rxe_dbg_mr(mr, "No MR for key %#x\n", key); + rxe_dbg_qp(qp, "No MR for key %#x\n", key); ret = -EINVAL; goto err; } -- GitLab From f67376d801499f4fa0838c18c1efcad8840e550d Mon Sep 17 00:00:00 2001 From: Zhang Xiaoxu Date: Tue, 22 Nov 2022 23:14:37 +0800 Subject: [PATCH 0839/1423] RDMA/rxe: Fix NULL-ptr-deref in rxe_qp_do_cleanup() when socket create failed There is a null-ptr-deref when mount.cifs over rdma: BUG: KASAN: null-ptr-deref in rxe_qp_do_cleanup+0x2f3/0x360 [rdma_rxe] Read of size 8 at addr 0000000000000018 by task mount.cifs/3046 CPU: 2 PID: 3046 Comm: mount.cifs Not tainted 6.1.0-rc5+ #62 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-1.fc3 Call Trace: dump_stack_lvl+0x34/0x44 kasan_report+0xad/0x130 rxe_qp_do_cleanup+0x2f3/0x360 [rdma_rxe] execute_in_process_context+0x25/0x90 __rxe_cleanup+0x101/0x1d0 [rdma_rxe] rxe_create_qp+0x16a/0x180 [rdma_rxe] create_qp.part.0+0x27d/0x340 ib_create_qp_kernel+0x73/0x160 rdma_create_qp+0x100/0x230 _smbd_get_connection+0x752/0x20f0 smbd_get_connection+0x21/0x40 cifs_get_tcp_session+0x8ef/0xda0 mount_get_conns+0x60/0x750 cifs_mount+0x103/0xd00 cifs_smb3_do_mount+0x1dd/0xcb0 smb3_get_tree+0x1d5/0x300 vfs_get_tree+0x41/0xf0 path_mount+0x9b3/0xdd0 __x64_sys_mount+0x190/0x1d0 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 The root cause of the issue is the socket create failed in rxe_qp_init_req(). So move the reset rxe_qp_do_cleanup() after the NULL ptr check. Fixes: 8700e3e7c485 ("Soft RoCE driver") Link: https://lore.kernel.org/r/20221122151437.1057671-1-zhangxiaoxu5@huawei.com Signed-off-by: Zhang Xiaoxu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_qp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 46f6c74ce00e1..ab72db68b58f6 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -817,12 +817,12 @@ static void rxe_qp_do_cleanup(struct work_struct *work) if (qp->resp.mr) rxe_put(qp->resp.mr); - if (qp_type(qp) == IB_QPT_RC) - sk_dst_reset(qp->sk->sk); - free_rd_atomic_resources(qp); if (qp->sk) { + if (qp_type(qp) == IB_QPT_RC) + sk_dst_reset(qp->sk->sk); + kernel_sock_shutdown(qp->sk, SHUT_RDWR); sock_release(qp->sk); } -- GitLab From 9676f40618df9f8e1ab681486021d6c0df86c5fa Mon Sep 17 00:00:00 2001 From: Ian Cowan Date: Sat, 12 Nov 2022 09:28:57 -0500 Subject: [PATCH 0840/1423] PCI: shpchp: Remove unused get_mode1_ECC_cap callback The ->get_mode1_ECC_cap callback in the shpchp_hpc_ops struct is never called, so remove it. [bhelgaas: squash] Link: https://lore.kernel.org/r/20221112142859.319733-2-ian@linux.cowan.aero Link: https://lore.kernel.org/r/20221112142859.319733-3-ian@linux.cowan.aero Link: https://lore.kernel.org/r/20221112142859.319733-4-ian@linux.cowan.aero Signed-off-by: Ian Cowan Signed-off-by: Bjorn Helgaas --- drivers/pci/hotplug/TODO | 3 --- drivers/pci/hotplug/shpchp.h | 1 - drivers/pci/hotplug/shpchp_hpc.c | 18 ------------------ 3 files changed, 22 deletions(-) diff --git a/drivers/pci/hotplug/TODO b/drivers/pci/hotplug/TODO index 88f217c82b4ff..fdb8dd6ea24dc 100644 --- a/drivers/pci/hotplug/TODO +++ b/drivers/pci/hotplug/TODO @@ -58,9 +58,6 @@ shpchp: pciehp with commit 82a9e79ef132 ("PCI: pciehp: remove hpc_ops"). Clarify if there was a specific reason not to apply the same change to shpchp. -* The ->get_mode1_ECC_cap callback in shpchp_hpc_ops is never invoked. - Why was it introduced? Can it be removed? - * The hardirq handler shpc_isr() queues events on a workqueue. It can be simplified by converting it to threaded IRQ handling. Use pciehp as a template. diff --git a/drivers/pci/hotplug/shpchp.h b/drivers/pci/hotplug/shpchp.h index 6e85885b554c5..3a97f455336e6 100644 --- a/drivers/pci/hotplug/shpchp.h +++ b/drivers/pci/hotplug/shpchp.h @@ -311,7 +311,6 @@ struct hpc_ops { int (*get_latch_status)(struct slot *slot, u8 *status); int (*get_adapter_status)(struct slot *slot, u8 *status); int (*get_adapter_speed)(struct slot *slot, enum pci_bus_speed *speed); - int (*get_mode1_ECC_cap)(struct slot *slot, u8 *mode); int (*get_prog_int)(struct slot *slot, u8 *prog_int); int (*query_power_fault)(struct slot *slot); void (*green_led_on)(struct slot *slot); diff --git a/drivers/pci/hotplug/shpchp_hpc.c b/drivers/pci/hotplug/shpchp_hpc.c index bd7557ca49108..48e4daefc44af 100644 --- a/drivers/pci/hotplug/shpchp_hpc.c +++ b/drivers/pci/hotplug/shpchp_hpc.c @@ -489,23 +489,6 @@ static int hpc_get_adapter_speed(struct slot *slot, enum pci_bus_speed *value) return retval; } -static int hpc_get_mode1_ECC_cap(struct slot *slot, u8 *mode) -{ - int retval = 0; - struct controller *ctrl = slot->ctrl; - u16 sec_bus_status = shpc_readw(ctrl, SEC_BUS_CONFIG); - u8 pi = shpc_readb(ctrl, PROG_INTERFACE); - - if (pi == 2) { - *mode = (sec_bus_status & 0x0100) >> 8; - } else { - retval = -1; - } - - ctrl_dbg(ctrl, "Mode 1 ECC cap = %d\n", *mode); - return retval; -} - static int hpc_query_power_fault(struct slot *slot) { struct controller *ctrl = slot->ctrl; @@ -900,7 +883,6 @@ static const struct hpc_ops shpchp_hpc_ops = { .get_adapter_status = hpc_get_adapter_status, .get_adapter_speed = hpc_get_adapter_speed, - .get_mode1_ECC_cap = hpc_get_mode1_ECC_cap, .get_prog_int = hpc_get_prog_int, .query_power_fault = hpc_query_power_fault, -- GitLab From 198dd8aedee6a7d2de0dfa739f9a008a938f6848 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 23 Nov 2022 12:40:11 +1100 Subject: [PATCH 0841/1423] xfs: punching delalloc extents on write failure is racy xfs_buffered_write_iomap_end() has a comment about the safety of punching delalloc extents based holding the IOLOCK_EXCL. This comment is wrong, and punching delalloc extents is not race free. When we punch out a delalloc extent after a write failure in xfs_buffered_write_iomap_end(), we punch out the page cache with truncate_pagecache_range() before we punch out the delalloc extents. At this point, we only hold the IOLOCK_EXCL, so there is nothing stopping mmap() write faults racing with this cleanup operation, reinstantiating a folio over the range we are about to punch and hence requiring the delalloc extent to be kept. If this race condition is hit, we can end up with a dirty page in the page cache that has no delalloc extent or space reservation backing it. This leads to bad things happening at writeback time. To avoid this race condition, we need the page cache truncation to be atomic w.r.t. the extent manipulation. We can do this by holding the mapping->invalidate_lock exclusively across this operation - this will prevent new pages from being inserted into the page cache whilst we are removing the pages and the backing extent and space reservation. Taking the mapping->invalidate_lock exclusively in the buffered write IO path is safe - it naturally nests inside the IOLOCK (see truncate and fallocate paths). iomap_zero_range() can be called from under the mapping->invalidate_lock (from the truncate path via either xfs_zero_eof() or xfs_truncate_page(), but iomap_zero_iter() will not instantiate new delalloc pages (because it skips holes) and hence will not ever need to punch out delalloc extents on failure. Fix the locking issue, and clean up the code logic a little to avoid unnecessary work if we didn't allocate the delalloc extent or wrote the entire region we allocated. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_iomap.c | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 5cea069a38b4e..a2e45ea1b0cb3 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1147,6 +1147,10 @@ xfs_buffered_write_iomap_end( written = 0; } + /* If we didn't reserve the blocks, we're not allowed to punch them. */ + if (!(iomap->flags & IOMAP_F_NEW)) + return 0; + /* * start_fsb refers to the first unused block after a short write. If * nothing was written, round offset down to point at the first block in @@ -1158,27 +1162,28 @@ xfs_buffered_write_iomap_end( start_fsb = XFS_B_TO_FSB(mp, offset + written); end_fsb = XFS_B_TO_FSB(mp, offset + length); + /* Nothing to do if we've written the entire delalloc extent */ + if (start_fsb >= end_fsb) + return 0; + /* - * Trim delalloc blocks if they were allocated by this write and we - * didn't manage to write the whole range. - * - * We don't need to care about racing delalloc as we hold i_mutex - * across the reserve/allocate/unreserve calls. If there are delalloc - * blocks in the range, they are ours. + * Lock the mapping to avoid races with page faults re-instantiating + * folios and dirtying them via ->page_mkwrite between the page cache + * truncation and the delalloc extent removal. Failing to do this can + * leave dirty pages with no space reservation in the cache. */ - if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) { - truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb), - XFS_FSB_TO_B(mp, end_fsb) - 1); - - error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - end_fsb - start_fsb); - if (error && !xfs_is_shutdown(mp)) { - xfs_alert(mp, "%s: unable to clean up ino %lld", - __func__, ip->i_ino); - return error; - } + filemap_invalidate_lock(inode->i_mapping); + truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb), + XFS_FSB_TO_B(mp, end_fsb) - 1); + + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, + end_fsb - start_fsb); + filemap_invalidate_unlock(inode->i_mapping); + if (error && !xfs_is_shutdown(mp)) { + xfs_alert(mp, "%s: unable to clean up ino %lld", + __func__, ip->i_ino); + return error; } - return 0; } -- GitLab From b71f889c18ada210a97aa3eb5e00c0de552234c6 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 23 Nov 2022 12:40:12 +1100 Subject: [PATCH 0842/1423] xfs: use byte ranges for write cleanup ranges xfs_buffered_write_iomap_end() currently converts the byte ranges passed to it to filesystem blocks to pass them to the bmap code to punch out delalloc blocks, but then has to convert filesytem blocks back to byte ranges for page cache truncate. We're about to make the page cache truncate go away and replace it with a page cache walk, so having to convert everything to/from/to filesystem blocks is messy and error-prone. It is much easier to pass around byte ranges and convert to page indexes and/or filesystem blocks only where those units are needed. In preparation for the page cache walk being added, add a helper that converts byte ranges to filesystem blocks and calls xfs_bmap_punch_delalloc_range() and convert xfs_buffered_write_iomap_end() to calculate limits in byte ranges. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_iomap.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index a2e45ea1b0cb3..7bb55dbc19d35 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1120,6 +1120,20 @@ out_unlock: return error; } +static int +xfs_buffered_write_delalloc_punch( + struct inode *inode, + loff_t start_byte, + loff_t end_byte) +{ + struct xfs_mount *mp = XFS_M(inode->i_sb); + xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, start_byte); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte); + + return xfs_bmap_punch_delalloc_range(XFS_I(inode), start_fsb, + end_fsb - start_fsb); +} + static int xfs_buffered_write_iomap_end( struct inode *inode, @@ -1129,10 +1143,9 @@ xfs_buffered_write_iomap_end( unsigned flags, struct iomap *iomap) { - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t start_fsb; - xfs_fileoff_t end_fsb; + struct xfs_mount *mp = XFS_M(inode->i_sb); + loff_t start_byte; + loff_t end_byte; int error = 0; if (iomap->type != IOMAP_DELALLOC) @@ -1157,13 +1170,13 @@ xfs_buffered_write_iomap_end( * the range. */ if (unlikely(!written)) - start_fsb = XFS_B_TO_FSBT(mp, offset); + start_byte = round_down(offset, mp->m_sb.sb_blocksize); else - start_fsb = XFS_B_TO_FSB(mp, offset + written); - end_fsb = XFS_B_TO_FSB(mp, offset + length); + start_byte = round_up(offset + written, mp->m_sb.sb_blocksize); + end_byte = round_up(offset + length, mp->m_sb.sb_blocksize); /* Nothing to do if we've written the entire delalloc extent */ - if (start_fsb >= end_fsb) + if (start_byte >= end_byte) return 0; /* @@ -1173,15 +1186,12 @@ xfs_buffered_write_iomap_end( * leave dirty pages with no space reservation in the cache. */ filemap_invalidate_lock(inode->i_mapping); - truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb), - XFS_FSB_TO_B(mp, end_fsb) - 1); - - error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - end_fsb - start_fsb); + truncate_pagecache_range(inode, start_byte, end_byte - 1); + error = xfs_buffered_write_delalloc_punch(inode, start_byte, end_byte); filemap_invalidate_unlock(inode->i_mapping); if (error && !xfs_is_shutdown(mp)) { - xfs_alert(mp, "%s: unable to clean up ino %lld", - __func__, ip->i_ino); + xfs_alert(mp, "%s: unable to clean up ino 0x%llx", + __func__, XFS_I(inode)->i_ino); return error; } return 0; -- GitLab From 9c7babf94a0d686b552e53aded8d4703d1b8b92b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 23 Nov 2022 12:44:38 +1100 Subject: [PATCH 0843/1423] xfs,iomap: move delalloc punching to iomap Because that's what Christoph wants for this error handling path only XFS uses. It requires a new iomap export for handling errors over delalloc ranges. This is basically the XFS code as is stands, but even though Christoph wants this as iomap funcitonality, we still have to call it from the filesystem specific ->iomap_end callback, and call into the iomap code with yet another filesystem specific callback to punch the delalloc extent within the defined ranges. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 60 ++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_iomap.c | 47 ++++++--------------------------- include/linux/iomap.h | 4 +++ 3 files changed, 72 insertions(+), 39 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 91ee0b308e13d..734b761a1e4ab 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -832,6 +832,66 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); +/* + * When a short write occurs, the filesystem may need to remove reserved space + * that was allocated in ->iomap_begin from it's ->iomap_end method. For + * filesystems that use delayed allocation, we need to punch out delalloc + * extents from the range that are not dirty in the page cache. As the write can + * race with page faults, there can be dirty pages over the delalloc extent + * outside the range of a short write but still within the delalloc extent + * allocated for this iomap. + * + * This function uses [start_byte, end_byte) intervals (i.e. open ended) to + * simplify range iterations, but converts them back to {offset,len} tuples for + * the punch callback. + */ +int iomap_file_buffered_write_punch_delalloc(struct inode *inode, + struct iomap *iomap, loff_t pos, loff_t length, + ssize_t written, + int (*punch)(struct inode *inode, loff_t pos, loff_t length)) +{ + loff_t start_byte; + loff_t end_byte; + int blocksize = i_blocksize(inode); + int error = 0; + + if (iomap->type != IOMAP_DELALLOC) + return 0; + + /* If we didn't reserve the blocks, we're not allowed to punch them. */ + if (!(iomap->flags & IOMAP_F_NEW)) + return 0; + + /* + * start_byte refers to the first unused block after a short write. If + * nothing was written, round offset down to point at the first block in + * the range. + */ + if (unlikely(!written)) + start_byte = round_down(pos, blocksize); + else + start_byte = round_up(pos + written, blocksize); + end_byte = round_up(pos + length, blocksize); + + /* Nothing to do if we've written the entire delalloc extent */ + if (start_byte >= end_byte) + return 0; + + /* + * Lock the mapping to avoid races with page faults re-instantiating + * folios and dirtying them via ->page_mkwrite between the page cache + * truncation and the delalloc extent removal. Failing to do this can + * leave dirty pages with no space reservation in the cache. + */ + filemap_invalidate_lock(inode->i_mapping); + truncate_pagecache_range(inode, start_byte, end_byte - 1); + error = punch(inode, start_byte, end_byte - start_byte); + filemap_invalidate_unlock(inode->i_mapping); + + return error; +} +EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc); + static loff_t iomap_unshare_iter(struct iomap_iter *iter) { struct iomap *iomap = &iter->iomap; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 7bb55dbc19d35..ea96e8a348687 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1123,12 +1123,12 @@ out_unlock: static int xfs_buffered_write_delalloc_punch( struct inode *inode, - loff_t start_byte, - loff_t end_byte) + loff_t offset, + loff_t length) { struct xfs_mount *mp = XFS_M(inode->i_sb); - xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, start_byte); - xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte); + xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length); return xfs_bmap_punch_delalloc_range(XFS_I(inode), start_fsb, end_fsb - start_fsb); @@ -1143,13 +1143,9 @@ xfs_buffered_write_iomap_end( unsigned flags, struct iomap *iomap) { - struct xfs_mount *mp = XFS_M(inode->i_sb); - loff_t start_byte; - loff_t end_byte; - int error = 0; - if (iomap->type != IOMAP_DELALLOC) - return 0; + struct xfs_mount *mp = XFS_M(inode->i_sb); + int error; /* * Behave as if the write failed if drop writes is enabled. Set the NEW @@ -1160,35 +1156,8 @@ xfs_buffered_write_iomap_end( written = 0; } - /* If we didn't reserve the blocks, we're not allowed to punch them. */ - if (!(iomap->flags & IOMAP_F_NEW)) - return 0; - - /* - * start_fsb refers to the first unused block after a short write. If - * nothing was written, round offset down to point at the first block in - * the range. - */ - if (unlikely(!written)) - start_byte = round_down(offset, mp->m_sb.sb_blocksize); - else - start_byte = round_up(offset + written, mp->m_sb.sb_blocksize); - end_byte = round_up(offset + length, mp->m_sb.sb_blocksize); - - /* Nothing to do if we've written the entire delalloc extent */ - if (start_byte >= end_byte) - return 0; - - /* - * Lock the mapping to avoid races with page faults re-instantiating - * folios and dirtying them via ->page_mkwrite between the page cache - * truncation and the delalloc extent removal. Failing to do this can - * leave dirty pages with no space reservation in the cache. - */ - filemap_invalidate_lock(inode->i_mapping); - truncate_pagecache_range(inode, start_byte, end_byte - 1); - error = xfs_buffered_write_delalloc_punch(inode, start_byte, end_byte); - filemap_invalidate_unlock(inode->i_mapping); + error = iomap_file_buffered_write_punch_delalloc(inode, iomap, offset, + length, written, &xfs_buffered_write_delalloc_punch); if (error && !xfs_is_shutdown(mp)) { xfs_alert(mp, "%s: unable to clean up ino 0x%llx", __func__, XFS_I(inode)->i_ino); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 238a03087e17e..0698c4b8ce0e2 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -226,6 +226,10 @@ static inline const struct iomap *iomap_iter_srcmap(const struct iomap_iter *i) ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops); +int iomap_file_buffered_write_punch_delalloc(struct inode *inode, + struct iomap *iomap, loff_t pos, loff_t length, ssize_t written, + int (*punch)(struct inode *inode, loff_t pos, loff_t length)); + int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); -- GitLab From 9124a26401483bf2b13a99cb4317dce3f677060f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 29 Sep 2022 01:58:59 -0700 Subject: [PATCH 0844/1423] kunit/fortify: Validate __alloc_size attribute results Validate the effect of the __alloc_size attribute on allocators. If the compiler doesn't support __builtin_dynamic_object_size(), skip the associated tests. (For GCC, just remove the "--make_options" line below...) $ ./tools/testing/kunit/kunit.py run --arch x86_64 \ --kconfig_add CONFIG_FORTIFY_SOURCE=y \ --make_options LLVM=1 fortify ... [15:16:30] ================== fortify (10 subtests) =================== [15:16:30] [PASSED] known_sizes_test [15:16:30] [PASSED] control_flow_split_test [15:16:30] [PASSED] alloc_size_kmalloc_const_test [15:16:30] [PASSED] alloc_size_kmalloc_dynamic_test [15:16:30] [PASSED] alloc_size_vmalloc_const_test [15:16:30] [PASSED] alloc_size_vmalloc_dynamic_test [15:16:30] [PASSED] alloc_size_kvmalloc_const_test [15:16:30] [PASSED] alloc_size_kvmalloc_dynamic_test [15:16:30] [PASSED] alloc_size_devm_kmalloc_const_test [15:16:30] [PASSED] alloc_size_devm_kmalloc_dynamic_test [15:16:30] ===================== [PASSED] fortify ===================== [15:16:30] ============================================================ [15:16:30] Testing complete. Ran 10 tests: passed: 10 [15:16:31] Elapsed time: 8.348s total, 0.002s configuring, 6.923s building, 1.075s running For earlier GCC prior to version 12, the dynamic tests will be skipped: [15:18:59] ================== fortify (10 subtests) =================== [15:18:59] [PASSED] known_sizes_test [15:18:59] [PASSED] control_flow_split_test [15:18:59] [PASSED] alloc_size_kmalloc_const_test [15:18:59] [SKIPPED] alloc_size_kmalloc_dynamic_test [15:18:59] [PASSED] alloc_size_vmalloc_const_test [15:18:59] [SKIPPED] alloc_size_vmalloc_dynamic_test [15:18:59] [PASSED] alloc_size_kvmalloc_const_test [15:18:59] [SKIPPED] alloc_size_kvmalloc_dynamic_test [15:18:59] [PASSED] alloc_size_devm_kmalloc_const_test [15:18:59] [SKIPPED] alloc_size_devm_kmalloc_dynamic_test [15:18:59] ===================== [PASSED] fortify ===================== [15:18:59] ============================================================ [15:18:59] Testing complete. Ran 10 tests: passed: 6, skipped: 4 [15:18:59] Elapsed time: 11.965s total, 0.002s configuring, 10.540s building, 1.068s running Cc: David Gow Cc: linux-hardening@vger.kernel.org Signed-off-by: Kees Cook --- lib/Makefile | 1 + lib/fortify_kunit.c | 255 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 256 insertions(+) diff --git a/lib/Makefile b/lib/Makefile index 322178b9f7fbc..2f0454b931dc8 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -378,6 +378,7 @@ CFLAGS_overflow_kunit.o = $(call cc-disable-warning, tautological-constant-out-o obj-$(CONFIG_OVERFLOW_KUNIT_TEST) += overflow_kunit.o CFLAGS_stackinit_kunit.o += $(call cc-disable-warning, switch-unreachable) obj-$(CONFIG_STACKINIT_KUNIT_TEST) += stackinit_kunit.o +CFLAGS_fortify_kunit.o += $(call cc-disable-warning, unsequenced) obj-$(CONFIG_FORTIFY_KUNIT_TEST) += fortify_kunit.o obj-$(CONFIG_STRSCPY_KUNIT_TEST) += strscpy_kunit.o obj-$(CONFIG_SIPHASH_KUNIT_TEST) += siphash_kunit.o diff --git a/lib/fortify_kunit.c b/lib/fortify_kunit.c index 409af07f340af..c8c33cbaae9ec 100644 --- a/lib/fortify_kunit.c +++ b/lib/fortify_kunit.c @@ -16,7 +16,10 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include +#include #include +#include static const char array_of_10[] = "this is 10"; static const char *ptr_of_11 = "this is 11!"; @@ -60,9 +63,261 @@ static void control_flow_split_test(struct kunit *test) KUNIT_EXPECT_EQ(test, want_minus_one(pick), SIZE_MAX); } +#define KUNIT_EXPECT_BOS(test, p, expected, name) \ + KUNIT_EXPECT_EQ_MSG(test, __builtin_object_size(p, 1), \ + expected, \ + "__alloc_size() not working with __bos on " name "\n") + +#if !__has_builtin(__builtin_dynamic_object_size) +#define KUNIT_EXPECT_BDOS(test, p, expected, name) \ + /* Silence "unused variable 'expected'" warning. */ \ + KUNIT_EXPECT_EQ(test, expected, expected) +#else +#define KUNIT_EXPECT_BDOS(test, p, expected, name) \ + KUNIT_EXPECT_EQ_MSG(test, __builtin_dynamic_object_size(p, 1), \ + expected, \ + "__alloc_size() not working with __bdos on " name "\n") +#endif + +/* If the execpted size is a constant value, __bos can see it. */ +#define check_const(_expected, alloc, free) do { \ + size_t expected = (_expected); \ + void *p = alloc; \ + KUNIT_EXPECT_TRUE_MSG(test, p != NULL, #alloc " failed?!\n"); \ + KUNIT_EXPECT_BOS(test, p, expected, #alloc); \ + KUNIT_EXPECT_BDOS(test, p, expected, #alloc); \ + free; \ +} while (0) + +/* If the execpted size is NOT a constant value, __bos CANNOT see it. */ +#define check_dynamic(_expected, alloc, free) do { \ + size_t expected = (_expected); \ + void *p = alloc; \ + KUNIT_EXPECT_TRUE_MSG(test, p != NULL, #alloc " failed?!\n"); \ + KUNIT_EXPECT_BOS(test, p, SIZE_MAX, #alloc); \ + KUNIT_EXPECT_BDOS(test, p, expected, #alloc); \ + free; \ +} while (0) + +/* Assortment of constant-value kinda-edge cases. */ +#define CONST_TEST_BODY(TEST_alloc) do { \ + /* Special-case vmalloc()-family to skip 0-sized allocs. */ \ + if (strcmp(#TEST_alloc, "TEST_vmalloc") != 0) \ + TEST_alloc(check_const, 0, 0); \ + TEST_alloc(check_const, 1, 1); \ + TEST_alloc(check_const, 128, 128); \ + TEST_alloc(check_const, 1023, 1023); \ + TEST_alloc(check_const, 1025, 1025); \ + TEST_alloc(check_const, 4096, 4096); \ + TEST_alloc(check_const, 4097, 4097); \ +} while (0) + +static volatile size_t zero_size; +static volatile size_t unknown_size = 50; + +#if !__has_builtin(__builtin_dynamic_object_size) +#define DYNAMIC_TEST_BODY(TEST_alloc) \ + kunit_skip(test, "Compiler is missing __builtin_dynamic_object_size() support\n") +#else +#define DYNAMIC_TEST_BODY(TEST_alloc) do { \ + size_t size = unknown_size; \ + \ + /* \ + * Expected size is "size" in each test, before it is then \ + * internally incremented in each test. Requires we disable \ + * -Wunsequenced. \ + */ \ + TEST_alloc(check_dynamic, size, size++); \ + /* Make sure incrementing actually happened. */ \ + KUNIT_EXPECT_NE(test, size, unknown_size); \ +} while (0) +#endif + +#define DEFINE_ALLOC_SIZE_TEST_PAIR(allocator) \ +static void alloc_size_##allocator##_const_test(struct kunit *test) \ +{ \ + CONST_TEST_BODY(TEST_##allocator); \ +} \ +static void alloc_size_##allocator##_dynamic_test(struct kunit *test) \ +{ \ + DYNAMIC_TEST_BODY(TEST_##allocator); \ +} + +#define TEST_kmalloc(checker, expected_size, alloc_size) do { \ + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; \ + void *orig; \ + size_t len; \ + \ + checker(expected_size, kmalloc(alloc_size, gfp), \ + kfree(p)); \ + checker(expected_size, \ + kmalloc_node(alloc_size, gfp, NUMA_NO_NODE), \ + kfree(p)); \ + checker(expected_size, kzalloc(alloc_size, gfp), \ + kfree(p)); \ + checker(expected_size, \ + kzalloc_node(alloc_size, gfp, NUMA_NO_NODE), \ + kfree(p)); \ + checker(expected_size, kcalloc(1, alloc_size, gfp), \ + kfree(p)); \ + checker(expected_size, kcalloc(alloc_size, 1, gfp), \ + kfree(p)); \ + checker(expected_size, \ + kcalloc_node(1, alloc_size, gfp, NUMA_NO_NODE), \ + kfree(p)); \ + checker(expected_size, \ + kcalloc_node(alloc_size, 1, gfp, NUMA_NO_NODE), \ + kfree(p)); \ + checker(expected_size, kmalloc_array(1, alloc_size, gfp), \ + kfree(p)); \ + checker(expected_size, kmalloc_array(alloc_size, 1, gfp), \ + kfree(p)); \ + checker(expected_size, \ + kmalloc_array_node(1, alloc_size, gfp, NUMA_NO_NODE), \ + kfree(p)); \ + checker(expected_size, \ + kmalloc_array_node(alloc_size, 1, gfp, NUMA_NO_NODE), \ + kfree(p)); \ + checker(expected_size, __kmalloc(alloc_size, gfp), \ + kfree(p)); \ + checker(expected_size, \ + __kmalloc_node(alloc_size, gfp, NUMA_NO_NODE), \ + kfree(p)); \ + \ + orig = kmalloc(alloc_size, gfp); \ + KUNIT_EXPECT_TRUE(test, orig != NULL); \ + checker((expected_size) * 2, \ + krealloc(orig, (alloc_size) * 2, gfp), \ + kfree(p)); \ + orig = kmalloc(alloc_size, gfp); \ + KUNIT_EXPECT_TRUE(test, orig != NULL); \ + checker((expected_size) * 2, \ + krealloc_array(orig, 1, (alloc_size) * 2, gfp), \ + kfree(p)); \ + orig = kmalloc(alloc_size, gfp); \ + KUNIT_EXPECT_TRUE(test, orig != NULL); \ + checker((expected_size) * 2, \ + krealloc_array(orig, (alloc_size) * 2, 1, gfp), \ + kfree(p)); \ + \ + len = 11; \ + /* Using memdup() with fixed size, so force unknown length. */ \ + if (!__builtin_constant_p(expected_size)) \ + len += zero_size; \ + checker(len, kmemdup("hello there", len, gfp), kfree(p)); \ +} while (0) +DEFINE_ALLOC_SIZE_TEST_PAIR(kmalloc) + +/* Sizes are in pages, not bytes. */ +#define TEST_vmalloc(checker, expected_pages, alloc_pages) do { \ + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; \ + checker((expected_pages) * PAGE_SIZE, \ + vmalloc((alloc_pages) * PAGE_SIZE), vfree(p)); \ + checker((expected_pages) * PAGE_SIZE, \ + vzalloc((alloc_pages) * PAGE_SIZE), vfree(p)); \ + checker((expected_pages) * PAGE_SIZE, \ + __vmalloc((alloc_pages) * PAGE_SIZE, gfp), vfree(p)); \ +} while (0) +DEFINE_ALLOC_SIZE_TEST_PAIR(vmalloc) + +/* Sizes are in pages (and open-coded for side-effects), not bytes. */ +#define TEST_kvmalloc(checker, expected_pages, alloc_pages) do { \ + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; \ + size_t prev_size; \ + void *orig; \ + \ + checker((expected_pages) * PAGE_SIZE, \ + kvmalloc((alloc_pages) * PAGE_SIZE, gfp), \ + vfree(p)); \ + checker((expected_pages) * PAGE_SIZE, \ + kvmalloc_node((alloc_pages) * PAGE_SIZE, gfp, NUMA_NO_NODE), \ + vfree(p)); \ + checker((expected_pages) * PAGE_SIZE, \ + kvzalloc((alloc_pages) * PAGE_SIZE, gfp), \ + vfree(p)); \ + checker((expected_pages) * PAGE_SIZE, \ + kvzalloc_node((alloc_pages) * PAGE_SIZE, gfp, NUMA_NO_NODE), \ + vfree(p)); \ + checker((expected_pages) * PAGE_SIZE, \ + kvcalloc(1, (alloc_pages) * PAGE_SIZE, gfp), \ + vfree(p)); \ + checker((expected_pages) * PAGE_SIZE, \ + kvcalloc((alloc_pages) * PAGE_SIZE, 1, gfp), \ + vfree(p)); \ + checker((expected_pages) * PAGE_SIZE, \ + kvmalloc_array(1, (alloc_pages) * PAGE_SIZE, gfp), \ + vfree(p)); \ + checker((expected_pages) * PAGE_SIZE, \ + kvmalloc_array((alloc_pages) * PAGE_SIZE, 1, gfp), \ + vfree(p)); \ + \ + prev_size = (expected_pages) * PAGE_SIZE; \ + orig = kvmalloc(prev_size, gfp); \ + KUNIT_EXPECT_TRUE(test, orig != NULL); \ + checker(((expected_pages) * PAGE_SIZE) * 2, \ + kvrealloc(orig, prev_size, \ + ((alloc_pages) * PAGE_SIZE) * 2, gfp), \ + kvfree(p)); \ +} while (0) +DEFINE_ALLOC_SIZE_TEST_PAIR(kvmalloc) + +#define TEST_devm_kmalloc(checker, expected_size, alloc_size) do { \ + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; \ + const char dev_name[] = "fortify-test"; \ + struct device *dev; \ + void *orig; \ + size_t len; \ + \ + /* Create dummy device for devm_kmalloc()-family tests. */ \ + dev = root_device_register(dev_name); \ + KUNIT_ASSERT_FALSE_MSG(test, IS_ERR(dev), \ + "Cannot register test device\n"); \ + \ + checker(expected_size, devm_kmalloc(dev, alloc_size, gfp), \ + devm_kfree(dev, p)); \ + checker(expected_size, devm_kzalloc(dev, alloc_size, gfp), \ + devm_kfree(dev, p)); \ + checker(expected_size, \ + devm_kmalloc_array(dev, 1, alloc_size, gfp), \ + devm_kfree(dev, p)); \ + checker(expected_size, \ + devm_kmalloc_array(dev, alloc_size, 1, gfp), \ + devm_kfree(dev, p)); \ + checker(expected_size, \ + devm_kcalloc(dev, 1, alloc_size, gfp), \ + devm_kfree(dev, p)); \ + checker(expected_size, \ + devm_kcalloc(dev, alloc_size, 1, gfp), \ + devm_kfree(dev, p)); \ + \ + orig = devm_kmalloc(dev, alloc_size, gfp); \ + KUNIT_EXPECT_TRUE(test, orig != NULL); \ + checker((expected_size) * 2, \ + devm_krealloc(dev, orig, (alloc_size) * 2, gfp), \ + devm_kfree(dev, p)); \ + \ + len = 4; \ + /* Using memdup() with fixed size, so force unknown length. */ \ + if (!__builtin_constant_p(expected_size)) \ + len += zero_size; \ + checker(len, devm_kmemdup(dev, "Ohai", len, gfp), \ + devm_kfree(dev, p)); \ + \ + device_unregister(dev); \ +} while (0) +DEFINE_ALLOC_SIZE_TEST_PAIR(devm_kmalloc) + static struct kunit_case fortify_test_cases[] = { KUNIT_CASE(known_sizes_test), KUNIT_CASE(control_flow_split_test), + KUNIT_CASE(alloc_size_kmalloc_const_test), + KUNIT_CASE(alloc_size_kmalloc_dynamic_test), + KUNIT_CASE(alloc_size_vmalloc_const_test), + KUNIT_CASE(alloc_size_vmalloc_dynamic_test), + KUNIT_CASE(alloc_size_kvmalloc_const_test), + KUNIT_CASE(alloc_size_kvmalloc_dynamic_test), + KUNIT_CASE(alloc_size_devm_kmalloc_const_test), + KUNIT_CASE(alloc_size_devm_kmalloc_dynamic_test), {} }; -- GitLab From fb491d5500a7ca551e49bc32d9b19d226023f68d Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Fri, 11 Nov 2022 18:06:27 +0100 Subject: [PATCH 0845/1423] KVM: s390: pv: asynchronous destroy for reboot Until now, destroying a protected guest was an entirely synchronous operation that could potentially take a very long time, depending on the size of the guest, due to the time needed to clean up the address space from protected pages. This patch implements an asynchronous destroy mechanism, that allows a protected guest to reboot significantly faster than previously. This is achieved by clearing the pages of the old guest in background. In case of reboot, the new guest will be able to run in the same address space almost immediately. The old protected guest is then only destroyed when all of its memory has been destroyed or otherwise made non protected. Two new PV commands are added for the KVM_S390_PV_COMMAND ioctl: KVM_PV_ASYNC_CLEANUP_PREPARE: set aside the current protected VM for later asynchronous teardown. The current KVM VM will then continue immediately as non-protected. If a protected VM had already been set aside for asynchronous teardown, but without starting the teardown process, this call will fail. There can be at most one VM set aside at any time. Once it is set aside, the protected VM only exists in the context of the Ultravisor, it is not associated with the KVM VM anymore. Its protected CPUs have already been destroyed, but not its memory. This command can be issued again immediately after starting KVM_PV_ASYNC_CLEANUP_PERFORM, without having to wait for completion. KVM_PV_ASYNC_CLEANUP_PERFORM: tears down the protected VM previously set aside using KVM_PV_ASYNC_CLEANUP_PREPARE. Ideally the KVM_PV_ASYNC_CLEANUP_PERFORM PV command should be issued by userspace from a separate thread. If a fatal signal is received (or if the process terminates naturally), the command will terminate immediately without completing. All protected VMs whose teardown was interrupted will be put in the need_cleanup list. The rest of the normal KVM teardown process will take care of properly cleaning up all remaining protected VMs, including the ones on the need_cleanup list. Signed-off-by: Claudio Imbrenda Reviewed-by: Nico Boehr Reviewed-by: Janosch Frank Reviewed-by: Steffen Eiden Link: https://lore.kernel.org/r/20221111170632.77622-2-imbrenda@linux.ibm.com Message-Id: <20221111170632.77622-2-imbrenda@linux.ibm.com> Signed-off-by: Janosch Frank --- arch/s390/include/asm/kvm_host.h | 2 + arch/s390/kvm/kvm-s390.c | 49 ++++- arch/s390/kvm/kvm-s390.h | 3 + arch/s390/kvm/pv.c | 295 +++++++++++++++++++++++++++++-- include/uapi/linux/kvm.h | 2 + 5 files changed, 333 insertions(+), 18 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 21f1339a41974..d67ce719d16a9 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -942,6 +942,8 @@ struct kvm_s390_pv { unsigned long stor_base; void *stor_var; bool dumping; + void *set_aside; + struct list_head need_cleanup; struct mmu_notifier mmu_notifier; }; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index bd6e0201bfe5f..f0abaaf7eea4c 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -209,6 +209,8 @@ unsigned int diag9c_forwarding_hz; module_param(diag9c_forwarding_hz, uint, 0644); MODULE_PARM_DESC(diag9c_forwarding_hz, "Maximum diag9c forwarding per second, 0 to turn off"); +static int async_destroy; + /* * For now we handle at most 16 double words as this is what the s390 base * kernel handles and stores in the prefix page. If we ever need to go beyond @@ -2504,9 +2506,13 @@ static int kvm_s390_pv_dmp(struct kvm *kvm, struct kvm_pv_cmd *cmd, static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) { + const bool need_lock = (cmd->cmd != KVM_PV_ASYNC_CLEANUP_PERFORM); + void __user *argp = (void __user *)cmd->data; int r = 0; u16 dummy; - void __user *argp = (void __user *)cmd->data; + + if (need_lock) + mutex_lock(&kvm->lock); switch (cmd->cmd) { case KVM_PV_ENABLE: { @@ -2540,6 +2546,31 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) set_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs); break; } + case KVM_PV_ASYNC_CLEANUP_PREPARE: + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm) || !async_destroy) + break; + + r = kvm_s390_cpus_from_pv(kvm, &cmd->rc, &cmd->rrc); + /* + * If a CPU could not be destroyed, destroy VM will also fail. + * There is no point in trying to destroy it. Instead return + * the rc and rrc from the first CPU that failed destroying. + */ + if (r) + break; + r = kvm_s390_pv_set_aside(kvm, &cmd->rc, &cmd->rrc); + + /* no need to block service interrupts any more */ + clear_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs); + break; + case KVM_PV_ASYNC_CLEANUP_PERFORM: + r = -EINVAL; + if (!async_destroy) + break; + /* kvm->lock must not be held; this is asserted inside the function. */ + r = kvm_s390_pv_deinit_aside_vm(kvm, &cmd->rc, &cmd->rrc); + break; case KVM_PV_DISABLE: { r = -EINVAL; if (!kvm_s390_pv_is_protected(kvm)) @@ -2553,7 +2584,7 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) */ if (r) break; - r = kvm_s390_pv_deinit_vm(kvm, &cmd->rc, &cmd->rrc); + r = kvm_s390_pv_deinit_cleanup_all(kvm, &cmd->rc, &cmd->rrc); /* no need to block service interrupts any more */ clear_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs); @@ -2703,6 +2734,9 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) default: r = -ENOTTY; } + if (need_lock) + mutex_unlock(&kvm->lock); + return r; } @@ -2907,9 +2941,8 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EINVAL; break; } - mutex_lock(&kvm->lock); + /* must be called without kvm->lock */ r = kvm_s390_handle_pv(kvm, &args); - mutex_unlock(&kvm->lock); if (copy_to_user(argp, &args, sizeof(args))) { r = -EFAULT; break; @@ -3228,6 +3261,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_s390_vsie_init(kvm); if (use_gisa) kvm_s390_gisa_init(kvm); + INIT_LIST_HEAD(&kvm->arch.pv.need_cleanup); + kvm->arch.pv.set_aside = NULL; KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid); return 0; @@ -3272,11 +3307,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm) /* * We are already at the end of life and kvm->lock is not taken. * This is ok as the file descriptor is closed by now and nobody - * can mess with the pv state. To avoid lockdep_assert_held from - * complaining we do not use kvm_s390_pv_is_protected. + * can mess with the pv state. */ - if (kvm_s390_pv_get_handle(kvm)) - kvm_s390_pv_deinit_vm(kvm, &rc, &rrc); + kvm_s390_pv_deinit_cleanup_all(kvm, &rc, &rrc); /* * Remove the mmu notifier only when the whole KVM VM is torn down, * and only if one was registered to begin with. If the VM is diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index a60d1e5c44cd1..826754937ae4a 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -244,6 +244,9 @@ static inline u32 kvm_s390_get_gisa_desc(struct kvm *kvm) /* implemented in pv.c */ int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); +int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc); +int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc); +int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc); int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc); int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc); int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc, diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c index 48c4f57d5d76f..5f958fcf62836 100644 --- a/arch/s390/kvm/pv.c +++ b/arch/s390/kvm/pv.c @@ -18,6 +18,29 @@ #include #include "kvm-s390.h" +/** + * struct pv_vm_to_be_destroyed - Represents a protected VM that needs to + * be destroyed + * + * @list: list head for the list of leftover VMs + * @old_gmap_table: the gmap table of the leftover protected VM + * @handle: the handle of the leftover protected VM + * @stor_var: pointer to the variable storage of the leftover protected VM + * @stor_base: address of the base storage of the leftover protected VM + * + * Represents a protected VM that is still registered with the Ultravisor, + * but which does not correspond any longer to an active KVM VM. It should + * be destroyed at some point later, either asynchronously or when the + * process terminates. + */ +struct pv_vm_to_be_destroyed { + struct list_head list; + unsigned long old_gmap_table; + u64 handle; + void *stor_var; + unsigned long stor_base; +}; + static void kvm_s390_clear_pv_state(struct kvm *kvm) { kvm->arch.pv.handle = 0; @@ -161,23 +184,150 @@ out_err: return -ENOMEM; } -/* this should not fail, but if it does, we must not free the donated memory */ -int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc) +/** + * kvm_s390_pv_dispose_one_leftover - Clean up one leftover protected VM. + * @kvm: the KVM that was associated with this leftover protected VM + * @leftover: details about the leftover protected VM that needs a clean up + * @rc: the RC code of the Destroy Secure Configuration UVC + * @rrc: the RRC code of the Destroy Secure Configuration UVC + * + * Destroy one leftover protected VM. + * On success, kvm->mm->context.protected_count will be decremented atomically + * and all other resources used by the VM will be freed. + * + * Return: 0 in case of success, otherwise 1 + */ +static int kvm_s390_pv_dispose_one_leftover(struct kvm *kvm, + struct pv_vm_to_be_destroyed *leftover, + u16 *rc, u16 *rrc) { int cc; - cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), - UVC_CMD_DESTROY_SEC_CONF, rc, rrc); - WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); + cc = uv_cmd_nodata(leftover->handle, UVC_CMD_DESTROY_SEC_CONF, rc, rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY LEFTOVER VM: rc %x rrc %x", *rc, *rrc); + WARN_ONCE(cc, "protvirt destroy leftover vm failed rc %x rrc %x", *rc, *rrc); + if (cc) + return cc; /* - * if the mm still has a mapping, make all its pages accessible - * before destroying the guest + * Intentionally leak unusable memory. If the UVC fails, the memory + * used for the VM and its metadata is permanently unusable. + * This can only happen in case of a serious KVM or hardware bug; it + * is not expected to happen in normal operation. */ - if (mmget_not_zero(kvm->mm)) { - s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE); - mmput(kvm->mm); + free_pages(leftover->stor_base, get_order(uv_info.guest_base_stor_len)); + free_pages(leftover->old_gmap_table, CRST_ALLOC_ORDER); + vfree(leftover->stor_var); + atomic_dec(&kvm->mm->context.protected_count); + return 0; +} + +/** + * kvm_s390_destroy_lower_2g - Destroy the first 2GB of protected guest memory. + * @kvm: the VM whose memory is to be cleared. + * + * Destroy the first 2GB of guest memory, to avoid prefix issues after reboot. + * The CPUs of the protected VM need to be destroyed beforehand. + */ +static void kvm_s390_destroy_lower_2g(struct kvm *kvm) +{ + const unsigned long pages_2g = SZ_2G / PAGE_SIZE; + struct kvm_memory_slot *slot; + unsigned long len; + int srcu_idx; + + srcu_idx = srcu_read_lock(&kvm->srcu); + + /* Take the memslot containing guest absolute address 0 */ + slot = gfn_to_memslot(kvm, 0); + /* Clear all slots or parts thereof that are below 2GB */ + while (slot && slot->base_gfn < pages_2g) { + len = min_t(u64, slot->npages, pages_2g - slot->base_gfn) * PAGE_SIZE; + s390_uv_destroy_range(kvm->mm, slot->userspace_addr, slot->userspace_addr + len); + /* Take the next memslot */ + slot = gfn_to_memslot(kvm, slot->base_gfn + slot->npages); + } + + srcu_read_unlock(&kvm->srcu, srcu_idx); +} + +/** + * kvm_s390_pv_set_aside - Set aside a protected VM for later teardown. + * @kvm: the VM + * @rc: return value for the RC field of the UVCB + * @rrc: return value for the RRC field of the UVCB + * + * Set aside the protected VM for a subsequent teardown. The VM will be able + * to continue immediately as a non-secure VM, and the information needed to + * properly tear down the protected VM is set aside. If another protected VM + * was already set aside without starting its teardown, this function will + * fail. + * The CPUs of the protected VM need to be destroyed beforehand. + * + * Context: kvm->lock needs to be held + * + * Return: 0 in case of success, -EINVAL if another protected VM was already set + * aside, -ENOMEM if the system ran out of memory. + */ +int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct pv_vm_to_be_destroyed *priv; + + lockdep_assert_held(&kvm->lock); + /* + * If another protected VM was already prepared for teardown, refuse. + * A normal deinitialization has to be performed instead. + */ + if (kvm->arch.pv.set_aside) + return -EINVAL; + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->stor_var = kvm->arch.pv.stor_var; + priv->stor_base = kvm->arch.pv.stor_base; + priv->handle = kvm_s390_pv_get_handle(kvm); + priv->old_gmap_table = (unsigned long)kvm->arch.gmap->table; + WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); + if (s390_replace_asce(kvm->arch.gmap)) { + kfree(priv); + return -ENOMEM; } + kvm_s390_destroy_lower_2g(kvm); + kvm_s390_clear_pv_state(kvm); + kvm->arch.pv.set_aside = priv; + + *rc = UVC_RC_EXECUTED; + *rrc = 42; + return 0; +} + +/** + * kvm_s390_pv_deinit_vm - Deinitialize the current protected VM + * @kvm: the KVM whose protected VM needs to be deinitialized + * @rc: the RC code of the UVC + * @rrc: the RRC code of the UVC + * + * Deinitialize the current protected VM. This function will destroy and + * cleanup the current protected VM, but it will not cleanup the guest + * memory. This function should only be called when the protected VM has + * just been created and therefore does not have any guest memory, or when + * the caller cleans up the guest memory separately. + * + * This function should not fail, but if it does, the donated memory must + * not be freed. + * + * Context: kvm->lock needs to be held + * + * Return: 0 in case of success, otherwise -EIO + */ +int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + int cc; + + cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_DESTROY_SEC_CONF, rc, rrc); + WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); if (!cc) { atomic_dec(&kvm->mm->context.protected_count); kvm_s390_pv_dealloc_vm(kvm); @@ -191,6 +341,131 @@ int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc) return cc ? -EIO : 0; } +/** + * kvm_s390_pv_deinit_cleanup_all - Clean up all protected VMs associated + * with a specific KVM. + * @kvm: the KVM to be cleaned up + * @rc: the RC code of the first failing UVC + * @rrc: the RRC code of the first failing UVC + * + * This function will clean up all protected VMs associated with a KVM. + * This includes the active one, the one prepared for deinitialization with + * kvm_s390_pv_set_aside, and any still pending in the need_cleanup list. + * + * Context: kvm->lock needs to be held unless being called from + * kvm_arch_destroy_vm. + * + * Return: 0 if all VMs are successfully cleaned up, otherwise -EIO + */ +int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct pv_vm_to_be_destroyed *cur; + bool need_zap = false; + u16 _rc, _rrc; + int cc = 0; + + /* Make sure the counter does not reach 0 before calling s390_uv_destroy_range */ + atomic_inc(&kvm->mm->context.protected_count); + + *rc = 1; + /* If the current VM is protected, destroy it */ + if (kvm_s390_pv_get_handle(kvm)) { + cc = kvm_s390_pv_deinit_vm(kvm, rc, rrc); + need_zap = true; + } + + /* If a previous protected VM was set aside, put it in the need_cleanup list */ + if (kvm->arch.pv.set_aside) { + list_add(kvm->arch.pv.set_aside, &kvm->arch.pv.need_cleanup); + kvm->arch.pv.set_aside = NULL; + } + + /* Cleanup all protected VMs in the need_cleanup list */ + while (!list_empty(&kvm->arch.pv.need_cleanup)) { + cur = list_first_entry(&kvm->arch.pv.need_cleanup, typeof(*cur), list); + need_zap = true; + if (kvm_s390_pv_dispose_one_leftover(kvm, cur, &_rc, &_rrc)) { + cc = 1; + /* + * Only return the first error rc and rrc, so make + * sure it is not overwritten. All destroys will + * additionally be reported via KVM_UV_EVENT(). + */ + if (*rc == UVC_RC_EXECUTED) { + *rc = _rc; + *rrc = _rrc; + } + } + list_del(&cur->list); + kfree(cur); + } + + /* + * If the mm still has a mapping, try to mark all its pages as + * accessible. The counter should not reach zero before this + * cleanup has been performed. + */ + if (need_zap && mmget_not_zero(kvm->mm)) { + s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE); + mmput(kvm->mm); + } + + /* Now the counter can safely reach 0 */ + atomic_dec(&kvm->mm->context.protected_count); + return cc ? -EIO : 0; +} + +/** + * kvm_s390_pv_deinit_aside_vm - Teardown a previously set aside protected VM. + * @kvm: the VM previously associated with the protected VM + * @rc: return value for the RC field of the UVCB + * @rrc: return value for the RRC field of the UVCB + * + * Tear down the protected VM that had been previously prepared for teardown + * using kvm_s390_pv_set_aside_vm. Ideally this should be called by + * userspace asynchronously from a separate thread. + * + * Context: kvm->lock must not be held. + * + * Return: 0 in case of success, -EINVAL if no protected VM had been + * prepared for asynchronous teardowm, -EIO in case of other errors. + */ +int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct pv_vm_to_be_destroyed *p; + int ret = 0; + + lockdep_assert_not_held(&kvm->lock); + mutex_lock(&kvm->lock); + p = kvm->arch.pv.set_aside; + kvm->arch.pv.set_aside = NULL; + mutex_unlock(&kvm->lock); + if (!p) + return -EINVAL; + + /* When a fatal signal is received, stop immediately */ + if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX)) + goto done; + if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc)) + ret = -EIO; + kfree(p); + p = NULL; +done: + /* + * p is not NULL if we aborted because of a fatal signal, in which + * case queue the leftover for later cleanup. + */ + if (p) { + mutex_lock(&kvm->lock); + list_add(&p->list, &kvm->arch.pv.need_cleanup); + mutex_unlock(&kvm->lock); + /* Did not finish, but pretend things went well */ + *rc = UVC_RC_EXECUTED; + *rrc = 42; + } + return ret; +} + static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription, struct mm_struct *mm) { diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 0d5d4419139ae..b3701b23ca189 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1740,6 +1740,8 @@ enum pv_cmd_id { KVM_PV_UNSHARE_ALL, KVM_PV_INFO, KVM_PV_DUMP, + KVM_PV_ASYNC_CLEANUP_PREPARE, + KVM_PV_ASYNC_CLEANUP_PERFORM, }; struct kvm_pv_cmd { -- GitLab From d9459922a15ce7a20a85b38a976494ac7f445732 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Fri, 11 Nov 2022 18:06:28 +0100 Subject: [PATCH 0846/1423] KVM: s390: pv: api documentation for asynchronous destroy Add documentation for the new commands added to the KVM_S390_PV_COMMAND ioctl. Signed-off-by: Claudio Imbrenda Reviewed-by: Nico Boehr Reviewed-by: Steffen Eiden Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20221111170632.77622-3-imbrenda@linux.ibm.com Message-Id: <20221111170632.77622-3-imbrenda@linux.ibm.com> Signed-off-by: Janosch Frank --- Documentation/virt/kvm/api.rst | 41 ++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index eee9f857a986f..9175d41e80816 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -5163,10 +5163,13 @@ KVM_PV_ENABLE ===== ============================= KVM_PV_DISABLE - Deregister the VM from the Ultravisor and reclaim the memory that - had been donated to the Ultravisor, making it usable by the kernel - again. All registered VCPUs are converted back to non-protected - ones. + Deregister the VM from the Ultravisor and reclaim the memory that had + been donated to the Ultravisor, making it usable by the kernel again. + All registered VCPUs are converted back to non-protected ones. If a + previous protected VM had been prepared for asynchonous teardown with + KVM_PV_ASYNC_CLEANUP_PREPARE and not subsequently torn down with + KVM_PV_ASYNC_CLEANUP_PERFORM, it will be torn down in this call + together with the current protected VM. KVM_PV_VM_SET_SEC_PARMS Pass the image header from VM memory to the Ultravisor in @@ -5289,6 +5292,36 @@ KVM_PV_DUMP authentication tag all of which are needed to decrypt the dump at a later time. +KVM_PV_ASYNC_CLEANUP_PREPARE + :Capability: KVM_CAP_S390_PROTECTED_ASYNC_DISABLE + + Prepare the current protected VM for asynchronous teardown. Most + resources used by the current protected VM will be set aside for a + subsequent asynchronous teardown. The current protected VM will then + resume execution immediately as non-protected. There can be at most + one protected VM prepared for asynchronous teardown at any time. If + a protected VM had already been prepared for teardown without + subsequently calling KVM_PV_ASYNC_CLEANUP_PERFORM, this call will + fail. In that case, the userspace process should issue a normal + KVM_PV_DISABLE. The resources set aside with this call will need to + be cleaned up with a subsequent call to KVM_PV_ASYNC_CLEANUP_PERFORM + or KVM_PV_DISABLE, otherwise they will be cleaned up when KVM + terminates. KVM_PV_ASYNC_CLEANUP_PREPARE can be called again as soon + as cleanup starts, i.e. before KVM_PV_ASYNC_CLEANUP_PERFORM finishes. + +KVM_PV_ASYNC_CLEANUP_PERFORM + :Capability: KVM_CAP_S390_PROTECTED_ASYNC_DISABLE + + Tear down the protected VM previously prepared for teardown with + KVM_PV_ASYNC_CLEANUP_PREPARE. The resources that had been set aside + will be freed during the execution of this command. This PV command + should ideally be issued by userspace from a separate thread. If a + fatal signal is received (or the process terminates naturally), the + command will terminate immediately without completing, and the normal + KVM shutdown procedure will take care of cleaning up all remaining + protected VMs, including the ones whose teardown was interrupted by + process termination. + 4.126 KVM_XEN_HVM_SET_ATTR -------------------------- -- GitLab From 8c516b25d6e9c70e6d76627932b14b0ef03a82c4 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Fri, 11 Nov 2022 18:06:29 +0100 Subject: [PATCH 0847/1423] KVM: s390: pv: add KVM_CAP_S390_PROTECTED_ASYNC_DISABLE Add KVM_CAP_S390_PROTECTED_ASYNC_DISABLE to signal that the KVM_PV_ASYNC_DISABLE and KVM_PV_ASYNC_DISABLE_PREPARE commands for the KVM_S390_PV_COMMAND ioctl are available. Signed-off-by: Claudio Imbrenda Reviewed-by: Nico Boehr Reviewed-by: Steffen Eiden Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20221111170632.77622-4-imbrenda@linux.ibm.com Message-Id: <20221111170632.77622-4-imbrenda@linux.ibm.com> Signed-off-by: Janosch Frank --- arch/s390/kvm/kvm-s390.c | 3 +++ include/uapi/linux/kvm.h | 1 + 2 files changed, 4 insertions(+) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index f0abaaf7eea4c..b6cc7d2935c0c 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -618,6 +618,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_BPB: r = test_facility(82); break; + case KVM_CAP_S390_PROTECTED_ASYNC_DISABLE: + r = async_destroy && is_prot_virt_host(); + break; case KVM_CAP_S390_PROTECTED: r = is_prot_virt_host(); break; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index b3701b23ca189..d3f86a2808580 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1178,6 +1178,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_S390_ZPCI_OP 221 #define KVM_CAP_S390_CPU_TOPOLOGY 222 #define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223 +#define KVM_CAP_S390_PROTECTED_ASYNC_DISABLE 224 #ifdef KVM_CAP_IRQ_ROUTING -- GitLab From afe20eb8df9108e4be9bdec88a4f90d2de863ca2 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Fri, 11 Nov 2022 18:06:30 +0100 Subject: [PATCH 0848/1423] KVM: s390: pv: avoid export before import if possible If the appropriate UV feature bit is set, there is no need to perform an export before import. The misc feature indicates, among other things, that importing a shared page from a different protected VM will automatically also transfer its ownership. Signed-off-by: Claudio Imbrenda Reviewed-by: Nico Boehr Reviewed-by: Janosch Frank Reviewed-by: Steffen Eiden Link: https://lore.kernel.org/r/20221111170632.77622-5-imbrenda@linux.ibm.com Message-Id: <20221111170632.77622-5-imbrenda@linux.ibm.com> Signed-off-by: Janosch Frank --- arch/s390/kernel/uv.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index f9810d2a267c6..9f18a4af9c131 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -255,6 +255,13 @@ static int make_secure_pte(pte_t *ptep, unsigned long addr, */ static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm) { + /* + * The misc feature indicates, among other things, that importing a + * shared page from a different protected VM will automatically also + * transfer its ownership. + */ + if (test_bit_inv(BIT_UV_FEAT_MISC, &uv_info.uv_feature_indications)) + return false; if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED) return false; return atomic_read(&mm->context.protected_count) > 1; -- GitLab From f7866f582b1c9d80d1a3bd0953170185668c52ca Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Fri, 11 Nov 2022 18:06:31 +0100 Subject: [PATCH 0849/1423] KVM: s390: pv: support for Destroy fast UVC Add support for the Destroy Secure Configuration Fast Ultravisor call, and take advantage of it for asynchronous destroy. When supported, the protected guest is destroyed immediately using the new UVC, leaving only the memory to be cleaned up asynchronously. Signed-off-by: Claudio Imbrenda Reviewed-by: Nico Boehr Reviewed-by: Janosch Frank Reviewed-by: Steffen Eiden Link: https://lore.kernel.org/r/20221111170632.77622-6-imbrenda@linux.ibm.com Message-Id: <20221111170632.77622-6-imbrenda@linux.ibm.com> Signed-off-by: Janosch Frank --- arch/s390/include/asm/uv.h | 10 +++++++ arch/s390/kvm/pv.c | 61 +++++++++++++++++++++++++++++++++----- 2 files changed, 63 insertions(+), 8 deletions(-) diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index be3ef9dd69726..28a9ad57b6f16 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -34,6 +34,7 @@ #define UVC_CMD_INIT_UV 0x000f #define UVC_CMD_CREATE_SEC_CONF 0x0100 #define UVC_CMD_DESTROY_SEC_CONF 0x0101 +#define UVC_CMD_DESTROY_SEC_CONF_FAST 0x0102 #define UVC_CMD_CREATE_SEC_CPU 0x0120 #define UVC_CMD_DESTROY_SEC_CPU 0x0121 #define UVC_CMD_CONV_TO_SEC_STOR 0x0200 @@ -81,6 +82,7 @@ enum uv_cmds_inst { BIT_UVC_CMD_UNSHARE_ALL = 20, BIT_UVC_CMD_PIN_PAGE_SHARED = 21, BIT_UVC_CMD_UNPIN_PAGE_SHARED = 22, + BIT_UVC_CMD_DESTROY_SEC_CONF_FAST = 23, BIT_UVC_CMD_DUMP_INIT = 24, BIT_UVC_CMD_DUMP_CONFIG_STOR_STATE = 25, BIT_UVC_CMD_DUMP_CPU = 26, @@ -230,6 +232,14 @@ struct uv_cb_nodata { u64 reserved20[4]; } __packed __aligned(8); +/* Destroy Configuration Fast */ +struct uv_cb_destroy_fast { + struct uv_cb_header header; + u64 reserved08[2]; + u64 handle; + u64 reserved20[5]; +} __packed __aligned(8); + /* Set Shared Access */ struct uv_cb_share { struct uv_cb_header header; diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c index 5f958fcf62836..e032ebbf51b97 100644 --- a/arch/s390/kvm/pv.c +++ b/arch/s390/kvm/pv.c @@ -203,6 +203,9 @@ static int kvm_s390_pv_dispose_one_leftover(struct kvm *kvm, { int cc; + /* It used the destroy-fast UVC, nothing left to do here */ + if (!leftover->handle) + goto done_fast; cc = uv_cmd_nodata(leftover->handle, UVC_CMD_DESTROY_SEC_CONF, rc, rrc); KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY LEFTOVER VM: rc %x rrc %x", *rc, *rrc); WARN_ONCE(cc, "protvirt destroy leftover vm failed rc %x rrc %x", *rc, *rrc); @@ -217,6 +220,7 @@ static int kvm_s390_pv_dispose_one_leftover(struct kvm *kvm, free_pages(leftover->stor_base, get_order(uv_info.guest_base_stor_len)); free_pages(leftover->old_gmap_table, CRST_ALLOC_ORDER); vfree(leftover->stor_var); +done_fast: atomic_dec(&kvm->mm->context.protected_count); return 0; } @@ -250,6 +254,36 @@ static void kvm_s390_destroy_lower_2g(struct kvm *kvm) srcu_read_unlock(&kvm->srcu, srcu_idx); } +static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct uv_cb_destroy_fast uvcb = { + .header.cmd = UVC_CMD_DESTROY_SEC_CONF_FAST, + .header.len = sizeof(uvcb), + .handle = kvm_s390_pv_get_handle(kvm), + }; + int cc; + + cc = uv_call_sched(0, (u64)&uvcb); + if (rc) + *rc = uvcb.header.rc; + if (rrc) + *rrc = uvcb.header.rrc; + WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); + KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM FAST: rc %x rrc %x", + uvcb.header.rc, uvcb.header.rrc); + WARN_ONCE(cc, "protvirt destroy vm fast failed handle %llx rc %x rrc %x", + kvm_s390_pv_get_handle(kvm), uvcb.header.rc, uvcb.header.rrc); + /* Inteded memory leak on "impossible" error */ + if (!cc) + kvm_s390_pv_dealloc_vm(kvm); + return cc ? -EIO : 0; +} + +static inline bool is_destroy_fast_available(void) +{ + return test_bit_inv(BIT_UVC_CMD_DESTROY_SEC_CONF_FAST, uv_info.inst_calls_list); +} + /** * kvm_s390_pv_set_aside - Set aside a protected VM for later teardown. * @kvm: the VM @@ -271,6 +305,7 @@ static void kvm_s390_destroy_lower_2g(struct kvm *kvm) int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc) { struct pv_vm_to_be_destroyed *priv; + int res = 0; lockdep_assert_held(&kvm->lock); /* @@ -283,14 +318,21 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc) if (!priv) return -ENOMEM; - priv->stor_var = kvm->arch.pv.stor_var; - priv->stor_base = kvm->arch.pv.stor_base; - priv->handle = kvm_s390_pv_get_handle(kvm); - priv->old_gmap_table = (unsigned long)kvm->arch.gmap->table; - WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); - if (s390_replace_asce(kvm->arch.gmap)) { + if (is_destroy_fast_available()) { + res = kvm_s390_pv_deinit_vm_fast(kvm, rc, rrc); + } else { + priv->stor_var = kvm->arch.pv.stor_var; + priv->stor_base = kvm->arch.pv.stor_base; + priv->handle = kvm_s390_pv_get_handle(kvm); + priv->old_gmap_table = (unsigned long)kvm->arch.gmap->table; + WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); + if (s390_replace_asce(kvm->arch.gmap)) + res = -ENOMEM; + } + + if (res) { kfree(priv); - return -ENOMEM; + return res; } kvm_s390_destroy_lower_2g(kvm); @@ -471,6 +513,7 @@ static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription, { struct kvm *kvm = container_of(subscription, struct kvm, arch.pv.mmu_notifier); u16 dummy; + int r; /* * No locking is needed since this is the last thread of the last user of this @@ -479,7 +522,9 @@ static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription, * unregistered. This means that if this notifier runs, then the * struct kvm is still valid. */ - kvm_s390_cpus_from_pv(kvm, &dummy, &dummy); + r = kvm_s390_cpus_from_pv(kvm, &dummy, &dummy); + if (!r && is_destroy_fast_available() && kvm_s390_pv_get_handle(kvm)) + kvm_s390_pv_deinit_vm_fast(kvm, &dummy, &dummy); } static const struct mmu_notifier_ops kvm_s390_pv_mmu_notifier_ops = { -- GitLab From cc726886079febfa2384d77486d7f3a11f951ea9 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Fri, 11 Nov 2022 18:06:32 +0100 Subject: [PATCH 0850/1423] KVM: s390: pv: module parameter to fence asynchronous destroy Add the module parameter "async_destroy", to allow the asynchronous destroy mechanism to be switched off. This might be useful for debugging purposes. The parameter is enabled by default since the feature is opt-in anyway. Signed-off-by: Claudio Imbrenda Reviewed-by: Janosch Frank Reviewed-by: Steffen Eiden Reviewed-by: Nico Boehr Link: https://lore.kernel.org/r/20221111170632.77622-7-imbrenda@linux.ibm.com Message-Id: <20221111170632.77622-7-imbrenda@linux.ibm.com> Signed-off-by: Janosch Frank --- arch/s390/kvm/kvm-s390.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index b6cc7d2935c0c..8a0c884bf7375 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -209,7 +209,13 @@ unsigned int diag9c_forwarding_hz; module_param(diag9c_forwarding_hz, uint, 0644); MODULE_PARM_DESC(diag9c_forwarding_hz, "Maximum diag9c forwarding per second, 0 to turn off"); -static int async_destroy; +/* + * allow asynchronous deinit for protected guests; enable by default since + * the feature is opt-in anyway + */ +static int async_destroy = 1; +module_param(async_destroy, int, 0444); +MODULE_PARM_DESC(async_destroy, "Asynchronous destroy for protected guests"); /* * For now we handle at most 16 double words as this is what the s390 base -- GitLab From dbec280045f8ca568de2a88ac4712a35f82f2cb1 Mon Sep 17 00:00:00 2001 From: Nico Boehr Date: Fri, 18 Nov 2022 11:04:29 +0100 Subject: [PATCH 0851/1423] s390/vfio-ap: GISA: sort out physical vs virtual pointers usage Fix virtual vs physical address confusion (which currently are the same) for the GISA when enabling the IRQ. Signed-off-by: Nico Boehr Reviewed-by: Halil Pasic Reviewed-by: Claudio Imbrenda Link: https://lore.kernel.org/r/20221118100429.70453-1-nrb@linux.ibm.com Message-Id: <20221118100429.70453-1-nrb@linux.ibm.com> Signed-off-by: Janosch Frank --- drivers/s390/crypto/vfio_ap_ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 0b4cc8c597ae6..205a001058589 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -429,7 +429,7 @@ static struct ap_queue_status vfio_ap_irq_enable(struct vfio_ap_queue *q, aqic_gisa.isc = nisc; aqic_gisa.ir = 1; - aqic_gisa.gisa = (uint64_t)gisa >> 4; + aqic_gisa.gisa = virt_to_phys(gisa) >> 4; status = ap_aqic(q->apqn, aqic_gisa, h_nib); switch (status.response_code) { -- GitLab From 99b63f55dc514a357c2ecf25e9aab149879329f0 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 18 Nov 2022 16:11:33 +0100 Subject: [PATCH 0852/1423] KVM: s390: remove unused gisa_clear_ipm_gisc() function clang warns about an unused function: arch/s390/kvm/interrupt.c:317:20: error: unused function 'gisa_clear_ipm_gisc' [-Werror,-Wunused-function] static inline void gisa_clear_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) Remove gisa_clear_ipm_gisc(), since it is unused and get rid of this warning. Signed-off-by: Heiko Carstens Reviewed-by: Thomas Huth Link: https://lore.kernel.org/r/20221118151133.2974602-1-hca@linux.ibm.com Signed-off-by: Christian Borntraeger Signed-off-by: Janosch Frank --- arch/s390/kvm/interrupt.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index ab569faf0df24..1dae78deddf28 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -314,11 +314,6 @@ static inline u8 gisa_get_ipm(struct kvm_s390_gisa *gisa) return READ_ONCE(gisa->ipm); } -static inline void gisa_clear_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) -{ - clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); -} - static inline int gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) { return test_and_clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); -- GitLab From b8a83e600bdde93e7da41ea3204b2b3832a3c99b Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Sun, 13 Nov 2022 22:12:42 +0300 Subject: [PATCH 0853/1423] dt-bindings: imx6q-pcie: Fix clock names for imx6sx and imx8mq Originally as it was defined the legacy bindings the pcie_inbound_axi and pcie_aux clock names were supposed to be used in the fsl,imx6sx-pcie and fsl,imx8mq-pcie devices respectively. But the bindings conversion has been incorrectly so now the fourth clock name is defined as "pcie_inbound_axi for imx6sx-pcie, pcie_aux for imx8mq-pcie", which is completely wrong. Let's fix that by conditionally apply the clock-names constraints based on the compatible string content. Link: https://lore.kernel.org/r/20221113191301.5526-2-Sergey.Semin@baikalelectronics.ru Fixes: 751ca492f131 ("dt-bindings: PCI: imx6: convert the imx pcie controller to dtschema") Signed-off-by: Serge Semin Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Acked-by: Alexander Stein --- .../bindings/pci/fsl,imx6q-pcie.yaml | 46 +++++++++++++++++-- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.yaml b/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.yaml index 376e739bcad40..49b4f7a32e71e 100644 --- a/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.yaml +++ b/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.yaml @@ -14,9 +14,6 @@ description: |+ This PCIe host controller is based on the Synopsys DesignWare PCIe IP and thus inherits all the common properties defined in snps,dw-pcie.yaml. -allOf: - - $ref: /schemas/pci/snps,dw-pcie.yaml# - properties: compatible: enum: @@ -61,7 +58,7 @@ properties: - const: pcie - const: pcie_bus - const: pcie_phy - - const: pcie_inbound_axi for imx6sx-pcie, pcie_aux for imx8mq-pcie + - enum: [ pcie_inbound_axi, pcie_aux ] num-lanes: const: 1 @@ -175,6 +172,47 @@ required: - clocks - clock-names +allOf: + - $ref: /schemas/pci/snps,dw-pcie.yaml# + - if: + properties: + compatible: + contains: + const: fsl,imx6sx-pcie + then: + properties: + clock-names: + items: + - {} + - {} + - {} + - const: pcie_inbound_axi + - if: + properties: + compatible: + contains: + const: fsl,imx8mq-pcie + then: + properties: + clock-names: + items: + - {} + - {} + - {} + - const: pcie_aux + - if: + properties: + compatible: + not: + contains: + enum: + - fsl,imx6sx-pcie + - fsl,imx8mq-pcie + then: + properties: + clock-names: + maxItems: 3 + unevaluatedProperties: false examples: -- GitLab From 4cf4b9b70ab2785461190c08a3542d2d74c28b46 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Sun, 13 Nov 2022 22:12:43 +0300 Subject: [PATCH 0854/1423] dt-bindings: visconti-pcie: Fix interrupts array max constraints In accordance with the way the device DT-node is actually defined in arch/arm64/boot/dts/toshiba/tmpv7708.dtsi and the way the device is probed by the DW PCIe driver there are two IRQs it actually has. It's MSI IRQ the DT-bindings lack. Let's extend the interrupts property constraints then and fix the schema example so one would be acceptable by the actual device DT-bindings. Link: https://lore.kernel.org/r/20221113191301.5526-3-Sergey.Semin@baikalelectronics.ru Fixes: 17c1b16340f0 ("dt-bindings: pci: Add DT binding for Toshiba Visconti PCIe controller") Signed-off-by: Serge Semin Signed-off-by: Lorenzo Pieralisi Acked-by: Rob Herring Acked-by: Nobuhiro Iwamatsu --- .../devicetree/bindings/pci/toshiba,visconti-pcie.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/pci/toshiba,visconti-pcie.yaml b/Documentation/devicetree/bindings/pci/toshiba,visconti-pcie.yaml index 48ed227fc5b9e..53da2edd7c9ab 100644 --- a/Documentation/devicetree/bindings/pci/toshiba,visconti-pcie.yaml +++ b/Documentation/devicetree/bindings/pci/toshiba,visconti-pcie.yaml @@ -36,7 +36,7 @@ properties: - const: mpu interrupts: - maxItems: 1 + maxItems: 2 clocks: items: @@ -94,8 +94,9 @@ examples: #interrupt-cells = <1>; ranges = <0x81000000 0 0x40000000 0 0x40000000 0 0x00010000>, <0x82000000 0 0x50000000 0 0x50000000 0 0x20000000>; - interrupts = ; - interrupt-names = "intr"; + interrupts = , + ; + interrupt-names = "msi", "intr"; interrupt-map-mask = <0 0 0 7>; interrupt-map = <0 0 0 1 &gic GIC_SPI 215 IRQ_TYPE_LEVEL_HIGH -- GitLab From 057646a5db2f8873efba90eeffd165c2525b413f Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Sun, 13 Nov 2022 22:12:44 +0300 Subject: [PATCH 0855/1423] dt-bindings: PCI: dwc: Detach common RP/EP DT bindings Currently both DW PCIe Root Port and End-point DT bindings are defined as separate schemas. Carefully looking at them, at the hardware reference manuals and seeing there is a generic part of the driver used by the both RP and EP drivers we can greatly simplify the DW PCIe controller bindings by moving some of the properties into the common DT schema. It concerns the PERST GPIO control, number of lanes, number of iATU windows and CDM check properties. They will be defined in the snps,dw-pcie-common.yaml schema which will be referenced in the DW PCIe Root Port and End-point DT bindings in order to evaluate the common for both of these controllers properties. The rest of properties like reg{,-names}, clock{s,-names}, reset{s,-names}, etc will be consolidate there in one of the next commits. Link: https://lore.kernel.org/r/20221113191301.5526-4-Sergey.Semin@baikalelectronics.ru Signed-off-by: Serge Semin Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring --- .../bindings/pci/snps,dw-pcie-common.yaml | 76 +++++++++++++++++++ .../bindings/pci/snps,dw-pcie-ep.yaml | 31 +------- .../devicetree/bindings/pci/snps,dw-pcie.yaml | 33 +------- 3 files changed, 78 insertions(+), 62 deletions(-) create mode 100644 Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml diff --git a/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml b/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml new file mode 100644 index 0000000000000..554c2804c608a --- /dev/null +++ b/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pci/snps,dw-pcie-common.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Synopsys DWC PCIe RP/EP controller + +maintainers: + - Jingoo Han + - Gustavo Pimentel + +description: + Generic Synopsys DesignWare PCIe Root Port and Endpoint controller + properties. + +select: false + +properties: + reset-gpio: + deprecated: true + description: + Reference to the GPIO-controlled PERST# signal. It is used to reset all + the peripheral devices available on the PCIe bus. + maxItems: 1 + + reset-gpios: + description: + Reference to the GPIO-controlled PERST# signal. It is used to reset all + the peripheral devices available on the PCIe bus. + maxItems: 1 + + num-lanes: + description: + Number of PCIe link lanes to use. Can be omitted if the already brought + up link is supposed to be preserved. + maximum: 16 + + num-ob-windows: + $ref: /schemas/types.yaml#/definitions/uint32 + deprecated: true + description: + Number of outbound address translation windows. This parameter can be + auto-detected based on the iATU memory writability. So there is no + point in having a dedicated DT-property for it. + maximum: 256 + + num-ib-windows: + $ref: /schemas/types.yaml#/definitions/uint32 + deprecated: true + description: + Number of inbound address translation windows. In the same way as + for the outbound AT windows, this parameter can be auto-detected based + on the iATU memory writability. There is no point having a dedicated + DT-property for it either. + maximum: 256 + + num-viewport: + $ref: /schemas/types.yaml#/definitions/uint32 + deprecated: true + description: + Number of outbound view ports configured in hardware. It's the same as + the number of outbound AT windows. + maximum: 256 + + snps,enable-cdm-check: + $ref: /schemas/types.yaml#/definitions/flag + description: + Enable automatic checking of CDM (Configuration Dependent Module) + registers for data corruption. CDM registers include standard PCIe + configuration space registers, Port Logic registers, DMA and iATU + registers. This feature has been available since DWC PCIe v4.80a. + +additionalProperties: true + +... diff --git a/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml b/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml index b78535040f04c..eae60901d60e2 100644 --- a/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml +++ b/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml @@ -15,6 +15,7 @@ description: | allOf: - $ref: /schemas/pci/pci-ep.yaml# + - $ref: /schemas/pci/snps,dw-pcie-common.yaml# properties: compatible: @@ -36,36 +37,6 @@ properties: items: enum: [dbi, dbi2, config, atu, addr_space, link, atu_dma, appl] - reset-gpio: - description: GPIO pin number of PERST# signal - maxItems: 1 - deprecated: true - - reset-gpios: - description: GPIO controlled connection to PERST# signal - maxItems: 1 - - snps,enable-cdm-check: - type: boolean - description: | - This is a boolean property and if present enables - automatic checking of CDM (Configuration Dependent Module) registers - for data corruption. CDM registers include standard PCIe configuration - space registers, Port Logic registers, DMA and iATU (internal Address - Translation Unit) registers. - - num-ib-windows: - $ref: /schemas/types.yaml#/definitions/uint32 - maximum: 256 - description: number of inbound address translation windows - deprecated: true - - num-ob-windows: - $ref: /schemas/types.yaml#/definitions/uint32 - maximum: 256 - description: number of outbound address translation windows - deprecated: true - required: - reg - reg-names diff --git a/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml b/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml index 7287d395e1b65..505b01e0a0341 100644 --- a/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml +++ b/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml @@ -15,6 +15,7 @@ description: | allOf: - $ref: /schemas/pci/pci-bus.yaml# + - $ref: /schemas/pci/snps,dw-pcie-common.yaml# properties: compatible: @@ -37,44 +38,12 @@ properties: enum: [ dbi, dbi2, config, atu, atu_dma, app, appl, elbi, mgmt, ctrl, parf, cfg, link, ulreg, smu, mpu, apb, phy ] - num-lanes: - description: | - number of lanes to use (this property should be specified unless - the link is brought already up in firmware) - maximum: 16 - - reset-gpio: - description: GPIO pin number of PERST# signal - maxItems: 1 - deprecated: true - - reset-gpios: - description: GPIO controlled connection to PERST# signal - maxItems: 1 - interrupts: true interrupt-names: true clocks: true - snps,enable-cdm-check: - type: boolean - description: | - This is a boolean property and if present enables - automatic checking of CDM (Configuration Dependent Module) registers - for data corruption. CDM registers include standard PCIe configuration - space registers, Port Logic registers, DMA and iATU (internal Address - Translation Unit) registers. - - num-viewport: - $ref: /schemas/types.yaml#/definitions/uint32 - maximum: 256 - description: | - number of view ports configured in hardware. If a platform - does not specify it, the driver autodetects it. - deprecated: true - additionalProperties: true required: -- GitLab From b9fe9985aee2cb62814671b883b9cbfa1c941ab3 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Sun, 13 Nov 2022 22:12:45 +0300 Subject: [PATCH 0856/1423] dt-bindings: PCI: dwc: Remove bus node from the examples It's absolutely redundant seeing by default each node is embedded into its own example-X node with address and size cells set to 1. Link: https://lore.kernel.org/r/20221113191301.5526-5-Sergey.Semin@baikalelectronics.ru Signed-off-by: Serge Semin Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring --- .../bindings/pci/snps,dw-pcie-ep.yaml | 16 ++++----- .../devicetree/bindings/pci/snps,dw-pcie.yaml | 35 ++++++++++--------- 2 files changed, 24 insertions(+), 27 deletions(-) diff --git a/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml b/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml index eae60901d60e2..7d05dcba419bf 100644 --- a/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml +++ b/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml @@ -46,14 +46,10 @@ additionalProperties: true examples: - | - bus { - #address-cells = <1>; - #size-cells = <1>; - pcie-ep@dfd00000 { - compatible = "snps,dw-pcie-ep"; - reg = <0xdfc00000 0x0001000>, /* IP registers 1 */ - <0xdfc01000 0x0001000>, /* IP registers 2 */ - <0xd0000000 0x2000000>; /* Configuration space */ - reg-names = "dbi", "dbi2", "addr_space"; - }; + pcie-ep@dfd00000 { + compatible = "snps,dw-pcie-ep"; + reg = <0xdfc00000 0x0001000>, /* IP registers 1 */ + <0xdfc01000 0x0001000>, /* IP registers 2 */ + <0xd0000000 0x2000000>; /* Configuration space */ + reg-names = "dbi", "dbi2", "addr_space"; }; diff --git a/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml b/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml index 505b01e0a0341..3fdc80453a850 100644 --- a/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml +++ b/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml @@ -53,21 +53,22 @@ required: examples: - | - bus { - #address-cells = <1>; - #size-cells = <1>; - pcie@dfc00000 { - device_type = "pci"; - compatible = "snps,dw-pcie"; - reg = <0xdfc00000 0x0001000>, /* IP registers */ - <0xd0000000 0x0002000>; /* Configuration space */ - reg-names = "dbi", "config"; - #address-cells = <3>; - #size-cells = <2>; - ranges = <0x81000000 0 0x00000000 0xde000000 0 0x00010000>, - <0x82000000 0 0xd0400000 0xd0400000 0 0x0d000000>; - interrupts = <25>, <24>; - #interrupt-cells = <1>; - num-lanes = <1>; - }; + pcie@dfc00000 { + compatible = "snps,dw-pcie"; + device_type = "pci"; + reg = <0xdfc00000 0x0001000>, /* IP registers */ + <0xd0000000 0x0002000>; /* Configuration space */ + reg-names = "dbi", "config"; + #address-cells = <3>; + #size-cells = <2>; + ranges = <0x81000000 0 0x00000000 0xde000000 0 0x00010000>, + <0x82000000 0 0xd0400000 0xd0400000 0 0x0d000000>; + bus-range = <0x0 0xff>; + + interrupts = <25>, <24>; + #interrupt-cells = <1>; + + reset-gpios = <&port0 0 1>; + + num-lanes = <1>; }; -- GitLab From 875596361910711f3e7ba6314075d867e4b74fd1 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Sun, 13 Nov 2022 22:12:46 +0300 Subject: [PATCH 0857/1423] dt-bindings: PCI: dwc: Add phys/phy-names common properties It's normal to have the DW PCIe RP/EP DT-nodes equipped with the explicit PHY phandle references. There can be up to 16 PHYs attach in accordance with the maximum number of supported PCIe lanes. Let's extend the common DW PCIe controller schema with the 'phys' and 'phy-names' properties definition. There two types PHY names are defined: preferred generic names '^pcie[0-9]+$' and non-preferred vendor-specific names '^pcie([0-9]+|-?phy[0-9]*)?$' so to match the names currently supported by the DW PCIe platform drivers ("pcie": meson; "pciephy": qcom, imx6; "pcie-phy": uniphier, rockchip, spear13xx; "pcie": intel-gw; "pcie-phy%d": keystone, dra7xx; "pcie": histb, etc). Link: https://lore.kernel.org/r/20221113191301.5526-6-Sergey.Semin@baikalelectronics.ru Signed-off-by: Serge Semin Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring --- .../bindings/pci/snps,dw-pcie-common.yaml | 24 +++++++++++++++++++ .../bindings/pci/snps,dw-pcie-ep.yaml | 3 +++ .../devicetree/bindings/pci/snps,dw-pcie.yaml | 3 +++ 3 files changed, 30 insertions(+) diff --git a/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml b/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml index 554c2804c608a..91d24e400dfcc 100644 --- a/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml +++ b/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml @@ -17,6 +17,30 @@ description: select: false properties: + phys: + description: + There can be up to the number of possible lanes PHYs specified placed in + the phandle array in the line-based order. Obviously each the specified + PHYs are supposed to be able to work in the PCIe mode with a speed + implied by the DWC PCIe controller they are attached to. + minItems: 1 + maxItems: 16 + + phy-names: + minItems: 1 + maxItems: 16 + oneOf: + - description: Generic PHY names + items: + pattern: '^pcie[0-9]+$' + - description: + Vendor-specific PHY names. Consider using the generic + names above for new bindings. + items: + oneOf: + - pattern: '^pcie(-?phy[0-9]*)?$' + - pattern: '^p2u-[0-7]$' + reset-gpio: deprecated: true description: diff --git a/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml b/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml index 7d05dcba419bf..dcd521aed2130 100644 --- a/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml +++ b/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml @@ -52,4 +52,7 @@ examples: <0xdfc01000 0x0001000>, /* IP registers 2 */ <0xd0000000 0x2000000>; /* Configuration space */ reg-names = "dbi", "dbi2", "addr_space"; + + phys = <&pcie_phy0>, <&pcie_phy1>, <&pcie_phy2>, <&pcie_phy3>; + phy-names = "pcie0", "pcie1", "pcie2", "pcie3"; }; diff --git a/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml b/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml index 3fdc80453a850..d9512f7f7124a 100644 --- a/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml +++ b/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml @@ -70,5 +70,8 @@ examples: reset-gpios = <&port0 0 1>; + phys = <&pcie_phy>; + phy-names = "pcie"; + num-lanes = <1>; }; -- GitLab From eaa9d886528730bcd7213f0b22c8dd468460f495 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Sun, 13 Nov 2022 22:12:47 +0300 Subject: [PATCH 0858/1423] dt-bindings: PCI: dwc: Add max-link-speed common property In accordance with [1] DW PCIe controllers support up to Gen5 link speed. Let's add the max-link-speed property upper bound to 5 then. The DT bindings of the particular devices are expected to setup more strict constraint on that parameter. [1] Synopsys DesignWare Cores PCI Express Controller Databook, Version 5.40a, March 2019, p. 27 Link: https://lore.kernel.org/r/20221113191301.5526-7-Sergey.Semin@baikalelectronics.ru Signed-off-by: Serge Semin Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring --- Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml | 3 +++ Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml | 2 ++ Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml | 1 + 3 files changed, 6 insertions(+) diff --git a/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml b/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml index 91d24e400dfcc..e63c21783fc1f 100644 --- a/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml +++ b/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml @@ -54,6 +54,9 @@ properties: the peripheral devices available on the PCIe bus. maxItems: 1 + max-link-speed: + maximum: 5 + num-lanes: description: Number of PCIe link lanes to use. Can be omitted if the already brought diff --git a/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml b/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml index dcd521aed2130..fc3b5d4ac2450 100644 --- a/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml +++ b/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml @@ -55,4 +55,6 @@ examples: phys = <&pcie_phy0>, <&pcie_phy1>, <&pcie_phy2>, <&pcie_phy3>; phy-names = "pcie0", "pcie1", "pcie2", "pcie3"; + + max-link-speed = <3>; }; diff --git a/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml b/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml index d9512f7f7124a..e787b97275897 100644 --- a/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml +++ b/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml @@ -74,4 +74,5 @@ examples: phy-names = "pcie"; num-lanes = <1>; + max-link-speed = <3>; }; -- GitLab From f133396e2d0063d589362122da659fe047643384 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Sun, 13 Nov 2022 22:12:48 +0300 Subject: [PATCH 0859/1423] dt-bindings: PCI: dwc: Apply generic schema for generic device only Having the generic compatible strings constraints with the 'any'+'generic string' semantic implicitly encourages either to add new DW PCIe-based DT-bindings with the generic compatible string attached or just forget about adding new DT-bindings since the corresponding DT-node will be evaluated anyway. Moreover having that semantic implemented in the generic DT-schema causes the DT-validation tool to apply the schema twice: first by implicit compatible-string-based selection and second by means of the 'allOf: [ $ref ]' statement. Let's fix all of that by dropping the compatible property constraints and selecting the generic DT-schema only for the purely generic DW PCIe DT-nodes. The later is required since there is a driver for such devices. (Though there are no such DT-nodes currently defined in the kernel DT sources.) Link: https://lore.kernel.org/r/20221113191301.5526-8-Sergey.Semin@baikalelectronics.ru Signed-off-by: Serge Semin Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring --- .../devicetree/bindings/pci/snps,dw-pcie-ep.yaml | 16 ++++++++++------ .../devicetree/bindings/pci/snps,dw-pcie.yaml | 16 ++++++++++------ 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml b/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml index fc3b5d4ac2450..d04001248b53d 100644 --- a/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml +++ b/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml @@ -13,16 +13,20 @@ maintainers: description: | Synopsys DesignWare PCIe host controller endpoint +# Please create a separate DT-schema for your DWC PCIe Endpoint controller +# and make sure it's assigned with the vendor-specific compatible string. +select: + properties: + compatible: + const: snps,dw-pcie-ep + required: + - compatible + allOf: - $ref: /schemas/pci/pci-ep.yaml# - $ref: /schemas/pci/snps,dw-pcie-common.yaml# properties: - compatible: - anyOf: - - {} - - const: snps,dw-pcie-ep - reg: description: | It should contain Data Bus Interface (dbi) and config registers for all @@ -38,9 +42,9 @@ properties: enum: [dbi, dbi2, config, atu, addr_space, link, atu_dma, appl] required: + - compatible - reg - reg-names - - compatible additionalProperties: true diff --git a/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml b/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml index e787b97275897..85861b71d9ff7 100644 --- a/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml +++ b/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml @@ -13,16 +13,20 @@ maintainers: description: | Synopsys DesignWare PCIe host controller +# Please create a separate DT-schema for your DWC PCIe Root Port controller +# and make sure it's assigned with the vendor-specific compatible string. +select: + properties: + compatible: + const: snps,dw-pcie + required: + - compatible + allOf: - $ref: /schemas/pci/pci-bus.yaml# - $ref: /schemas/pci/snps,dw-pcie-common.yaml# properties: - compatible: - anyOf: - - {} - - const: snps,dw-pcie - reg: description: | It should contain Data Bus Interface (dbi) and config registers for all @@ -47,9 +51,9 @@ properties: additionalProperties: true required: + - compatible - reg - reg-names - - compatible examples: - | -- GitLab From 12f7936c7a0e0c40069ff12ddfd091a29da6e77c Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Sun, 13 Nov 2022 22:12:49 +0300 Subject: [PATCH 0860/1423] dt-bindings: PCI: dwc: Add max-functions EP property In accordance with [1] the CX_NFUNC IP-core synthesize parameter is responsible for the number of physical functions to support in the EP mode. Its upper limit is 32. Let's use it to constrain the number of PCIe functions the DW PCIe EP DT-nodes can advertise. [1] Synopsys DesignWare Cores PCI Express Controller Databook - DWC PCIe Endpoint, Version 5.40a, March 2019, p. 887. Link: https://lore.kernel.org/r/20221113191301.5526-9-Sergey.Semin@baikalelectronics.ru Signed-off-by: Serge Semin Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring --- Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml b/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml index d04001248b53d..71dd19ae1060c 100644 --- a/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml +++ b/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml @@ -41,6 +41,9 @@ properties: items: enum: [dbi, dbi2, config, atu, addr_space, link, atu_dma, appl] + max-functions: + maximum: 32 + required: - compatible - reg @@ -61,4 +64,5 @@ examples: phy-names = "pcie0", "pcie1", "pcie2", "pcie3"; max-link-speed = <3>; + max-functions = /bits/ 8 <4>; }; -- GitLab From 35486813c41b3a5229b4987857ff597704feda21 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Sun, 13 Nov 2022 22:12:50 +0300 Subject: [PATCH 0861/1423] dt-bindings: PCI: dwc: Add interrupts/interrupt-names common properties Currently the 'interrupts' and 'interrupt-names' properties are defined being too generic to really describe any actual IRQ interface. Moreover the DW PCIe End-point devices are left with no IRQ signals. All of that can be fixed by adding the IRQ-related properties to the common DW PCIe DT-schemas in accordance with the hardware reference manual. The DW PCIe common DT-schema will contain the generic properties definitions with just a number of entries per property, while the DW PCIe RP/EP-specific schemas will have the particular number of items and the generic resource names listed. Note since there are DW PCI-based vendor-specific DT-bindings with the custom names assigned to the same IRQ resources we have no much choice but to add them to the generic DT-schemas in order to have the schemas being applicable for such devices. These names are marked as vendor-specific and should be avoided being used in new bindings in favor of the generic names. Link: https://lore.kernel.org/r/20221113191301.5526-10-Sergey.Semin@baikalelectronics.ru Signed-off-by: Serge Semin Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring --- .../bindings/pci/snps,dw-pcie-common.yaml | 19 ++++ .../bindings/pci/snps,dw-pcie-ep.yaml | 52 +++++++++++ .../devicetree/bindings/pci/snps,dw-pcie.yaml | 90 ++++++++++++++++++- 3 files changed, 158 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml b/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml index e63c21783fc1f..4646fb14e8176 100644 --- a/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml +++ b/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml @@ -17,6 +17,25 @@ description: select: false properties: + interrupts: + description: + There are two main sub-blocks which are normally capable of + generating interrupts. It's System Information Interface and MSI + interface. While the former one has some common for the Host and + Endpoint controllers IRQ-signals, the later interface is obviously + Root Complex specific since it's responsible for the incoming MSI + messages signalling. The System Information IRQ signals are mainly + responsible for reporting the generic PCIe hierarchy and Root + Complex events like VPD IO request, general AER, PME, Hot-plug, link + bandwidth change, link equalization request, INTx asserted/deasserted + Message detection, embedded DMA Tx/Rx/Error. + minItems: 1 + maxItems: 26 + + interrupt-names: + minItems: 1 + maxItems: 26 + phys: description: There can be up to the number of possible lanes PHYs specified placed in diff --git a/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml b/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml index 71dd19ae1060c..7d3f8fc8b7b49 100644 --- a/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml +++ b/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml @@ -41,6 +41,55 @@ properties: items: enum: [dbi, dbi2, config, atu, addr_space, link, atu_dma, appl] + interrupts: + description: + There is no mandatory IRQ signals for the normal controller functioning, + but in addition to the native set the platforms may have a link- or + PM-related IRQs specified. + minItems: 1 + maxItems: 20 + + interrupt-names: + minItems: 1 + maxItems: 20 + items: + oneOf: + - description: + Controller request to read or write virtual product data + from/to the VPD capability registers. + const: vpd + - description: + Link Equalization Request flag is set in the Link Status 2 + register (applicable if the corresponding IRQ is enabled in + the Link Control 3 register). + const: l_eq + - description: + Indicates that the eDMA Tx/Rx transfer is complete or that an + error has occurred on the corresponding channel. eDMA can have + eight Tx (Write) and Rx (Read) eDMA channels thus supporting up + to 16 IRQ signals all together. Write eDMA channels shall go + first in the ordered row as per default edma_int[*] bus setup. + pattern: '^dma([0-9]|1[0-5])?$' + - description: + PCIe protocol correctable error or a Data Path protection + correctable error is detected by the automotive/safety + feature. + const: sft_ce + - description: + Indicates that the internal safety mechanism has detected an + uncorrectable error. + const: sft_ue + - description: + Application-specific IRQ raised depending on the vendor-specific + events basis. + const: app + - description: + Vendor-specific IRQ names. Consider using the generic names above + for new bindings. + oneOf: + - description: See native "app" IRQ for details + enum: [ intr ] + max-functions: maximum: 32 @@ -60,6 +109,9 @@ examples: <0xd0000000 0x2000000>; /* Configuration space */ reg-names = "dbi", "dbi2", "addr_space"; + interrupts = <23>, <24>; + interrupt-names = "dma0", "dma1"; + phys = <&pcie_phy0>, <&pcie_phy1>, <&pcie_phy2>, <&pcie_phy3>; phy-names = "pcie0", "pcie1", "pcie2", "pcie3"; diff --git a/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml b/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml index 85861b71d9ff7..fa1db57b2b979 100644 --- a/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml +++ b/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml @@ -42,9 +42,92 @@ properties: enum: [ dbi, dbi2, config, atu, atu_dma, app, appl, elbi, mgmt, ctrl, parf, cfg, link, ulreg, smu, mpu, apb, phy ] - interrupts: true - - interrupt-names: true + interrupts: + description: + DWC PCIe Root Port/Complex specific IRQ signals. At least MSI interrupt + signal is supposed to be specified for the host controller. + minItems: 1 + maxItems: 26 + + interrupt-names: + minItems: 1 + maxItems: 26 + items: + oneOf: + - description: + Controller request to read or write virtual product data + from/to the VPD capability registers. + const: vpd + - description: + Link Equalization Request flag is set in the Link Status 2 + register (applicable if the corresponding IRQ is enabled in + the Link Control 3 register). + const: l_eq + - description: + Indicates that the eDMA Tx/Rx transfer is complete or that an + error has occurred on the corresponding channel. eDMA can have + eight Tx (Write) and Rx (Read) eDMA channels thus supporting up + to 16 IRQ signals all together. Write eDMA channels shall go + first in the ordered row as per default edma_int[*] bus setup. + pattern: '^dma([0-9]|1[0-5])?$' + - description: + PCIe protocol correctable error or a Data Path protection + correctable error is detected by the automotive/safety + feature. + const: sft_ce + - description: + Indicates that the internal safety mechanism has detected an + uncorrectable error. + const: sft_ue + - description: + Application-specific IRQ raised depending on the vendor-specific + events basis. + const: app + - description: + DSP AXI MSI Interrupt detected. It gets de-asserted when there is + no more MSI interrupt pending. The interrupt is relevant to the + iMSI-RX - Integrated MSI Receiver (AXI bridge). + const: msi + - description: + Legacy A/B/C/D interrupt signal. Basically it's triggered by + receiving a Assert_INT{A,B,C,D}/Desassert_INT{A,B,C,D} message + from the downstream device. + pattern: "^int(a|b|c|d)$" + - description: + Error condition detected and a flag is set in the Root Error Status + register of the AER capability. It's asserted when the RC + internally generated an error or an error message is received by + the RC. + const: aer + - description: + PME message is received by the port. That means having the PME + status bit set in the Root Status register (the event is + supposed to be unmasked in the Root Control register). + const: pme + - description: + Hot-plug event is detected. That is a bit has been set in the + Slot Status register and the corresponding event is enabled in + the Slot Control register. + const: hp + - description: + Link Autonomous Bandwidth Status flag has been set in the Link + Status register (the event is supposed to be unmasked in the + Link Control register). + const: bw_au + - description: + Bandwidth Management Status flag has been set in the Link + Status register (the event is supposed to be unmasked in the + Link Control register). + const: bw_mg + - description: + Vendor-specific IRQ names. Consider using the generic names above + for new bindings. + oneOf: + - description: See native "app" IRQ for details + enum: [ intr ] + allOf: + - contains: + const: msi clocks: true @@ -70,6 +153,7 @@ examples: bus-range = <0x0 0xff>; interrupts = <25>, <24>; + interrupt-names = "msi", "hp"; #interrupt-cells = <1>; reset-gpios = <&port0 0 1>; -- GitLab From 4cc13eedb892c53f3d61fb5a1f6d57724541441a Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Sun, 13 Nov 2022 22:12:51 +0300 Subject: [PATCH 0862/1423] dt-bindings: PCI: dwc: Add reg/reg-names common properties Even though there is a more-or-less limited set of the CSR spaces can be defined for each DW PCIe controller the generic DT-schema currently doesn't specify much limitations on the reg-space names used for one or another range. In order to prevent the vendor-specific controller schemas further deviation from the generic interface let's fix that by introducing the reg-names definition in the common DW PCIe DT-schemas and preserving the generic "reg" and "reg-names" properties in there. New DW PCIe device DT-bindings are encouraged to use the generic set of the CSR spaces defined in the generic DW PCIe RP/EP DT-bindings, while the already available vendor-specific DT-bindings can still apple the common DT-schemas. Note the number of reg/reg-names items need to be changed in the DW PCIe EP DT-schema since aside with the "dbi" CSRs space these arrays can have "dbi2", "addr_space", "atu", etc ranges. Also note since there are DW PCIe-based vendor-specific DT-bindings with the custom names assigned to the same CSR resources we have no much choice but to add them to the generic DT-schemas in order to have the schemas being applicable for such devices. These names are marked as vendor-specific and should be avoided being used in new bindings in favor of the generic names. Link: https://lore.kernel.org/r/20221113191301.5526-11-Sergey.Semin@baikalelectronics.ru Signed-off-by: Serge Semin Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring --- .../bindings/pci/snps,dw-pcie-common.yaml | 22 +++++ .../bindings/pci/snps,dw-pcie-ep.yaml | 82 +++++++++++++++++-- .../devicetree/bindings/pci/snps,dw-pcie.yaml | 78 ++++++++++++++++-- 3 files changed, 169 insertions(+), 13 deletions(-) diff --git a/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml b/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml index 4646fb14e8176..13c41cd50e54a 100644 --- a/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml +++ b/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml @@ -17,6 +17,28 @@ description: select: false properties: + reg: + description: + DWC PCIe CSR space is normally accessed over the dedicated Data Bus + Interface - DBI. In accordance with the reference manual the register + configuration space belongs to the Configuration-Dependent Module (CDM) + and is split up into several sub-parts Standard PCIe configuration + space, Port Logic Registers (PL), Shadow Config-space Registers, + iATU/eDMA registers. The particular sub-space is selected by the + CDM/ELBI (dbi_cs) and CS2 (dbi_cs2) signals (selector bits). Such + configuration provides a flexible interface for the system engineers to + either map the particular space at a desired MMIO address or just leave + them in a contiguous memory space if pure Native or AXI Bridge DBI access + is selected. Note the PCIe CFG-space, PL and Shadow registers are + specific for each activated function, while the rest of the sub-spaces + are common for all of them (if there are more than one). + minItems: 2 + maxItems: 6 + + reg-names: + minItems: 2 + maxItems: 6 + interrupts: description: There are two main sub-blocks which are normally capable of diff --git a/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml b/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml index 7d3f8fc8b7b49..f4d7eb2dec4db 100644 --- a/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml +++ b/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml @@ -28,18 +28,86 @@ allOf: properties: reg: - description: | - It should contain Data Bus Interface (dbi) and config registers for all - versions. - For designware core version >= 4.80, it may contain ATU address space. + description: + DBI, DBI2 reg-spaces and outbound memory window are required for the + normal controller functioning. iATU memory IO region is also required + if the space is unrolled (IP-core version >= 4.80a). minItems: 2 - maxItems: 4 + maxItems: 5 reg-names: minItems: 2 - maxItems: 4 + maxItems: 5 items: - enum: [dbi, dbi2, config, atu, addr_space, link, atu_dma, appl] + oneOf: + - description: + Basic DWC PCIe controller configuration-space accessible over + the DBI interface. This memory space is either activated with + CDM/ELBI = 0 and CS2 = 0 or is a contiguous memory region + with all spaces. Note iATU/eDMA CSRs are indirectly accessible + via the PL viewports on the DWC PCIe controllers older than + v4.80a. + const: dbi + - description: + Shadow DWC PCIe config-space registers. This space is selected + by setting CDM/ELBI = 0 and CS2 = 1. This is an intermix of + the PCI-SIG PCIe CFG-space with the shadow registers for some + PCI Header space, PCI Standard and Extended Structures. It's + mainly relevant for the end-point controller configuration, + but still there are some shadow registers available for the + Root Port mode too. + const: dbi2 + - description: + External Local Bus registers. It's an application-dependent + registers normally defined by the platform engineers. The space + can be selected by setting CDM/ELBI = 1 and CS2 = 0 wires or can + be accessed over some platform-specific means (for instance + as a part of a system controller). + enum: [ elbi, app ] + - description: + iATU/eDMA registers common for all device functions. It's an + unrolled memory space with the internal Address Translation + Unit and Enhanced DMA, which is selected by setting CDM/ELBI = 1 + and CS2 = 1. For IP-core releases prior v4.80a, these registers + have been programmed via an indirect addressing scheme using a + set of viewport CSRs mapped into the PL space. Note iATU is + normally mapped to the 0x0 address of this region, while eDMA + is available at 0x80000 base address. + const: atu + - description: + Platform-specific eDMA registers. Some platforms may have eDMA + CSRs mapped in a non-standard base address. The registers offset + can be changed or the MS/LS-bits of the address can be attached + in an additional RTL block before the MEM-IO transactions reach + the DW PCIe slave interface. + const: dma + - description: + PHY/PCS configuration registers. Some platforms can have the + PCS and PHY CSRs accessible over a dedicated memory mapped + region, but mainly these registers are indirectly accessible + either by means of the embedded PHY viewport schema or by some + platform-specific method. + const: phy + - description: + Outbound iATU-capable memory-region which will be used to + generate various application-specific traffic on the PCIe bus + hierarchy. It's usage scenario depends on the endpoint + functionality, for instance it can be used to create MSI(X) + messages. + const: addr_space + - description: + Vendor-specific CSR names. Consider using the generic names above + for new bindings. + oneOf: + - description: See native 'elbi/app' CSR region for details. + enum: [ link, appl ] + - description: See native 'atu' CSR region for details. + enum: [ atu_dma ] + allOf: + - contains: + const: dbi + - contains: + const: addr_space interrupts: description: diff --git a/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml b/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml index fa1db57b2b979..59d3bbb5883a7 100644 --- a/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml +++ b/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml @@ -28,10 +28,10 @@ allOf: properties: reg: - description: | - It should contain Data Bus Interface (dbi) and config registers for all - versions. - For designware core version >= 4.80, it may contain ATU address space. + description: + At least DBI reg-space and peripheral devices CFG-space outbound window + are required for the normal controller work. iATU memory IO region is + also required if the space is unrolled (IP-core version >= 4.80a). minItems: 2 maxItems: 5 @@ -39,8 +39,74 @@ properties: minItems: 2 maxItems: 5 items: - enum: [ dbi, dbi2, config, atu, atu_dma, app, appl, elbi, mgmt, ctrl, - parf, cfg, link, ulreg, smu, mpu, apb, phy ] + oneOf: + - description: + Basic DWC PCIe controller configuration-space accessible over + the DBI interface. This memory space is either activated with + CDM/ELBI = 0 and CS2 = 0 or is a contiguous memory region + with all spaces. Note iATU/eDMA CSRs are indirectly accessible + via the PL viewports on the DWC PCIe controllers older than + v4.80a. + const: dbi + - description: + Shadow DWC PCIe config-space registers. This space is selected + by setting CDM/ELBI = 0 and CS2 = 1. This is an intermix of + the PCI-SIG PCIe CFG-space with the shadow registers for some + PCI Header space, PCI Standard and Extended Structures. It's + mainly relevant for the end-point controller configuration, + but still there are some shadow registers available for the + Root Port mode too. + const: dbi2 + - description: + External Local Bus registers. It's an application-dependent + registers normally defined by the platform engineers. The space + can be selected by setting CDM/ELBI = 1 and CS2 = 0 wires or can + be accessed over some platform-specific means (for instance + as a part of a system controller). + enum: [ elbi, app ] + - description: + iATU/eDMA registers common for all device functions. It's an + unrolled memory space with the internal Address Translation + Unit and Enhanced DMA, which is selected by setting CDM/ELBI = 1 + and CS2 = 1. For IP-core releases prior v4.80a, these registers + have been programmed via an indirect addressing scheme using a + set of viewport CSRs mapped into the PL space. Note iATU is + normally mapped to the 0x0 address of this region, while eDMA + is available at 0x80000 base address. + const: atu + - description: + Platform-specific eDMA registers. Some platforms may have eDMA + CSRs mapped in a non-standard base address. The registers offset + can be changed or the MS/LS-bits of the address can be attached + in an additional RTL block before the MEM-IO transactions reach + the DW PCIe slave interface. + const: dma + - description: + PHY/PCS configuration registers. Some platforms can have the + PCS and PHY CSRs accessible over a dedicated memory mapped + region, but mainly these registers are indirectly accessible + either by means of the embedded PHY viewport schema or by some + platform-specific method. + const: phy + - description: + Outbound iATU-capable memory-region which will be used to access + the peripheral PCIe devices configuration space. + const: config + - description: + Vendor-specific CSR names. Consider using the generic names above + for new bindings. + oneOf: + - description: See native 'elbi/app' CSR region for details. + enum: [ apb, mgmt, link, ulreg, appl ] + - description: See native 'atu' CSR region for details. + enum: [ atu_dma ] + - description: Syscon-related CSR regions. + enum: [ smu, mpu ] + allOf: + - contains: + const: dbi + - contains: + const: config interrupts: description: -- GitLab From bd9504af9169131156e753a6e47de34ad7a97b7d Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Sun, 13 Nov 2022 22:12:52 +0300 Subject: [PATCH 0863/1423] dt-bindings: PCI: dwc: Add clocks/resets common properties DW PCIe RP/EP reference manuals explicit define all the clocks and reset requirements in [1] and [2]. Seeing the DW PCIe vendor-specific DT-bindings have already started assigning random names to the same set of the clocks and resets lines, let's define a generic names sets and add them to the DW PCIe common DT-schema. Note since there are DW PCI-based vendor-specific DT-bindings with the custom names assigned to the same clocks and resets resources we have no much choice but to add them to the generic DT-schemas in order to have the schemas being applicable for such devices. These names are marked as vendor-specific and should be avoided being used in new bindings in favor of the generic names. [1] Synopsys DesignWare Cores PCI Express Controller Databook - DWC PCIe Root Port, Version 5.40a, March 2019, p.55 - 78. [2] Synopsys DesignWare Cores PCI Express Controller Databook - DWC PCIe Endpoint, Version 5.40a, March 2019, p.58 - 81. Link: https://lore.kernel.org/r/20221113191301.5526-12-Sergey.Semin@baikalelectronics.ru Signed-off-by: Serge Semin Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring --- .../bindings/pci/snps,dw-pcie-common.yaml | 120 ++++++++++++++++++ .../bindings/pci/snps,dw-pcie-ep.yaml | 6 + .../devicetree/bindings/pci/snps,dw-pcie.yaml | 2 - 3 files changed, 126 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml b/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml index 13c41cd50e54a..4d9efcea38598 100644 --- a/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml +++ b/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml @@ -58,6 +58,126 @@ properties: minItems: 1 maxItems: 26 + clocks: + description: + DWC PCIe reference manual explicitly defines a set of the clocks required + to get the controller working correctly. In general all of them can + be divided into two groups':' application and core clocks. Note the + platforms may have some of the clock sources unspecified in case if the + corresponding domains are fed up from a common clock source. + minItems: 1 + maxItems: 7 + + clock-names: + minItems: 1 + maxItems: 7 + items: + oneOf: + - description: + Data Bus Interface (DBI) clock. Clock signal for the AXI-bus + interface of the Configuration-Dependent Module, which is + basically the set of the controller CSRs. + const: dbi + - description: + Application AXI-bus Master interface clock. Basically this is + a clock for the controller DMA interface (PCI-to-CPU). + const: mstr + - description: + Application AXI-bus Slave interface clock. This is a clock for + the CPU-to-PCI memory IO interface. + const: slv + - description: + Controller Core-PCS PIPE interface clock. It's normally + supplied by an external PCS-PHY. + const: pipe + - description: + Controller Primary clock. It's assumed that all controller input + signals (except resets) are synchronous to this clock. + const: core + - description: + Auxiliary clock for the controller PMC domain. The controller + partitioning implies having some parts to operate with this + clock in some power management states. + const: aux + - description: + Generic reference clock. In case if there are several + interfaces fed up with a common clock source it's advisable to + define it with this name (for instance pipe, core and aux can + be connected to a single source of the periodic signal). + const: ref + - description: + Clock for the PHY registers interface. Originally this is + a PHY-viewport-based interface, but some platform may have + specifically designed one. + const: phy_reg + - description: + Vendor-specific clock names. Consider using the generic names + above for new bindings. + oneOf: + - description: See native 'dbi' clock for details + enum: [ pcie, pcie_apb_sys, aclk_dbi ] + - description: See native 'mstr/slv' clock for details + enum: [ pcie_bus, pcie_inbound_axi, pcie_aclk, aclk_mst, aclk_slv ] + - description: See native 'pipe' clock for details + enum: [ pcie_phy, pcie_phy_ref, link ] + - description: See native 'aux' clock for details + enum: [ pcie_aux ] + - description: See native 'ref' clock for details. + enum: [ gio ] + - description: See nativs 'phy_reg' clock for details + enum: [ pcie_apb_phy, pclk ] + + resets: + description: + DWC PCIe reference manual explicitly defines a set of the reset + signals required to be de-asserted to properly activate the controller + sub-parts. All of these signals can be divided into two sub-groups':' + application and core resets with respect to the main sub-domains they + are supposed to reset. Note the platforms may have some of these signals + unspecified in case if they are automatically handled or aggregated into + a comprehensive control module. + minItems: 1 + maxItems: 10 + + reset-names: + minItems: 1 + maxItems: 10 + items: + oneOf: + - description: Data Bus Interface (DBI) domain reset + const: dbi + - description: AXI-bus Master interface reset + const: mstr + - description: AXI-bus Slave interface reset + const: slv + - description: Application-dependent interface reset + const: app + - description: Controller Non-sticky CSR flags reset + const: non-sticky + - description: Controller sticky CSR flags reset + const: sticky + - description: PIPE-interface (Core-PCS) logic reset + const: pipe + - description: + Controller primary reset (resets everything except PMC module) + const: core + - description: PCS/PHY block reset + const: phy + - description: PMC hot reset signal + const: hot + - description: Cold reset signal + const: pwr + - description: + Vendor-specific reset names. Consider using the generic names + above for new bindings. + oneOf: + - description: See native 'app' reset for details + enum: [ apps, gio, apb ] + - description: See native 'phy' reset for details + enum: [ pciephy, link ] + - description: See native 'pwr' reset for details + enum: [ turnoff ] + phys: description: There can be up to the number of possible lanes PHYs specified placed in diff --git a/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml b/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml index f4d7eb2dec4db..8fc2151691a47 100644 --- a/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml +++ b/Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml @@ -180,6 +180,12 @@ examples: interrupts = <23>, <24>; interrupt-names = "dma0", "dma1"; + clocks = <&sys_clk 12>, <&sys_clk 24>; + clock-names = "dbi", "ref"; + + resets = <&sys_rst 12>, <&sys_rst 24>; + reset-names = "dbi", "phy"; + phys = <&pcie_phy0>, <&pcie_phy1>, <&pcie_phy2>, <&pcie_phy3>; phy-names = "pcie0", "pcie1", "pcie2", "pcie3"; diff --git a/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml b/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml index 59d3bbb5883a7..c62c8fe517aef 100644 --- a/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml +++ b/Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml @@ -195,8 +195,6 @@ properties: - contains: const: msi - clocks: true - additionalProperties: true required: -- GitLab From 4a8972542a6d1eee81c7cc27699b0a47f6a6619e Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Sun, 13 Nov 2022 22:12:53 +0300 Subject: [PATCH 0864/1423] dt-bindings: PCI: dwc: Add dma-coherent property DW PCIe EP/RP AXI- and TRGT1-master interfaces are responsible for the application memory access. They are used by the RP/EP PCIe buses (MWr/MWr TLPs emitted by the peripheral PCIe devices) and the eDMA block. Since all of them mainly involve the system memory and basically mean DMA we can expect the corresponding platforms can be designed in a way to make sure the transactions are cache-coherent. As such the DW PCIe DT-nodes can have the 'dma-coherent' property specified. Let's permit it in the DT-bindings then. Link: https://lore.kernel.org/r/20221113191301.5526-13-Sergey.Semin@baikalelectronics.ru Signed-off-by: Serge Semin Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring --- Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml b/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml index 4d9efcea38598..d87e13496834a 100644 --- a/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml +++ b/Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml @@ -259,6 +259,8 @@ properties: configuration space registers, Port Logic registers, DMA and iATU registers. This feature has been available since DWC PCIe v4.80a. + dma-coherent: true + additionalProperties: true ... -- GitLab From 98b59129cb9f43a37bb92a577145f29ca54353a7 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Sun, 13 Nov 2022 22:12:54 +0300 Subject: [PATCH 0865/1423] dt-bindings: PCI: dwc: Apply common schema to Rockchip DW PCIe nodes As the DT-bindings description states the Rockchip PCIe controller is based on the DW PCIe RP IP-core thus its DT-nodes are supposed to be compatible with the common DW PCIe controller schema. Let's make sure they are evaluated against it by referring to the snps,dw-pcie.yaml schema in the allOf sub-schemas composition. Link: https://lore.kernel.org/r/20221113191301.5526-14-Sergey.Semin@baikalelectronics.ru Signed-off-by: Serge Semin Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring --- Documentation/devicetree/bindings/pci/rockchip-dw-pcie.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/pci/rockchip-dw-pcie.yaml b/Documentation/devicetree/bindings/pci/rockchip-dw-pcie.yaml index bc0a9d1db750b..2be72ae1169f9 100644 --- a/Documentation/devicetree/bindings/pci/rockchip-dw-pcie.yaml +++ b/Documentation/devicetree/bindings/pci/rockchip-dw-pcie.yaml @@ -14,10 +14,10 @@ maintainers: description: |+ RK3568 SoC PCIe host controller is based on the Synopsys DesignWare PCIe IP and thus inherits all the common properties defined in - designware-pcie.txt. + snps,dw-pcie.yaml. allOf: - - $ref: /schemas/pci/pci-bus.yaml# + - $ref: /schemas/pci/snps,dw-pcie.yaml# properties: compatible: -- GitLab From ce27c4e61f2dcc41d13f54cbecbd3a4b15db86c8 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Sun, 13 Nov 2022 22:12:55 +0300 Subject: [PATCH 0866/1423] dt-bindings: PCI: dwc: Add Baikal-T1 PCIe Root Port bindings Baikal-T1 SoC is equipped with DWC PCIe v4.60a Root Port controller, which link can be trained to work on up to Gen.3 speed over up to x4 lanes. The controller is supposed to be fed up with four clock sources: DBI peripheral clock, AXI application Tx/Rx clocks and external PHY/core reference clock generating the 100MHz signal. In addition to that the platform provide a way to reset each part of the controller: sticky/non-sticky bits, host controller core, PIPE interface, PCS/PHY and Hot/Power reset signal. The Root Port controller is equipped with multiple IRQ lines like MSI, system AER, PME, HP, Bandwidth change, Link equalization request and eDMA ones. The registers space is accessed over the DBI interface. There can be no more than four inbound or outbound iATU windows configured. Link: https://lore.kernel.org/r/20221113191301.5526-15-Sergey.Semin@baikalelectronics.ru Signed-off-by: Serge Semin Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring --- .../bindings/pci/baikal,bt1-pcie.yaml | 168 ++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 Documentation/devicetree/bindings/pci/baikal,bt1-pcie.yaml diff --git a/Documentation/devicetree/bindings/pci/baikal,bt1-pcie.yaml b/Documentation/devicetree/bindings/pci/baikal,bt1-pcie.yaml new file mode 100644 index 0000000000000..8eaa07ae97743 --- /dev/null +++ b/Documentation/devicetree/bindings/pci/baikal,bt1-pcie.yaml @@ -0,0 +1,168 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pci/baikal,bt1-pcie.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Baikal-T1 PCIe Root Port Controller + +maintainers: + - Serge Semin + +description: + Embedded into Baikal-T1 SoC Root Complex controller with a single port + activated. It's based on the DWC RC PCIe v4.60a IP-core, which is configured + to have just a single Root Port function and is capable of establishing the + link up to Gen.3 speed on x4 lanes. It doesn't have embedded clock and reset + control module, so the proper interface initialization is supposed to be + performed by software. There four in- and four outbound iATU regions + which can be used to emit all required TLP types on the PCIe bus. + +allOf: + - $ref: /schemas/pci/snps,dw-pcie.yaml# + +properties: + compatible: + const: baikal,bt1-pcie + + reg: + description: + DBI, DBI2 and at least 4KB outbound iATU-capable region for the + peripheral devices CFG-space access. + maxItems: 3 + + reg-names: + items: + - const: dbi + - const: dbi2 + - const: config + + interrupts: + description: + MSI, AER, PME, Hot-plug, Link Bandwidth Management, Link Equalization + request and eight Read/Write eDMA IRQ lines are available. + maxItems: 14 + + interrupt-names: + items: + - const: dma0 + - const: dma1 + - const: dma2 + - const: dma3 + - const: dma4 + - const: dma5 + - const: dma6 + - const: dma7 + - const: msi + - const: aer + - const: pme + - const: hp + - const: bw_mg + - const: l_eq + + clocks: + description: + DBI (attached to the APB bus), AXI-bus master and slave interfaces + are fed up by the dedicated application clocks. A common reference + clock signal is supposed to be attached to the corresponding Ref-pad + of the SoC. It will be redistributed amongst the controller core + sub-modules (pipe, core, aux, etc). + maxItems: 4 + + clock-names: + items: + - const: dbi + - const: mstr + - const: slv + - const: ref + + resets: + description: + A comprehensive controller reset logic is supposed to be implemented + by software, so almost all the possible application and core reset + signals are exposed via the system CCU module. + maxItems: 9 + + reset-names: + items: + - const: mstr + - const: slv + - const: pwr + - const: hot + - const: phy + - const: core + - const: pipe + - const: sticky + - const: non-sticky + + baikal,bt1-syscon: + $ref: /schemas/types.yaml#/definitions/phandle + description: + Phandle to the Baikal-T1 System Controller DT node. It's required to + access some additional PM, Reset-related and LTSSM signals. + + num-lanes: + maximum: 4 + + max-link-speed: + maximum: 3 + +required: + - compatible + - reg + - reg-names + - interrupts + - interrupt-names + +unevaluatedProperties: false + +examples: + - | + #include + #include + + pcie@1f052000 { + compatible = "baikal,bt1-pcie"; + device_type = "pci"; + reg = <0x1f052000 0x1000>, <0x1f053000 0x1000>, <0x1bdbf000 0x1000>; + reg-names = "dbi", "dbi2", "config"; + #address-cells = <3>; + #size-cells = <2>; + ranges = <0x81000000 0 0x00000000 0x1bdb0000 0 0x00008000>, + <0x82000000 0 0x20000000 0x08000000 0 0x13db0000>; + bus-range = <0x0 0xff>; + + interrupts = , + , + , + , + , + , + , + , + , + , + , + , + , + ; + interrupt-names = "dma0", "dma1", "dma2", "dma3", + "dma4", "dma5", "dma6", "dma7", + "msi", "aer", "pme", "hp", "bw_mg", + "l_eq"; + + clocks = <&ccu_sys 1>, <&ccu_axi 6>, <&ccu_axi 7>, <&clk_pcie>; + clock-names = "dbi", "mstr", "slv", "ref"; + + resets = <&ccu_axi 6>, <&ccu_axi 7>, <&ccu_sys 7>, <&ccu_sys 10>, + <&ccu_sys 4>, <&ccu_sys 6>, <&ccu_sys 5>, <&ccu_sys 8>, + <&ccu_sys 9>; + reset-names = "mstr", "slv", "pwr", "hot", "phy", "core", "pipe", + "sticky", "non-sticky"; + + reset-gpios = <&port0 0 GPIO_ACTIVE_LOW>; + + num-lanes = <4>; + max-link-speed = <3>; + }; +... -- GitLab From 8522e17d4cab47b35d43943ca13d677e76ab01b7 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Sun, 13 Nov 2022 22:12:56 +0300 Subject: [PATCH 0867/1423] PCI: dwc: Introduce dma-ranges property support for RC-host In accordance with the generic PCIe Root Port DT-bindings the "dma-ranges" property has the same format as the "ranges" property. The only difference is in their semantics. The "dma-ranges" property describes the PCIe-to-CPU memory mapping in opposite to the CPU-to-PCIe mapping of the "ranges" property. Even though the DW PCIe controllers are normally equipped with the internal Address Translation Unit which inbound and outbound tables can be used to implement both properties semantics, it was surprising for me to discover that the host-related part of the DW PCIe driver currently supports the "ranges" property only while the "dma-ranges" windows are just ignored. Having the "dma-ranges" supported in the driver would be very handy for the platforms, that don't tolerate the 1:1 CPU-PCIe memory mapping and require a customized PCIe memory layout. So let's fix that by introducing the "dma-ranges" property support. First of all we suggest to rename the dw_pcie_prog_inbound_atu() method to dw_pcie_prog_ep_inbound_atu() and create a new version of the dw_pcie_prog_inbound_atu() function. Thus we'll have two methods for the RC and EP controllers respectively in the same way as it has been developed for the outbound ATU setup methods. Secondly aside with the memory window index and type the new dw_pcie_prog_inbound_atu() function will accept CPU address, PCIe address and size as its arguments. These parameters define the PCIe and CPU memory ranges which will be used to setup the respective inbound ATU mapping. The passed parameters need to be verified against the ATU ranges constraints in the same way as it is done for the outbound ranges. Finally the DMA-ranges detected for the PCIe controller need to be converted to the inbound ATU entries during the host controller initialization procedure. It will be done in the framework of the dw_pcie_iatu_setup() method. Note before setting the inbound ranges up we need to disable all the inbound ATU entries in order to prevent unexpected PCIe TLPs translations defined by some third party software like bootloaders. Link: https://lore.kernel.org/r/20221113191301.5526-16-Sergey.Semin@baikalelectronics.ru Signed-off-by: Serge Semin Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Reviewed-by: Manivannan Sadhasivam --- .../pci/controller/dwc/pcie-designware-ep.c | 4 +- .../pci/controller/dwc/pcie-designware-host.c | 32 ++++++++++- drivers/pci/controller/dwc/pcie-designware.c | 56 ++++++++++++++++++- drivers/pci/controller/dwc/pcie-designware.h | 6 +- 4 files changed, 89 insertions(+), 9 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c index efc6c6360e28f..40d0056b2f568 100644 --- a/drivers/pci/controller/dwc/pcie-designware-ep.c +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c @@ -171,8 +171,8 @@ static int dw_pcie_ep_inbound_atu(struct dw_pcie_ep *ep, u8 func_no, int type, return -EINVAL; } - ret = dw_pcie_prog_inbound_atu(pci, func_no, free_win, type, - cpu_addr, bar); + ret = dw_pcie_prog_ep_inbound_atu(pci, func_no, free_win, type, + cpu_addr, bar); if (ret < 0) { dev_err(pci->dev, "Failed to program IB window\n"); return ret; diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c index 39f3b37d4033c..ea923c25e12d2 100644 --- a/drivers/pci/controller/dwc/pcie-designware-host.c +++ b/drivers/pci/controller/dwc/pcie-designware-host.c @@ -643,12 +643,15 @@ static int dw_pcie_iatu_setup(struct dw_pcie_rp *pp) } /* - * Ensure all outbound windows are disabled before proceeding with - * the MEM/IO ranges setups. + * Ensure all out/inbound windows are disabled before proceeding with + * the MEM/IO (dma-)ranges setups. */ for (i = 0; i < pci->num_ob_windows; i++) dw_pcie_disable_atu(pci, PCIE_ATU_REGION_DIR_OB, i); + for (i = 0; i < pci->num_ib_windows; i++) + dw_pcie_disable_atu(pci, PCIE_ATU_REGION_DIR_IB, i); + i = 0; resource_list_for_each_entry(entry, &pp->bridge->windows) { if (resource_type(entry->res) != IORESOURCE_MEM) @@ -685,9 +688,32 @@ static int dw_pcie_iatu_setup(struct dw_pcie_rp *pp) } if (pci->num_ob_windows <= i) - dev_warn(pci->dev, "Resources exceed number of ATU entries (%d)\n", + dev_warn(pci->dev, "Ranges exceed outbound iATU size (%d)\n", pci->num_ob_windows); + i = 0; + resource_list_for_each_entry(entry, &pp->bridge->dma_ranges) { + if (resource_type(entry->res) != IORESOURCE_MEM) + continue; + + if (pci->num_ib_windows <= i) + break; + + ret = dw_pcie_prog_inbound_atu(pci, i++, PCIE_ATU_TYPE_MEM, + entry->res->start, + entry->res->start - entry->offset, + resource_size(entry->res)); + if (ret) { + dev_err(pci->dev, "Failed to set DMA range %pr\n", + entry->res); + return ret; + } + } + + if (pci->num_ib_windows <= i) + dev_warn(pci->dev, "Dma-ranges exceed inbound iATU size (%u)\n", + pci->num_ib_windows); + return 0; } diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c index 432aead68d1fd..7f1fb764897df 100644 --- a/drivers/pci/controller/dwc/pcie-designware.c +++ b/drivers/pci/controller/dwc/pcie-designware.c @@ -393,8 +393,60 @@ static inline void dw_pcie_writel_atu_ib(struct dw_pcie *pci, u32 index, u32 reg dw_pcie_writel_atu(pci, PCIE_ATU_REGION_DIR_IB, index, reg, val); } -int dw_pcie_prog_inbound_atu(struct dw_pcie *pci, u8 func_no, int index, - int type, u64 cpu_addr, u8 bar) +int dw_pcie_prog_inbound_atu(struct dw_pcie *pci, int index, int type, + u64 cpu_addr, u64 pci_addr, u64 size) +{ + u64 limit_addr = pci_addr + size - 1; + u32 retries, val; + + if ((limit_addr & ~pci->region_limit) != (pci_addr & ~pci->region_limit) || + !IS_ALIGNED(cpu_addr, pci->region_align) || + !IS_ALIGNED(pci_addr, pci->region_align) || !size) { + return -EINVAL; + } + + dw_pcie_writel_atu_ib(pci, index, PCIE_ATU_LOWER_BASE, + lower_32_bits(pci_addr)); + dw_pcie_writel_atu_ib(pci, index, PCIE_ATU_UPPER_BASE, + upper_32_bits(pci_addr)); + + dw_pcie_writel_atu_ib(pci, index, PCIE_ATU_LIMIT, + lower_32_bits(limit_addr)); + if (dw_pcie_ver_is_ge(pci, 460A)) + dw_pcie_writel_atu_ib(pci, index, PCIE_ATU_UPPER_LIMIT, + upper_32_bits(limit_addr)); + + dw_pcie_writel_atu_ib(pci, index, PCIE_ATU_LOWER_TARGET, + lower_32_bits(cpu_addr)); + dw_pcie_writel_atu_ib(pci, index, PCIE_ATU_UPPER_TARGET, + upper_32_bits(cpu_addr)); + + val = type; + if (upper_32_bits(limit_addr) > upper_32_bits(pci_addr) && + dw_pcie_ver_is_ge(pci, 460A)) + val |= PCIE_ATU_INCREASE_REGION_SIZE; + dw_pcie_writel_atu_ib(pci, index, PCIE_ATU_REGION_CTRL1, val); + dw_pcie_writel_atu_ib(pci, index, PCIE_ATU_REGION_CTRL2, PCIE_ATU_ENABLE); + + /* + * Make sure ATU enable takes effect before any subsequent config + * and I/O accesses. + */ + for (retries = 0; retries < LINK_WAIT_MAX_IATU_RETRIES; retries++) { + val = dw_pcie_readl_atu_ib(pci, index, PCIE_ATU_REGION_CTRL2); + if (val & PCIE_ATU_ENABLE) + return 0; + + mdelay(LINK_WAIT_IATU); + } + + dev_err(pci->dev, "Inbound iATU is not being enabled\n"); + + return -ETIMEDOUT; +} + +int dw_pcie_prog_ep_inbound_atu(struct dw_pcie *pci, u8 func_no, int index, + int type, u64 cpu_addr, u8 bar) { u32 retries, val; diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h index a871ae7eb59ec..37801bbce8540 100644 --- a/drivers/pci/controller/dwc/pcie-designware.h +++ b/drivers/pci/controller/dwc/pcie-designware.h @@ -346,8 +346,10 @@ int dw_pcie_prog_outbound_atu(struct dw_pcie *pci, int index, int type, u64 cpu_addr, u64 pci_addr, u64 size); int dw_pcie_prog_ep_outbound_atu(struct dw_pcie *pci, u8 func_no, int index, int type, u64 cpu_addr, u64 pci_addr, u64 size); -int dw_pcie_prog_inbound_atu(struct dw_pcie *pci, u8 func_no, int index, - int type, u64 cpu_addr, u8 bar); +int dw_pcie_prog_inbound_atu(struct dw_pcie *pci, int index, int type, + u64 cpu_addr, u64 pci_addr, u64 size); +int dw_pcie_prog_ep_inbound_atu(struct dw_pcie *pci, u8 func_no, int index, + int type, u64 cpu_addr, u8 bar); void dw_pcie_disable_atu(struct dw_pcie *pci, u32 dir, int index); void dw_pcie_setup(struct dw_pcie *pci); void dw_pcie_iatu_detect(struct dw_pcie *pci); -- GitLab From 7f9e982dc4fcf7b4bc7e9dc8a9f344395fc125b8 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Sun, 13 Nov 2022 22:12:57 +0300 Subject: [PATCH 0868/1423] PCI: dwc: Introduce generic controller capabilities interface Since in addition to the already available iATU unrolled mapping we are about to add a few more DW PCIe platform-specific capabilities (CDM-check and generic clocks/resets resources) let's add a generic interface to set and get the flags indicating their availability. The new interface shall improve maintainability of the platform-specific code. Link: https://lore.kernel.org/r/20221113191301.5526-17-Sergey.Semin@baikalelectronics.ru Signed-off-by: Serge Semin Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Reviewed-by: Manivannan Sadhasivam --- drivers/pci/controller/dwc/pcie-designware.c | 11 ++++++----- drivers/pci/controller/dwc/pcie-designware.h | 12 +++++++++++- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c index 7f1fb764897df..b9cc4b00e5fe7 100644 --- a/drivers/pci/controller/dwc/pcie-designware.c +++ b/drivers/pci/controller/dwc/pcie-designware.c @@ -211,7 +211,7 @@ void dw_pcie_write_dbi2(struct dw_pcie *pci, u32 reg, size_t size, u32 val) static inline void __iomem *dw_pcie_select_atu(struct dw_pcie *pci, u32 dir, u32 index) { - if (pci->iatu_unroll_enabled) + if (dw_pcie_cap_is(pci, IATU_UNROLL)) return pci->atu_base + PCIE_ATU_UNROLL_BASE(dir, index); dw_pcie_writel_dbi(pci, PCIE_ATU_VIEWPORT, dir | index); @@ -591,7 +591,7 @@ static void dw_pcie_iatu_detect_regions(struct dw_pcie *pci) u32 val, min, dir; u64 max; - if (pci->iatu_unroll_enabled) { + if (dw_pcie_cap_is(pci, IATU_UNROLL)) { max_region = min((int)pci->atu_size / 512, 256); } else { dw_pcie_writel_dbi(pci, PCIE_ATU_VIEWPORT, 0xFF); @@ -641,8 +641,9 @@ void dw_pcie_iatu_detect(struct dw_pcie *pci) { struct platform_device *pdev = to_platform_device(pci->dev); - pci->iatu_unroll_enabled = dw_pcie_iatu_unroll_enabled(pci); - if (pci->iatu_unroll_enabled) { + if (dw_pcie_iatu_unroll_enabled(pci)) { + dw_pcie_cap_set(pci, IATU_UNROLL); + if (!pci->atu_base) { struct resource *res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "atu"); @@ -664,7 +665,7 @@ void dw_pcie_iatu_detect(struct dw_pcie *pci) dw_pcie_iatu_detect_regions(pci); - dev_info(pci->dev, "iATU unroll: %s\n", pci->iatu_unroll_enabled ? + dev_info(pci->dev, "iATU unroll: %s\n", dw_pcie_cap_is(pci, IATU_UNROLL) ? "enabled" : "disabled"); dev_info(pci->dev, "iATU regions: %u ob, %u ib, align %uK, limit %lluG\n", diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h index 37801bbce8540..c6dddacee3b1c 100644 --- a/drivers/pci/controller/dwc/pcie-designware.h +++ b/drivers/pci/controller/dwc/pcie-designware.h @@ -12,6 +12,7 @@ #define _PCIE_DESIGNWARE_H #include +#include #include #include #include @@ -43,6 +44,15 @@ (__dw_pcie_ver_cmp(_pci, _ver, ==) && \ __dw_pcie_ver_cmp(_pci, TYPE_ ## _type, >=)) +/* DWC PCIe controller capabilities */ +#define DW_PCIE_CAP_IATU_UNROLL 1 + +#define dw_pcie_cap_is(_pci, _cap) \ + test_bit(DW_PCIE_CAP_ ## _cap, &(_pci)->caps) + +#define dw_pcie_cap_set(_pci, _cap) \ + set_bit(DW_PCIE_CAP_ ## _cap, &(_pci)->caps) + /* Parameters for the waiting for link up routine */ #define LINK_WAIT_MAX_RETRIES 10 #define LINK_WAIT_USLEEP_MIN 90000 @@ -317,10 +327,10 @@ struct dw_pcie { const struct dw_pcie_ops *ops; u32 version; u32 type; + unsigned long caps; int num_lanes; int link_gen; u8 n_fts[2]; - bool iatu_unroll_enabled: 1; }; #define to_dw_pcie_from_pp(port) container_of((port), struct dw_pcie, pp) -- GitLab From ef8c58877fe77c7807777f61f59cffaee89881f7 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Sun, 13 Nov 2022 22:12:58 +0300 Subject: [PATCH 0869/1423] PCI: dwc: Introduce generic resources getter Currently the DW PCIe Root Port and Endpoint CSR spaces are retrieved in the separate parts of the DW PCIe core driver. It doesn't really make sense since the both controller types have identical set of the core CSR regions: DBI, DBI CS2 and iATU/eDMA. Thus we can simplify the DW PCIe Host and EP initialization methods by moving the platform-specific registers space getting and mapping into a common method. It gets to be even more justified seeing the CSRs base address pointers are preserved in the common DW PCIe descriptor. Note all the OF-based common DW PCIe settings initialization will be moved to the new method too in order to have a single function for all the generic platform properties handling in single place. A nice side-effect of this change is that the pcie-designware-host.c and pcie-designware-ep.c drivers are cleaned up from all the direct dw_pcie storage modification, which makes the DW PCIe core, Root Port and Endpoint modules more coherent. Link: https://lore.kernel.org/r/20221113191301.5526-18-Sergey.Semin@baikalelectronics.ru Signed-off-by: Serge Semin Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring --- .../pci/controller/dwc/pcie-designware-ep.c | 25 +------ .../pci/controller/dwc/pcie-designware-host.c | 15 +--- drivers/pci/controller/dwc/pcie-designware.c | 75 ++++++++++++++----- drivers/pci/controller/dwc/pcie-designware.h | 3 + 4 files changed, 65 insertions(+), 53 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c index 40d0056b2f568..d06654895ebae 100644 --- a/drivers/pci/controller/dwc/pcie-designware-ep.c +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c @@ -13,8 +13,6 @@ #include #include -#include "../../pci.h" - void dw_pcie_ep_linkup(struct dw_pcie_ep *ep) { struct pci_epc *epc = ep->epc; @@ -711,23 +709,9 @@ int dw_pcie_ep_init(struct dw_pcie_ep *ep) INIT_LIST_HEAD(&ep->func_list); - if (!pci->dbi_base) { - res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dbi"); - pci->dbi_base = devm_pci_remap_cfg_resource(dev, res); - if (IS_ERR(pci->dbi_base)) - return PTR_ERR(pci->dbi_base); - } - - if (!pci->dbi_base2) { - res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dbi2"); - if (!res) { - pci->dbi_base2 = pci->dbi_base + SZ_4K; - } else { - pci->dbi_base2 = devm_pci_remap_cfg_resource(dev, res); - if (IS_ERR(pci->dbi_base2)) - return PTR_ERR(pci->dbi_base2); - } - } + ret = dw_pcie_get_resources(pci); + if (ret) + return ret; res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "addr_space"); if (!res) @@ -756,9 +740,6 @@ int dw_pcie_ep_init(struct dw_pcie_ep *ep) return -ENOMEM; ep->outbound_addr = addr; - if (pci->link_gen < 1) - pci->link_gen = of_pci_get_max_link_speed(np); - epc = devm_pci_epc_create(dev, &epc_ops); if (IS_ERR(epc)) { dev_err(dev, "Failed to create epc device\n"); diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c index ea923c25e12d2..3ab6ae3712c42 100644 --- a/drivers/pci/controller/dwc/pcie-designware-host.c +++ b/drivers/pci/controller/dwc/pcie-designware-host.c @@ -16,7 +16,6 @@ #include #include -#include "../../pci.h" #include "pcie-designware.h" static struct pci_ops dw_pcie_ops; @@ -395,6 +394,10 @@ int dw_pcie_host_init(struct dw_pcie_rp *pp) raw_spin_lock_init(&pp->lock); + ret = dw_pcie_get_resources(pci); + if (ret) + return ret; + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "config"); if (res) { pp->cfg0_size = resource_size(res); @@ -408,13 +411,6 @@ int dw_pcie_host_init(struct dw_pcie_rp *pp) return -ENODEV; } - if (!pci->dbi_base) { - res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dbi"); - pci->dbi_base = devm_pci_remap_cfg_resource(dev, res); - if (IS_ERR(pci->dbi_base)) - return PTR_ERR(pci->dbi_base); - } - bridge = devm_pci_alloc_host_bridge(dev, 0); if (!bridge) return -ENOMEM; @@ -429,9 +425,6 @@ int dw_pcie_host_init(struct dw_pcie_rp *pp) pp->io_base = pci_pio_to_address(win->res->start); } - if (pci->link_gen < 1) - pci->link_gen = of_pci_get_max_link_speed(np); - /* Set default bus ops */ bridge->ops = &dw_pcie_ops; bridge->child_ops = &dw_child_pcie_ops; diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c index b9cc4b00e5fe7..393e64ecccd35 100644 --- a/drivers/pci/controller/dwc/pcie-designware.c +++ b/drivers/pci/controller/dwc/pcie-designware.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -19,6 +20,59 @@ #include "../../pci.h" #include "pcie-designware.h" +int dw_pcie_get_resources(struct dw_pcie *pci) +{ + struct platform_device *pdev = to_platform_device(pci->dev); + struct device_node *np = dev_of_node(pci->dev); + struct resource *res; + + if (!pci->dbi_base) { + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dbi"); + pci->dbi_base = devm_pci_remap_cfg_resource(pci->dev, res); + if (IS_ERR(pci->dbi_base)) + return PTR_ERR(pci->dbi_base); + } + + /* DBI2 is mainly useful for the endpoint controller */ + if (!pci->dbi_base2) { + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dbi2"); + if (res) { + pci->dbi_base2 = devm_pci_remap_cfg_resource(pci->dev, res); + if (IS_ERR(pci->dbi_base2)) + return PTR_ERR(pci->dbi_base2); + } else { + pci->dbi_base2 = pci->dbi_base + SZ_4K; + } + } + + /* For non-unrolled iATU/eDMA platforms this range will be ignored */ + if (!pci->atu_base) { + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "atu"); + if (res) { + pci->atu_size = resource_size(res); + pci->atu_base = devm_ioremap_resource(pci->dev, res); + if (IS_ERR(pci->atu_base)) + return PTR_ERR(pci->atu_base); + } else { + pci->atu_base = pci->dbi_base + DEFAULT_DBI_ATU_OFFSET; + } + } + + /* Set a default value suitable for at most 8 in and 8 out windows */ + if (!pci->atu_size) + pci->atu_size = SZ_4K; + + if (pci->link_gen < 1) + pci->link_gen = of_pci_get_max_link_speed(np); + + of_property_read_u32(np, "num-lanes", &pci->num_lanes); + + if (of_property_read_bool(np, "snps,enable-cdm-check")) + dw_pcie_cap_set(pci, CDM_CHECK); + + return 0; +} + void dw_pcie_version_detect(struct dw_pcie *pci) { u32 ver; @@ -639,25 +693,8 @@ static void dw_pcie_iatu_detect_regions(struct dw_pcie *pci) void dw_pcie_iatu_detect(struct dw_pcie *pci) { - struct platform_device *pdev = to_platform_device(pci->dev); - if (dw_pcie_iatu_unroll_enabled(pci)) { dw_pcie_cap_set(pci, IATU_UNROLL); - - if (!pci->atu_base) { - struct resource *res = - platform_get_resource_byname(pdev, IORESOURCE_MEM, "atu"); - if (res) { - pci->atu_size = resource_size(res); - pci->atu_base = devm_ioremap_resource(pci->dev, res); - } - if (!pci->atu_base || IS_ERR(pci->atu_base)) - pci->atu_base = pci->dbi_base + DEFAULT_DBI_ATU_OFFSET; - } - - if (!pci->atu_size) - /* Pick a minimal default, enough for 8 in and 8 out windows */ - pci->atu_size = SZ_4K; } else { pci->atu_base = pci->dbi_base + PCIE_ATU_VIEWPORT_BASE; pci->atu_size = PCIE_ATU_VIEWPORT_SIZE; @@ -675,7 +712,6 @@ void dw_pcie_iatu_detect(struct dw_pcie *pci) void dw_pcie_setup(struct dw_pcie *pci) { - struct device_node *np = pci->dev->of_node; u32 val; if (pci->link_gen > 0) @@ -703,14 +739,13 @@ void dw_pcie_setup(struct dw_pcie *pci) val |= PORT_LINK_DLL_LINK_EN; dw_pcie_writel_dbi(pci, PCIE_PORT_LINK_CONTROL, val); - if (of_property_read_bool(np, "snps,enable-cdm-check")) { + if (dw_pcie_cap_is(pci, CDM_CHECK)) { val = dw_pcie_readl_dbi(pci, PCIE_PL_CHK_REG_CONTROL_STATUS); val |= PCIE_PL_CHK_REG_CHK_REG_CONTINUOUS | PCIE_PL_CHK_REG_CHK_REG_START; dw_pcie_writel_dbi(pci, PCIE_PL_CHK_REG_CONTROL_STATUS, val); } - of_property_read_u32(np, "num-lanes", &pci->num_lanes); if (!pci->num_lanes) { dev_dbg(pci->dev, "Using h/w default number of lanes\n"); return; diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h index c6dddacee3b1c..081f169e6021e 100644 --- a/drivers/pci/controller/dwc/pcie-designware.h +++ b/drivers/pci/controller/dwc/pcie-designware.h @@ -46,6 +46,7 @@ /* DWC PCIe controller capabilities */ #define DW_PCIE_CAP_IATU_UNROLL 1 +#define DW_PCIE_CAP_CDM_CHECK 2 #define dw_pcie_cap_is(_pci, _cap) \ test_bit(DW_PCIE_CAP_ ## _cap, &(_pci)->caps) @@ -338,6 +339,8 @@ struct dw_pcie { #define to_dw_pcie_from_ep(endpoint) \ container_of((endpoint), struct dw_pcie, ep) +int dw_pcie_get_resources(struct dw_pcie *pci); + void dw_pcie_version_detect(struct dw_pcie *pci); u8 dw_pcie_find_capability(struct dw_pcie *pci, u8 cap); -- GitLab From 9f67ecdd9579228d656192a4b6e951c757085db8 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Sun, 13 Nov 2022 22:12:59 +0300 Subject: [PATCH 0870/1423] PCI: dwc: Combine iATU detection procedures Since the iATU CSR region is now retrieved in the DW PCIe resources getter there is no much benefits in the iATU detection procedures splitting up. Therefore let's join the iATU unroll/viewport detection procedure with the rest of the iATU parameters detection code. The resultant method will be as coherent as before, while the redundant functions will be eliminated thus producing more readable code. Link: https://lore.kernel.org/r/20221113191301.5526-19-Sergey.Semin@baikalelectronics.ru Signed-off-by: Serge Semin Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Reviewed-by: Manivannan Sadhasivam --- drivers/pci/controller/dwc/pcie-designware.c | 39 +++++--------------- 1 file changed, 10 insertions(+), 29 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c index 393e64ecccd35..e979fb8f3cee9 100644 --- a/drivers/pci/controller/dwc/pcie-designware.c +++ b/drivers/pci/controller/dwc/pcie-designware.c @@ -628,26 +628,21 @@ static void dw_pcie_link_set_max_speed(struct dw_pcie *pci, u32 link_gen) } -static bool dw_pcie_iatu_unroll_enabled(struct dw_pcie *pci) -{ - u32 val; - - val = dw_pcie_readl_dbi(pci, PCIE_ATU_VIEWPORT); - if (val == 0xffffffff) - return true; - - return false; -} - -static void dw_pcie_iatu_detect_regions(struct dw_pcie *pci) +void dw_pcie_iatu_detect(struct dw_pcie *pci) { int max_region, ob, ib; u32 val, min, dir; u64 max; - if (dw_pcie_cap_is(pci, IATU_UNROLL)) { + val = dw_pcie_readl_dbi(pci, PCIE_ATU_VIEWPORT); + if (val == 0xFFFFFFFF) { + dw_pcie_cap_set(pci, IATU_UNROLL); + max_region = min((int)pci->atu_size / 512, 256); } else { + pci->atu_base = pci->dbi_base + PCIE_ATU_VIEWPORT_BASE; + pci->atu_size = PCIE_ATU_VIEWPORT_SIZE; + dw_pcie_writel_dbi(pci, PCIE_ATU_VIEWPORT, 0xFF); max_region = dw_pcie_readl_dbi(pci, PCIE_ATU_VIEWPORT) + 1; } @@ -689,23 +684,9 @@ static void dw_pcie_iatu_detect_regions(struct dw_pcie *pci) pci->num_ib_windows = ib; pci->region_align = 1 << fls(min); pci->region_limit = (max << 32) | (SZ_4G - 1); -} - -void dw_pcie_iatu_detect(struct dw_pcie *pci) -{ - if (dw_pcie_iatu_unroll_enabled(pci)) { - dw_pcie_cap_set(pci, IATU_UNROLL); - } else { - pci->atu_base = pci->dbi_base + PCIE_ATU_VIEWPORT_BASE; - pci->atu_size = PCIE_ATU_VIEWPORT_SIZE; - } - - dw_pcie_iatu_detect_regions(pci); - - dev_info(pci->dev, "iATU unroll: %s\n", dw_pcie_cap_is(pci, IATU_UNROLL) ? - "enabled" : "disabled"); - dev_info(pci->dev, "iATU regions: %u ob, %u ib, align %uK, limit %lluG\n", + dev_info(pci->dev, "iATU: unroll %s, %u ob, %u ib, align %uK, limit %lluG\n", + dw_pcie_cap_is(pci, IATU_UNROLL) ? "T" : "F", pci->num_ob_windows, pci->num_ib_windows, pci->region_align / SZ_1K, (pci->region_limit + 1) / SZ_1G); } -- GitLab From ef69f852a9784017e646e50e3efc715dac7e3fc4 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Sun, 13 Nov 2022 22:13:00 +0300 Subject: [PATCH 0871/1423] PCI: dwc: Introduce generic platform clocks and resets Currently almost each platform driver uses its own resets and clocks naming in order to get the corresponding descriptors. It makes the code harder to maintain and comprehend especially seeing the DWC PCIe core main resets and clocks signals set hasn't changed much for about at least one major IP-core release. So in order to organize things around these signals we suggest to create a generic interface for them in accordance with the naming introduced in the DWC PCIe IP-core reference manual: Application clocks: - "dbi" - data bus interface clock (on some DWC PCIe platforms it's referred as "pclk", "pcie", "sys", "ahb", "cfg", "iface", "gio", "reg", "pcie_apb_sys"); - "mstr" - AXI-bus master interface clock (some DWC PCIe glue drivers refer to this clock as "port", "bus", "pcie_bus", "bus_master/master_bus/axi_m", "pcie_aclk"); - "slv" - AXI-bus slave interface clock (also called as "port", "bus", "pcie_bus", "bus_slave/slave_bus/axi_s", "pcie_aclk", "pcie_inbound_axi"). Core clocks: - "pipe" - core-PCS PIPE interface clock coming from external PHY (it's normally named by the platform drivers as just "pipe"); - "core" - primary clock of the controller (none of the platform drivers declare such a clock but in accordance with the ref. manual the devices may have it separately specified); - "aux" - auxiliary PMC domain clock (it is named by some platforms as "pcie_aux" and just "aux"); - "ref" - Generic reference clock (it is a generic clock source, which can be used as a signal source for multiple interfaces, some platforms call it as "ref", "general", "pcie_phy", "pcie_phy_ref"). Application resets: - "dbi" - Data-bus interface reset (it's CSR interface clock and is normally called as "apb" though technically it's not APB but DWC PCIe-specific interface); - "mstr" - AXI-bus master reset (some platforms call it as "port", "apps", "bus", "axi_m"); - "slv" - ABI-bus slave reset (some platforms call it as "port", "apps", "bus", "axi_s"). Core resets: - "non-sticky" - non-sticky CSR flags reset; - "sticky" - sticky CSR flags reset; - "pipe" - PIPE-interface (Core-PCS) logic reset (some platforms call it just "pipe"); - "core" - controller primary reset (resets everything except PMC module, some platforms refer to this signal as "soft", "pci"); - "phy" - PCS/PHY block reset (strictly speaking it is normally connected to the input of an external block, but the reference manual says it must be available for the PMC working correctly, some existing platforms call it "pciephy", "phy", "link"); - "hot" - PMC hot reset signal (also called as "sleep"); - "pwr" - cold reset signal (can be referred as "pwr", "turnoff"). Bus reset: - "perst" - PCIe standard signal used to reset the PCIe peripheral devices. As you can see each platform uses it's own naming for basically the same set of the signals. In the framework of this commit we suggest to add a set of the clocks and reset signals resources, corresponding names and identifiers for each denoted entity. At current stage the platforms will be able to use the provided infrastructure to automatically request all these resources and manipulate with them in the Host/EP init callbacks. Alas it isn't that easy to create a common cold/hot reset procedure due to too many platform-specifics in the procedure, like the external flags exposure and the delays requirement. Link: https://lore.kernel.org/r/20221113191301.5526-20-Sergey.Semin@baikalelectronics.ru Signed-off-by: Serge Semin Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/dwc/pcie-designware.c | 91 ++++++++++++++++++++ drivers/pci/controller/dwc/pcie-designware.h | 42 +++++++++ 2 files changed, 133 insertions(+) diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c index e979fb8f3cee9..6d5d619ab2e94 100644 --- a/drivers/pci/controller/dwc/pcie-designware.c +++ b/drivers/pci/controller/dwc/pcie-designware.c @@ -10,7 +10,9 @@ #include #include +#include #include +#include #include #include #include @@ -20,11 +22,89 @@ #include "../../pci.h" #include "pcie-designware.h" +static const char * const dw_pcie_app_clks[DW_PCIE_NUM_APP_CLKS] = { + [DW_PCIE_DBI_CLK] = "dbi", + [DW_PCIE_MSTR_CLK] = "mstr", + [DW_PCIE_SLV_CLK] = "slv", +}; + +static const char * const dw_pcie_core_clks[DW_PCIE_NUM_CORE_CLKS] = { + [DW_PCIE_PIPE_CLK] = "pipe", + [DW_PCIE_CORE_CLK] = "core", + [DW_PCIE_AUX_CLK] = "aux", + [DW_PCIE_REF_CLK] = "ref", +}; + +static const char * const dw_pcie_app_rsts[DW_PCIE_NUM_APP_RSTS] = { + [DW_PCIE_DBI_RST] = "dbi", + [DW_PCIE_MSTR_RST] = "mstr", + [DW_PCIE_SLV_RST] = "slv", +}; + +static const char * const dw_pcie_core_rsts[DW_PCIE_NUM_CORE_RSTS] = { + [DW_PCIE_NON_STICKY_RST] = "non-sticky", + [DW_PCIE_STICKY_RST] = "sticky", + [DW_PCIE_CORE_RST] = "core", + [DW_PCIE_PIPE_RST] = "pipe", + [DW_PCIE_PHY_RST] = "phy", + [DW_PCIE_HOT_RST] = "hot", + [DW_PCIE_PWR_RST] = "pwr", +}; + +static int dw_pcie_get_clocks(struct dw_pcie *pci) +{ + int i, ret; + + for (i = 0; i < DW_PCIE_NUM_APP_CLKS; i++) + pci->app_clks[i].id = dw_pcie_app_clks[i]; + + for (i = 0; i < DW_PCIE_NUM_CORE_CLKS; i++) + pci->core_clks[i].id = dw_pcie_core_clks[i]; + + ret = devm_clk_bulk_get_optional(pci->dev, DW_PCIE_NUM_APP_CLKS, + pci->app_clks); + if (ret) + return ret; + + return devm_clk_bulk_get_optional(pci->dev, DW_PCIE_NUM_CORE_CLKS, + pci->core_clks); +} + +static int dw_pcie_get_resets(struct dw_pcie *pci) +{ + int i, ret; + + for (i = 0; i < DW_PCIE_NUM_APP_RSTS; i++) + pci->app_rsts[i].id = dw_pcie_app_rsts[i]; + + for (i = 0; i < DW_PCIE_NUM_CORE_RSTS; i++) + pci->core_rsts[i].id = dw_pcie_core_rsts[i]; + + ret = devm_reset_control_bulk_get_optional_shared(pci->dev, + DW_PCIE_NUM_APP_RSTS, + pci->app_rsts); + if (ret) + return ret; + + ret = devm_reset_control_bulk_get_optional_exclusive(pci->dev, + DW_PCIE_NUM_CORE_RSTS, + pci->core_rsts); + if (ret) + return ret; + + pci->pe_rst = devm_gpiod_get_optional(pci->dev, "reset", GPIOD_OUT_HIGH); + if (IS_ERR(pci->pe_rst)) + return PTR_ERR(pci->pe_rst); + + return 0; +} + int dw_pcie_get_resources(struct dw_pcie *pci) { struct platform_device *pdev = to_platform_device(pci->dev); struct device_node *np = dev_of_node(pci->dev); struct resource *res; + int ret; if (!pci->dbi_base) { res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dbi"); @@ -62,6 +142,17 @@ int dw_pcie_get_resources(struct dw_pcie *pci) if (!pci->atu_size) pci->atu_size = SZ_4K; + /* LLDD is supposed to manually switch the clocks and resets state */ + if (dw_pcie_cap_is(pci, REQ_RES)) { + ret = dw_pcie_get_clocks(pci); + if (ret) + return ret; + + ret = dw_pcie_get_resets(pci); + if (ret) + return ret; + } + if (pci->link_gen < 1) pci->link_gen = of_pci_get_max_link_speed(np); diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h index 081f169e6021e..393dfb931df6c 100644 --- a/drivers/pci/controller/dwc/pcie-designware.h +++ b/drivers/pci/controller/dwc/pcie-designware.h @@ -13,10 +13,13 @@ #include #include +#include #include +#include #include #include #include +#include #include #include @@ -45,6 +48,7 @@ __dw_pcie_ver_cmp(_pci, TYPE_ ## _type, >=)) /* DWC PCIe controller capabilities */ +#define DW_PCIE_CAP_REQ_RES 0 #define DW_PCIE_CAP_IATU_UNROLL 1 #define DW_PCIE_CAP_CDM_CHECK 2 @@ -233,6 +237,39 @@ enum dw_pcie_device_mode { DW_PCIE_RC_TYPE, }; +enum dw_pcie_app_clk { + DW_PCIE_DBI_CLK, + DW_PCIE_MSTR_CLK, + DW_PCIE_SLV_CLK, + DW_PCIE_NUM_APP_CLKS +}; + +enum dw_pcie_core_clk { + DW_PCIE_PIPE_CLK, + DW_PCIE_CORE_CLK, + DW_PCIE_AUX_CLK, + DW_PCIE_REF_CLK, + DW_PCIE_NUM_CORE_CLKS +}; + +enum dw_pcie_app_rst { + DW_PCIE_DBI_RST, + DW_PCIE_MSTR_RST, + DW_PCIE_SLV_RST, + DW_PCIE_NUM_APP_RSTS +}; + +enum dw_pcie_core_rst { + DW_PCIE_NON_STICKY_RST, + DW_PCIE_STICKY_RST, + DW_PCIE_CORE_RST, + DW_PCIE_PIPE_RST, + DW_PCIE_PHY_RST, + DW_PCIE_HOT_RST, + DW_PCIE_PWR_RST, + DW_PCIE_NUM_CORE_RSTS +}; + struct dw_pcie_host_ops { int (*host_init)(struct dw_pcie_rp *pp); void (*host_deinit)(struct dw_pcie_rp *pp); @@ -332,6 +369,11 @@ struct dw_pcie { int num_lanes; int link_gen; u8 n_fts[2]; + struct clk_bulk_data app_clks[DW_PCIE_NUM_APP_CLKS]; + struct clk_bulk_data core_clks[DW_PCIE_NUM_CORE_CLKS]; + struct reset_control_bulk_data app_rsts[DW_PCIE_NUM_APP_RSTS]; + struct reset_control_bulk_data core_rsts[DW_PCIE_NUM_CORE_RSTS]; + struct gpio_desc *pe_rst; }; #define to_dw_pcie_from_pp(port) container_of((port), struct dw_pcie, pp) -- GitLab From ba6ed462dcf41a83b36eb9a74a8c4720040f9762 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Sun, 13 Nov 2022 22:13:01 +0300 Subject: [PATCH 0872/1423] PCI: dwc: Add Baikal-T1 PCIe controller support Baikal-T1 SoC is equipped with DWC PCIe v4.60a host controller. It can be trained to work up to Gen.3 speed over up to x4 lanes. The host controller is attached to the DW PCIe 3.0 PCS via the PIPE-4 interface, which in its turn is connected to the DWC 10G PHY. The whole system is supposed to be fed up with four clock sources: DBI peripheral clock, AXI application clocks and external PHY/core reference clock generating the 100MHz signal. In addition to that the platform provide a way to reset each part of the controller: sticky/non-sticky bits, host controller core, PIPE interface, PCS/PHY and Hot/Power reset signal. The driver also provides a way to handle the GPIO-based PERST# signal. Note due to the Baikal-T1 MMIO peculiarity we have to implement the DBI interface accessors which make sure the IO operations are dword-aligned. Link: https://lore.kernel.org/r/20221113191301.5526-21-Sergey.Semin@baikalelectronics.ru Signed-off-by: Serge Semin Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/dwc/Kconfig | 9 + drivers/pci/controller/dwc/Makefile | 1 + drivers/pci/controller/dwc/pcie-bt1.c | 643 ++++++++++++++++++++++++++ 3 files changed, 653 insertions(+) create mode 100644 drivers/pci/controller/dwc/pcie-bt1.c diff --git a/drivers/pci/controller/dwc/Kconfig b/drivers/pci/controller/dwc/Kconfig index 62ce3abf0f196..771b8b146623f 100644 --- a/drivers/pci/controller/dwc/Kconfig +++ b/drivers/pci/controller/dwc/Kconfig @@ -222,6 +222,15 @@ config PCIE_ARTPEC6_EP Enables support for the PCIe controller in the ARTPEC-6 SoC to work in endpoint mode. This uses the DesignWare core. +config PCIE_BT1 + tristate "Baikal-T1 PCIe controller" + depends on MIPS_BAIKAL_T1 || COMPILE_TEST + depends on PCI_MSI_IRQ_DOMAIN + select PCIE_DW_HOST + help + Enables support for the PCIe controller in the Baikal-T1 SoC to work + in host mode. It's based on the Synopsys DWC PCIe v4.60a IP-core. + config PCIE_ROCKCHIP_DW_HOST bool "Rockchip DesignWare PCIe controller" select PCIE_DW diff --git a/drivers/pci/controller/dwc/Makefile b/drivers/pci/controller/dwc/Makefile index 8ba7b67f5e50a..bf5c311875a1e 100644 --- a/drivers/pci/controller/dwc/Makefile +++ b/drivers/pci/controller/dwc/Makefile @@ -3,6 +3,7 @@ obj-$(CONFIG_PCIE_DW) += pcie-designware.o obj-$(CONFIG_PCIE_DW_HOST) += pcie-designware-host.o obj-$(CONFIG_PCIE_DW_EP) += pcie-designware-ep.o obj-$(CONFIG_PCIE_DW_PLAT) += pcie-designware-plat.o +obj-$(CONFIG_PCIE_BT1) += pcie-bt1.o obj-$(CONFIG_PCI_DRA7XX) += pci-dra7xx.o obj-$(CONFIG_PCI_EXYNOS) += pci-exynos.o obj-$(CONFIG_PCIE_FU740) += pcie-fu740.o diff --git a/drivers/pci/controller/dwc/pcie-bt1.c b/drivers/pci/controller/dwc/pcie-bt1.c new file mode 100644 index 0000000000000..3346770e66547 --- /dev/null +++ b/drivers/pci/controller/dwc/pcie-bt1.c @@ -0,0 +1,643 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021 BAIKAL ELECTRONICS, JSC + * + * Authors: + * Vadim Vlasov + * Serge Semin + * + * Baikal-T1 PCIe controller driver + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pcie-designware.h" + +/* Baikal-T1 System CCU control registers */ +#define BT1_CCU_PCIE_CLKC 0x140 +#define BT1_CCU_PCIE_REQ_PCS_CLK BIT(16) +#define BT1_CCU_PCIE_REQ_MAC_CLK BIT(17) +#define BT1_CCU_PCIE_REQ_PIPE_CLK BIT(18) + +#define BT1_CCU_PCIE_RSTC 0x144 +#define BT1_CCU_PCIE_REQ_LINK_RST BIT(13) +#define BT1_CCU_PCIE_REQ_SMLH_RST BIT(14) +#define BT1_CCU_PCIE_REQ_PHY_RST BIT(16) +#define BT1_CCU_PCIE_REQ_CORE_RST BIT(24) +#define BT1_CCU_PCIE_REQ_STICKY_RST BIT(26) +#define BT1_CCU_PCIE_REQ_NSTICKY_RST BIT(27) + +#define BT1_CCU_PCIE_PMSC 0x148 +#define BT1_CCU_PCIE_LTSSM_STATE_MASK GENMASK(5, 0) +#define BT1_CCU_PCIE_LTSSM_DET_QUIET 0x00 +#define BT1_CCU_PCIE_LTSSM_DET_ACT 0x01 +#define BT1_CCU_PCIE_LTSSM_POLL_ACT 0x02 +#define BT1_CCU_PCIE_LTSSM_POLL_COMP 0x03 +#define BT1_CCU_PCIE_LTSSM_POLL_CONF 0x04 +#define BT1_CCU_PCIE_LTSSM_PRE_DET_QUIET 0x05 +#define BT1_CCU_PCIE_LTSSM_DET_WAIT 0x06 +#define BT1_CCU_PCIE_LTSSM_CFG_LNKWD_START 0x07 +#define BT1_CCU_PCIE_LTSSM_CFG_LNKWD_ACEPT 0x08 +#define BT1_CCU_PCIE_LTSSM_CFG_LNNUM_WAIT 0x09 +#define BT1_CCU_PCIE_LTSSM_CFG_LNNUM_ACEPT 0x0a +#define BT1_CCU_PCIE_LTSSM_CFG_COMPLETE 0x0b +#define BT1_CCU_PCIE_LTSSM_CFG_IDLE 0x0c +#define BT1_CCU_PCIE_LTSSM_RCVR_LOCK 0x0d +#define BT1_CCU_PCIE_LTSSM_RCVR_SPEED 0x0e +#define BT1_CCU_PCIE_LTSSM_RCVR_RCVRCFG 0x0f +#define BT1_CCU_PCIE_LTSSM_RCVR_IDLE 0x10 +#define BT1_CCU_PCIE_LTSSM_L0 0x11 +#define BT1_CCU_PCIE_LTSSM_L0S 0x12 +#define BT1_CCU_PCIE_LTSSM_L123_SEND_IDLE 0x13 +#define BT1_CCU_PCIE_LTSSM_L1_IDLE 0x14 +#define BT1_CCU_PCIE_LTSSM_L2_IDLE 0x15 +#define BT1_CCU_PCIE_LTSSM_L2_WAKE 0x16 +#define BT1_CCU_PCIE_LTSSM_DIS_ENTRY 0x17 +#define BT1_CCU_PCIE_LTSSM_DIS_IDLE 0x18 +#define BT1_CCU_PCIE_LTSSM_DISABLE 0x19 +#define BT1_CCU_PCIE_LTSSM_LPBK_ENTRY 0x1a +#define BT1_CCU_PCIE_LTSSM_LPBK_ACTIVE 0x1b +#define BT1_CCU_PCIE_LTSSM_LPBK_EXIT 0x1c +#define BT1_CCU_PCIE_LTSSM_LPBK_EXIT_TOUT 0x1d +#define BT1_CCU_PCIE_LTSSM_HOT_RST_ENTRY 0x1e +#define BT1_CCU_PCIE_LTSSM_HOT_RST 0x1f +#define BT1_CCU_PCIE_LTSSM_RCVR_EQ0 0x20 +#define BT1_CCU_PCIE_LTSSM_RCVR_EQ1 0x21 +#define BT1_CCU_PCIE_LTSSM_RCVR_EQ2 0x22 +#define BT1_CCU_PCIE_LTSSM_RCVR_EQ3 0x23 +#define BT1_CCU_PCIE_SMLH_LINKUP BIT(6) +#define BT1_CCU_PCIE_RDLH_LINKUP BIT(7) +#define BT1_CCU_PCIE_PM_LINKSTATE_L0S BIT(8) +#define BT1_CCU_PCIE_PM_LINKSTATE_L1 BIT(9) +#define BT1_CCU_PCIE_PM_LINKSTATE_L2 BIT(10) +#define BT1_CCU_PCIE_L1_PENDING BIT(12) +#define BT1_CCU_PCIE_REQ_EXIT_L1 BIT(14) +#define BT1_CCU_PCIE_LTSSM_RCVR_EQ BIT(15) +#define BT1_CCU_PCIE_PM_DSTAT_MASK GENMASK(18, 16) +#define BT1_CCU_PCIE_PM_PME_EN BIT(20) +#define BT1_CCU_PCIE_PM_PME_STATUS BIT(21) +#define BT1_CCU_PCIE_AUX_PM_EN BIT(22) +#define BT1_CCU_PCIE_AUX_PWR_DET BIT(23) +#define BT1_CCU_PCIE_WAKE_DET BIT(24) +#define BT1_CCU_PCIE_TURNOFF_REQ BIT(30) +#define BT1_CCU_PCIE_TURNOFF_ACK BIT(31) + +#define BT1_CCU_PCIE_GENC 0x14c +#define BT1_CCU_PCIE_LTSSM_EN BIT(1) +#define BT1_CCU_PCIE_DBI2_MODE BIT(2) +#define BT1_CCU_PCIE_MGMT_EN BIT(3) +#define BT1_CCU_PCIE_RXLANE_FLIP_EN BIT(16) +#define BT1_CCU_PCIE_TXLANE_FLIP_EN BIT(17) +#define BT1_CCU_PCIE_SLV_XFER_PEND BIT(24) +#define BT1_CCU_PCIE_RCV_XFER_PEND BIT(25) +#define BT1_CCU_PCIE_DBI_XFER_PEND BIT(26) +#define BT1_CCU_PCIE_DMA_XFER_PEND BIT(27) + +#define BT1_CCU_PCIE_LTSSM_LINKUP(_pmsc) \ +({ \ + int __state = FIELD_GET(BT1_CCU_PCIE_LTSSM_STATE_MASK, _pmsc); \ + __state >= BT1_CCU_PCIE_LTSSM_L0 && __state <= BT1_CCU_PCIE_LTSSM_L2_WAKE; \ +}) + +/* Baikal-T1 PCIe specific control registers */ +#define BT1_PCIE_AXI2MGM_LANENUM 0xd04 +#define BT1_PCIE_AXI2MGM_LANESEL_MASK GENMASK(3, 0) + +#define BT1_PCIE_AXI2MGM_ADDRCTL 0xd08 +#define BT1_PCIE_AXI2MGM_PHYREG_ADDR_MASK GENMASK(20, 0) +#define BT1_PCIE_AXI2MGM_READ_FLAG BIT(29) +#define BT1_PCIE_AXI2MGM_DONE BIT(30) +#define BT1_PCIE_AXI2MGM_BUSY BIT(31) + +#define BT1_PCIE_AXI2MGM_WRITEDATA 0xd0c +#define BT1_PCIE_AXI2MGM_WDATA GENMASK(15, 0) + +#define BT1_PCIE_AXI2MGM_READDATA 0xd10 +#define BT1_PCIE_AXI2MGM_RDATA GENMASK(15, 0) + +/* Generic Baikal-T1 PCIe interface resources */ +#define BT1_PCIE_NUM_APP_CLKS ARRAY_SIZE(bt1_pcie_app_clks) +#define BT1_PCIE_NUM_CORE_CLKS ARRAY_SIZE(bt1_pcie_core_clks) +#define BT1_PCIE_NUM_APP_RSTS ARRAY_SIZE(bt1_pcie_app_rsts) +#define BT1_PCIE_NUM_CORE_RSTS ARRAY_SIZE(bt1_pcie_core_rsts) + +/* PCIe bus setup delays and timeouts */ +#define BT1_PCIE_RST_DELAY_MS 100 +#define BT1_PCIE_RUN_DELAY_US 100 +#define BT1_PCIE_REQ_DELAY_US 1 +#define BT1_PCIE_REQ_TIMEOUT_US 1000 +#define BT1_PCIE_LNK_DELAY_US 1000 +#define BT1_PCIE_LNK_TIMEOUT_US 1000000 + +static const enum dw_pcie_app_clk bt1_pcie_app_clks[] = { + DW_PCIE_DBI_CLK, DW_PCIE_MSTR_CLK, DW_PCIE_SLV_CLK, +}; + +static const enum dw_pcie_core_clk bt1_pcie_core_clks[] = { + DW_PCIE_REF_CLK, +}; + +static const enum dw_pcie_app_rst bt1_pcie_app_rsts[] = { + DW_PCIE_MSTR_RST, DW_PCIE_SLV_RST, +}; + +static const enum dw_pcie_core_rst bt1_pcie_core_rsts[] = { + DW_PCIE_NON_STICKY_RST, DW_PCIE_STICKY_RST, DW_PCIE_CORE_RST, + DW_PCIE_PIPE_RST, DW_PCIE_PHY_RST, DW_PCIE_HOT_RST, DW_PCIE_PWR_RST, +}; + +struct bt1_pcie { + struct dw_pcie dw; + struct platform_device *pdev; + struct regmap *sys_regs; +}; +#define to_bt1_pcie(_dw) container_of(_dw, struct bt1_pcie, dw) + +/* + * Baikal-T1 MMIO space must be read/written by the dword-aligned + * instructions. Note the methods are optimized to have the dword operations + * performed with minimum overhead as the most frequently used ones. + */ +static int bt1_pcie_read_mmio(void __iomem *addr, int size, u32 *val) +{ + unsigned int ofs = (uintptr_t)addr & 0x3; + + if (!IS_ALIGNED((uintptr_t)addr, size)) + return -EINVAL; + + *val = readl(addr - ofs) >> ofs * BITS_PER_BYTE; + if (size == 4) { + return 0; + } else if (size == 2) { + *val &= 0xffff; + return 0; + } else if (size == 1) { + *val &= 0xff; + return 0; + } + + return -EINVAL; +} + +static int bt1_pcie_write_mmio(void __iomem *addr, int size, u32 val) +{ + unsigned int ofs = (uintptr_t)addr & 0x3; + u32 tmp, mask; + + if (!IS_ALIGNED((uintptr_t)addr, size)) + return -EINVAL; + + if (size == 4) { + writel(val, addr); + return 0; + } else if (size == 2 || size == 1) { + mask = GENMASK(size * BITS_PER_BYTE - 1, 0); + tmp = readl(addr - ofs) & ~(mask << ofs * BITS_PER_BYTE); + tmp |= (val & mask) << ofs * BITS_PER_BYTE; + writel(tmp, addr - ofs); + return 0; + } + + return -EINVAL; +} + +static u32 bt1_pcie_read_dbi(struct dw_pcie *pci, void __iomem *base, u32 reg, + size_t size) +{ + int ret; + u32 val; + + ret = bt1_pcie_read_mmio(base + reg, size, &val); + if (ret) { + dev_err(pci->dev, "Read DBI address failed\n"); + return ~0U; + } + + return val; +} + +static void bt1_pcie_write_dbi(struct dw_pcie *pci, void __iomem *base, u32 reg, + size_t size, u32 val) +{ + int ret; + + ret = bt1_pcie_write_mmio(base + reg, size, val); + if (ret) + dev_err(pci->dev, "Write DBI address failed\n"); +} + +static void bt1_pcie_write_dbi2(struct dw_pcie *pci, void __iomem *base, u32 reg, + size_t size, u32 val) +{ + struct bt1_pcie *btpci = to_bt1_pcie(pci); + int ret; + + regmap_update_bits(btpci->sys_regs, BT1_CCU_PCIE_GENC, + BT1_CCU_PCIE_DBI2_MODE, BT1_CCU_PCIE_DBI2_MODE); + + ret = bt1_pcie_write_mmio(base + reg, size, val); + if (ret) + dev_err(pci->dev, "Write DBI2 address failed\n"); + + regmap_update_bits(btpci->sys_regs, BT1_CCU_PCIE_GENC, + BT1_CCU_PCIE_DBI2_MODE, 0); +} + +static int bt1_pcie_start_link(struct dw_pcie *pci) +{ + struct bt1_pcie *btpci = to_bt1_pcie(pci); + u32 val; + int ret; + + /* + * Enable LTSSM and make sure it was able to establish both PHY and + * data links. This procedure shall work fine to reach 2.5 GT/s speed. + */ + regmap_update_bits(btpci->sys_regs, BT1_CCU_PCIE_GENC, + BT1_CCU_PCIE_LTSSM_EN, BT1_CCU_PCIE_LTSSM_EN); + + ret = regmap_read_poll_timeout(btpci->sys_regs, BT1_CCU_PCIE_PMSC, val, + (val & BT1_CCU_PCIE_SMLH_LINKUP), + BT1_PCIE_LNK_DELAY_US, BT1_PCIE_LNK_TIMEOUT_US); + if (ret) { + dev_err(pci->dev, "LTSSM failed to set PHY link up\n"); + return ret; + } + + ret = regmap_read_poll_timeout(btpci->sys_regs, BT1_CCU_PCIE_PMSC, val, + (val & BT1_CCU_PCIE_RDLH_LINKUP), + BT1_PCIE_LNK_DELAY_US, BT1_PCIE_LNK_TIMEOUT_US); + if (ret) { + dev_err(pci->dev, "LTSSM failed to set data link up\n"); + return ret; + } + + /* + * Activate direct speed change after the link is established in an + * attempt to reach a higher bus performance (up to Gen.3 - 8.0 GT/s). + * This is required at least to get 8.0 GT/s speed. + */ + val = dw_pcie_readl_dbi(pci, PCIE_LINK_WIDTH_SPEED_CONTROL); + val |= PORT_LOGIC_SPEED_CHANGE; + dw_pcie_writel_dbi(pci, PCIE_LINK_WIDTH_SPEED_CONTROL, val); + + ret = regmap_read_poll_timeout(btpci->sys_regs, BT1_CCU_PCIE_PMSC, val, + BT1_CCU_PCIE_LTSSM_LINKUP(val), + BT1_PCIE_LNK_DELAY_US, BT1_PCIE_LNK_TIMEOUT_US); + if (ret) + dev_err(pci->dev, "LTSSM failed to get into L0 state\n"); + + return ret; +} + +static void bt1_pcie_stop_link(struct dw_pcie *pci) +{ + struct bt1_pcie *btpci = to_bt1_pcie(pci); + + regmap_update_bits(btpci->sys_regs, BT1_CCU_PCIE_GENC, + BT1_CCU_PCIE_LTSSM_EN, 0); +} + +static const struct dw_pcie_ops bt1_pcie_ops = { + .read_dbi = bt1_pcie_read_dbi, + .write_dbi = bt1_pcie_write_dbi, + .write_dbi2 = bt1_pcie_write_dbi2, + .start_link = bt1_pcie_start_link, + .stop_link = bt1_pcie_stop_link, +}; + +static struct pci_ops bt1_pci_ops = { + .map_bus = dw_pcie_own_conf_map_bus, + .read = pci_generic_config_read32, + .write = pci_generic_config_write32, +}; + +static int bt1_pcie_get_resources(struct bt1_pcie *btpci) +{ + struct device *dev = btpci->dw.dev; + int i; + + /* DBI access is supposed to be performed by the dword-aligned IOs */ + btpci->dw.pp.bridge->ops = &bt1_pci_ops; + + /* These CSRs are in MMIO so we won't check the regmap-methods status */ + btpci->sys_regs = + syscon_regmap_lookup_by_phandle(dev->of_node, "baikal,bt1-syscon"); + if (IS_ERR(btpci->sys_regs)) + return dev_err_probe(dev, PTR_ERR(btpci->sys_regs), + "Failed to get syscon\n"); + + /* Make sure all the required resources have been specified */ + for (i = 0; i < BT1_PCIE_NUM_APP_CLKS; i++) { + if (!btpci->dw.app_clks[bt1_pcie_app_clks[i]].clk) { + dev_err(dev, "App clocks set is incomplete\n"); + return -ENOENT; + } + } + + for (i = 0; i < BT1_PCIE_NUM_CORE_CLKS; i++) { + if (!btpci->dw.core_clks[bt1_pcie_core_clks[i]].clk) { + dev_err(dev, "Core clocks set is incomplete\n"); + return -ENOENT; + } + } + + for (i = 0; i < BT1_PCIE_NUM_APP_RSTS; i++) { + if (!btpci->dw.app_rsts[bt1_pcie_app_rsts[i]].rstc) { + dev_err(dev, "App resets set is incomplete\n"); + return -ENOENT; + } + } + + for (i = 0; i < BT1_PCIE_NUM_CORE_RSTS; i++) { + if (!btpci->dw.core_rsts[bt1_pcie_core_rsts[i]].rstc) { + dev_err(dev, "Core resets set is incomplete\n"); + return -ENOENT; + } + } + + return 0; +} + +static void bt1_pcie_full_stop_bus(struct bt1_pcie *btpci, bool init) +{ + struct device *dev = btpci->dw.dev; + struct dw_pcie *pci = &btpci->dw; + int ret; + + /* Disable LTSSM for sure */ + regmap_update_bits(btpci->sys_regs, BT1_CCU_PCIE_GENC, + BT1_CCU_PCIE_LTSSM_EN, 0); + + /* + * Application reset controls are trigger-based so assert the core + * resets only. + */ + ret = reset_control_bulk_assert(DW_PCIE_NUM_CORE_RSTS, pci->core_rsts); + if (ret) + dev_err(dev, "Failed to assert core resets\n"); + + /* + * Clocks are disabled by default at least in accordance with the clk + * enable counter value on init stage. + */ + if (!init) { + clk_bulk_disable_unprepare(DW_PCIE_NUM_CORE_CLKS, pci->core_clks); + + clk_bulk_disable_unprepare(DW_PCIE_NUM_APP_CLKS, pci->app_clks); + } + + /* The peripheral devices are unavailable anyway so reset them too */ + gpiod_set_value_cansleep(pci->pe_rst, 1); + + /* Make sure all the resets are settled */ + msleep(BT1_PCIE_RST_DELAY_MS); +} + +/* + * Implements the cold reset procedure in accordance with the reference manual + * and available PM signals. + */ +static int bt1_pcie_cold_start_bus(struct bt1_pcie *btpci) +{ + struct device *dev = btpci->dw.dev; + struct dw_pcie *pci = &btpci->dw; + u32 val; + int ret; + + /* First get out of the Power/Hot reset state */ + ret = reset_control_deassert(pci->core_rsts[DW_PCIE_PWR_RST].rstc); + if (ret) { + dev_err(dev, "Failed to deassert PHY reset\n"); + return ret; + } + + ret = reset_control_deassert(pci->core_rsts[DW_PCIE_HOT_RST].rstc); + if (ret) { + dev_err(dev, "Failed to deassert hot reset\n"); + goto err_assert_pwr_rst; + } + + /* Wait for the PM-core to stop requesting the PHY reset */ + ret = regmap_read_poll_timeout(btpci->sys_regs, BT1_CCU_PCIE_RSTC, val, + !(val & BT1_CCU_PCIE_REQ_PHY_RST), + BT1_PCIE_REQ_DELAY_US, BT1_PCIE_REQ_TIMEOUT_US); + if (ret) { + dev_err(dev, "Timed out waiting for PM to stop PHY resetting\n"); + goto err_assert_hot_rst; + } + + ret = reset_control_deassert(pci->core_rsts[DW_PCIE_PHY_RST].rstc); + if (ret) { + dev_err(dev, "Failed to deassert PHY reset\n"); + goto err_assert_hot_rst; + } + + /* Clocks can be now enabled, but the ref one is crucial at this stage */ + ret = clk_bulk_prepare_enable(DW_PCIE_NUM_APP_CLKS, pci->app_clks); + if (ret) { + dev_err(dev, "Failed to enable app clocks\n"); + goto err_assert_phy_rst; + } + + ret = clk_bulk_prepare_enable(DW_PCIE_NUM_CORE_CLKS, pci->core_clks); + if (ret) { + dev_err(dev, "Failed to enable ref clocks\n"); + goto err_disable_app_clk; + } + + /* Wait for the PM to stop requesting the controller core reset */ + ret = regmap_read_poll_timeout(btpci->sys_regs, BT1_CCU_PCIE_RSTC, val, + !(val & BT1_CCU_PCIE_REQ_CORE_RST), + BT1_PCIE_REQ_DELAY_US, BT1_PCIE_REQ_TIMEOUT_US); + if (ret) { + dev_err(dev, "Timed out waiting for PM to stop core resetting\n"); + goto err_disable_core_clk; + } + + /* PCS-PIPE interface and controller core can be now activated */ + ret = reset_control_deassert(pci->core_rsts[DW_PCIE_PIPE_RST].rstc); + if (ret) { + dev_err(dev, "Failed to deassert PIPE reset\n"); + goto err_disable_core_clk; + } + + ret = reset_control_deassert(pci->core_rsts[DW_PCIE_CORE_RST].rstc); + if (ret) { + dev_err(dev, "Failed to deassert core reset\n"); + goto err_assert_pipe_rst; + } + + /* It's recommended to reset the core and application logic together */ + ret = reset_control_bulk_reset(DW_PCIE_NUM_APP_RSTS, pci->app_rsts); + if (ret) { + dev_err(dev, "Failed to reset app domain\n"); + goto err_assert_core_rst; + } + + /* Sticky/Non-sticky CSR flags can be now unreset too */ + ret = reset_control_deassert(pci->core_rsts[DW_PCIE_STICKY_RST].rstc); + if (ret) { + dev_err(dev, "Failed to deassert sticky reset\n"); + goto err_assert_core_rst; + } + + ret = reset_control_deassert(pci->core_rsts[DW_PCIE_NON_STICKY_RST].rstc); + if (ret) { + dev_err(dev, "Failed to deassert non-sticky reset\n"); + goto err_assert_sticky_rst; + } + + /* Activate the PCIe bus peripheral devices */ + gpiod_set_value_cansleep(pci->pe_rst, 0); + + /* Make sure the state is settled (LTSSM is still disabled though) */ + usleep_range(BT1_PCIE_RUN_DELAY_US, BT1_PCIE_RUN_DELAY_US + 100); + + return 0; + +err_assert_sticky_rst: + reset_control_assert(pci->core_rsts[DW_PCIE_STICKY_RST].rstc); + +err_assert_core_rst: + reset_control_assert(pci->core_rsts[DW_PCIE_CORE_RST].rstc); + +err_assert_pipe_rst: + reset_control_assert(pci->core_rsts[DW_PCIE_PIPE_RST].rstc); + +err_disable_core_clk: + clk_bulk_disable_unprepare(DW_PCIE_NUM_CORE_CLKS, pci->core_clks); + +err_disable_app_clk: + clk_bulk_disable_unprepare(DW_PCIE_NUM_APP_CLKS, pci->app_clks); + +err_assert_phy_rst: + reset_control_assert(pci->core_rsts[DW_PCIE_PHY_RST].rstc); + +err_assert_hot_rst: + reset_control_assert(pci->core_rsts[DW_PCIE_HOT_RST].rstc); + +err_assert_pwr_rst: + reset_control_assert(pci->core_rsts[DW_PCIE_PWR_RST].rstc); + + return ret; +} + +static int bt1_pcie_host_init(struct dw_pcie_rp *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct bt1_pcie *btpci = to_bt1_pcie(pci); + int ret; + + ret = bt1_pcie_get_resources(btpci); + if (ret) + return ret; + + bt1_pcie_full_stop_bus(btpci, true); + + return bt1_pcie_cold_start_bus(btpci); +} + +static void bt1_pcie_host_deinit(struct dw_pcie_rp *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct bt1_pcie *btpci = to_bt1_pcie(pci); + + bt1_pcie_full_stop_bus(btpci, false); +} + +static const struct dw_pcie_host_ops bt1_pcie_host_ops = { + .host_init = bt1_pcie_host_init, + .host_deinit = bt1_pcie_host_deinit, +}; + +static struct bt1_pcie *bt1_pcie_create_data(struct platform_device *pdev) +{ + struct bt1_pcie *btpci; + + btpci = devm_kzalloc(&pdev->dev, sizeof(*btpci), GFP_KERNEL); + if (!btpci) + return ERR_PTR(-ENOMEM); + + btpci->pdev = pdev; + + platform_set_drvdata(pdev, btpci); + + return btpci; +} + +static int bt1_pcie_add_port(struct bt1_pcie *btpci) +{ + struct device *dev = &btpci->pdev->dev; + int ret; + + btpci->dw.version = DW_PCIE_VER_460A; + btpci->dw.dev = dev; + btpci->dw.ops = &bt1_pcie_ops; + + btpci->dw.pp.num_vectors = MAX_MSI_IRQS; + btpci->dw.pp.ops = &bt1_pcie_host_ops; + + dw_pcie_cap_set(&btpci->dw, REQ_RES); + + ret = dw_pcie_host_init(&btpci->dw.pp); + + return dev_err_probe(dev, ret, "Failed to initialize DWC PCIe host\n"); +} + +static void bt1_pcie_del_port(struct bt1_pcie *btpci) +{ + dw_pcie_host_deinit(&btpci->dw.pp); +} + +static int bt1_pcie_probe(struct platform_device *pdev) +{ + struct bt1_pcie *btpci; + + btpci = bt1_pcie_create_data(pdev); + if (IS_ERR(btpci)) + return PTR_ERR(btpci); + + return bt1_pcie_add_port(btpci); +} + +static int bt1_pcie_remove(struct platform_device *pdev) +{ + struct bt1_pcie *btpci = platform_get_drvdata(pdev); + + bt1_pcie_del_port(btpci); + + return 0; +} + +static const struct of_device_id bt1_pcie_of_match[] = { + { .compatible = "baikal,bt1-pcie" }, + {}, +}; +MODULE_DEVICE_TABLE(of, bt1_pcie_of_match); + +static struct platform_driver bt1_pcie_driver = { + .probe = bt1_pcie_probe, + .remove = bt1_pcie_remove, + .driver = { + .name = "bt1-pcie", + .of_match_table = bt1_pcie_of_match, + }, +}; +module_platform_driver(bt1_pcie_driver); + +MODULE_AUTHOR("Serge Semin "); +MODULE_DESCRIPTION("Baikal-T1 PCIe driver"); +MODULE_LICENSE("GPL"); -- GitLab From 9298804840457c29c7e115f3a87bec406c262c81 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Wed, 2 Nov 2022 10:10:08 -0400 Subject: [PATCH 0873/1423] PCI: endpoint: pci-epf-vntb: Clean up kernel_doc warning Cleanup warning found by scripts/kernel-doc. Consolidate terms: - host, host1 to HOST - vhost, vHost, Vhost, VHOST2 to VHOST Link: https://lore.kernel.org/r/20221102141014.1025893-2-Frank.Li@nxp.com Signed-off-by: Frank Li Signed-off-by: Lorenzo Pieralisi Acked-by: Manivannan Sadhasivam --- drivers/pci/endpoint/functions/pci-epf-vntb.c | 83 ++++++++++++------- 1 file changed, 54 insertions(+), 29 deletions(-) diff --git a/drivers/pci/endpoint/functions/pci-epf-vntb.c b/drivers/pci/endpoint/functions/pci-epf-vntb.c index 0ea85e1d292ec..c0115bcb3b5ec 100644 --- a/drivers/pci/endpoint/functions/pci-epf-vntb.c +++ b/drivers/pci/endpoint/functions/pci-epf-vntb.c @@ -11,7 +11,7 @@ * Author: Kishon Vijay Abraham I */ -/** +/* * +------------+ +---------------------------------------+ * | | | | * +------------+ | +--------------+ @@ -156,12 +156,14 @@ static struct pci_epf_header epf_ntb_header = { }; /** - * epf_ntb_link_up() - Raise link_up interrupt to Virtual Host + * epf_ntb_link_up() - Raise link_up interrupt to Virtual Host (VHOST) * @ntb: NTB device that facilitates communication between HOST and VHOST * @link_up: true or false indicating Link is UP or Down * * Once NTB function in HOST invoke ntb_link_enable(), - * this NTB function driver will trigger a link event to vhost. + * this NTB function driver will trigger a link event to VHOST. + * + * Returns: Zero for success, or an error code in case of failure */ static int epf_ntb_link_up(struct epf_ntb *ntb, bool link_up) { @@ -175,9 +177,9 @@ static int epf_ntb_link_up(struct epf_ntb *ntb, bool link_up) } /** - * epf_ntb_configure_mw() - Configure the Outbound Address Space for vhost - * to access the memory window of host - * @ntb: NTB device that facilitates communication between host and vhost + * epf_ntb_configure_mw() - Configure the Outbound Address Space for VHOST + * to access the memory window of HOST + * @ntb: NTB device that facilitates communication between HOST and VHOST * @mw: Index of the memory window (either 0, 1, 2 or 3) * * EP Outbound Window @@ -194,7 +196,9 @@ static int epf_ntb_link_up(struct epf_ntb *ntb, bool link_up) * | | | | * | | | | * +--------+ +-----------+ - * VHost PCI EP + * VHOST PCI EP + * + * Returns: Zero for success, or an error code in case of failure */ static int epf_ntb_configure_mw(struct epf_ntb *ntb, u32 mw) { @@ -219,7 +223,7 @@ static int epf_ntb_configure_mw(struct epf_ntb *ntb, u32 mw) /** * epf_ntb_teardown_mw() - Teardown the configured OB ATU - * @ntb: NTB device that facilitates communication between HOST and vHOST + * @ntb: NTB device that facilitates communication between HOST and VHOST * @mw: Index of the memory window (either 0, 1, 2 or 3) * * Teardown the configured OB ATU configured in epf_ntb_configure_mw() using @@ -234,12 +238,12 @@ static void epf_ntb_teardown_mw(struct epf_ntb *ntb, u32 mw) } /** - * epf_ntb_cmd_handler() - Handle commands provided by the NTB Host + * epf_ntb_cmd_handler() - Handle commands provided by the NTB HOST * @work: work_struct for the epf_ntb_epc * * Workqueue function that gets invoked for the two epf_ntb_epc * periodically (once every 5ms) to see if it has received any commands - * from NTB host. The host can send commands to configure doorbell or + * from NTB HOST. The HOST can send commands to configure doorbell or * configure memory window or to update link status. */ static void epf_ntb_cmd_handler(struct work_struct *work) @@ -321,8 +325,8 @@ reset_handler: /** * epf_ntb_config_sspad_bar_clear() - Clear Config + Self scratchpad BAR - * @ntb_epc: EPC associated with one of the HOST which holds peer's outbound - * address. + * @ntb: EPC associated with one of the HOST which holds peer's outbound + * address. * * Clear BAR0 of EP CONTROLLER 1 which contains the HOST1's config and * self scratchpad region (removes inbound ATU configuration). While BAR0 is @@ -331,8 +335,10 @@ reset_handler: * used for self scratchpad from epf_ntb_bar[BAR_CONFIG]. * * Please note the self scratchpad region and config region is combined to - * a single region and mapped using the same BAR. Also note HOST2's peer - * scratchpad is HOST1's self scratchpad. + * a single region and mapped using the same BAR. Also note VHOST's peer + * scratchpad is HOST's self scratchpad. + * + * Returns: void */ static void epf_ntb_config_sspad_bar_clear(struct epf_ntb *ntb) { @@ -347,13 +353,15 @@ static void epf_ntb_config_sspad_bar_clear(struct epf_ntb *ntb) /** * epf_ntb_config_sspad_bar_set() - Set Config + Self scratchpad BAR - * @ntb: NTB device that facilitates communication between HOST and vHOST + * @ntb: NTB device that facilitates communication between HOST and VHOST * - * Map BAR0 of EP CONTROLLER 1 which contains the HOST1's config and + * Map BAR0 of EP CONTROLLER which contains the VHOST's config and * self scratchpad region. * * Please note the self scratchpad region and config region is combined to * a single region and mapped using the same BAR. + * + * Returns: Zero for success, or an error code in case of failure */ static int epf_ntb_config_sspad_bar_set(struct epf_ntb *ntb) { @@ -380,7 +388,7 @@ static int epf_ntb_config_sspad_bar_set(struct epf_ntb *ntb) /** * epf_ntb_config_spad_bar_free() - Free the physical memory associated with * config + scratchpad region - * @ntb: NTB device that facilitates communication between HOST and vHOST + * @ntb: NTB device that facilitates communication between HOST and VHOST */ static void epf_ntb_config_spad_bar_free(struct epf_ntb *ntb) { @@ -393,11 +401,13 @@ static void epf_ntb_config_spad_bar_free(struct epf_ntb *ntb) /** * epf_ntb_config_spad_bar_alloc() - Allocate memory for config + scratchpad * region - * @ntb: NTB device that facilitates communication between HOST1 and HOST2 + * @ntb: NTB device that facilitates communication between HOST and VHOST * * Allocate the Local Memory mentioned in the above diagram. The size of * CONFIG REGION is sizeof(struct epf_ntb_ctrl) and size of SCRATCHPAD REGION * is obtained from "spad-count" configfs entry. + * + * Returns: Zero for success, or an error code in case of failure */ static int epf_ntb_config_spad_bar_alloc(struct epf_ntb *ntb) { @@ -465,11 +475,13 @@ static int epf_ntb_config_spad_bar_alloc(struct epf_ntb *ntb) } /** - * epf_ntb_configure_interrupt() - Configure MSI/MSI-X capaiblity - * @ntb: NTB device that facilitates communication between HOST and vHOST + * epf_ntb_configure_interrupt() - Configure MSI/MSI-X capability + * @ntb: NTB device that facilitates communication between HOST and VHOST * * Configure MSI/MSI-X capability for each interface with number of * interrupts equal to "db_count" configfs entry. + * + * Returns: Zero for success, or an error code in case of failure */ static int epf_ntb_configure_interrupt(struct epf_ntb *ntb) { @@ -511,7 +523,9 @@ static int epf_ntb_configure_interrupt(struct epf_ntb *ntb) /** * epf_ntb_db_bar_init() - Configure Doorbell window BARs - * @ntb: NTB device that facilitates communication between HOST and vHOST + * @ntb: NTB device that facilitates communication between HOST and VHOST + * + * Returns: Zero for success, or an error code in case of failure */ static int epf_ntb_db_bar_init(struct epf_ntb *ntb) { @@ -566,7 +580,7 @@ static void epf_ntb_mw_bar_clear(struct epf_ntb *ntb, int num_mws); /** * epf_ntb_db_bar_clear() - Clear doorbell BAR and free memory * allocated in peer's outbound address space - * @ntb: NTB device that facilitates communication between HOST and vHOST + * @ntb: NTB device that facilitates communication between HOST and VHOST */ static void epf_ntb_db_bar_clear(struct epf_ntb *ntb) { @@ -582,8 +596,9 @@ static void epf_ntb_db_bar_clear(struct epf_ntb *ntb) /** * epf_ntb_mw_bar_init() - Configure Memory window BARs - * @ntb: NTB device that facilitates communication between HOST and vHOST + * @ntb: NTB device that facilitates communication between HOST and VHOST * + * Returns: Zero for success, or an error code in case of failure */ static int epf_ntb_mw_bar_init(struct epf_ntb *ntb) { @@ -639,7 +654,7 @@ err_alloc_mem: /** * epf_ntb_mw_bar_clear() - Clear Memory window BARs - * @ntb: NTB device that facilitates communication between HOST and vHOST + * @ntb: NTB device that facilitates communication between HOST and VHOST */ static void epf_ntb_mw_bar_clear(struct epf_ntb *ntb, int num_mws) { @@ -662,7 +677,7 @@ static void epf_ntb_mw_bar_clear(struct epf_ntb *ntb, int num_mws) /** * epf_ntb_epc_destroy() - Cleanup NTB EPC interface - * @ntb: NTB device that facilitates communication between HOST and vHOST + * @ntb: NTB device that facilitates communication between HOST and VHOST * * Wrapper for epf_ntb_epc_destroy_interface() to cleanup all the NTB interfaces */ @@ -675,7 +690,9 @@ static void epf_ntb_epc_destroy(struct epf_ntb *ntb) /** * epf_ntb_init_epc_bar() - Identify BARs to be used for each of the NTB * constructs (scratchpad region, doorbell, memorywindow) - * @ntb: NTB device that facilitates communication between HOST and vHOST + * @ntb: NTB device that facilitates communication between HOST and VHOST + * + * Returns: Zero for success, or an error code in case of failure */ static int epf_ntb_init_epc_bar(struct epf_ntb *ntb) { @@ -716,11 +733,13 @@ static int epf_ntb_init_epc_bar(struct epf_ntb *ntb) /** * epf_ntb_epc_init() - Initialize NTB interface - * @ntb: NTB device that facilitates communication between HOST and vHOST2 + * @ntb: NTB device that facilitates communication between HOST and VHOST * * Wrapper to initialize a particular EPC interface and start the workqueue - * to check for commands from host. This function will write to the + * to check for commands from HOST. This function will write to the * EP controller HW for configuring it. + * + * Returns: Zero for success, or an error code in case of failure */ static int epf_ntb_epc_init(struct epf_ntb *ntb) { @@ -787,7 +806,7 @@ err_config_interrupt: /** * epf_ntb_epc_cleanup() - Cleanup all NTB interfaces - * @ntb: NTB device that facilitates communication between HOST1 and HOST2 + * @ntb: NTB device that facilitates communication between HOST and VHOST * * Wrapper to cleanup all NTB interfaces. */ @@ -951,6 +970,8 @@ static const struct config_item_type ntb_group_type = { * * Add configfs directory specific to NTB. This directory will hold * NTB specific properties like db_count, spad_count, num_mws etc., + * + * Returns: Pointer to config_group */ static struct config_group *epf_ntb_add_cfs(struct pci_epf *epf, struct config_group *group) @@ -1292,6 +1313,8 @@ static struct pci_driver vntb_pci_driver = { * Invoked when a primary interface or secondary interface is bound to EPC * device. This function will succeed only when EPC is bound to both the * interfaces. + * + * Returns: Zero for success, or an error code in case of failure */ static int epf_ntb_bind(struct pci_epf *epf) { @@ -1377,6 +1400,8 @@ static struct pci_epf_ops epf_ntb_ops = { * * Probe NTB function driver when endpoint function bus detects a NTB * endpoint function. + * + * Returns: Zero for success, or an error code in case of failure */ static int epf_ntb_probe(struct pci_epf *epf) { -- GitLab From 1d118fed348f65bcc08e9bfb947085c276d05b52 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Wed, 2 Nov 2022 10:10:09 -0400 Subject: [PATCH 0874/1423] PCI: endpoint: pci-epf-vntb: Fix struct epf_ntb_ctrl indentation Align the indentation of struct epf_ntb_ctrl with other structs in the driver. Link: https://lore.kernel.org/r/20221102141014.1025893-3-Frank.Li@nxp.com Signed-off-by: Frank Li Signed-off-by: Lorenzo Pieralisi --- drivers/pci/endpoint/functions/pci-epf-vntb.c | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/pci/endpoint/functions/pci-epf-vntb.c b/drivers/pci/endpoint/functions/pci-epf-vntb.c index c0115bcb3b5ec..1863006cc36c0 100644 --- a/drivers/pci/endpoint/functions/pci-epf-vntb.c +++ b/drivers/pci/endpoint/functions/pci-epf-vntb.c @@ -99,20 +99,20 @@ enum epf_ntb_bar { * NTB Driver NTB Driver */ struct epf_ntb_ctrl { - u32 command; - u32 argument; - u16 command_status; - u16 link_status; - u32 topology; - u64 addr; - u64 size; - u32 num_mws; - u32 reserved; - u32 spad_offset; - u32 spad_count; - u32 db_entry_size; - u32 db_data[MAX_DB_COUNT]; - u32 db_offset[MAX_DB_COUNT]; + u32 command; + u32 argument; + u16 command_status; + u16 link_status; + u32 topology; + u64 addr; + u64 size; + u32 num_mws; + u32 reserved; + u32 spad_offset; + u32 spad_count; + u32 db_entry_size; + u32 db_data[MAX_DB_COUNT]; + u32 db_offset[MAX_DB_COUNT]; } __packed; struct epf_ntb { -- GitLab From 0c031262d2ddfb938f9668d620d7ed674771646c Mon Sep 17 00:00:00 2001 From: Frank Li Date: Wed, 2 Nov 2022 10:10:10 -0400 Subject: [PATCH 0875/1423] PCI: endpoint: pci-epf-vntb: Fix call pci_epc_mem_free_addr() in error path Replace pci_epc_mem_free_addr() with pci_epf_free_space() in the error handle path to match pci_epf_alloc_space(). Link: https://lore.kernel.org/r/20221102141014.1025893-4-Frank.Li@nxp.com Fixes: e35f56bb0330 ("PCI: endpoint: Support NTB transfer between RC and EP") Signed-off-by: Frank Li Signed-off-by: Lorenzo Pieralisi --- drivers/pci/endpoint/functions/pci-epf-vntb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/endpoint/functions/pci-epf-vntb.c b/drivers/pci/endpoint/functions/pci-epf-vntb.c index 1863006cc36c0..191924a834548 100644 --- a/drivers/pci/endpoint/functions/pci-epf-vntb.c +++ b/drivers/pci/endpoint/functions/pci-epf-vntb.c @@ -571,7 +571,7 @@ static int epf_ntb_db_bar_init(struct epf_ntb *ntb) return ret; err_alloc_peer_mem: - pci_epc_mem_free_addr(ntb->epf->epc, epf_bar->phys_addr, mw_addr, epf_bar->size); + pci_epf_free_space(ntb->epf, mw_addr, barno, 0); return -1; } -- GitLab From 03d426ae5426caf46cb96534ca77fa50a018dd3a Mon Sep 17 00:00:00 2001 From: Frank Li Date: Wed, 2 Nov 2022 10:10:11 -0400 Subject: [PATCH 0876/1423] PCI: endpoint: pci-epf-vntb: Remove unused epf_db_phy struct member epf_db_phy member in struct epf_ntb is not used, remove it. Link: https://lore.kernel.org/r/20221102141014.1025893-5-Frank.Li@nxp.com Signed-off-by: Frank Li Signed-off-by: Lorenzo Pieralisi Acked-by: Manivannan Sadhasivam --- drivers/pci/endpoint/functions/pci-epf-vntb.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/pci/endpoint/functions/pci-epf-vntb.c b/drivers/pci/endpoint/functions/pci-epf-vntb.c index 191924a834548..ee66101cb5c41 100644 --- a/drivers/pci/endpoint/functions/pci-epf-vntb.c +++ b/drivers/pci/endpoint/functions/pci-epf-vntb.c @@ -136,7 +136,6 @@ struct epf_ntb { struct epf_ntb_ctrl *reg; - phys_addr_t epf_db_phy; void __iomem *epf_db; phys_addr_t vpci_mw_phy[MAX_MW]; -- GitLab From 2b35c886556a24f1531edf38a4dab53bbbea4db4 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Wed, 2 Nov 2022 10:10:12 -0400 Subject: [PATCH 0877/1423] PCI: endpoint: pci-epf-vntb: Replace hardcoded 4 with sizeof(u32) NTB spad entry item size is sizeof(u32), replace hardcoded 4 with it. Link: https://lore.kernel.org/r/20221102141014.1025893-6-Frank.Li@nxp.com Signed-off-by: Frank Li Signed-off-by: Lorenzo Pieralisi Acked-by: Manivannan Sadhasivam --- drivers/pci/endpoint/functions/pci-epf-vntb.c | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/pci/endpoint/functions/pci-epf-vntb.c b/drivers/pci/endpoint/functions/pci-epf-vntb.c index ee66101cb5c41..54616281da9e1 100644 --- a/drivers/pci/endpoint/functions/pci-epf-vntb.c +++ b/drivers/pci/endpoint/functions/pci-epf-vntb.c @@ -257,12 +257,12 @@ static void epf_ntb_cmd_handler(struct work_struct *work) ntb = container_of(work, struct epf_ntb, cmd_handler.work); for (i = 1; i < ntb->db_count; i++) { - if (readl(ntb->epf_db + i * 4)) { - if (readl(ntb->epf_db + i * 4)) + if (readl(ntb->epf_db + i * sizeof(u32))) { + if (readl(ntb->epf_db + i * sizeof(u32))) ntb->db |= 1 << (i - 1); ntb_db_event(&ntb->ntb, i); - writel(0, ntb->epf_db + i * 4); + writel(0, ntb->epf_db + i * sizeof(u32)); } } @@ -433,7 +433,7 @@ static int epf_ntb_config_spad_bar_alloc(struct epf_ntb *ntb) spad_count = ntb->spad_count; ctrl_size = sizeof(struct epf_ntb_ctrl); - spad_size = 2 * spad_count * 4; + spad_size = 2 * spad_count * sizeof(u32); if (!align) { ctrl_size = roundup_pow_of_two(ctrl_size); @@ -463,7 +463,7 @@ static int epf_ntb_config_spad_bar_alloc(struct epf_ntb *ntb) ctrl->num_mws = ntb->num_mws; ntb->spad_size = spad_size; - ctrl->db_entry_size = 4; + ctrl->db_entry_size = sizeof(u32); for (i = 0; i < ntb->db_count; i++) { ntb->reg->db_data[i] = 1 + i; @@ -535,7 +535,7 @@ static int epf_ntb_db_bar_init(struct epf_ntb *ntb) struct pci_epf_bar *epf_bar; void __iomem *mw_addr; enum pci_barno barno; - size_t size = 4 * ntb->db_count; + size_t size = sizeof(u32) * ntb->db_count; epc_features = pci_epc_get_features(ntb->epf->epc, ntb->epf->func_no, @@ -1121,11 +1121,11 @@ static int vntb_epf_link_enable(struct ntb_dev *ntb, static u32 vntb_epf_spad_read(struct ntb_dev *ndev, int idx) { struct epf_ntb *ntb = ntb_ndev(ndev); - int off = ntb->reg->spad_offset, ct = ntb->reg->spad_count * 4; + int off = ntb->reg->spad_offset, ct = ntb->reg->spad_count * sizeof(u32); u32 val; void __iomem *base = ntb->reg; - val = readl(base + off + ct + idx * 4); + val = readl(base + off + ct + idx * sizeof(u32)); return val; } @@ -1133,10 +1133,10 @@ static int vntb_epf_spad_write(struct ntb_dev *ndev, int idx, u32 val) { struct epf_ntb *ntb = ntb_ndev(ndev); struct epf_ntb_ctrl *ctrl = ntb->reg; - int off = ctrl->spad_offset, ct = ctrl->spad_count * 4; + int off = ctrl->spad_offset, ct = ctrl->spad_count * sizeof(u32); void __iomem *base = ntb->reg; - writel(val, base + off + ct + idx * 4); + writel(val, base + off + ct + idx * sizeof(u32)); return 0; } @@ -1148,7 +1148,7 @@ static u32 vntb_epf_peer_spad_read(struct ntb_dev *ndev, int pidx, int idx) void __iomem *base = ntb->reg; u32 val; - val = readl(base + off + idx * 4); + val = readl(base + off + idx * sizeof(u32)); return val; } @@ -1159,7 +1159,7 @@ static int vntb_epf_peer_spad_write(struct ntb_dev *ndev, int pidx, int idx, u32 int off = ctrl->spad_offset; void __iomem *base = ntb->reg; - writel(val, base + off + idx * 4); + writel(val, base + off + idx * sizeof(u32)); return 0; } -- GitLab From 01dcec6d57ce62d535b2016fc4a617627fff506d Mon Sep 17 00:00:00 2001 From: Frank Li Date: Wed, 2 Nov 2022 10:10:13 -0400 Subject: [PATCH 0878/1423] PCI: endpoint: pci-epf-vntb: Fix sparse build warning for epf_db Use epf_db[i] dereference instead of readl() because epf_db is in memory allocated by dma_alloc_coherent(), not I/O. Remove useless/duplicated readl() in the process. Link: https://lore.kernel.org/r/20221102141014.1025893-7-Frank.Li@nxp.com Signed-off-by: Frank Li Signed-off-by: Lorenzo Pieralisi --- drivers/pci/endpoint/functions/pci-epf-vntb.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/pci/endpoint/functions/pci-epf-vntb.c b/drivers/pci/endpoint/functions/pci-epf-vntb.c index 54616281da9e1..f896846ed970a 100644 --- a/drivers/pci/endpoint/functions/pci-epf-vntb.c +++ b/drivers/pci/endpoint/functions/pci-epf-vntb.c @@ -136,7 +136,7 @@ struct epf_ntb { struct epf_ntb_ctrl *reg; - void __iomem *epf_db; + u32 *epf_db; phys_addr_t vpci_mw_phy[MAX_MW]; void __iomem *vpci_mw_addr[MAX_MW]; @@ -257,12 +257,10 @@ static void epf_ntb_cmd_handler(struct work_struct *work) ntb = container_of(work, struct epf_ntb, cmd_handler.work); for (i = 1; i < ntb->db_count; i++) { - if (readl(ntb->epf_db + i * sizeof(u32))) { - if (readl(ntb->epf_db + i * sizeof(u32))) - ntb->db |= 1 << (i - 1); - + if (ntb->epf_db[i]) { + ntb->db |= 1 << (i - 1); ntb_db_event(&ntb->ntb, i); - writel(0, ntb->epf_db + i * sizeof(u32)); + ntb->epf_db[i] = 0; } } -- GitLab From 5f697b25009ccfebede5e42c6693c4b18de11b37 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Wed, 2 Nov 2022 10:10:14 -0400 Subject: [PATCH 0879/1423] PCI: endpoint: pci-epf-vntb: Fix sparse ntb->reg build warning pci-epf-vntb.c:1128:33: sparse: expected void [noderef] __iomem *base pci-epf-vntb.c:1128:33: sparse: got struct epf_ntb_ctrl *reg Add __iomem type cast in vntb_epf_peer_spad_read() and vntb_epf_peer_spad_write(). Link: https://lore.kernel.org/r/20221102141014.1025893-8-Frank.Li@nxp.com Reported-by: kernel test robot Signed-off-by: Frank Li Signed-off-by: Lorenzo Pieralisi Acked-by: Manivannan Sadhasivam --- drivers/pci/endpoint/functions/pci-epf-vntb.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/pci/endpoint/functions/pci-epf-vntb.c b/drivers/pci/endpoint/functions/pci-epf-vntb.c index f896846ed970a..04698e7995a54 100644 --- a/drivers/pci/endpoint/functions/pci-epf-vntb.c +++ b/drivers/pci/endpoint/functions/pci-epf-vntb.c @@ -1121,7 +1121,7 @@ static u32 vntb_epf_spad_read(struct ntb_dev *ndev, int idx) struct epf_ntb *ntb = ntb_ndev(ndev); int off = ntb->reg->spad_offset, ct = ntb->reg->spad_count * sizeof(u32); u32 val; - void __iomem *base = ntb->reg; + void __iomem *base = (void __iomem *)ntb->reg; val = readl(base + off + ct + idx * sizeof(u32)); return val; @@ -1132,7 +1132,7 @@ static int vntb_epf_spad_write(struct ntb_dev *ndev, int idx, u32 val) struct epf_ntb *ntb = ntb_ndev(ndev); struct epf_ntb_ctrl *ctrl = ntb->reg; int off = ctrl->spad_offset, ct = ctrl->spad_count * sizeof(u32); - void __iomem *base = ntb->reg; + void __iomem *base = (void __iomem *)ntb->reg; writel(val, base + off + ct + idx * sizeof(u32)); return 0; @@ -1143,7 +1143,7 @@ static u32 vntb_epf_peer_spad_read(struct ntb_dev *ndev, int pidx, int idx) struct epf_ntb *ntb = ntb_ndev(ndev); struct epf_ntb_ctrl *ctrl = ntb->reg; int off = ctrl->spad_offset; - void __iomem *base = ntb->reg; + void __iomem *base = (void __iomem *)ntb->reg; u32 val; val = readl(base + off + idx * sizeof(u32)); @@ -1155,7 +1155,7 @@ static int vntb_epf_peer_spad_write(struct ntb_dev *ndev, int pidx, int idx, u32 struct epf_ntb *ntb = ntb_ndev(ndev); struct epf_ntb_ctrl *ctrl = ntb->reg; int off = ctrl->spad_offset; - void __iomem *base = ntb->reg; + void __iomem *base = (void __iomem *)ntb->reg; writel(val, base + off + idx * sizeof(u32)); return 0; -- GitLab From 8ac813f7e663bcf03e09291517359111d0cf9785 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 18 Nov 2022 23:35:45 +0100 Subject: [PATCH 0880/1423] gpio: max732x: Convert to i2c's .probe_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit .probe_new() doesn't get the i2c_device_id * parameter, so determine that explicitly in the probe function. Signed-off-by: Uwe Kleine-König Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-max732x.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpio/gpio-max732x.c b/drivers/gpio/gpio-max732x.c index da69721170308..68e982cdee734 100644 --- a/drivers/gpio/gpio-max732x.c +++ b/drivers/gpio/gpio-max732x.c @@ -608,9 +608,9 @@ static struct max732x_platform_data *of_gpio_max732x(struct device *dev) return pdata; } -static int max732x_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int max732x_probe(struct i2c_client *client) { + const struct i2c_device_id *id = i2c_client_get_device_id(client); struct max732x_platform_data *pdata; struct device_node *node; struct max732x_chip *chip; @@ -707,7 +707,7 @@ static struct i2c_driver max732x_driver = { .name = "max732x", .of_match_table = of_match_ptr(max732x_of_table), }, - .probe = max732x_probe, + .probe_new = max732x_probe, .id_table = max732x_id, }; -- GitLab From 1287341c1980e0cf9eb19bdd370405d755392f2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 18 Nov 2022 23:35:46 +0100 Subject: [PATCH 0881/1423] gpio: pca953x: Convert to i2c's .probe_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit .probe_new() doesn't get the i2c_device_id * parameter, so determine that explicitly in the probe function. Signed-off-by: Uwe Kleine-König Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-pca953x.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpio/gpio-pca953x.c b/drivers/gpio/gpio-pca953x.c index ebe1943b85dd9..342800527c146 100644 --- a/drivers/gpio/gpio-pca953x.c +++ b/drivers/gpio/gpio-pca953x.c @@ -1049,9 +1049,9 @@ out: return ret; } -static int pca953x_probe(struct i2c_client *client, - const struct i2c_device_id *i2c_id) +static int pca953x_probe(struct i2c_client *client) { + const struct i2c_device_id *i2c_id = i2c_client_get_device_id(client); struct pca953x_platform_data *pdata; struct pca953x_chip *chip; int irq_base = 0; @@ -1375,7 +1375,7 @@ static struct i2c_driver pca953x_driver = { .of_match_table = pca953x_dt_ids, .acpi_match_table = pca953x_acpi_ids, }, - .probe = pca953x_probe, + .probe_new = pca953x_probe, .remove = pca953x_remove, .id_table = pca953x_id, }; -- GitLab From 7963ba02b2d1de681ba1ee33060db42eb4cf4c07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 18 Nov 2022 23:35:47 +0100 Subject: [PATCH 0882/1423] gpio: pcf857x: Convert to i2c's .probe_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit .probe_new() doesn't get the i2c_device_id * parameter, so determine that explicitly in the probe function. Signed-off-by: Uwe Kleine-König Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-pcf857x.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpio/gpio-pcf857x.c b/drivers/gpio/gpio-pcf857x.c index e98ea47d72379..cec2f2c78255f 100644 --- a/drivers/gpio/gpio-pcf857x.c +++ b/drivers/gpio/gpio-pcf857x.c @@ -247,9 +247,9 @@ static const struct irq_chip pcf857x_irq_chip = { /*-------------------------------------------------------------------------*/ -static int pcf857x_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int pcf857x_probe(struct i2c_client *client) { + const struct i2c_device_id *id = i2c_client_get_device_id(client); struct pcf857x_platform_data *pdata = dev_get_platdata(&client->dev); struct device_node *np = client->dev.of_node; struct pcf857x *gpio; @@ -422,7 +422,7 @@ static struct i2c_driver pcf857x_driver = { .name = "pcf857x", .of_match_table = of_match_ptr(pcf857x_of_table), }, - .probe = pcf857x_probe, + .probe_new = pcf857x_probe, .remove = pcf857x_remove, .shutdown = pcf857x_shutdown, .id_table = pcf857x_id, -- GitLab From 5736b1b70170e15d66ec02e500db917ef42ade83 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 3 Sep 2022 00:37:06 -0700 Subject: [PATCH 0883/1423] x86/paravirt: Remove clobber bitmask from .parainstructions The u16 "clobber" value is not used in .parainstructions since commit 27876f3882fd ("x86/paravirt: Remove clobbers from struct paravirt_patch_site") Remove the u16 from the section macro, the argument from all macros, and all now-unused CLBR_* macros. Signed-off-by: Kees Cook Signed-off-by: Borislav Petkov Signed-off-by: Peter Zijlstra Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20220903073706.3193746-1-keescook@chromium.org --- arch/x86/include/asm/paravirt_types.h | 61 ++++++--------------------- 1 file changed, 12 insertions(+), 49 deletions(-) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 27c692791b7ea..8c1da419260f9 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -20,37 +20,6 @@ enum paravirt_lazy_mode { #ifdef CONFIG_PARAVIRT -/* Bitmask of what can be clobbered: usually at least eax. */ -#define CLBR_EAX (1 << 0) -#define CLBR_ECX (1 << 1) -#define CLBR_EDX (1 << 2) -#define CLBR_EDI (1 << 3) - -#ifdef CONFIG_X86_32 -/* CLBR_ANY should match all regs platform has. For i386, that's just it */ -#define CLBR_ANY ((1 << 4) - 1) - -#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX) -#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX) -#else -#define CLBR_RAX CLBR_EAX -#define CLBR_RCX CLBR_ECX -#define CLBR_RDX CLBR_EDX -#define CLBR_RDI CLBR_EDI -#define CLBR_RSI (1 << 4) -#define CLBR_R8 (1 << 5) -#define CLBR_R9 (1 << 6) -#define CLBR_R10 (1 << 7) -#define CLBR_R11 (1 << 8) - -#define CLBR_ANY ((1 << 9) - 1) - -#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \ - CLBR_RCX | CLBR_R8 | CLBR_R9) -#define CLBR_RET_REG (CLBR_RAX) - -#endif /* X86_64 */ - #ifndef __ASSEMBLY__ #include @@ -297,27 +266,23 @@ extern struct paravirt_patch_template pv_ops; #define paravirt_type(op) \ [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \ [paravirt_opptr] "m" (pv_ops.op) -#define paravirt_clobber(clobber) \ - [paravirt_clobber] "i" (clobber) - /* * Generate some code, and mark it as patchable by the * apply_paravirt() alternate instruction patcher. */ -#define _paravirt_alt(insn_string, type, clobber) \ +#define _paravirt_alt(insn_string, type) \ "771:\n\t" insn_string "\n" "772:\n" \ ".pushsection .parainstructions,\"a\"\n" \ _ASM_ALIGN "\n" \ _ASM_PTR " 771b\n" \ " .byte " type "\n" \ " .byte 772b-771b\n" \ - " .short " clobber "\n" \ _ASM_ALIGN "\n" \ ".popsection\n" /* Generate patchable code, with the default asm parameters. */ #define paravirt_alt(insn_string) \ - _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]") + _paravirt_alt(insn_string, "%c[paravirt_typenum]") /* Simple instruction patching code. */ #define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t" @@ -469,20 +434,19 @@ int paravirt_disable_iospace(void); }) -#define ____PVOP_CALL(ret, op, clbr, call_clbr, extra_clbr, ...) \ +#define ____PVOP_CALL(ret, op, call_clbr, extra_clbr, ...) \ ({ \ PVOP_CALL_ARGS; \ PVOP_TEST_NULL(op); \ asm volatile(paravirt_alt(PARAVIRT_CALL) \ : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ - paravirt_clobber(clbr), \ ##__VA_ARGS__ \ : "memory", "cc" extra_clbr); \ ret; \ }) -#define ____PVOP_ALT_CALL(ret, op, alt, cond, clbr, call_clbr, \ +#define ____PVOP_ALT_CALL(ret, op, alt, cond, call_clbr, \ extra_clbr, ...) \ ({ \ PVOP_CALL_ARGS; \ @@ -491,45 +455,44 @@ int paravirt_disable_iospace(void); alt, cond) \ : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ - paravirt_clobber(clbr), \ ##__VA_ARGS__ \ : "memory", "cc" extra_clbr); \ ret; \ }) #define __PVOP_CALL(rettype, op, ...) \ - ____PVOP_CALL(PVOP_RETVAL(rettype), op, CLBR_ANY, \ + ____PVOP_CALL(PVOP_RETVAL(rettype), op, \ PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, ##__VA_ARGS__) #define __PVOP_ALT_CALL(rettype, op, alt, cond, ...) \ - ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op, alt, cond, CLBR_ANY,\ + ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op, alt, cond, \ PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, \ ##__VA_ARGS__) #define __PVOP_CALLEESAVE(rettype, op, ...) \ - ____PVOP_CALL(PVOP_RETVAL(rettype), op.func, CLBR_RET_REG, \ + ____PVOP_CALL(PVOP_RETVAL(rettype), op.func, \ PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__) #define __PVOP_ALT_CALLEESAVE(rettype, op, alt, cond, ...) \ ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op.func, alt, cond, \ - CLBR_RET_REG, PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__) + PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__) #define __PVOP_VCALL(op, ...) \ - (void)____PVOP_CALL(, op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \ + (void)____PVOP_CALL(, op, PVOP_VCALL_CLOBBERS, \ VEXTRA_CLOBBERS, ##__VA_ARGS__) #define __PVOP_ALT_VCALL(op, alt, cond, ...) \ - (void)____PVOP_ALT_CALL(, op, alt, cond, CLBR_ANY, \ + (void)____PVOP_ALT_CALL(, op, alt, cond, \ PVOP_VCALL_CLOBBERS, VEXTRA_CLOBBERS, \ ##__VA_ARGS__) #define __PVOP_VCALLEESAVE(op, ...) \ - (void)____PVOP_CALL(, op.func, CLBR_RET_REG, \ + (void)____PVOP_CALL(, op.func, \ PVOP_VCALLEE_CLOBBERS, , ##__VA_ARGS__) #define __PVOP_ALT_VCALLEESAVE(op, alt, cond, ...) \ - (void)____PVOP_ALT_CALL(, op.func, alt, cond, CLBR_RET_REG, \ + (void)____PVOP_ALT_CALL(, op.func, alt, cond, \ PVOP_VCALLEE_CLOBBERS, , ##__VA_ARGS__) -- GitLab From f1a033cc6b9eb6d80322008422df3c87aa5d47a0 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 9 Nov 2022 14:44:18 +0100 Subject: [PATCH 0884/1423] x86/paravirt: Use common macro for creating simple asm paravirt functions There are some paravirt assembler functions which are sharing a common pattern. Introduce a macro DEFINE_PARAVIRT_ASM() for creating them. Note that this macro is including explicit alignment of the generated functions, leading to __raw_callee_save___kvm_vcpu_is_preempted(), _paravirt_nop() and paravirt_ret0() to be aligned at 4 byte boundaries now. The explicit _paravirt_nop() prototype in paravirt.c isn't needed, as it is included in paravirt_types.h already. Signed-off-by: Juergen Gross Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Srivatsa S. Bhat (VMware) Link: https://lkml.kernel.org/r/20221109134418.6516-1-jgross@suse.com --- arch/x86/include/asm/paravirt.h | 12 ++++++ arch/x86/include/asm/qspinlock_paravirt.h | 47 ++++++++++------------- arch/x86/kernel/kvm.c | 19 +++------ arch/x86/kernel/paravirt.c | 23 +---------- 4 files changed, 40 insertions(+), 61 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 2851bc2339d58..73e9522db7c15 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -731,6 +731,18 @@ static __always_inline unsigned long arch_local_irq_save(void) #undef PVOP_VCALL4 #undef PVOP_CALL4 +#define DEFINE_PARAVIRT_ASM(func, instr, sec) \ + asm (".pushsection " #sec ", \"ax\"\n" \ + ".global " #func "\n\t" \ + ".type " #func ", @function\n\t" \ + ASM_FUNC_ALIGN "\n" \ + #func ":\n\t" \ + ASM_ENDBR \ + instr "\n\t" \ + ASM_RET \ + ".size " #func ", . - " #func "\n\t" \ + ".popsection") + extern void default_banner(void); #else /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h index d861127731f4d..42b17cf10b10e 100644 --- a/arch/x86/include/asm/qspinlock_paravirt.h +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -14,8 +14,6 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text"); #define __pv_queued_spin_unlock __pv_queued_spin_unlock -#define PV_UNLOCK "__raw_callee_save___pv_queued_spin_unlock" -#define PV_UNLOCK_SLOWPATH "__raw_callee_save___pv_queued_spin_unlock_slowpath" /* * Optimized assembly version of __raw_callee_save___pv_queued_spin_unlock @@ -37,32 +35,27 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text"); * rsi = lockval (second argument) * rdx = internal variable (set to 0) */ -asm (".pushsection .spinlock.text, \"ax\";" - ".globl " PV_UNLOCK ";" - ".type " PV_UNLOCK ", @function;" - ASM_FUNC_ALIGN - PV_UNLOCK ": " - ASM_ENDBR - FRAME_BEGIN - "push %rdx;" - "mov $0x1,%eax;" - "xor %edx,%edx;" - LOCK_PREFIX "cmpxchg %dl,(%rdi);" - "cmp $0x1,%al;" - "jne .slowpath;" - "pop %rdx;" +#define PV_UNLOCK_ASM \ + FRAME_BEGIN \ + "push %rdx\n\t" \ + "mov $0x1,%eax\n\t" \ + "xor %edx,%edx\n\t" \ + LOCK_PREFIX "cmpxchg %dl,(%rdi)\n\t" \ + "cmp $0x1,%al\n\t" \ + "jne .slowpath\n\t" \ + "pop %rdx\n\t" \ + FRAME_END \ + ASM_RET \ + ".slowpath:\n\t" \ + "push %rsi\n\t" \ + "movzbl %al,%esi\n\t" \ + "call __raw_callee_save___pv_queued_spin_unlock_slowpath\n\t" \ + "pop %rsi\n\t" \ + "pop %rdx\n\t" \ FRAME_END - ASM_RET - ".slowpath: " - "push %rsi;" - "movzbl %al,%esi;" - "call " PV_UNLOCK_SLOWPATH ";" - "pop %rsi;" - "pop %rdx;" - FRAME_END - ASM_RET - ".size " PV_UNLOCK ", .-" PV_UNLOCK ";" - ".popsection"); + +DEFINE_PARAVIRT_ASM(__raw_callee_save___pv_queued_spin_unlock, + PV_UNLOCK_ASM, .spinlock.text); #else /* CONFIG_64BIT */ diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 95fb85bea1118..4d053cb2c48a4 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -798,20 +798,13 @@ extern bool __raw_callee_save___kvm_vcpu_is_preempted(long); * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and * restoring to/from the stack. */ -asm( -".pushsection .text;" -".global __raw_callee_save___kvm_vcpu_is_preempted;" -".type __raw_callee_save___kvm_vcpu_is_preempted, @function;" -ASM_FUNC_ALIGN -"__raw_callee_save___kvm_vcpu_is_preempted:" -ASM_ENDBR -"movq __per_cpu_offset(,%rdi,8), %rax;" -"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);" -"setne %al;" -ASM_RET -".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;" -".popsection"); +#define PV_VCPU_PREEMPTED_ASM \ + "movq __per_cpu_offset(,%rdi,8), %rax\n\t" \ + "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax)\n\t" \ + "setne %al\n\t" +DEFINE_PARAVIRT_ASM(__raw_callee_save___kvm_vcpu_is_preempted, + PV_VCPU_PREEMPTED_ASM, .text); #endif static void __init kvm_guest_init(void) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index e244c49b52d7e..327757afb0276 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -37,29 +37,10 @@ * nop stub, which must not clobber anything *including the stack* to * avoid confusing the entry prologues. */ -extern void _paravirt_nop(void); -asm (".pushsection .entry.text, \"ax\"\n" - ".global _paravirt_nop\n" - ASM_FUNC_ALIGN - "_paravirt_nop:\n\t" - ASM_ENDBR - ASM_RET - ".size _paravirt_nop, . - _paravirt_nop\n\t" - ".type _paravirt_nop, @function\n\t" - ".popsection"); +DEFINE_PARAVIRT_ASM(_paravirt_nop, "", .entry.text); /* stub always returning 0. */ -asm (".pushsection .entry.text, \"ax\"\n" - ".global paravirt_ret0\n" - ASM_FUNC_ALIGN - "paravirt_ret0:\n\t" - ASM_ENDBR - "xor %" _ASM_AX ", %" _ASM_AX ";\n\t" - ASM_RET - ".size paravirt_ret0, . - paravirt_ret0\n\t" - ".type paravirt_ret0, @function\n\t" - ".popsection"); - +DEFINE_PARAVIRT_ASM(paravirt_ret0, "xor %eax,%eax", .entry.text); void __init default_banner(void) { -- GitLab From b4d46c57d2fb0fa2611fa2ffbaf715925989f83f Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Thu, 24 Nov 2022 19:49:33 +0800 Subject: [PATCH 0885/1423] RDMA/erdma: Fix a typo in annotation A non-ASCII character was wrongly put in a comment, use the ACSII version. Fixes: bee85e0e31ec ("RDMA/erdma: Add main include file") Link: https://lore.kernel.org/r/20221124114933.77250-1-chengyou@linux.alibaba.com Signed-off-by: Cheng Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/erdma/erdma.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h index bb23d897c7108..35726f25a989c 100644 --- a/drivers/infiniband/hw/erdma/erdma.h +++ b/drivers/infiniband/hw/erdma/erdma.h @@ -219,7 +219,7 @@ struct erdma_dev { DECLARE_BITMAP(sdb_page, ERDMA_DWQE_TYPE0_CNT); /* * We provide max 496 uContexts that each has one SQ normal Db, - * and one directWQE db。 + * and one directWQE db. */ DECLARE_BITMAP(sdb_entry, ERDMA_DWQE_TYPE1_CNT); -- GitLab From 35765dccaf3485575a4420da529c72484c980345 Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Wed, 16 Nov 2022 10:31:05 +0800 Subject: [PATCH 0886/1423] RDMA/erdma: Add a workqueue for WRs reflushing ERDMA driver use a workqueue for asynchronous reflush command posting. Implement the lifecycle of this workqueue. Link: https://lore.kernel.org/r/20221116023107.82835-2-chengyou@linux.alibaba.com Signed-off-by: Cheng Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/erdma/erdma.h | 1 + drivers/infiniband/hw/erdma/erdma_main.c | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h index 35726f25a989c..3d8c11aa23a26 100644 --- a/drivers/infiniband/hw/erdma/erdma.h +++ b/drivers/infiniband/hw/erdma/erdma.h @@ -190,6 +190,7 @@ struct erdma_dev { struct net_device *netdev; struct pci_dev *pdev; struct notifier_block netdev_nb; + struct workqueue_struct *reflush_wq; resource_size_t func_bar_addr; resource_size_t func_bar_len; diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c index e44b06fea5957..5dc31e5df5cba 100644 --- a/drivers/infiniband/hw/erdma/erdma_main.c +++ b/drivers/infiniband/hw/erdma/erdma_main.c @@ -521,13 +521,22 @@ static int erdma_ib_device_add(struct pci_dev *pdev) u64_to_ether_addr(mac, dev->attrs.peer_addr); + dev->reflush_wq = alloc_workqueue("erdma-reflush-wq", WQ_UNBOUND, + WQ_UNBOUND_MAX_ACTIVE); + if (!dev->reflush_wq) { + ret = -ENOMEM; + goto err_alloc_workqueue; + } + ret = erdma_device_register(dev); if (ret) - goto err_out; + goto err_register; return 0; -err_out: +err_register: + destroy_workqueue(dev->reflush_wq); +err_alloc_workqueue: xa_destroy(&dev->qp_xa); xa_destroy(&dev->cq_xa); @@ -543,6 +552,7 @@ static void erdma_ib_device_remove(struct pci_dev *pdev) unregister_netdevice_notifier(&dev->netdev_nb); ib_unregister_device(&dev->ibdev); + destroy_workqueue(dev->reflush_wq); erdma_res_cb_free(dev); xa_destroy(&dev->qp_xa); xa_destroy(&dev->cq_xa); -- GitLab From 54d8fffc2a500953ba90ff9462ae06bb05ca2354 Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Wed, 16 Nov 2022 10:31:06 +0800 Subject: [PATCH 0887/1423] RDMA/erdma: Implement the lifecycle of reflushing work for each QP Each QP has a work for reflushing purpose. In the work, driver will report the latest pi to hardware. Link: https://lore.kernel.org/r/20221116023107.82835-3-chengyou@linux.alibaba.com Signed-off-by: Cheng Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/erdma/erdma_hw.h | 8 ++++++++ drivers/infiniband/hw/erdma/erdma_verbs.c | 18 ++++++++++++++++++ drivers/infiniband/hw/erdma/erdma_verbs.h | 2 ++ 3 files changed, 28 insertions(+) diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index 1b2e2b70678f5..ab371fec610c3 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -145,6 +145,7 @@ enum CMDQ_RDMA_OPCODE { CMDQ_OPCODE_MODIFY_QP = 3, CMDQ_OPCODE_CREATE_CQ = 4, CMDQ_OPCODE_DESTROY_CQ = 5, + CMDQ_OPCODE_REFLUSH = 6, CMDQ_OPCODE_REG_MR = 8, CMDQ_OPCODE_DEREG_MR = 9 }; @@ -301,6 +302,13 @@ struct erdma_cmdq_destroy_qp_req { u32 qpn; }; +struct erdma_cmdq_reflush_req { + u64 hdr; + u32 qpn; + u32 sq_pi; + u32 rq_pi; +}; + /* cap qword 0 definition */ #define ERDMA_CMD_DEV_CAP_MAX_CQE_MASK GENMASK_ULL(47, 40) #define ERDMA_CMD_DEV_CAP_FLAGS_MASK GENMASK_ULL(31, 24) diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index d843ce1f35f3d..5dab1e87975ba 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -379,6 +379,21 @@ int erdma_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) return 0; } +static void erdma_flush_worker(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct erdma_qp *qp = + container_of(dwork, struct erdma_qp, reflush_dwork); + struct erdma_cmdq_reflush_req req; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_REFLUSH); + req.qpn = QP_ID(qp); + req.sq_pi = qp->kern_qp.sq_pi; + req.rq_pi = qp->kern_qp.rq_pi; + erdma_post_cmd_wait(&qp->dev->cmdq, &req, sizeof(req), NULL, NULL); +} + static int erdma_qp_validate_cap(struct erdma_dev *dev, struct ib_qp_init_attr *attrs) { @@ -735,6 +750,7 @@ int erdma_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs, qp->attrs.max_send_sge = attrs->cap.max_send_sge; qp->attrs.max_recv_sge = attrs->cap.max_recv_sge; qp->attrs.state = ERDMA_QP_STATE_IDLE; + INIT_DELAYED_WORK(&qp->reflush_dwork, erdma_flush_worker); ret = create_qp_cmd(dev, qp); if (ret) @@ -1028,6 +1044,8 @@ int erdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) erdma_modify_qp_internal(qp, &qp_attrs, ERDMA_QP_ATTR_STATE); up_write(&qp->state_lock); + cancel_delayed_work_sync(&qp->reflush_dwork); + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, CMDQ_OPCODE_DESTROY_QP); req.qpn = QP_ID(qp); diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h index a5574f0252bb4..9f341d032069b 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.h +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -197,6 +197,8 @@ struct erdma_qp { struct erdma_cep *cep; struct rw_semaphore state_lock; + struct delayed_work reflush_dwork; + union { struct erdma_kqp kern_qp; struct erdma_uqp user_qp; -- GitLab From 0edf42cbcc8690ef349d4432fea74d7791e3c645 Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Wed, 16 Nov 2022 10:31:07 +0800 Subject: [PATCH 0888/1423] RDMA/erdma: Notify the latest PI to FW for reflushing when necessary Firmware is responsible for flushing WRs in HW, and it's a little difficult for firmware to get the latest PI of QPs, especially for RQs after QP state being changed to ERROR. So we introduce a new CMDQ command, by which driver can notify to latest PI to FW, and then FW can flush all posted WRs. Link: https://lore.kernel.org/r/20221116023107.82835-4-chengyou@linux.alibaba.com Signed-off-by: Cheng Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/erdma/erdma_qp.c | 30 ++++++++++++++++------- drivers/infiniband/hw/erdma/erdma_verbs.h | 5 ++++ 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/erdma/erdma_qp.c b/drivers/infiniband/hw/erdma/erdma_qp.c index 521e97258de77..d088d6bef431a 100644 --- a/drivers/infiniband/hw/erdma/erdma_qp.c +++ b/drivers/infiniband/hw/erdma/erdma_qp.c @@ -120,6 +120,7 @@ static int erdma_modify_qp_state_to_stop(struct erdma_qp *qp, int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, enum erdma_qp_attr_mask mask) { + bool need_reflush = false; int drop_conn, ret = 0; if (!mask) @@ -135,6 +136,7 @@ int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, ret = erdma_modify_qp_state_to_rts(qp, attrs, mask); } else if (attrs->state == ERDMA_QP_STATE_ERROR) { qp->attrs.state = ERDMA_QP_STATE_ERROR; + need_reflush = true; if (qp->cep) { erdma_cep_put(qp->cep); qp->cep = NULL; @@ -145,17 +147,12 @@ int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, case ERDMA_QP_STATE_RTS: drop_conn = 0; - if (attrs->state == ERDMA_QP_STATE_CLOSING) { + if (attrs->state == ERDMA_QP_STATE_CLOSING || + attrs->state == ERDMA_QP_STATE_TERMINATE || + attrs->state == ERDMA_QP_STATE_ERROR) { ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); drop_conn = 1; - } else if (attrs->state == ERDMA_QP_STATE_TERMINATE) { - qp->attrs.state = ERDMA_QP_STATE_TERMINATE; - ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); - drop_conn = 1; - } else if (attrs->state == ERDMA_QP_STATE_ERROR) { - ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); - qp->attrs.state = ERDMA_QP_STATE_ERROR; - drop_conn = 1; + need_reflush = true; } if (drop_conn) @@ -180,6 +177,12 @@ int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, break; } + if (need_reflush && !ret && rdma_is_kernel_res(&qp->ibqp.res)) { + qp->flags |= ERDMA_QP_IN_FLUSHING; + mod_delayed_work(qp->dev->reflush_wq, &qp->reflush_dwork, + usecs_to_jiffies(100)); + } + return ret; } @@ -527,6 +530,10 @@ int erdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *send_wr, } spin_unlock_irqrestore(&qp->lock, flags); + if (unlikely(qp->flags & ERDMA_QP_IN_FLUSHING)) + mod_delayed_work(qp->dev->reflush_wq, &qp->reflush_dwork, + usecs_to_jiffies(100)); + return ret; } @@ -580,5 +587,10 @@ int erdma_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *recv_wr, } spin_unlock_irqrestore(&qp->lock, flags); + + if (unlikely(qp->flags & ERDMA_QP_IN_FLUSHING)) + mod_delayed_work(qp->dev->reflush_wq, &qp->reflush_dwork, + usecs_to_jiffies(100)); + return ret; } diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h index 9f341d032069b..e0a993bc032a4 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.h +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -173,6 +173,10 @@ enum erdma_qp_attr_mask { ERDMA_QP_ATTR_MPA = (1 << 7) }; +enum erdma_qp_flags { + ERDMA_QP_IN_FLUSHING = (1 << 0), +}; + struct erdma_qp_attrs { enum erdma_qp_state state; enum erdma_cc_alg cc; /* Congestion control algorithm */ @@ -197,6 +201,7 @@ struct erdma_qp { struct erdma_cep *cep; struct rw_semaphore state_lock; + unsigned long flags; struct delayed_work reflush_dwork; union { -- GitLab From a7008584ab19d2df05caa95634cd72bc41f4cad3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 13 Nov 2022 16:12:33 -0800 Subject: [PATCH 0889/1423] crypto: api - optimize algorithm registration when self-tests disabled Currently, registering an algorithm with the crypto API always causes a notification to be posted to the "cryptomgr", which then creates a kthread to self-test the algorithm. However, if self-tests are disabled in the kconfig (as is the default option), then this kthread just notifies waiters that the algorithm has been tested, then exits. This causes a significant amount of overhead, especially in the kthread creation and destruction, which is not necessary at all. For example, in a quick test I found that booting a "minimum" x86_64 kernel with all the crypto options enabled (except for the self-tests) takes about 400ms until PID 1 can start. Of that, a full 13ms is spent just doing this pointless dance, involving a kthread being created, run, and destroyed over 200 times. That's over 3% of the entire kernel start time. Fix this by just skipping the creation of the test larval and the posting of the registration notification entirely, when self-tests are disabled. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/algapi.c | 154 +++++++++++++++++++++++++++--------------------- crypto/api.c | 3 - 2 files changed, 86 insertions(+), 71 deletions(-) diff --git a/crypto/algapi.c b/crypto/algapi.c index 5c69ff8e8fa5c..950195e90bfc9 100644 --- a/crypto/algapi.c +++ b/crypto/algapi.c @@ -222,12 +222,64 @@ void crypto_remove_spawns(struct crypto_alg *alg, struct list_head *list, } EXPORT_SYMBOL_GPL(crypto_remove_spawns); +static void crypto_alg_finish_registration(struct crypto_alg *alg, + bool fulfill_requests, + struct list_head *algs_to_put) +{ + struct crypto_alg *q; + + list_for_each_entry(q, &crypto_alg_list, cra_list) { + if (q == alg) + continue; + + if (crypto_is_moribund(q)) + continue; + + if (crypto_is_larval(q)) { + struct crypto_larval *larval = (void *)q; + + /* + * Check to see if either our generic name or + * specific name can satisfy the name requested + * by the larval entry q. + */ + if (strcmp(alg->cra_name, q->cra_name) && + strcmp(alg->cra_driver_name, q->cra_name)) + continue; + + if (larval->adult) + continue; + if ((q->cra_flags ^ alg->cra_flags) & larval->mask) + continue; + + if (fulfill_requests && crypto_mod_get(alg)) + larval->adult = alg; + else + larval->adult = ERR_PTR(-EAGAIN); + + continue; + } + + if (strcmp(alg->cra_name, q->cra_name)) + continue; + + if (strcmp(alg->cra_driver_name, q->cra_driver_name) && + q->cra_priority > alg->cra_priority) + continue; + + crypto_remove_spawns(q, algs_to_put, alg); + } + + crypto_notify(CRYPTO_MSG_ALG_LOADED, alg); +} + static struct crypto_larval *crypto_alloc_test_larval(struct crypto_alg *alg) { struct crypto_larval *larval; - if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER)) - return NULL; + if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER) || + IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS)) + return NULL; /* No self-test needed */ larval = crypto_larval_alloc(alg->cra_name, alg->cra_flags | CRYPTO_ALG_TESTED, 0); @@ -248,7 +300,8 @@ static struct crypto_larval *crypto_alloc_test_larval(struct crypto_alg *alg) return larval; } -static struct crypto_larval *__crypto_register_alg(struct crypto_alg *alg) +static struct crypto_larval * +__crypto_register_alg(struct crypto_alg *alg, struct list_head *algs_to_put) { struct crypto_alg *q; struct crypto_larval *larval; @@ -259,9 +312,6 @@ static struct crypto_larval *__crypto_register_alg(struct crypto_alg *alg) INIT_LIST_HEAD(&alg->cra_users); - /* No cheating! */ - alg->cra_flags &= ~CRYPTO_ALG_TESTED; - ret = -EEXIST; list_for_each_entry(q, &crypto_alg_list, cra_list) { @@ -288,12 +338,17 @@ static struct crypto_larval *__crypto_register_alg(struct crypto_alg *alg) list_add(&alg->cra_list, &crypto_alg_list); - if (larval) + crypto_stats_init(alg); + + if (larval) { + /* No cheating! */ + alg->cra_flags &= ~CRYPTO_ALG_TESTED; + list_add(&larval->alg.cra_list, &crypto_alg_list); - else + } else { alg->cra_flags |= CRYPTO_ALG_TESTED; - - crypto_stats_init(alg); + crypto_alg_finish_registration(alg, true, algs_to_put); + } out: return larval; @@ -341,7 +396,10 @@ found: alg->cra_flags |= CRYPTO_ALG_TESTED; - /* Only satisfy larval waiters if we are the best. */ + /* + * If a higher-priority implementation of the same algorithm is + * currently being tested, then don't fulfill request larvals. + */ best = true; list_for_each_entry(q, &crypto_alg_list, cra_list) { if (crypto_is_moribund(q) || !crypto_is_larval(q)) @@ -356,47 +414,7 @@ found: } } - list_for_each_entry(q, &crypto_alg_list, cra_list) { - if (q == alg) - continue; - - if (crypto_is_moribund(q)) - continue; - - if (crypto_is_larval(q)) { - struct crypto_larval *larval = (void *)q; - - /* - * Check to see if either our generic name or - * specific name can satisfy the name requested - * by the larval entry q. - */ - if (strcmp(alg->cra_name, q->cra_name) && - strcmp(alg->cra_driver_name, q->cra_name)) - continue; - - if (larval->adult) - continue; - if ((q->cra_flags ^ alg->cra_flags) & larval->mask) - continue; - - if (best && crypto_mod_get(alg)) - larval->adult = alg; - else - larval->adult = ERR_PTR(-EAGAIN); - - continue; - } - - if (strcmp(alg->cra_name, q->cra_name)) - continue; - - if (strcmp(alg->cra_driver_name, q->cra_driver_name) && - q->cra_priority > alg->cra_priority) - continue; - - crypto_remove_spawns(q, &list, alg); - } + crypto_alg_finish_registration(alg, best, &list); complete: complete_all(&test->completion); @@ -423,7 +441,8 @@ EXPORT_SYMBOL_GPL(crypto_remove_final); int crypto_register_alg(struct crypto_alg *alg) { struct crypto_larval *larval; - bool test_started; + LIST_HEAD(algs_to_put); + bool test_started = false; int err; alg->cra_flags &= ~CRYPTO_ALG_DEAD; @@ -432,17 +451,18 @@ int crypto_register_alg(struct crypto_alg *alg) return err; down_write(&crypto_alg_sem); - larval = __crypto_register_alg(alg); - test_started = static_key_enabled(&crypto_boot_test_finished); - if (!IS_ERR_OR_NULL(larval)) + larval = __crypto_register_alg(alg, &algs_to_put); + if (!IS_ERR_OR_NULL(larval)) { + test_started = static_key_enabled(&crypto_boot_test_finished); larval->test_started = test_started; + } up_write(&crypto_alg_sem); - if (IS_ERR_OR_NULL(larval)) + if (IS_ERR(larval)) return PTR_ERR(larval); - if (test_started) crypto_wait_for_test(larval); + crypto_remove_final(&algs_to_put); return 0; } EXPORT_SYMBOL_GPL(crypto_register_alg); @@ -619,6 +639,7 @@ int crypto_register_instance(struct crypto_template *tmpl, struct crypto_larval *larval; struct crypto_spawn *spawn; u32 fips_internal = 0; + LIST_HEAD(algs_to_put); int err; err = crypto_check_alg(&inst->alg); @@ -650,7 +671,7 @@ int crypto_register_instance(struct crypto_template *tmpl, inst->alg.cra_flags |= (fips_internal & CRYPTO_ALG_FIPS_INTERNAL); - larval = __crypto_register_alg(&inst->alg); + larval = __crypto_register_alg(&inst->alg, &algs_to_put); if (IS_ERR(larval)) goto unlock; else if (larval) @@ -662,15 +683,12 @@ int crypto_register_instance(struct crypto_template *tmpl, unlock: up_write(&crypto_alg_sem); - err = PTR_ERR(larval); - if (IS_ERR_OR_NULL(larval)) - goto err; - - crypto_wait_for_test(larval); - err = 0; - -err: - return err; + if (IS_ERR(larval)) + return PTR_ERR(larval); + if (larval) + crypto_wait_for_test(larval); + crypto_remove_final(&algs_to_put); + return 0; } EXPORT_SYMBOL_GPL(crypto_register_instance); diff --git a/crypto/api.c b/crypto/api.c index 64f2d365a8e94..52ce10a353660 100644 --- a/crypto/api.c +++ b/crypto/api.c @@ -172,9 +172,6 @@ void crypto_wait_for_test(struct crypto_larval *larval) err = wait_for_completion_killable(&larval->completion); WARN_ON(err); - if (!err) - crypto_notify(CRYPTO_MSG_ALG_LOADED, larval); - out: crypto_larval_kill(&larval->alg); } -- GitLab From 9cadd73adef1e1d53ea100f28e3e258698b92418 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 13 Nov 2022 16:12:34 -0800 Subject: [PATCH 0890/1423] crypto: algboss - optimize registration of internal algorithms Since algboss always skips testing of algorithms with the CRYPTO_ALG_INTERNAL flag, there is no need to go through the dance of creating the test kthread, which creates a lot of overhead. Instead, we can just directly finish the algorithm registration, like is now done when self-tests are disabled entirely. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/algapi.c | 3 ++- crypto/algboss.c | 13 +------------ 2 files changed, 3 insertions(+), 13 deletions(-) diff --git a/crypto/algapi.c b/crypto/algapi.c index 950195e90bfc9..851b247f043d3 100644 --- a/crypto/algapi.c +++ b/crypto/algapi.c @@ -278,7 +278,8 @@ static struct crypto_larval *crypto_alloc_test_larval(struct crypto_alg *alg) struct crypto_larval *larval; if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER) || - IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS)) + IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) || + (alg->cra_flags & CRYPTO_ALG_INTERNAL)) return NULL; /* No self-test needed */ larval = crypto_larval_alloc(alg->cra_name, diff --git a/crypto/algboss.c b/crypto/algboss.c index eb5fe84efb83e..13d37320a66eb 100644 --- a/crypto/algboss.c +++ b/crypto/algboss.c @@ -181,12 +181,8 @@ static int cryptomgr_test(void *data) goto skiptest; #endif - if (type & CRYPTO_ALG_TESTED) - goto skiptest; - err = alg_test(param->driver, param->alg, type, CRYPTO_ALG_TESTED); -skiptest: crypto_alg_tested(param->driver, err); kfree(param); @@ -197,7 +193,6 @@ static int cryptomgr_schedule_test(struct crypto_alg *alg) { struct task_struct *thread; struct crypto_test_param *param; - u32 type; if (!try_module_get(THIS_MODULE)) goto err; @@ -208,13 +203,7 @@ static int cryptomgr_schedule_test(struct crypto_alg *alg) memcpy(param->driver, alg->cra_driver_name, sizeof(param->driver)); memcpy(param->alg, alg->cra_name, sizeof(param->alg)); - type = alg->cra_flags; - - /* Do not test internal algorithms. */ - if (type & CRYPTO_ALG_INTERNAL) - type |= CRYPTO_ALG_TESTED; - - param->type = type; + param->type = alg->cra_flags; thread = kthread_run(cryptomgr_test, param, "cryptomgr_test"); if (IS_ERR(thread)) -- GitLab From 06bd9c967eaac5484c31c3dc6dfbef6183819508 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 13 Nov 2022 16:12:35 -0800 Subject: [PATCH 0891/1423] crypto: api - compile out crypto_boot_test_finished when tests disabled The crypto_boot_test_finished static key is unnecessary when self-tests are disabled in the kconfig, so optimize it out accordingly, along with the entirety of crypto_start_tests(). This mainly avoids the overhead of an unnecessary static_branch_enable() on every boot. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/algapi.c | 7 +++++-- crypto/api.c | 8 +++++--- crypto/internal.h | 20 +++++++++++++++++++- 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/crypto/algapi.c b/crypto/algapi.c index 851b247f043d3..d08f864f08bee 100644 --- a/crypto/algapi.c +++ b/crypto/algapi.c @@ -454,7 +454,7 @@ int crypto_register_alg(struct crypto_alg *alg) down_write(&crypto_alg_sem); larval = __crypto_register_alg(alg, &algs_to_put); if (!IS_ERR_OR_NULL(larval)) { - test_started = static_key_enabled(&crypto_boot_test_finished); + test_started = crypto_boot_test_finished(); larval->test_started = test_started; } up_write(&crypto_alg_sem); @@ -1253,6 +1253,9 @@ EXPORT_SYMBOL_GPL(crypto_stats_skcipher_decrypt); static void __init crypto_start_tests(void) { + if (IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS)) + return; + for (;;) { struct crypto_larval *larval = NULL; struct crypto_alg *q; @@ -1286,7 +1289,7 @@ static void __init crypto_start_tests(void) crypto_wait_for_test(larval); } - static_branch_enable(&crypto_boot_test_finished); + set_crypto_boot_test_finished(); } static int __init crypto_algapi_init(void) diff --git a/crypto/api.c b/crypto/api.c index 52ce10a353660..b022702f64367 100644 --- a/crypto/api.c +++ b/crypto/api.c @@ -31,8 +31,10 @@ EXPORT_SYMBOL_GPL(crypto_alg_sem); BLOCKING_NOTIFIER_HEAD(crypto_chain); EXPORT_SYMBOL_GPL(crypto_chain); -DEFINE_STATIC_KEY_FALSE(crypto_boot_test_finished); -EXPORT_SYMBOL_GPL(crypto_boot_test_finished); +#ifndef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS +DEFINE_STATIC_KEY_FALSE(__crypto_boot_test_finished); +EXPORT_SYMBOL_GPL(__crypto_boot_test_finished); +#endif static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg); @@ -202,7 +204,7 @@ static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg) struct crypto_larval *larval = (void *)alg; long timeout; - if (!static_branch_likely(&crypto_boot_test_finished)) + if (!crypto_boot_test_finished()) crypto_start_test(larval); timeout = wait_for_completion_killable_timeout( diff --git a/crypto/internal.h b/crypto/internal.h index c08385571853e..932f0aafddc32 100644 --- a/crypto/internal.h +++ b/crypto/internal.h @@ -47,7 +47,25 @@ extern struct list_head crypto_alg_list; extern struct rw_semaphore crypto_alg_sem; extern struct blocking_notifier_head crypto_chain; -DECLARE_STATIC_KEY_FALSE(crypto_boot_test_finished); +#ifdef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS +static inline bool crypto_boot_test_finished(void) +{ + return true; +} +static inline void set_crypto_boot_test_finished(void) +{ +} +#else +DECLARE_STATIC_KEY_FALSE(__crypto_boot_test_finished); +static inline bool crypto_boot_test_finished(void) +{ + return static_branch_likely(&__crypto_boot_test_finished); +} +static inline void set_crypto_boot_test_finished(void) +{ + static_branch_enable(&__crypto_boot_test_finished); +} +#endif /* !CONFIG_CRYPTO_MANAGER_DISABLE_TESTS */ #ifdef CONFIG_PROC_FS void __init crypto_init_proc(void); -- GitLab From 0bf365c0efdd8fc03cb82e381ea4d76196c66bc2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 13 Nov 2022 16:12:36 -0800 Subject: [PATCH 0892/1423] crypto: kdf - skip self-test when tests disabled Make kdf_sp800108 honor the CONFIG_CRYPTO_MANAGER_DISABLE_TESTS kconfig option, so that it doesn't always waste time running its self-test. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/kdf_sp800108.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/crypto/kdf_sp800108.c b/crypto/kdf_sp800108.c index 58edf7797abfb..c6e3ad82d5f7a 100644 --- a/crypto/kdf_sp800108.c +++ b/crypto/kdf_sp800108.c @@ -125,9 +125,13 @@ static const struct kdf_testvec kdf_ctr_hmac_sha256_tv_template[] = { static int __init crypto_kdf108_init(void) { - int ret = kdf_test(&kdf_ctr_hmac_sha256_tv_template[0], "hmac(sha256)", - crypto_kdf108_setkey, crypto_kdf108_ctr_generate); + int ret; + if (IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS)) + return 0; + + ret = kdf_test(&kdf_ctr_hmac_sha256_tv_template[0], "hmac(sha256)", + crypto_kdf108_setkey, crypto_kdf108_ctr_generate); if (ret) { if (fips_enabled) panic("alg: self-tests for CTR-KDF (hmac(sha256)) failed (rc=%d)\n", -- GitLab From 790c4c9f532318e3fe8c6f0b498072abc80e1195 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 13 Nov 2022 16:12:37 -0800 Subject: [PATCH 0893/1423] crypto: kdf - silence noisy self-test Make the kdf_sp800108 self-test only print a message on success when fips_enabled, so that it's consistent with testmgr.c and doesn't spam the kernel log with a message that isn't really important. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/kdf_sp800108.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/kdf_sp800108.c b/crypto/kdf_sp800108.c index c6e3ad82d5f7a..c3f9938e1ad27 100644 --- a/crypto/kdf_sp800108.c +++ b/crypto/kdf_sp800108.c @@ -140,7 +140,7 @@ static int __init crypto_kdf108_init(void) WARN(1, "alg: self-tests for CTR-KDF (hmac(sha256)) failed (rc=%d)\n", ret); - } else { + } else if (fips_enabled) { pr_info("alg: self-tests for CTR-KDF (hmac(sha256)) passed\n"); } -- GitLab From 441cb1b730006bd2d636f72dc7f6e11a8a0ecce5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 13 Nov 2022 16:12:38 -0800 Subject: [PATCH 0894/1423] crypto: algboss - compile out test-related code when tests disabled When CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is set, the code in algboss.c that handles CRYPTO_MSG_ALG_REGISTER is unnecessary, so make it be compiled out. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/algboss.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/crypto/algboss.c b/crypto/algboss.c index 13d37320a66eb..0de1e66979498 100644 --- a/crypto/algboss.c +++ b/crypto/algboss.c @@ -175,11 +175,7 @@ static int cryptomgr_test(void *data) { struct crypto_test_param *param = data; u32 type = param->type; - int err = 0; - -#ifdef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS - goto skiptest; -#endif + int err; err = alg_test(param->driver, param->alg, type, CRYPTO_ALG_TESTED); @@ -194,6 +190,9 @@ static int cryptomgr_schedule_test(struct crypto_alg *alg) struct task_struct *thread; struct crypto_test_param *param; + if (IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS)) + return NOTIFY_DONE; + if (!try_module_get(THIS_MODULE)) goto err; -- GitLab From 1aa33fc8d4032227253ceb736f47c52b859d9683 Mon Sep 17 00:00:00 2001 From: Zhang Yiqun Date: Wed, 16 Nov 2022 17:24:11 +0800 Subject: [PATCH 0895/1423] crypto: tcrypt - Fix multibuffer skcipher speed test mem leak In the past, the data for mb-skcipher test has been allocated twice, that means the first allcated memory area is without free, which may cause a potential memory leakage. So this patch is to remove one allocation to fix this error. Fixes: e161c5930c15 ("crypto: tcrypt - add multibuf skcipher...") Signed-off-by: Zhang Yiqun Signed-off-by: Herbert Xu --- crypto/tcrypt.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 0f101897e90fb..a0833654ce946 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -1090,15 +1090,6 @@ static void test_mb_skcipher_speed(const char *algo, int enc, int secs, goto out_free_tfm; } - - for (i = 0; i < num_mb; ++i) - if (testmgr_alloc_buf(data[i].xbuf)) { - while (i--) - testmgr_free_buf(data[i].xbuf); - goto out_free_tfm; - } - - for (i = 0; i < num_mb; ++i) { data[i].req = skcipher_request_alloc(tfm, GFP_KERNEL); if (!data[i].req) { -- GitLab From 34c3a47d20ae55b3600fed733bf96eafe9c500d5 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Wed, 16 Nov 2022 20:28:02 -0500 Subject: [PATCH 0896/1423] padata: Always leave BHs disabled when running ->parallel() A deadlock can happen when an overloaded system runs ->parallel() in the context of the current task: padata_do_parallel ->parallel() pcrypt_aead_enc/dec padata_do_serial spin_lock(&reorder->lock) // BHs still enabled ... __do_softirq ... padata_do_serial spin_lock(&reorder->lock) It's a bug for BHs to be on in _do_serial as Steffen points out, so ensure they're off in the "current task" case like they are in padata_parallel_worker to avoid this situation. Reported-by: syzbot+bc05445bc14148d51915@syzkaller.appspotmail.com Fixes: 4611ce224688 ("padata: allocate work structures for parallel jobs from a pool") Signed-off-by: Daniel Jordan Acked-by: Steffen Klassert Signed-off-by: Herbert Xu --- kernel/padata.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/padata.c b/kernel/padata.c index e5819bb8bd1dc..97f51e0c17766 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -207,14 +207,16 @@ int padata_do_parallel(struct padata_shell *ps, pw = padata_work_alloc(); spin_unlock(&padata_works_lock); + if (!pw) { + /* Maximum works limit exceeded, run in the current task. */ + padata->parallel(padata); + } + rcu_read_unlock_bh(); if (pw) { padata_work_init(pw, padata_parallel_worker, padata, 0); queue_work(pinst->parallel_wq, &pw->pw_work); - } else { - /* Maximum works limit exceeded, run in the current task. */ - padata->parallel(padata); } return 0; -- GitLab From 57ddfecc72a6c9941d159543e1c0c0a74fe9afdd Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Wed, 16 Nov 2022 20:28:04 -0500 Subject: [PATCH 0897/1423] padata: Fix list iterator in padata_do_serial() list_for_each_entry_reverse() assumes that the iterated list is nonempty and that every list_head is embedded in the same type, but its use in padata_do_serial() breaks both rules. This doesn't cause any issues now because padata_priv and padata_list happen to have their list fields at the same offset, but we really shouldn't be relying on that. Fixes: bfde23ce200e ("padata: unbind parallel jobs from specific CPUs") Signed-off-by: Daniel Jordan Signed-off-by: Herbert Xu --- kernel/padata.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/padata.c b/kernel/padata.c index 97f51e0c17766..de90af5fcbe6b 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -390,13 +390,16 @@ void padata_do_serial(struct padata_priv *padata) int hashed_cpu = padata_cpu_hash(pd, padata->seq_nr); struct padata_list *reorder = per_cpu_ptr(pd->reorder_list, hashed_cpu); struct padata_priv *cur; + struct list_head *pos; spin_lock(&reorder->lock); /* Sort in ascending order of sequence number. */ - list_for_each_entry_reverse(cur, &reorder->list, list) + list_for_each_prev(pos, &reorder->list) { + cur = list_entry(pos, struct padata_priv, list); if (cur->seq_nr < padata->seq_nr) break; - list_add(&padata->list, &cur->list); + } + list_add(&padata->list, pos); spin_unlock(&reorder->lock); /* -- GitLab From 8bd9974b6bfcd1e14a001deeca051aed7295559a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 18 Nov 2022 11:44:10 -0800 Subject: [PATCH 0898/1423] crypto: x86/aegis128 - fix possible crash with CFI enabled crypto_aegis128_aesni_enc(), crypto_aegis128_aesni_enc_tail(), crypto_aegis128_aesni_dec(), and crypto_aegis128_aesni_dec_tail() are called via indirect function calls. Therefore they need to use SYM_TYPED_FUNC_START instead of SYM_FUNC_START to cause their type hashes to be emitted when the kernel is built with CONFIG_CFI_CLANG=y. Otherwise, the code crashes with a CFI failure (if the compiler didn't happen to optimize out the indirect calls). Fixes: ccace936eec7 ("x86: Add types to indirectly called assembly functions") Acked-by: Peter Zijlstra (Intel) Reviewed-by: Sami Tolvanen Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/aegis128-aesni-asm.S | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S index b48ddebb47489..cdf3215ec272c 100644 --- a/arch/x86/crypto/aegis128-aesni-asm.S +++ b/arch/x86/crypto/aegis128-aesni-asm.S @@ -7,6 +7,7 @@ */ #include +#include #include #define STATE0 %xmm0 @@ -402,7 +403,7 @@ SYM_FUNC_END(crypto_aegis128_aesni_ad) * void crypto_aegis128_aesni_enc(void *state, unsigned int length, * const void *src, void *dst); */ -SYM_FUNC_START(crypto_aegis128_aesni_enc) +SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) FRAME_BEGIN cmp $0x10, LEN @@ -499,7 +500,7 @@ SYM_FUNC_END(crypto_aegis128_aesni_enc) * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length, * const void *src, void *dst); */ -SYM_FUNC_START(crypto_aegis128_aesni_enc_tail) +SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail) FRAME_BEGIN /* load the state: */ @@ -556,7 +557,7 @@ SYM_FUNC_END(crypto_aegis128_aesni_enc_tail) * void crypto_aegis128_aesni_dec(void *state, unsigned int length, * const void *src, void *dst); */ -SYM_FUNC_START(crypto_aegis128_aesni_dec) +SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) FRAME_BEGIN cmp $0x10, LEN @@ -653,7 +654,7 @@ SYM_FUNC_END(crypto_aegis128_aesni_dec) * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length, * const void *src, void *dst); */ -SYM_FUNC_START(crypto_aegis128_aesni_dec_tail) +SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail) FRAME_BEGIN /* load the state: */ -- GitLab From c67b553a4f4a8bd921e4c9ceae00e111be09c488 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 18 Nov 2022 11:44:11 -0800 Subject: [PATCH 0899/1423] crypto: x86/aria - fix crash with CFI enabled aria_aesni_avx_encrypt_16way(), aria_aesni_avx_decrypt_16way(), aria_aesni_avx_ctr_crypt_16way(), aria_aesni_avx_gfni_encrypt_16way(), aria_aesni_avx_gfni_decrypt_16way(), and aria_aesni_avx_gfni_ctr_crypt_16way() are called via indirect function calls. Therefore they need to use SYM_TYPED_FUNC_START instead of SYM_FUNC_START to cause their type hashes to be emitted when the kernel is built with CONFIG_CFI_CLANG=y. Otherwise, the code crashes with a CFI failure. Fixes: ccace936eec7 ("x86: Add types to indirectly called assembly functions") Acked-by: Peter Zijlstra (Intel) Reviewed-by: Sami Tolvanen Cc: Taehee Yoo Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/aria-aesni-avx-asm_64.S | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/crypto/aria-aesni-avx-asm_64.S b/arch/x86/crypto/aria-aesni-avx-asm_64.S index c75fd7d015ed8..03ae4cd1d976a 100644 --- a/arch/x86/crypto/aria-aesni-avx-asm_64.S +++ b/arch/x86/crypto/aria-aesni-avx-asm_64.S @@ -7,6 +7,7 @@ */ #include +#include #include /* struct aria_ctx: */ @@ -913,7 +914,7 @@ SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way) RET; SYM_FUNC_END(__aria_aesni_avx_crypt_16way) -SYM_FUNC_START(aria_aesni_avx_encrypt_16way) +SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -938,7 +939,7 @@ SYM_FUNC_START(aria_aesni_avx_encrypt_16way) RET; SYM_FUNC_END(aria_aesni_avx_encrypt_16way) -SYM_FUNC_START(aria_aesni_avx_decrypt_16way) +SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -1039,7 +1040,7 @@ SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way) RET; SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way) -SYM_FUNC_START(aria_aesni_avx_ctr_crypt_16way) +SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way) /* input: * %rdi: ctx * %rsi: dst @@ -1208,7 +1209,7 @@ SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way) RET; SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way) -SYM_FUNC_START(aria_aesni_avx_gfni_encrypt_16way) +SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -1233,7 +1234,7 @@ SYM_FUNC_START(aria_aesni_avx_gfni_encrypt_16way) RET; SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way) -SYM_FUNC_START(aria_aesni_avx_gfni_decrypt_16way) +SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -1258,7 +1259,7 @@ SYM_FUNC_START(aria_aesni_avx_gfni_decrypt_16way) RET; SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way) -SYM_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way) +SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way) /* input: * %rdi: ctx * %rsi: dst -- GitLab From 0f8bc4bd48dd148046c19c38568cd9449c79b45f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 18 Nov 2022 11:44:12 -0800 Subject: [PATCH 0900/1423] crypto: x86/nhpoly1305 - eliminate unnecessary CFI wrappers Since the CFI implementation now supports indirect calls to assembly functions, take advantage of that rather than use wrapper functions. Acked-by: Peter Zijlstra (Intel) Reviewed-by: Sami Tolvanen Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/nh-avx2-x86_64.S | 5 +++-- arch/x86/crypto/nh-sse2-x86_64.S | 5 +++-- arch/x86/crypto/nhpoly1305-avx2-glue.c | 11 ++--------- arch/x86/crypto/nhpoly1305-sse2-glue.c | 11 ++--------- 4 files changed, 10 insertions(+), 22 deletions(-) diff --git a/arch/x86/crypto/nh-avx2-x86_64.S b/arch/x86/crypto/nh-avx2-x86_64.S index 6a0b15e7196a8..ef73a3ab87263 100644 --- a/arch/x86/crypto/nh-avx2-x86_64.S +++ b/arch/x86/crypto/nh-avx2-x86_64.S @@ -8,6 +8,7 @@ */ #include +#include #define PASS0_SUMS %ymm0 #define PASS1_SUMS %ymm1 @@ -65,11 +66,11 @@ /* * void nh_avx2(const u32 *key, const u8 *message, size_t message_len, - * u8 hash[NH_HASH_BYTES]) + * __le64 hash[NH_NUM_PASSES]) * * It's guaranteed that message_len % 16 == 0. */ -SYM_FUNC_START(nh_avx2) +SYM_TYPED_FUNC_START(nh_avx2) vmovdqu 0x00(KEY), K0 vmovdqu 0x10(KEY), K1 diff --git a/arch/x86/crypto/nh-sse2-x86_64.S b/arch/x86/crypto/nh-sse2-x86_64.S index 34c567bbcb4fa..75fb994b6d177 100644 --- a/arch/x86/crypto/nh-sse2-x86_64.S +++ b/arch/x86/crypto/nh-sse2-x86_64.S @@ -8,6 +8,7 @@ */ #include +#include #define PASS0_SUMS %xmm0 #define PASS1_SUMS %xmm1 @@ -67,11 +68,11 @@ /* * void nh_sse2(const u32 *key, const u8 *message, size_t message_len, - * u8 hash[NH_HASH_BYTES]) + * __le64 hash[NH_NUM_PASSES]) * * It's guaranteed that message_len % 16 == 0. */ -SYM_FUNC_START(nh_sse2) +SYM_TYPED_FUNC_START(nh_sse2) movdqu 0x00(KEY), K0 movdqu 0x10(KEY), K1 diff --git a/arch/x86/crypto/nhpoly1305-avx2-glue.c b/arch/x86/crypto/nhpoly1305-avx2-glue.c index 8ea5ab0f1ca74..46b036204ed91 100644 --- a/arch/x86/crypto/nhpoly1305-avx2-glue.c +++ b/arch/x86/crypto/nhpoly1305-avx2-glue.c @@ -14,14 +14,7 @@ #include asmlinkage void nh_avx2(const u32 *key, const u8 *message, size_t message_len, - u8 hash[NH_HASH_BYTES]); - -/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */ -static void _nh_avx2(const u32 *key, const u8 *message, size_t message_len, - __le64 hash[NH_NUM_PASSES]) -{ - nh_avx2(key, message, message_len, (u8 *)hash); -} + __le64 hash[NH_NUM_PASSES]); static int nhpoly1305_avx2_update(struct shash_desc *desc, const u8 *src, unsigned int srclen) @@ -33,7 +26,7 @@ static int nhpoly1305_avx2_update(struct shash_desc *desc, unsigned int n = min_t(unsigned int, srclen, SZ_4K); kernel_fpu_begin(); - crypto_nhpoly1305_update_helper(desc, src, n, _nh_avx2); + crypto_nhpoly1305_update_helper(desc, src, n, nh_avx2); kernel_fpu_end(); src += n; srclen -= n; diff --git a/arch/x86/crypto/nhpoly1305-sse2-glue.c b/arch/x86/crypto/nhpoly1305-sse2-glue.c index 2b353d42ed13f..4a4970d751076 100644 --- a/arch/x86/crypto/nhpoly1305-sse2-glue.c +++ b/arch/x86/crypto/nhpoly1305-sse2-glue.c @@ -14,14 +14,7 @@ #include asmlinkage void nh_sse2(const u32 *key, const u8 *message, size_t message_len, - u8 hash[NH_HASH_BYTES]); - -/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */ -static void _nh_sse2(const u32 *key, const u8 *message, size_t message_len, - __le64 hash[NH_NUM_PASSES]) -{ - nh_sse2(key, message, message_len, (u8 *)hash); -} + __le64 hash[NH_NUM_PASSES]); static int nhpoly1305_sse2_update(struct shash_desc *desc, const u8 *src, unsigned int srclen) @@ -33,7 +26,7 @@ static int nhpoly1305_sse2_update(struct shash_desc *desc, unsigned int n = min_t(unsigned int, srclen, SZ_4K); kernel_fpu_begin(); - crypto_nhpoly1305_update_helper(desc, src, n, _nh_sse2); + crypto_nhpoly1305_update_helper(desc, src, n, nh_sse2); kernel_fpu_end(); src += n; srclen -= n; -- GitLab From 32f34bf7e44eeaa241fb845d6f52af5104bc30fd Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 18 Nov 2022 11:44:13 -0800 Subject: [PATCH 0901/1423] crypto: x86/sha1 - fix possible crash with CFI enabled sha1_transform_ssse3(), sha1_transform_avx(), and sha1_ni_transform() (but not sha1_transform_avx2()) are called via indirect function calls. Therefore they need to use SYM_TYPED_FUNC_START instead of SYM_FUNC_START to cause their type hashes to be emitted when the kernel is built with CONFIG_CFI_CLANG=y. Otherwise, the code crashes with a CFI failure (if the compiler didn't happen to optimize out the indirect calls). Fixes: ccace936eec7 ("x86: Add types to indirectly called assembly functions") Acked-by: Peter Zijlstra (Intel) Reviewed-by: Sami Tolvanen Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/sha1_ni_asm.S | 3 ++- arch/x86/crypto/sha1_ssse3_asm.S | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S index 2f94ec0e763bf..3cae5a1bb3d6e 100644 --- a/arch/x86/crypto/sha1_ni_asm.S +++ b/arch/x86/crypto/sha1_ni_asm.S @@ -54,6 +54,7 @@ */ #include +#include #define DIGEST_PTR %rdi /* 1st arg */ #define DATA_PTR %rsi /* 2nd arg */ @@ -93,7 +94,7 @@ */ .text .align 32 -SYM_FUNC_START(sha1_ni_transform) +SYM_TYPED_FUNC_START(sha1_ni_transform) push %rbp mov %rsp, %rbp sub $FRAME_SIZE, %rsp diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S index 263f916362e02..f54988c80eb40 100644 --- a/arch/x86/crypto/sha1_ssse3_asm.S +++ b/arch/x86/crypto/sha1_ssse3_asm.S @@ -25,6 +25,7 @@ */ #include +#include #define CTX %rdi // arg1 #define BUF %rsi // arg2 @@ -67,7 +68,7 @@ * param: function's name */ .macro SHA1_VECTOR_ASM name - SYM_FUNC_START(\name) + SYM_TYPED_FUNC_START(\name) push %rbx push %r12 -- GitLab From 19940ebbb59c12146d05c5f8acd873197b290648 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 18 Nov 2022 11:44:14 -0800 Subject: [PATCH 0902/1423] crypto: x86/sha256 - fix possible crash with CFI enabled sha256_transform_ssse3(), sha256_transform_avx(), sha256_transform_rorx(), and sha256_ni_transform() are called via indirect function calls. Therefore they need to use SYM_TYPED_FUNC_START instead of SYM_FUNC_START to cause their type hashes to be emitted when the kernel is built with CONFIG_CFI_CLANG=y. Otherwise, the code crashes with a CFI failure (if the compiler didn't happen to optimize out the indirect calls). Fixes: ccace936eec7 ("x86: Add types to indirectly called assembly functions") Acked-by: Peter Zijlstra (Intel) Reviewed-by: Sami Tolvanen Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/sha256-avx-asm.S | 3 ++- arch/x86/crypto/sha256-avx2-asm.S | 3 ++- arch/x86/crypto/sha256-ssse3-asm.S | 3 ++- arch/x86/crypto/sha256_ni_asm.S | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S index 3baa1ec390974..06ea30c20828d 100644 --- a/arch/x86/crypto/sha256-avx-asm.S +++ b/arch/x86/crypto/sha256-avx-asm.S @@ -48,6 +48,7 @@ ######################################################################## #include +#include ## assume buffers not aligned #define VMOVDQ vmovdqu @@ -346,7 +347,7 @@ a = TMP_ ## arg 3 : Num blocks ######################################################################## .text -SYM_FUNC_START(sha256_transform_avx) +SYM_TYPED_FUNC_START(sha256_transform_avx) .align 32 pushq %rbx pushq %r12 diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S index 9bcdbc47b8b4b..2d2be531a11ed 100644 --- a/arch/x86/crypto/sha256-avx2-asm.S +++ b/arch/x86/crypto/sha256-avx2-asm.S @@ -49,6 +49,7 @@ ######################################################################## #include +#include ## assume buffers not aligned #define VMOVDQ vmovdqu @@ -523,7 +524,7 @@ STACK_SIZE = _CTX + _CTX_SIZE ## arg 3 : Num blocks ######################################################################## .text -SYM_FUNC_START(sha256_transform_rorx) +SYM_TYPED_FUNC_START(sha256_transform_rorx) .align 32 pushq %rbx pushq %r12 diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S index c4a5db612c327..7db28839108dd 100644 --- a/arch/x86/crypto/sha256-ssse3-asm.S +++ b/arch/x86/crypto/sha256-ssse3-asm.S @@ -47,6 +47,7 @@ ######################################################################## #include +#include ## assume buffers not aligned #define MOVDQ movdqu @@ -355,7 +356,7 @@ a = TMP_ ## arg 3 : Num blocks ######################################################################## .text -SYM_FUNC_START(sha256_transform_ssse3) +SYM_TYPED_FUNC_START(sha256_transform_ssse3) .align 32 pushq %rbx pushq %r12 diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S index 94d50dd27cb53..47f93937f798a 100644 --- a/arch/x86/crypto/sha256_ni_asm.S +++ b/arch/x86/crypto/sha256_ni_asm.S @@ -54,6 +54,7 @@ */ #include +#include #define DIGEST_PTR %rdi /* 1st arg */ #define DATA_PTR %rsi /* 2nd arg */ @@ -97,7 +98,7 @@ .text .align 32 -SYM_FUNC_START(sha256_ni_transform) +SYM_TYPED_FUNC_START(sha256_ni_transform) shl $6, NUM_BLKS /* convert to bytes */ jz .Ldone_hash -- GitLab From a1d72fa33186ac69c7d8120c71f41ea4fc23dcc9 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 18 Nov 2022 11:44:15 -0800 Subject: [PATCH 0903/1423] crypto: x86/sha512 - fix possible crash with CFI enabled sha512_transform_ssse3(), sha512_transform_avx(), and sha512_transform_rorx() are called via indirect function calls. Therefore they need to use SYM_TYPED_FUNC_START instead of SYM_FUNC_START to cause their type hashes to be emitted when the kernel is built with CONFIG_CFI_CLANG=y. Otherwise, the code crashes with a CFI failure (if the compiler didn't happen to optimize out the indirect calls). Fixes: ccace936eec7 ("x86: Add types to indirectly called assembly functions") Acked-by: Peter Zijlstra (Intel) Reviewed-by: Sami Tolvanen Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/sha512-avx-asm.S | 3 ++- arch/x86/crypto/sha512-avx2-asm.S | 3 ++- arch/x86/crypto/sha512-ssse3-asm.S | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S index 1fefe6dd3a9e2..b0984f19fdb40 100644 --- a/arch/x86/crypto/sha512-avx-asm.S +++ b/arch/x86/crypto/sha512-avx-asm.S @@ -48,6 +48,7 @@ ######################################################################## #include +#include .text @@ -273,7 +274,7 @@ frame_size = frame_WK + WK_SIZE # of SHA512 message blocks. # "blocks" is the message length in SHA512 blocks ######################################################################## -SYM_FUNC_START(sha512_transform_avx) +SYM_TYPED_FUNC_START(sha512_transform_avx) test msglen, msglen je nowork diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S index 5cdaab7d69015..b1ca99055ef99 100644 --- a/arch/x86/crypto/sha512-avx2-asm.S +++ b/arch/x86/crypto/sha512-avx2-asm.S @@ -50,6 +50,7 @@ ######################################################################## #include +#include .text @@ -565,7 +566,7 @@ frame_size = frame_CTX + CTX_SIZE # of SHA512 message blocks. # "blocks" is the message length in SHA512 blocks ######################################################################## -SYM_FUNC_START(sha512_transform_rorx) +SYM_TYPED_FUNC_START(sha512_transform_rorx) # Save GPRs push %rbx push %r12 diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S index b84c22e06c5f7..c06afb5270e5f 100644 --- a/arch/x86/crypto/sha512-ssse3-asm.S +++ b/arch/x86/crypto/sha512-ssse3-asm.S @@ -48,6 +48,7 @@ ######################################################################## #include +#include .text @@ -274,7 +275,7 @@ frame_size = frame_WK + WK_SIZE # of SHA512 message blocks. # "blocks" is the message length in SHA512 blocks. ######################################################################## -SYM_FUNC_START(sha512_transform_ssse3) +SYM_TYPED_FUNC_START(sha512_transform_ssse3) test msglen, msglen je nowork -- GitLab From 8ba490d9f5a56f52091644325a32d3f71a982776 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 18 Nov 2022 11:44:16 -0800 Subject: [PATCH 0904/1423] crypto: x86/sm3 - fix possible crash with CFI enabled sm3_transform_avx() is called via indirect function calls. Therefore it needs to use SYM_TYPED_FUNC_START instead of SYM_FUNC_START to cause its type hash to be emitted when the kernel is built with CONFIG_CFI_CLANG=y. Otherwise, the code crashes with a CFI failure (if the compiler didn't happen to optimize out the indirect call). Fixes: ccace936eec7 ("x86: Add types to indirectly called assembly functions") Acked-by: Peter Zijlstra (Intel) Reviewed-by: Sami Tolvanen Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/sm3-avx-asm_64.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/crypto/sm3-avx-asm_64.S b/arch/x86/crypto/sm3-avx-asm_64.S index b12b9efb5ec51..8fc5ac681fd63 100644 --- a/arch/x86/crypto/sm3-avx-asm_64.S +++ b/arch/x86/crypto/sm3-avx-asm_64.S @@ -12,6 +12,7 @@ */ #include +#include #include /* Context structure */ @@ -328,7 +329,7 @@ * const u8 *data, int nblocks); */ .align 16 -SYM_FUNC_START(sm3_transform_avx) +SYM_TYPED_FUNC_START(sm3_transform_avx) /* input: * %rdi: ctx, CTX * %rsi: data (64*nblks bytes) -- GitLab From 2d203c46a0fa5df0785383b13b722483e1fd27a8 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 18 Nov 2022 11:44:17 -0800 Subject: [PATCH 0905/1423] crypto: x86/sm4 - fix crash with CFI enabled sm4_aesni_avx_ctr_enc_blk8(), sm4_aesni_avx_cbc_dec_blk8(), sm4_aesni_avx_cfb_dec_blk8(), sm4_aesni_avx2_ctr_enc_blk16(), sm4_aesni_avx2_cbc_dec_blk16(), and sm4_aesni_avx2_cfb_dec_blk16() are called via indirect function calls. Therefore they need to use SYM_TYPED_FUNC_START instead of SYM_FUNC_START to cause their type hashes to be emitted when the kernel is built with CONFIG_CFI_CLANG=y. Otherwise, the code crashes with a CFI failure. (Or at least that should be the case. For some reason the CFI checks in sm4_avx_cbc_decrypt(), sm4_avx_cfb_decrypt(), and sm4_avx_ctr_crypt() are not always being generated, using current tip-of-tree clang. Anyway, this patch is a good idea anyway.) Fixes: ccace936eec7 ("x86: Add types to indirectly called assembly functions") Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/sm4-aesni-avx-asm_64.S | 7 ++++--- arch/x86/crypto/sm4-aesni-avx2-asm_64.S | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86/crypto/sm4-aesni-avx-asm_64.S b/arch/x86/crypto/sm4-aesni-avx-asm_64.S index 4767ab61ff489..22b6560eb9e1e 100644 --- a/arch/x86/crypto/sm4-aesni-avx-asm_64.S +++ b/arch/x86/crypto/sm4-aesni-avx-asm_64.S @@ -14,6 +14,7 @@ */ #include +#include #include #define rRIP (%rip) @@ -420,7 +421,7 @@ SYM_FUNC_END(sm4_aesni_avx_crypt8) * const u8 *src, u8 *iv) */ .align 8 -SYM_FUNC_START(sm4_aesni_avx_ctr_enc_blk8) +SYM_TYPED_FUNC_START(sm4_aesni_avx_ctr_enc_blk8) /* input: * %rdi: round key array, CTX * %rsi: dst (8 blocks) @@ -495,7 +496,7 @@ SYM_FUNC_END(sm4_aesni_avx_ctr_enc_blk8) * const u8 *src, u8 *iv) */ .align 8 -SYM_FUNC_START(sm4_aesni_avx_cbc_dec_blk8) +SYM_TYPED_FUNC_START(sm4_aesni_avx_cbc_dec_blk8) /* input: * %rdi: round key array, CTX * %rsi: dst (8 blocks) @@ -545,7 +546,7 @@ SYM_FUNC_END(sm4_aesni_avx_cbc_dec_blk8) * const u8 *src, u8 *iv) */ .align 8 -SYM_FUNC_START(sm4_aesni_avx_cfb_dec_blk8) +SYM_TYPED_FUNC_START(sm4_aesni_avx_cfb_dec_blk8) /* input: * %rdi: round key array, CTX * %rsi: dst (8 blocks) diff --git a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S index 4732fe8bb65b6..23ee39a8ada8c 100644 --- a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S @@ -14,6 +14,7 @@ */ #include +#include #include #define rRIP (%rip) @@ -282,7 +283,7 @@ SYM_FUNC_END(__sm4_crypt_blk16) * const u8 *src, u8 *iv) */ .align 8 -SYM_FUNC_START(sm4_aesni_avx2_ctr_enc_blk16) +SYM_TYPED_FUNC_START(sm4_aesni_avx2_ctr_enc_blk16) /* input: * %rdi: round key array, CTX * %rsi: dst (16 blocks) @@ -395,7 +396,7 @@ SYM_FUNC_END(sm4_aesni_avx2_ctr_enc_blk16) * const u8 *src, u8 *iv) */ .align 8 -SYM_FUNC_START(sm4_aesni_avx2_cbc_dec_blk16) +SYM_TYPED_FUNC_START(sm4_aesni_avx2_cbc_dec_blk16) /* input: * %rdi: round key array, CTX * %rsi: dst (16 blocks) @@ -449,7 +450,7 @@ SYM_FUNC_END(sm4_aesni_avx2_cbc_dec_blk16) * const u8 *src, u8 *iv) */ .align 8 -SYM_FUNC_START(sm4_aesni_avx2_cfb_dec_blk16) +SYM_TYPED_FUNC_START(sm4_aesni_avx2_cfb_dec_blk16) /* input: * %rdi: round key array, CTX * %rsi: dst (16 blocks) -- GitLab From e5e1c67e2f01d924e9583b67a907934948d852aa Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 18 Nov 2022 11:44:18 -0800 Subject: [PATCH 0906/1423] crypto: arm64/nhpoly1305 - eliminate unnecessary CFI wrapper Since the CFI implementation now supports indirect calls to assembly functions, take advantage of that rather than use a wrapper function. Acked-by: Peter Zijlstra (Intel) Reviewed-by: Sami Tolvanen Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/arm64/crypto/nh-neon-core.S | 5 +++-- arch/arm64/crypto/nhpoly1305-neon-glue.c | 11 ++--------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/arch/arm64/crypto/nh-neon-core.S b/arch/arm64/crypto/nh-neon-core.S index 51c0a534ef87c..13eda08fda1e5 100644 --- a/arch/arm64/crypto/nh-neon-core.S +++ b/arch/arm64/crypto/nh-neon-core.S @@ -8,6 +8,7 @@ */ #include +#include KEY .req x0 MESSAGE .req x1 @@ -58,11 +59,11 @@ /* * void nh_neon(const u32 *key, const u8 *message, size_t message_len, - * u8 hash[NH_HASH_BYTES]) + * __le64 hash[NH_NUM_PASSES]) * * It's guaranteed that message_len % 16 == 0. */ -SYM_FUNC_START(nh_neon) +SYM_TYPED_FUNC_START(nh_neon) ld1 {K0.4s,K1.4s}, [KEY], #32 movi PASS0_SUMS.2d, #0 diff --git a/arch/arm64/crypto/nhpoly1305-neon-glue.c b/arch/arm64/crypto/nhpoly1305-neon-glue.c index c5405e6a6db76..cd882c35d9252 100644 --- a/arch/arm64/crypto/nhpoly1305-neon-glue.c +++ b/arch/arm64/crypto/nhpoly1305-neon-glue.c @@ -14,14 +14,7 @@ #include asmlinkage void nh_neon(const u32 *key, const u8 *message, size_t message_len, - u8 hash[NH_HASH_BYTES]); - -/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */ -static void _nh_neon(const u32 *key, const u8 *message, size_t message_len, - __le64 hash[NH_NUM_PASSES]) -{ - nh_neon(key, message, message_len, (u8 *)hash); -} + __le64 hash[NH_NUM_PASSES]); static int nhpoly1305_neon_update(struct shash_desc *desc, const u8 *src, unsigned int srclen) @@ -33,7 +26,7 @@ static int nhpoly1305_neon_update(struct shash_desc *desc, unsigned int n = min_t(unsigned int, srclen, SZ_4K); kernel_neon_begin(); - crypto_nhpoly1305_update_helper(desc, src, n, _nh_neon); + crypto_nhpoly1305_update_helper(desc, src, n, nh_neon); kernel_neon_end(); src += n; srclen -= n; -- GitLab From be8f6b6496076588fd49cbe5bfaaf3ab883eb779 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 18 Nov 2022 11:44:19 -0800 Subject: [PATCH 0907/1423] crypto: arm64/sm3 - fix possible crash with CFI enabled sm3_neon_transform() is called via indirect function calls. Therefore it needs to use SYM_TYPED_FUNC_START instead of SYM_FUNC_START to cause its type hash to be emitted when the kernel is built with CONFIG_CFI_CLANG=y. Otherwise, the code crashes with a CFI failure (if the compiler didn't happen to optimize out the indirect call). Fixes: c50d32859e70 ("arm64: Add types to indirect called assembly functions") Acked-by: Peter Zijlstra (Intel) Reviewed-by: Sami Tolvanen Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/arm64/crypto/sm3-neon-core.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/crypto/sm3-neon-core.S b/arch/arm64/crypto/sm3-neon-core.S index 3e3b4e5c736fc..4357e0e51be38 100644 --- a/arch/arm64/crypto/sm3-neon-core.S +++ b/arch/arm64/crypto/sm3-neon-core.S @@ -9,6 +9,7 @@ */ #include +#include #include /* Context structure */ @@ -351,7 +352,7 @@ */ .text .align 3 -SYM_FUNC_START(sm3_neon_transform) +SYM_TYPED_FUNC_START(sm3_neon_transform) ldp ra, rb, [RSTATE, #0] ldp rc, rd, [RSTATE, #8] ldp re, rf, [RSTATE, #16] -- GitLab From cc7acaadf6ab5d44f43170eb568e1cc9739c3df4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 18 Nov 2022 11:44:20 -0800 Subject: [PATCH 0908/1423] crypto: arm/nhpoly1305 - eliminate unnecessary CFI wrapper The arm architecture doesn't support CFI yet, and even if it did, the new CFI implementation supports indirect calls to assembly functions. Therefore, there's no need to use a wrapper function for nh_neon(). Acked-by: Peter Zijlstra (Intel) Reviewed-by: Sami Tolvanen Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/arm/crypto/nh-neon-core.S | 2 +- arch/arm/crypto/nhpoly1305-neon-glue.c | 11 ++--------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/arch/arm/crypto/nh-neon-core.S b/arch/arm/crypto/nh-neon-core.S index 434d80ab531c2..01620a0782ca9 100644 --- a/arch/arm/crypto/nh-neon-core.S +++ b/arch/arm/crypto/nh-neon-core.S @@ -69,7 +69,7 @@ /* * void nh_neon(const u32 *key, const u8 *message, size_t message_len, - * u8 hash[NH_HASH_BYTES]) + * __le64 hash[NH_NUM_PASSES]) * * It's guaranteed that message_len % 16 == 0. */ diff --git a/arch/arm/crypto/nhpoly1305-neon-glue.c b/arch/arm/crypto/nhpoly1305-neon-glue.c index ffa8d73fe722c..e93e41ff26566 100644 --- a/arch/arm/crypto/nhpoly1305-neon-glue.c +++ b/arch/arm/crypto/nhpoly1305-neon-glue.c @@ -14,14 +14,7 @@ #include asmlinkage void nh_neon(const u32 *key, const u8 *message, size_t message_len, - u8 hash[NH_HASH_BYTES]); - -/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */ -static void _nh_neon(const u32 *key, const u8 *message, size_t message_len, - __le64 hash[NH_NUM_PASSES]) -{ - nh_neon(key, message, message_len, (u8 *)hash); -} + __le64 hash[NH_NUM_PASSES]); static int nhpoly1305_neon_update(struct shash_desc *desc, const u8 *src, unsigned int srclen) @@ -33,7 +26,7 @@ static int nhpoly1305_neon_update(struct shash_desc *desc, unsigned int n = min_t(unsigned int, srclen, SZ_4K); kernel_neon_begin(); - crypto_nhpoly1305_update_helper(desc, src, n, _nh_neon); + crypto_nhpoly1305_update_helper(desc, src, n, nh_neon); kernel_neon_end(); src += n; srclen -= n; -- GitLab From c060e16ddb51a92b1f7fa84c628d287ea5799864 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 18 Nov 2022 11:44:21 -0800 Subject: [PATCH 0909/1423] Revert "crypto: shash - avoid comparing pointers to exported functions under CFI" This reverts commit 22ca9f4aaf431a9413dcc115dd590123307f274f because CFI no longer breaks cross-module function address equality, so crypto_shash_alg_has_setkey() can now be an inline function like before. This commit should not be backported to kernels that don't have the new CFI implementation. Acked-by: Peter Zijlstra (Intel) Reviewed-by: Sami Tolvanen Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/shash.c | 18 +++--------------- include/crypto/internal/hash.h | 8 +++++++- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/crypto/shash.c b/crypto/shash.c index 4c88e63b3350f..0f85431588267 100644 --- a/crypto/shash.c +++ b/crypto/shash.c @@ -20,24 +20,12 @@ static const struct crypto_type crypto_shash_type; -static int shash_no_setkey(struct crypto_shash *tfm, const u8 *key, - unsigned int keylen) +int shash_no_setkey(struct crypto_shash *tfm, const u8 *key, + unsigned int keylen) { return -ENOSYS; } - -/* - * Check whether an shash algorithm has a setkey function. - * - * For CFI compatibility, this must not be an inline function. This is because - * when CFI is enabled, modules won't get the same address for shash_no_setkey - * (if it were exported, which inlining would require) as the core kernel will. - */ -bool crypto_shash_alg_has_setkey(struct shash_alg *alg) -{ - return alg->setkey != shash_no_setkey; -} -EXPORT_SYMBOL_GPL(crypto_shash_alg_has_setkey); +EXPORT_SYMBOL_GPL(shash_no_setkey); static int shash_setkey_unaligned(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) diff --git a/include/crypto/internal/hash.h b/include/crypto/internal/hash.h index 25806141db591..0a288dddcf5be 100644 --- a/include/crypto/internal/hash.h +++ b/include/crypto/internal/hash.h @@ -75,7 +75,13 @@ void crypto_unregister_ahashes(struct ahash_alg *algs, int count); int ahash_register_instance(struct crypto_template *tmpl, struct ahash_instance *inst); -bool crypto_shash_alg_has_setkey(struct shash_alg *alg); +int shash_no_setkey(struct crypto_shash *tfm, const u8 *key, + unsigned int keylen); + +static inline bool crypto_shash_alg_has_setkey(struct shash_alg *alg) +{ + return alg->setkey != shash_no_setkey; +} static inline bool crypto_shash_alg_needs_key(struct shash_alg *alg) { -- GitLab From b8ed0bff96393cbeef66f00f34fda2a4960b75e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 18 Nov 2022 23:35:40 +0100 Subject: [PATCH 0910/1423] crypto: atmel-ecc - Convert to i2c's .probe_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit .probe_new() doesn't get the i2c_device_id * parameter, so determine that explicitly in the probe function. Signed-off-by: Uwe Kleine-König Signed-off-by: Herbert Xu --- drivers/crypto/atmel-ecc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/crypto/atmel-ecc.c b/drivers/crypto/atmel-ecc.c index 82bf15d495614..53100fb9b07bd 100644 --- a/drivers/crypto/atmel-ecc.c +++ b/drivers/crypto/atmel-ecc.c @@ -311,9 +311,9 @@ static struct kpp_alg atmel_ecdh_nist_p256 = { }, }; -static int atmel_ecc_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int atmel_ecc_probe(struct i2c_client *client) { + const struct i2c_device_id *id = i2c_client_get_device_id(client); struct atmel_i2c_client_priv *i2c_priv; int ret; @@ -390,7 +390,7 @@ static struct i2c_driver atmel_ecc_driver = { .name = "atmel-ecc", .of_match_table = of_match_ptr(atmel_ecc_dt_ids), }, - .probe = atmel_ecc_probe, + .probe_new = atmel_ecc_probe, .remove = atmel_ecc_remove, .id_table = atmel_ecc_id, }; -- GitLab From fa2ca3b275874d61f42e68f5eb13645bbeb5d72b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 18 Nov 2022 23:35:41 +0100 Subject: [PATCH 0911/1423] crypto: atmel-sha204a - Convert to i2c's .probe_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit .probe_new() doesn't get the i2c_device_id * parameter, so determine that explicitly in the probe function. Signed-off-by: Uwe Kleine-König Signed-off-by: Herbert Xu --- drivers/crypto/atmel-sha204a.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/crypto/atmel-sha204a.c b/drivers/crypto/atmel-sha204a.c index c0103e7fc2e75..272a06f0b5886 100644 --- a/drivers/crypto/atmel-sha204a.c +++ b/drivers/crypto/atmel-sha204a.c @@ -91,9 +91,9 @@ static int atmel_sha204a_rng_read(struct hwrng *rng, void *data, size_t max, return max; } -static int atmel_sha204a_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int atmel_sha204a_probe(struct i2c_client *client) { + const struct i2c_device_id *id = i2c_client_get_device_id(client); struct atmel_i2c_client_priv *i2c_priv; int ret; @@ -142,7 +142,7 @@ static const struct i2c_device_id atmel_sha204a_id[] = { MODULE_DEVICE_TABLE(i2c, atmel_sha204a_id); static struct i2c_driver atmel_sha204a_driver = { - .probe = atmel_sha204a_probe, + .probe_new = atmel_sha204a_probe, .remove = atmel_sha204a_remove, .id_table = atmel_sha204a_id, -- GitLab From 3901355624d14afe3230252cb36bc3da8ff6890e Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Sat, 19 Nov 2022 17:48:43 +0800 Subject: [PATCH 0912/1423] crypto: hisilicon/qm - fix 'QM_XEQ_DEPTH_CAP' mask value 'QM_XEQ_DEPTH_CAP' mask value is GENMASK(31, 0) instead of GENMASK(15, 0). If the mask value is incorrect, will cause abnormal events cannot be handled. So fix it. Fixes: 129a9f340172 ("crypto: hisilicon/qm - get qp num and depth from hardware registers") Signed-off-by: Weili Qian Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/qm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 36d70b9f6117c..9072bee7336fe 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -336,7 +336,7 @@ static const struct hisi_qm_cap_info qm_cap_info_vf[] = { static const struct hisi_qm_cap_info qm_basic_info[] = { {QM_TOTAL_QP_NUM_CAP, 0x100158, 0, GENMASK(10, 0), 0x1000, 0x400, 0x400}, {QM_FUNC_MAX_QP_CAP, 0x100158, 11, GENMASK(10, 0), 0x1000, 0x400, 0x400}, - {QM_XEQ_DEPTH_CAP, 0x3104, 0, GENMASK(15, 0), 0x800, 0x4000800, 0x4000800}, + {QM_XEQ_DEPTH_CAP, 0x3104, 0, GENMASK(31, 0), 0x800, 0x4000800, 0x4000800}, {QM_QP_DEPTH_CAP, 0x3108, 0, GENMASK(31, 0), 0x4000400, 0x4000400, 0x4000400}, {QM_EQ_IRQ_TYPE_CAP, 0x310c, 0, GENMASK(31, 0), 0x10000, 0x10000, 0x10000}, {QM_AEQ_IRQ_TYPE_CAP, 0x3110, 0, GENMASK(31, 0), 0x0, 0x10001, 0x10001}, -- GitLab From 5f9c97a0e6dc873f662528ae591f2bd500eb5940 Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Sat, 19 Nov 2022 17:50:03 +0800 Subject: [PATCH 0913/1423] crypto: hisilicon/qm - add device status check when start fails In function 'hisi_qm_resume', if the device fails to be started, directly returning error code will cause the device to be unavailable. However, the failure may be caused by device error, which will be reported to the driver, and driver can reset and restart device. Therefore, check device status instead of returning error code directly. Returns 0 if device error has occurred, otherwise returns error code. Signed-off-by: Weili Qian Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/qm.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 9072bee7336fe..007ac7a69ce74 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -5468,8 +5468,14 @@ int hisi_qm_resume(struct device *dev) } ret = hisi_qm_start(qm); - if (ret) - pci_err(pdev, "failed to start qm(%d)\n", ret); + if (ret) { + if (qm_check_dev_error(qm)) { + pci_info(pdev, "failed to start qm due to device error, device will be reset!\n"); + return 0; + } + + pci_err(pdev, "failed to start qm(%d)!\n", ret); + } return ret; } -- GitLab From 83478938f78fd640c72af83d739a90c840e1b876 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sat, 19 Nov 2022 14:42:59 +0100 Subject: [PATCH 0914/1423] hwrng: u2fzero - account for high quality RNG The U2F zero apparently has a real TRNG in it with maximum quality, not one with quality of "1", which was likely a misinterpretation of the field as a boolean. So remove the assignment entirely, so that we get the default quality setting. In the u2f-zero firmware, the 0x21 RNG command used by this driver is handled as such [1]: case U2F_CUSTOM_GET_RNG: if (atecc_send_recv(ATECC_CMD_RNG,ATECC_RNG_P1,ATECC_RNG_P2, NULL, 0, appdata.tmp, sizeof(appdata.tmp), &res) == 0 ) { memmove(msg->pkt.init.payload, res.buf, 32); U2FHID_SET_LEN(msg, 32); usb_write((uint8_t*)msg, 64); } else { U2FHID_SET_LEN(msg, 0); usb_write((uint8_t*)msg, 64); } This same call to `atecc_send_recv(ATECC_CMD_RNG,ATECC_RNG_P1, ATECC_RNG_P2,...)` is then also used in the token's cryptographically critical "u2f_new_keypair" function, as its rather straightforward source of random bytes [2]: int8_t u2f_new_keypair(uint8_t * handle, uint8_t * appid, uint8_t * pubkey) { struct atecc_response res; uint8_t private_key[36]; int i; watchdog(); if (atecc_send_recv(ATECC_CMD_RNG,ATECC_RNG_P1,ATECC_RNG_P2, NULL, 0, appdata.tmp, sizeof(appdata.tmp), &res) != 0 ) { return -1; } So it seems rather plain that the ATECC RNG is considered to provide good random numbers. [1] https://github.com/conorpp/u2f-zero/blob/master/firmware/src/custom.c [2] https://github.com/conorpp/u2f-zero/blob/master/firmware/src/u2f_atecc.c Cc: Andrej Shadura Cc: Jiri Kosina Cc: Herbert Xu Signed-off-by: Jason A. Donenfeld Acked-by: Andrej Shadura Signed-off-by: Herbert Xu --- drivers/hid/hid-u2fzero.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/hid/hid-u2fzero.c b/drivers/hid/hid-u2fzero.c index ad489caf53ad8..744a91e6e78c5 100644 --- a/drivers/hid/hid-u2fzero.c +++ b/drivers/hid/hid-u2fzero.c @@ -261,7 +261,6 @@ static int u2fzero_init_hwrng(struct u2fzero_device *dev, dev->hwrng.name = dev->rng_name; dev->hwrng.read = u2fzero_rng_read; - dev->hwrng.quality = 1; return devm_hwrng_register(&dev->hdev->dev, &dev->hwrng); } -- GitLab From 09f530f0c6d6689eee5e690c6d98f495fcc3a0f9 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 23 Nov 2022 20:27:14 -0400 Subject: [PATCH 0915/1423] RDMA: Add netdevice_tracker to ib_device_set_netdev() This will cause an informative backtrace to print if the user of ib_device_set_netdev() isn't careful about tearing down the ibdevice before its the netdevice parent is destroyed. Such as like this: unregister_netdevice: waiting for vlan0 to become free. Usage count = 2 leaked reference. ib_device_set_netdev+0x266/0x730 siw_newlink+0x4e0/0xfd0 nldev_newlink+0x35c/0x5c0 rdma_nl_rcv_msg+0x36d/0x690 rdma_nl_rcv+0x2ee/0x430 netlink_unicast+0x543/0x7f0 netlink_sendmsg+0x918/0xe20 sock_sendmsg+0xcf/0x120 ____sys_sendmsg+0x70d/0x8b0 ___sys_sendmsg+0x11d/0x1b0 __sys_sendmsg+0xfa/0x1d0 do_syscall_64+0x35/0xb0 entry_SYSCALL_64_after_hwframe+0x63/0xcd This will help debug the issues syzkaller is seeing. Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/0-v1-a7c81b3842ce+e5-netdev_tracker_jgg@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/device.c | 6 ++++-- include/rdma/ib_verbs.h | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 3409c55ea88bf..ff35cebb25e26 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2159,14 +2159,16 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, return 0; } + if (old_ndev) + netdev_tracker_free(ndev, &pdata->netdev_tracker); if (ndev) - dev_hold(ndev); + netdev_hold(ndev, &pdata->netdev_tracker, GFP_ATOMIC); rcu_assign_pointer(pdata->netdev, ndev); spin_unlock_irqrestore(&pdata->netdev_lock, flags); add_ndev_hash(pdata); if (old_ndev) - dev_put(old_ndev); + __dev_put(old_ndev); return 0; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index a1f4d53a4bb63..77dd9148815b9 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2203,6 +2203,7 @@ struct ib_port_data { struct ib_port_cache cache; struct net_device __rcu *netdev; + netdevice_tracker netdev_tracker; struct hlist_node ndev_hash_link; struct rdma_port_counter port_counter; struct ib_port *sysfs; -- GitLab From ea5ef136e215fdef35f14010bc51fcd6686e6922 Mon Sep 17 00:00:00 2001 From: Yuan Can Date: Sat, 26 Nov 2022 04:34:10 +0000 Subject: [PATCH 0916/1423] RDMA/nldev: Add checks for nla_nest_start() in fill_stat_counter_qps() As the nla_nest_start() may fail with NULL returned, the return value needs to be checked. Fixes: c4ffee7c9bdb ("RDMA/netlink: Implement counter dumpit calback") Signed-off-by: Yuan Can Link: https://lore.kernel.org/r/20221126043410.85632-1-yuancan@huawei.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/nldev.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 2be76a3fdd87a..ca0ed7d143261 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -894,6 +894,8 @@ static int fill_stat_counter_qps(struct sk_buff *msg, int ret = 0; table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP); + if (!table_attr) + return -EMSGSIZE; rt = &counter->device->res[RDMA_RESTRACK_QP]; xa_lock(&rt->xa); -- GitLab From 86815735aa571d493cf5768cad5fa8e6fd9c7ba8 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 28 Nov 2022 19:26:29 +0530 Subject: [PATCH 0917/1423] KVM: arm64: PMU: Replace version number '0' with ID_AA64DFR0_EL1_PMUVer_NI kvm_host_pmu_init() returns when detected PMU is either not implemented, or implementation defined. kvm_pmu_probe_armpmu() also has a similar situation. Extracted ID_AA64DFR0_EL1_PMUVer value, when PMU is not implemented is '0', which can be replaced with ID_AA64DFR0_EL1_PMUVer_NI defined as '0b0000'. Cc: Arnaldo Carvalho de Melo Cc: Marc Zyngier Cc: Mark Rutland Cc: Will Deacon Cc: Catalin Marinas Cc: linux-perf-users@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Signed-off-by: Anshuman Khandual Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221128135629.118346-1-anshuman.khandual@arm.com --- arch/arm64/kvm/pmu-emul.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 3295dea34f4c9..bb7251e670a96 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -668,7 +668,8 @@ void kvm_host_pmu_init(struct arm_pmu *pmu) { struct arm_pmu_entry *entry; - if (pmu->pmuver == 0 || pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) + if (pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_NI || + pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) return; mutex_lock(&arm_pmus_lock); @@ -721,7 +722,7 @@ static struct arm_pmu *kvm_pmu_probe_armpmu(void) if (event->pmu) { pmu = to_arm_pmu(event->pmu); - if (pmu->pmuver == 0 || + if (pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_NI || pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) pmu = NULL; } -- GitLab From 292e8f1494764ac46dd1b7dd46fa317db691436c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 24 Nov 2022 10:40:02 +0000 Subject: [PATCH 0918/1423] KVM: arm64: PMU: Simplify PMCR_EL0 reset handling Resetting PMCR_EL0 is a pretty involved process that includes poisoning some of the writable bits, just because we can. It makes it hard to reason about about what gets configured, and just resetting things to 0 seems like a much saner option. Reduce reset_pmcr() to just preserving PMCR_EL0.N from the host, and setting PMCR_EL0.LC if we don't support AArch32. Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 67eac0f747be8..eb56ad031116b 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -639,24 +639,18 @@ static void reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { - u64 pmcr, val; + u64 pmcr; /* No PMU available, PMCR_EL0 may UNDEF... */ if (!kvm_arm_support_pmu_v3()) return; - pmcr = read_sysreg(pmcr_el0); - /* - * Writable bits of PMCR_EL0 (ARMV8_PMU_PMCR_MASK) are reset to UNKNOWN - * except PMCR.E resetting to zero. - */ - val = ((pmcr & ~ARMV8_PMU_PMCR_MASK) - | (ARMV8_PMU_PMCR_MASK & 0xdecafbad)) & (~ARMV8_PMU_PMCR_E); + /* Only preserve PMCR_EL0.N, and reset the rest to 0 */ + pmcr = read_sysreg(pmcr_el0) & ARMV8_PMU_PMCR_N_MASK; if (!kvm_supports_32bit_el0()) - val |= ARMV8_PMU_PMCR_LC; - if (!kvm_pmu_is_3p5(vcpu)) - val &= ~ARMV8_PMU_PMCR_LP; - __vcpu_sys_reg(vcpu, r->reg) = val; + pmcr |= ARMV8_PMU_PMCR_LC; + + __vcpu_sys_reg(vcpu, r->reg) = pmcr; } static bool check_pmu_access_disabled(struct kvm_vcpu *vcpu, u64 flags) -- GitLab From 64d6820d64c0a206e744bd8945374d563a76c16c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 24 Nov 2022 10:44:59 +0000 Subject: [PATCH 0919/1423] KVM: arm64: PMU: Sanitise PMCR_EL0.LP on first vcpu run Userspace can play some dirty tricks on us by selecting a given PMU version (such as PMUv3p5), restore a PMCR_EL0 value that has PMCR_EL0.LP set, and then switch the PMU version to PMUv3p1, for example. In this situation, we end-up with PMCR_EL0.LP being set and spreading havoc in the PMU emulation. This is specially hard as the first two step can be done on one vcpu and the third step on another, meaning that we need to sanitise *all* vcpus when the PMU version is changed. In orer to avoid a pretty complicated locking situation, defer the sanitisation of PMCR_EL0 to the point where the vcpu is actually run for the first tine, using the existing KVM_REQ_RELOAD_PMU request that calls into kvm_pmu_handle_pmcr(). There is still an obscure corner case where userspace could do the above trick, and then save the VM without running it. They would then observe an inconsistent state (PMUv3.1 + LP set), but that state will be fixed on the first run anyway whenever the guest gets restored on a host. Reported-by: Reiji Watanabe Signed-off-by: Marc Zyngier --- arch/arm64/kvm/pmu-emul.c | 6 ++++++ arch/arm64/kvm/sys_regs.c | 8 ++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index bb7251e670a96..d8ea399430861 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -538,6 +538,12 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) if (!kvm_vcpu_has_pmu(vcpu)) return; + /* Fixup PMCR_EL0 to reconcile the PMU version and the LP bit */ + if (!kvm_pmu_is_3p5(vcpu)) + val &= ~ARMV8_PMU_PMCR_LP; + + __vcpu_sys_reg(vcpu, PMCR_EL0) = val; + if (val & ARMV8_PMU_PMCR_E) { kvm_pmu_enable_counter_mask(vcpu, __vcpu_sys_reg(vcpu, PMCNTENSET_EL0)); diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index eb56ad031116b..528d253c571ab 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -693,15 +693,15 @@ static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, return false; if (p->is_write) { - /* Only update writeable bits of PMCR */ + /* + * Only update writeable bits of PMCR (continuing into + * kvm_pmu_handle_pmcr() as well) + */ val = __vcpu_sys_reg(vcpu, PMCR_EL0); val &= ~ARMV8_PMU_PMCR_MASK; val |= p->regval & ARMV8_PMU_PMCR_MASK; if (!kvm_supports_32bit_el0()) val |= ARMV8_PMU_PMCR_LC; - if (!kvm_pmu_is_3p5(vcpu)) - val &= ~ARMV8_PMU_PMCR_LP; - __vcpu_sys_reg(vcpu, PMCR_EL0) = val; kvm_pmu_handle_pmcr(vcpu, val); kvm_vcpu_pmu_restore_guest(vcpu); } else { -- GitLab From 923d011febb4e2fb338036bb0ee6a0a7f9b10da1 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 27 Nov 2022 22:52:10 +0100 Subject: [PATCH 0920/1423] gpio: Do not include when not really needed. is included only for using container_of(). Include instead, it is much lighter. Signed-off-by: Christophe JAILLET Reviewed-by: Linus Walleij Reviewed-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- include/linux/of_gpio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h index a5166eb934375..6db627257a7ba 100644 --- a/include/linux/of_gpio.h +++ b/include/linux/of_gpio.h @@ -34,7 +34,7 @@ enum of_gpio_flags { #ifdef CONFIG_OF_GPIO -#include +#include /* * OF GPIO chip for memory mapped banks -- GitLab From 4ef339bc053a62dac9017f80f7bb8cff0412bd29 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 16 Nov 2022 16:17:28 +0200 Subject: [PATCH 0921/1423] gpiolib: Unify access to the device properties Some of the functions are using struct fwnode_handle, some struct device pointer. In the GPIO library the firmware node of the GPIO device is the same as GPIO node of the GPIO chip. Due to this fact we may use former to access properties everywhere in the code. Signed-off-by: Andy Shevchenko Reviewed-by: Brian Masney Tested-by: Marijn Suijten [Bartosz: stick to the 80-char limit where it's not hurting readability] Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 51afdc6ac9198..2729f7ebab9d6 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -367,12 +367,12 @@ static int gpiochip_set_desc_names(struct gpio_chip *gc) static int devprop_gpiochip_set_names(struct gpio_chip *chip) { struct gpio_device *gdev = chip->gpiodev; - const struct fwnode_handle *fwnode = dev_fwnode(&gdev->dev); + struct device *dev = &gdev->dev; const char **names; int ret, i; int count; - count = fwnode_property_string_array_count(fwnode, "gpio-line-names"); + count = device_property_string_array_count(dev, "gpio-line-names"); if (count < 0) return 0; @@ -385,7 +385,7 @@ static int devprop_gpiochip_set_names(struct gpio_chip *chip) * gpiochips. */ if (count <= chip->offset) { - dev_warn(&gdev->dev, "gpio-line-names too short (length %d), cannot map names for the gpiochip at offset %u\n", + dev_warn(dev, "gpio-line-names too short (length %d), cannot map names for the gpiochip at offset %u\n", count, chip->offset); return 0; } @@ -394,10 +394,10 @@ static int devprop_gpiochip_set_names(struct gpio_chip *chip) if (!names) return -ENOMEM; - ret = fwnode_property_read_string_array(fwnode, "gpio-line-names", + ret = device_property_read_string_array(dev, "gpio-line-names", names, count); if (ret < 0) { - dev_warn(&gdev->dev, "failed to read GPIO line names\n"); + dev_warn(dev, "failed to read GPIO line names\n"); kfree(names); return ret; } @@ -448,10 +448,11 @@ static unsigned long *gpiochip_allocate_mask(struct gpio_chip *gc) static unsigned int gpiochip_count_reserved_ranges(struct gpio_chip *gc) { + struct device *dev = &gc->gpiodev->dev; int size; /* Format is "start, count, ..." */ - size = fwnode_property_count_u32(gc->fwnode, "gpio-reserved-ranges"); + size = device_property_count_u32(dev, "gpio-reserved-ranges"); if (size > 0 && size % 2 == 0) return size; @@ -472,6 +473,7 @@ static int gpiochip_alloc_valid_mask(struct gpio_chip *gc) static int gpiochip_apply_reserved_ranges(struct gpio_chip *gc) { + struct device *dev = &gc->gpiodev->dev; unsigned int size; u32 *ranges; int ret; @@ -484,7 +486,8 @@ static int gpiochip_apply_reserved_ranges(struct gpio_chip *gc) if (!ranges) return -ENOMEM; - ret = fwnode_property_read_u32_array(gc->fwnode, "gpio-reserved-ranges", ranges, size); + ret = device_property_read_u32_array(dev, "gpio-reserved-ranges", + ranges, size); if (ret) { kfree(ranges); return ret; -- GitLab From 3ca9d84e722e8044c09e80992aa7b15bd904d3ce Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 23 Nov 2022 19:40:01 -0500 Subject: [PATCH 0922/1423] KVM: always declare prototype for kvm_arch_irqchip_in_kernel Architecture code might want to use it even if CONFIG_HAVE_KVM_IRQ_ROUTING is false; for example PPC XICS has KVM_IRQ_LINE and wants to use kvm_arch_irqchip_in_kernel from there, but it does not have KVM_SET_GSI_ROUTING so the prototype was not provided. Fixes: d663b8a28598 ("KVM: replace direct irq.h inclusion") Reported-by: kernel test robot Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6f0f389f5f9cd..b8d12356f015f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -663,9 +663,9 @@ struct kvm_irq_routing_table { */ struct hlist_head map[]; }; +#endif bool kvm_arch_irqchip_in_kernel(struct kvm *kvm); -#endif #ifndef KVM_INTERNAL_MEM_SLOTS #define KVM_INTERNAL_MEM_SLOTS 0 -- GitLab From c3f3719952b9ab64e001e36df3d7bf24d5a4752d Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 14 Nov 2022 12:48:57 -0800 Subject: [PATCH 0923/1423] KVM: x86/xen: Add CPL to Xen hypercall tracepoint Signed-off-by: David Woodhouse Signed-off-by: Paolo Bonzini --- arch/x86/kvm/trace.h | 15 +++++++++------ arch/x86/kvm/xen.c | 2 +- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 09f3392dd8300..83843379813ee 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -113,12 +113,13 @@ TRACE_EVENT(kvm_hv_hypercall_done, * Tracepoint for Xen hypercall. */ TRACE_EVENT(kvm_xen_hypercall, - TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1, - unsigned long a2, unsigned long a3, unsigned long a4, - unsigned long a5), - TP_ARGS(nr, a0, a1, a2, a3, a4, a5), + TP_PROTO(u8 cpl, unsigned long nr, + unsigned long a0, unsigned long a1, unsigned long a2, + unsigned long a3, unsigned long a4, unsigned long a5), + TP_ARGS(cpl, nr, a0, a1, a2, a3, a4, a5), TP_STRUCT__entry( + __field(u8, cpl) __field(unsigned long, nr) __field(unsigned long, a0) __field(unsigned long, a1) @@ -129,6 +130,7 @@ TRACE_EVENT(kvm_xen_hypercall, ), TP_fast_assign( + __entry->cpl = cpl; __entry->nr = nr; __entry->a0 = a0; __entry->a1 = a1; @@ -138,8 +140,9 @@ TRACE_EVENT(kvm_xen_hypercall, __entry->a4 = a5; ), - TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx a4 0x%lx a5 %lx", - __entry->nr, __entry->a0, __entry->a1, __entry->a2, + TP_printk("cpl %d nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx a4 0x%lx a5 %lx", + __entry->cpl, __entry->nr, + __entry->a0, __entry->a1, __entry->a2, __entry->a3, __entry->a4, __entry->a5) ); diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index f3098c0e386a8..4b8e9628fbf57 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -1256,7 +1256,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu) } #endif cpl = static_call(kvm_x86_get_cpl)(vcpu); - trace_kvm_xen_hypercall(input, params[0], params[1], params[2], + trace_kvm_xen_hypercall(cpl, input, params[0], params[1], params[2], params[3], params[4], params[5]); /* -- GitLab From 7927e27549d3f02354233a9ab3f28e0080ede29b Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 12 Nov 2022 14:28:20 +0000 Subject: [PATCH 0924/1423] MAINTAINERS: Add KVM x86/xen maintainer list Adding Paul as co-maintainer of Xen support to help ensure that things don't fall through the cracks when I spend three months at a time travelling... Signed-off-by: David Woodhouse Reviewed-by: Paul Durrant Signed-off-by: Paolo Bonzini --- MAINTAINERS | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 046ff06ff97fa..89672a59c0c3a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11324,6 +11324,16 @@ F: arch/x86/kvm/svm/hyperv.* F: arch/x86/kvm/svm/svm_onhyperv.* F: arch/x86/kvm/vmx/evmcs.* +KVM X86 Xen (KVM/Xen) +M: David Woodhouse +M: Paul Durrant +M: Sean Christopherson +M: Paolo Bonzini +L: kvm@vger.kernel.org +S: Supported +T: git git://git.kernel.org/pub/scm/virt/kvm/kvm.git +F: arch/x86/kvm/xen.* + KERNFS M: Greg Kroah-Hartman M: Tejun Heo -- GitLab From c4690d016182d271a862767145db8b2bc792f4a8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Nov 2022 20:58:38 +0800 Subject: [PATCH 0925/1423] KVM: x86: Add BUILD_BUG_ON() to detect bad usage of "scattered" flags Add a compile-time assert in the SF() macro to detect improper usage, i.e. to detect passing in an X86_FEATURE_* flag that isn't actually scattered by the kernel. Upcoming feature flags will be 100% KVM-only and will have X86_FEATURE_* macros that point at a kvm_only_cpuid_leafs word, not a kernel-defined word. Using SF() and thus boot_cpu_has() for such feature flags would access memory beyond x86_capability[NCAPINTS] and at best incorrectly hide a feature, and at worst leak kernel state to userspace. Signed-off-by: Sean Christopherson Message-Id: <20221125125845.1182922-2-jiaxi.chen@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 6b5912578eddc..ff2e9734e5c18 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -65,7 +65,13 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted) #define KVM_X86_FEATURE_AMD_PSFD (13*32+28) /* Predictive Store Forwarding Disable */ #define F feature_bit -#define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0) + +/* Scattered Flag - For features that are scattered by cpufeatures.h. */ +#define SF(name) \ +({ \ + BUILD_BUG_ON(X86_FEATURE_##name >= MAX_CPU_FEATURES); \ + (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0); \ +}) /* * Magic value used by KVM when querying userspace-provided CPUID entries and -- GitLab From 047c7229906152fb85c23dc18fd25a00cd7cb4de Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Nov 2022 20:58:39 +0800 Subject: [PATCH 0926/1423] KVM: x86: Update KVM-only leaf handling to allow for 100% KVM-only leafs Rename kvm_cpu_cap_init_scattered() to kvm_cpu_cap_init_kvm_defined() in anticipation of adding KVM-only CPUID leafs that aren't recognized by the kernel and thus not scattered, i.e. for leafs that are 100% KVM-defined. Adjust/add comments to kvm_only_cpuid_leafs and KVM_X86_FEATURE to document how to create new kvm_only_cpuid_leafs entries for scattered features as well as features that are entirely unknown to the kernel. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20221125125845.1182922-3-jiaxi.chen@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 8 ++++---- arch/x86/kvm/reverse_cpuid.h | 18 +++++++++++++++--- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ff2e9734e5c18..73c3c6dc6e7b5 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -549,9 +549,9 @@ static __always_inline void __kvm_cpu_cap_mask(unsigned int leaf) } static __always_inline -void kvm_cpu_cap_init_scattered(enum kvm_only_cpuid_leafs leaf, u32 mask) +void kvm_cpu_cap_init_kvm_defined(enum kvm_only_cpuid_leafs leaf, u32 mask) { - /* Use kvm_cpu_cap_mask for non-scattered leafs. */ + /* Use kvm_cpu_cap_mask for leafs that aren't KVM-only. */ BUILD_BUG_ON(leaf < NCAPINTS); kvm_cpu_caps[leaf] = mask; @@ -561,7 +561,7 @@ void kvm_cpu_cap_init_scattered(enum kvm_only_cpuid_leafs leaf, u32 mask) static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask) { - /* Use kvm_cpu_cap_init_scattered for scattered leafs. */ + /* Use kvm_cpu_cap_init_kvm_defined for KVM-only leafs. */ BUILD_BUG_ON(leaf >= NCAPINTS); kvm_cpu_caps[leaf] &= mask; @@ -670,7 +670,7 @@ void kvm_set_cpu_caps(void) F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) | f_xfd ); - kvm_cpu_cap_init_scattered(CPUID_12_EAX, + kvm_cpu_cap_init_kvm_defined(CPUID_12_EAX, SF(SGX1) | SF(SGX2) ); diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index a19d473d01847..443a6b3e66c0e 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -7,9 +7,9 @@ #include /* - * Hardware-defined CPUID leafs that are scattered in the kernel, but need to - * be directly used by KVM. Note, these word values conflict with the kernel's - * "bug" caps, but KVM doesn't use those. + * Hardware-defined CPUID leafs that are either scattered by the kernel or are + * unknown to the kernel, but need to be directly used by KVM. Note, these + * word values conflict with the kernel's "bug" caps, but KVM doesn't use those. */ enum kvm_only_cpuid_leafs { CPUID_12_EAX = NCAPINTS, @@ -18,6 +18,18 @@ enum kvm_only_cpuid_leafs { NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, }; +/* + * Define a KVM-only feature flag. + * + * For features that are scattered by cpufeatures.h, __feature_translate() also + * needs to be updated to translate the kernel-defined feature into the + * KVM-defined feature. + * + * For features that are 100% KVM-only, i.e. not defined by cpufeatures.h, + * forego the intermediate KVM_X86_FEATURE and directly define X86_FEATURE_* so + * that X86_FEATURE_* can be used in KVM. No __feature_translate() handling is + * needed in this case. + */ #define KVM_X86_FEATURE(w, f) ((w)*32 + (f)) /* Intel-defined SGX sub-features, CPUID level 0x12 (EAX). */ -- GitLab From 6a19d7aa5821522eec528fd44f24fe774b875377 Mon Sep 17 00:00:00 2001 From: Jiaxi Chen Date: Fri, 25 Nov 2022 20:58:40 +0800 Subject: [PATCH 0927/1423] x86: KVM: Advertise CMPccXADD CPUID to user space CMPccXADD is a new set of instructions in the latest Intel platform Sierra Forest. This new instruction set includes a semaphore operation that can compare and add the operands if condition is met, which can improve database performance. The bit definition: CPUID.(EAX=7,ECX=1):EAX[bit 7] CMPccXADD is on an expected-dense CPUID leaf and some other bits on this leaf have kernel usages. Given that, define this feature bit like X86_FEATURE_ in kernel. Considering CMPccXADD itself has no truly kernel usages and /proc/cpuinfo has too much unreadable flags, hide this one in /proc/cpuinfo. Advertise CMPCCXADD to KVM userspace. This is safe because there are no new VMX controls or additional host enabling required for guests to use this feature. Signed-off-by: Jiaxi Chen Acked-by: Borislav Petkov Message-Id: <20221125125845.1182922-4-jiaxi.chen@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kvm/cpuid.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index b71f4f2ecdd57..5cdd57133d904 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -308,6 +308,7 @@ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ +#define X86_FEATURE_CMPCCXADD (12*32+ 7) /* "" CMPccXADD instructions */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 73c3c6dc6e7b5..2f263ff911d66 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -663,7 +663,7 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD); kvm_cpu_cap_mask(CPUID_7_1_EAX, - F(AVX_VNNI) | F(AVX512_BF16) + F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) ); kvm_cpu_cap_mask(CPUID_D_1_EAX, -- GitLab From af2872f6225476566bcbbd523a74dcaba29e159e Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Fri, 25 Nov 2022 20:58:41 +0800 Subject: [PATCH 0928/1423] x86: KVM: Advertise AMX-FP16 CPUID to user space Latest Intel platform Granite Rapids has introduced a new instruction - AMX-FP16, which performs dot-products of two FP16 tiles and accumulates the results into a packed single precision tile. AMX-FP16 adds FP16 capability and also allows a FP16 GPU trained model to run faster without loss of accuracy or added SW overhead. The bit definition: CPUID.(EAX=7,ECX=1):EAX[bit 21] AMX-FP16 is on an expected-dense CPUID leaf and some other bits on this leaf have kernel usages. Given that, define this feature bit like X86_FEATURE_ in kernel. Considering AMX-FP16 itself has no truly kernel usages and /proc/cpuinfo has too much unreadable flags, hide this one in /proc/cpuinfo. Advertise AMX-FP16 to KVM userspace. This is safe because there are no new VMX controls or additional host enabling required for guests to use this feature. Signed-off-by: Chang S. Bae Signed-off-by: Jiaxi Chen Acked-by: Borislav Petkov Message-Id: <20221125125845.1182922-5-jiaxi.chen@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kvm/cpuid.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 5cdd57133d904..20059dc33d246 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -309,6 +309,7 @@ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ #define X86_FEATURE_CMPCCXADD (12*32+ 7) /* "" CMPccXADD instructions */ +#define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 2f263ff911d66..6360075ca70f4 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -663,7 +663,7 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD); kvm_cpu_cap_mask(CPUID_7_1_EAX, - F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) + F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) | F(AMX_FP16) ); kvm_cpu_cap_mask(CPUID_D_1_EAX, -- GitLab From 5e85c4ebf206e50c58e82ca44c15e2be2bac6923 Mon Sep 17 00:00:00 2001 From: Jiaxi Chen Date: Fri, 25 Nov 2022 20:58:42 +0800 Subject: [PATCH 0929/1423] x86: KVM: Advertise AVX-IFMA CPUID to user space AVX-IFMA is a new instruction in the latest Intel platform Sierra Forest. This instruction packed multiplies unsigned 52-bit integers and adds the low/high 52-bit products to Qword Accumulators. The bit definition: CPUID.(EAX=7,ECX=1):EAX[bit 23] AVX-IFMA is on an expected-dense CPUID leaf and some other bits on this leaf have kernel usages. Given that, define this feature bit like X86_FEATURE_ in kernel. Considering AVX-IFMA itself has no truly kernel usages and /proc/cpuinfo has too much unreadable flags, hide this one in /proc/cpuinfo. Advertise AVX-IFMA to KVM userspace. This is safe because there are no new VMX controls or additional host enabling required for guests to use this feature. Signed-off-by: Jiaxi Chen Acked-by: Borislav Petkov Message-Id: <20221125125845.1182922-6-jiaxi.chen@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kvm/cpuid.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 20059dc33d246..1419c4e04d45f 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -310,6 +310,7 @@ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ #define X86_FEATURE_CMPCCXADD (12*32+ 7) /* "" CMPccXADD instructions */ #define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */ +#define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 6360075ca70f4..5dfc0d036df5e 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -663,7 +663,8 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD); kvm_cpu_cap_mask(CPUID_7_1_EAX, - F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) | F(AMX_FP16) + F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) | F(AMX_FP16) | + F(AVX_IFMA) ); kvm_cpu_cap_mask(CPUID_D_1_EAX, -- GitLab From 24d74b9f5f2a972ac9228372adeac62b2dc10ea2 Mon Sep 17 00:00:00 2001 From: Jiaxi Chen Date: Fri, 25 Nov 2022 20:58:43 +0800 Subject: [PATCH 0930/1423] KVM: x86: Advertise AVX-VNNI-INT8 CPUID to user space AVX-VNNI-INT8 is a new set of instructions in the latest Intel platform Sierra Forest, aims for the platform to have superior AI capabilities. This instruction multiplies the individual bytes of two unsigned or unsigned source operands, then adds and accumulates the results into the destination dword element size operand. The bit definition: CPUID.(EAX=7,ECX=1):EDX[bit 4] AVX-VNNI-INT8 is on a new and sparse CPUID leaf and all bits on this leaf have no truly kernel use case for now. Given that and to save space for kernel feature bits, move this new leaf to KVM-only subleaf and plus an x86_FEATURE definition for AVX-VNNI-INT8 to direct it to the KVM entry. Advertise AVX-VNNI-INT8 to KVM userspace. This is safe because there are no new VMX controls or additional host enabling required for guests to use this feature. Signed-off-by: Jiaxi Chen Message-Id: <20221125125845.1182922-7-jiaxi.chen@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 6 +++++- arch/x86/kvm/reverse_cpuid.h | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 5dfc0d036df5e..b6ce47c4e9728 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -667,6 +667,10 @@ void kvm_set_cpu_caps(void) F(AVX_IFMA) ); + kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX, + F(AVX_VNNI_INT8) + ); + kvm_cpu_cap_mask(CPUID_D_1_EAX, F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) | f_xfd ); @@ -920,9 +924,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) goto out; cpuid_entry_override(entry, CPUID_7_1_EAX); + cpuid_entry_override(entry, CPUID_7_1_EDX); entry->ebx = 0; entry->ecx = 0; - entry->edx = 0; } break; case 0xa: { /* Architectural Performance Monitoring */ diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index 443a6b3e66c0e..84f56b662424c 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -13,6 +13,7 @@ */ enum kvm_only_cpuid_leafs { CPUID_12_EAX = NCAPINTS, + CPUID_7_1_EDX, NR_KVM_CPU_CAPS, NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, @@ -36,6 +37,9 @@ enum kvm_only_cpuid_leafs { #define KVM_X86_FEATURE_SGX1 KVM_X86_FEATURE(CPUID_12_EAX, 0) #define KVM_X86_FEATURE_SGX2 KVM_X86_FEATURE(CPUID_12_EAX, 1) +/* Intel-defined sub-features, CPUID level 0x00000007:1 (EDX) */ +#define X86_FEATURE_AVX_VNNI_INT8 KVM_X86_FEATURE(CPUID_7_1_EDX, 4) + struct cpuid_reg { u32 function; u32 index; @@ -60,6 +64,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_7_1_EAX] = { 7, 1, CPUID_EAX}, [CPUID_12_EAX] = {0x00000012, 0, CPUID_EAX}, [CPUID_8000_001F_EAX] = {0x8000001f, 0, CPUID_EAX}, + [CPUID_7_1_EDX] = { 7, 1, CPUID_EDX}, }; /* -- GitLab From 9977f0877de7f8fc51391e2d52bc993efbd58b90 Mon Sep 17 00:00:00 2001 From: Jiaxi Chen Date: Fri, 25 Nov 2022 20:58:44 +0800 Subject: [PATCH 0931/1423] KVM: x86: Advertise AVX-NE-CONVERT CPUID to user space AVX-NE-CONVERT is a new set of instructions which can convert low precision floating point like BF16/FP16 to high precision floating point FP32, and can also convert FP32 elements to BF16. This instruction allows the platform to have improved AI capabilities and better compatibility. The bit definition: CPUID.(EAX=7,ECX=1):EDX[bit 5] AVX-NE-CONVERT is on a KVM-only subleaf. Plus an x86_FEATURE definition for this feature bit to direct it to the KVM entry. Advertise AVX-NE-CONVERT to KVM userspace. This is safe because there are no new VMX controls or additional host enabling required for guests to use this feature. Signed-off-by: Jiaxi Chen Message-Id: <20221125125845.1182922-8-jiaxi.chen@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/reverse_cpuid.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index b6ce47c4e9728..5a95624a658a8 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -668,7 +668,7 @@ void kvm_set_cpu_caps(void) ); kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX, - F(AVX_VNNI_INT8) + F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) ); kvm_cpu_cap_mask(CPUID_D_1_EAX, diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index 84f56b662424c..43eff7207e017 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -39,6 +39,7 @@ enum kvm_only_cpuid_leafs { /* Intel-defined sub-features, CPUID level 0x00000007:1 (EDX) */ #define X86_FEATURE_AVX_VNNI_INT8 KVM_X86_FEATURE(CPUID_7_1_EDX, 4) +#define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5) struct cpuid_reg { u32 function; -- GitLab From 29c46979b25d5ca867e9859bfdd088d028739cdf Mon Sep 17 00:00:00 2001 From: Jiaxi Chen Date: Fri, 25 Nov 2022 20:58:45 +0800 Subject: [PATCH 0932/1423] KVM: x86: Advertise PREFETCHIT0/1 CPUID to user space Latest Intel platform Granite Rapids has introduced a new instruction - PREFETCHIT0/1, which moves code to memory (cache) closer to the processor depending on specific hints. The bit definition: CPUID.(EAX=7,ECX=1):EDX[bit 14] PREFETCHIT0/1 is on a KVM-only subleaf. Plus an x86_FEATURE definition for this feature bit to direct it to the KVM entry. Advertise PREFETCHIT0/1 to KVM userspace. This is safe because there are no new VMX controls or additional host enabling required for guests to use this feature. Signed-off-by: Jiaxi Chen Message-Id: <20221125125845.1182922-9-jiaxi.chen@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/reverse_cpuid.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 5a95624a658a8..723502181a3a9 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -668,7 +668,7 @@ void kvm_set_cpu_caps(void) ); kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX, - F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) + F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI) ); kvm_cpu_cap_mask(CPUID_D_1_EAX, diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index 43eff7207e017..203fdad07bae6 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -40,6 +40,7 @@ enum kvm_only_cpuid_leafs { /* Intel-defined sub-features, CPUID level 0x00000007:1 (EDX) */ #define X86_FEATURE_AVX_VNNI_INT8 KVM_X86_FEATURE(CPUID_7_1_EDX, 4) #define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5) +#define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14) struct cpuid_reg { u32 function; -- GitLab From 41e8f85a75fc60e1543e4903428a1b481b672a17 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Fri, 11 Nov 2022 09:04:06 -0800 Subject: [PATCH 0933/1423] f2fs: introduce F2FS_IOC_START_ATOMIC_REPLACE introduce a new ioctl to replace the whole content of a file atomically, which means it induces truncate and content update at the same time. We can start it with F2FS_IOC_START_ATOMIC_REPLACE and complete it with F2FS_IOC_COMMIT_ATOMIC_WRITE. Or abort it with F2FS_IOC_ABORT_ATOMIC_WRITE. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 +++ fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 21 +++++++++++++++------ fs/f2fs/segment.c | 13 ++++++++++++- include/uapi/linux/f2fs.h | 1 + 5 files changed, 32 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9b47ded653d1c..560fa80590e99 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3466,6 +3466,9 @@ static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi, else if (*blk_addr != NULL_ADDR) return 0; + if (is_inode_flag_set(inode, FI_ATOMIC_REPLACE)) + goto reserve_block; + /* Look for the block in the original inode */ err = __find_data_block(inode, index, &ori_blk_addr); if (err) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6a8cbf5bb1871..b89b5d755ce05 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -769,6 +769,7 @@ enum { FI_ALIGNED_WRITE, /* enable aligned write */ FI_COW_FILE, /* indicate COW file */ FI_ATOMIC_COMMITTED, /* indicate atomic commit completed except disk sync */ + FI_ATOMIC_REPLACE, /* indicate atomic replace */ FI_MAX, /* max flag, never be used */ }; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 28f586e779992..ab0a0d3730f6e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2034,7 +2034,7 @@ static int f2fs_ioc_getversion(struct file *filp, unsigned long arg) return put_user(inode->i_generation, (int __user *)arg); } -static int f2fs_ioc_start_atomic_write(struct file *filp) +static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate) { struct inode *inode = file_inode(filp); struct user_namespace *mnt_userns = file_mnt_user_ns(filp); @@ -2103,15 +2103,22 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) f2fs_write_inode(inode, NULL); - isize = i_size_read(inode); - fi->original_i_size = isize; - f2fs_i_size_write(fi->cow_inode, isize); - stat_inc_atomic_inode(inode); set_inode_flag(inode, FI_ATOMIC_FILE); set_inode_flag(fi->cow_inode, FI_COW_FILE); clear_inode_flag(fi->cow_inode, FI_INLINE_DATA); + + isize = i_size_read(inode); + fi->original_i_size = isize; + if (truncate) { + set_inode_flag(inode, FI_ATOMIC_REPLACE); + truncate_inode_pages_final(inode->i_mapping); + f2fs_i_size_write(inode, 0); + isize = 0; + } + f2fs_i_size_write(fi->cow_inode, isize); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); f2fs_update_time(sbi, REQ_TIME); @@ -4139,7 +4146,9 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case FS_IOC_GETVERSION: return f2fs_ioc_getversion(filp, arg); case F2FS_IOC_START_ATOMIC_WRITE: - return f2fs_ioc_start_atomic_write(filp); + return f2fs_ioc_start_atomic_write(filp, false); + case F2FS_IOC_START_ATOMIC_REPLACE: + return f2fs_ioc_start_atomic_write(filp, true); case F2FS_IOC_COMMIT_ATOMIC_WRITE: return f2fs_ioc_commit_atomic_write(filp); case F2FS_IOC_ABORT_ATOMIC_WRITE: diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8aa81238c770f..5ac026a572287 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -197,6 +197,7 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean) fi->cow_inode = NULL; release_atomic_write_cnt(inode); clear_inode_flag(inode, FI_ATOMIC_COMMITTED); + clear_inode_flag(inode, FI_ATOMIC_REPLACE); clear_inode_flag(inode, FI_ATOMIC_FILE); stat_dec_atomic_inode(inode); @@ -261,14 +262,24 @@ static void __complete_revoke_list(struct inode *inode, struct list_head *head, bool revoke) { struct revoke_entry *cur, *tmp; + pgoff_t start_index = 0; + bool truncate = is_inode_flag_set(inode, FI_ATOMIC_REPLACE); list_for_each_entry_safe(cur, tmp, head, list) { - if (revoke) + if (revoke) { __replace_atomic_write_block(inode, cur->index, cur->old_addr, NULL, true); + } else if (truncate) { + f2fs_truncate_hole(inode, start_index, cur->index); + start_index = cur->index + 1; + } + list_del(&cur->list); kmem_cache_free(revoke_entry_slab, cur); } + + if (!revoke && truncate) + f2fs_do_truncate_blocks(inode, start_index * PAGE_SIZE, false); } static int __f2fs_commit_atomic_write(struct inode *inode) diff --git a/include/uapi/linux/f2fs.h b/include/uapi/linux/f2fs.h index 3121d127d5aae..955d440be1046 100644 --- a/include/uapi/linux/f2fs.h +++ b/include/uapi/linux/f2fs.h @@ -42,6 +42,7 @@ struct f2fs_comp_option) #define F2FS_IOC_DECOMPRESS_FILE _IO(F2FS_IOCTL_MAGIC, 23) #define F2FS_IOC_COMPRESS_FILE _IO(F2FS_IOCTL_MAGIC, 24) +#define F2FS_IOC_START_ATOMIC_REPLACE _IO(F2FS_IOCTL_MAGIC, 25) /* * should be same as XFS_IOC_GOINGDOWN. -- GitLab From d3b7b4afd6b2c344eabf9cc26b8bfa903c164c7c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 15 Nov 2022 00:08:47 +0800 Subject: [PATCH 0934/1423] f2fs: fix to do sanity check on i_extra_isize in is_alive() syzbot found a f2fs bug: BUG: KASAN: slab-out-of-bounds in data_blkaddr fs/f2fs/f2fs.h:2891 [inline] BUG: KASAN: slab-out-of-bounds in is_alive fs/f2fs/gc.c:1117 [inline] BUG: KASAN: slab-out-of-bounds in gc_data_segment fs/f2fs/gc.c:1520 [inline] BUG: KASAN: slab-out-of-bounds in do_garbage_collect+0x386a/0x3df0 fs/f2fs/gc.c:1734 Read of size 4 at addr ffff888076557568 by task kworker/u4:3/52 CPU: 1 PID: 52 Comm: kworker/u4:3 Not tainted 6.1.0-rc4-syzkaller-00362-gfef7fd48922d #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022 Workqueue: writeback wb_workfn (flush-7:0) Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:284 [inline] print_report+0x15e/0x45d mm/kasan/report.c:395 kasan_report+0xbb/0x1f0 mm/kasan/report.c:495 data_blkaddr fs/f2fs/f2fs.h:2891 [inline] is_alive fs/f2fs/gc.c:1117 [inline] gc_data_segment fs/f2fs/gc.c:1520 [inline] do_garbage_collect+0x386a/0x3df0 fs/f2fs/gc.c:1734 f2fs_gc+0x88c/0x20a0 fs/f2fs/gc.c:1831 f2fs_balance_fs+0x544/0x6b0 fs/f2fs/segment.c:410 f2fs_write_inode+0x57e/0xe20 fs/f2fs/inode.c:753 write_inode fs/fs-writeback.c:1440 [inline] __writeback_single_inode+0xcfc/0x1440 fs/fs-writeback.c:1652 writeback_sb_inodes+0x54d/0xf90 fs/fs-writeback.c:1870 wb_writeback+0x2c5/0xd70 fs/fs-writeback.c:2044 wb_do_writeback fs/fs-writeback.c:2187 [inline] wb_workfn+0x2dc/0x12f0 fs/fs-writeback.c:2227 process_one_work+0x9bf/0x1710 kernel/workqueue.c:2289 worker_thread+0x665/0x1080 kernel/workqueue.c:2436 kthread+0x2e4/0x3a0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:306 The root cause is that we forgot to do sanity check on .i_extra_isize in below path, result in accessing invalid address later, fix it. - gc_data_segment - is_alive - data_blkaddr - offset_in_addr Reported-by: syzbot+f8f3dfa4abc489e768a1@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-f2fs-devel/0000000000003cb3c405ed5c17f9@google.com/T/#u Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index f1a46519a5fe3..0f967b1e98f23 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1077,7 +1077,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, { struct page *node_page; nid_t nid; - unsigned int ofs_in_node, max_addrs; + unsigned int ofs_in_node, max_addrs, base; block_t source_blkaddr; nid = le32_to_cpu(sum->nid); @@ -1103,11 +1103,17 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return false; } - max_addrs = IS_INODE(node_page) ? DEF_ADDRS_PER_INODE : - DEF_ADDRS_PER_BLOCK; - if (ofs_in_node >= max_addrs) { - f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%u, nid:%u, max:%u", - ofs_in_node, dni->ino, dni->nid, max_addrs); + if (IS_INODE(node_page)) { + base = offset_in_addr(F2FS_INODE(node_page)); + max_addrs = DEF_ADDRS_PER_INODE; + } else { + base = 0; + max_addrs = DEF_ADDRS_PER_BLOCK; + } + + if (base + ofs_in_node >= max_addrs) { + f2fs_err(sbi, "Inconsistent blkaddr offset: base:%u, ofs_in_node:%u, max:%u, ino:%u, nid:%u", + base, ofs_in_node, max_addrs, dni->ino, dni->nid); f2fs_put_page(node_page, 1); return false; } -- GitLab From 5b7b74b71c7fefbaa3e0ccc120c3cbd50b3fad86 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sat, 12 Nov 2022 00:13:49 +0800 Subject: [PATCH 0935/1423] f2fs: remove submit label in __submit_discard_cmd() Complaint from Matthew Wilcox in another similar place: "submit? You don't submit anything at the 'submit' label. it should be called 'skip' or something. But I think this is just badly written and you don't need a goto at all." Let's remove submit label for readability. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5ac026a572287..8b0b765505785 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1143,13 +1143,12 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, if (time_to_inject(sbi, FAULT_DISCARD)) { f2fs_show_injection_info(sbi, FAULT_DISCARD); err = -EIO; - goto submit; - } - err = __blkdev_issue_discard(bdev, + } else { + err = __blkdev_issue_discard(bdev, SECTOR_FROM_BLOCK(start), SECTOR_FROM_BLOCK(len), GFP_NOFS, &bio); -submit: + } if (err) { spin_lock_irqsave(&dc->lock, flags); if (dc->state == D_PARTIAL) -- GitLab From b7ad23cec26a91a4f7c45ff7ff8e915f21ac5127 Mon Sep 17 00:00:00 2001 From: Yuwei Guan Date: Tue, 15 Nov 2022 14:35:35 +0800 Subject: [PATCH 0936/1423] f2fs: fix to alloc_mode changed after remount on a small volume device The commit 84b89e5d943d8 ("f2fs: add auto tuning for small devices") add tuning for small volume device, now support to tune alloce_mode to 'reuse' if it's small size. But the alloc_mode will change to 'default' when do remount on this small size dievce. This patch fo fix alloc_mode changed when do remount for a small volume device. Signed-off-by: Yuwei Guan Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 75027ff85cd93..96cfe626a670a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2059,7 +2059,11 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE; F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; - F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; + if (le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_main) <= + SMALL_VOLUME_SEGMENTS) + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; + else + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); @@ -4077,7 +4081,6 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) /* adjust parameters according to the volume size */ if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) { - F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; if (f2fs_block_unit_discard(sbi)) sm_i->dcc_info->discard_granularity = 1; sm_i->ipu_policy = 1 << F2FS_IPU_FORCE | -- GitLab From 777cd95b8066ced4e9a2534941b81f8ad98e74fb Mon Sep 17 00:00:00 2001 From: Yuwei Guan Date: Tue, 15 Nov 2022 14:35:36 +0800 Subject: [PATCH 0937/1423] f2fs: cleanup for 'f2fs_tuning_parameters' function A cleanup patch for 'f2fs_tuning_parameters' function. Signed-off-by: Yuwei Guan Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 96cfe626a670a..05101cd4140b4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4077,13 +4077,11 @@ static int f2fs_setup_casefold(struct f2fs_sb_info *sbi) static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) { - struct f2fs_sm_info *sm_i = SM_I(sbi); - /* adjust parameters according to the volume size */ - if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) { + if (MAIN_SEGS(sbi) <= SMALL_VOLUME_SEGMENTS) { if (f2fs_block_unit_discard(sbi)) - sm_i->dcc_info->discard_granularity = 1; - sm_i->ipu_policy = 1 << F2FS_IPU_FORCE | + SM_I(sbi)->dcc_info->discard_granularity = 1; + SM_I(sbi)->ipu_policy = 1 << F2FS_IPU_FORCE | 1 << F2FS_IPU_HONOR_OPU_WRITE; } -- GitLab From 66aee5aaa237e5a3475581b462b8b22e0944d264 Mon Sep 17 00:00:00 2001 From: Yuwei Guan Date: Tue, 15 Nov 2022 14:35:37 +0800 Subject: [PATCH 0938/1423] f2fs: change type for 'sbi->readdir_ra' Before this patch, the varibale 'readdir_ra' takes effect if it's equal to '1' or not, so we can change type for it from 'int' to 'bool'. Signed-off-by: Yuwei Guan Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 2 +- fs/f2fs/f2fs.h | 2 +- fs/f2fs/super.c | 2 +- fs/f2fs/sysfs.c | 5 +++++ 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 030b7fd4142ff..8e025157f35c9 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -1010,7 +1010,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, struct fscrypt_str de_name = FSTR_INIT(NULL, 0); struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode); struct blk_plug plug; - bool readdir_ra = sbi->readdir_ra == 1; + bool readdir_ra = sbi->readdir_ra; bool found_valid_dirent = false; int err = 0; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b89b5d755ce05..96bd3461c0bbb 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1698,7 +1698,7 @@ struct f2fs_sb_info { unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ int dir_level; /* directory level */ - int readdir_ra; /* readahead inode in readdir */ + bool readdir_ra; /* readahead inode in readdir */ u64 max_io_bytes; /* max io bytes to merge IOs */ block_t user_block_count; /* # of user blocks */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 05101cd4140b4..31435c8645c83 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4085,7 +4085,7 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) 1 << F2FS_IPU_HONOR_OPU_WRITE; } - sbi->readdir_ra = 1; + sbi->readdir_ra = true; } static int f2fs_fill_super(struct super_block *sb, void *data, int silent) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 97bf0dbb0974b..33ec467b37725 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -656,6 +656,11 @@ out: return count; } + if (!strcmp(a->attr.name, "readdir_ra")) { + sbi->readdir_ra = !!t; + return count; + } + *ui = (unsigned int)t; return count; -- GitLab From 4ff23a6547b81ca22adb852dfe93ee5fc45328ac Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Thu, 17 Nov 2022 23:10:54 +0800 Subject: [PATCH 0939/1423] f2fs: set zstd compress level correctly Fixes: cf30f6a5f0c6 ("lib: zstd: Add kernel-specific API") Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Reviewed-by: Nick Terrell Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index d315c2de136f2..74d3f2d2271f3 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -346,7 +346,7 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc) if (!level) level = F2FS_ZSTD_DEFAULT_CLEVEL; - params = zstd_get_params(F2FS_ZSTD_DEFAULT_CLEVEL, cc->rlen); + params = zstd_get_params(level, cc->rlen); workspace_size = zstd_cstream_workspace_bound(¶ms.cParams); workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode), -- GitLab From 787caf1bdcd9f04058e4e8d8ed56db1dbafea0b7 Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Fri, 11 Nov 2022 18:08:29 +0800 Subject: [PATCH 0940/1423] f2fs: fix to enable compress for newly created file if extension matches If compress_extension is set, and a newly created file matches the extension, the file could be marked as compression file. However, if inline_data is also enabled, there is no chance to check its extension since f2fs_should_compress() always returns false. This patch moves set_compress_inode(), which do extension check, in f2fs_should_compress() to check extensions before setting inline data flag. Fixes: 7165841d578e ("f2fs: fix to check inline_data during compressed inode conversion") Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/namei.c | 329 ++++++++++++++++++++++++------------------------ 2 files changed, 164 insertions(+), 167 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 96bd3461c0bbb..f0833638f59ea 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2980,7 +2980,7 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) /* Flags that should be inherited by new inodes from their parent. */ #define F2FS_FL_INHERITED (F2FS_SYNC_FL | F2FS_NODUMP_FL | F2FS_NOATIME_FL | \ F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \ - F2FS_CASEFOLD_FL | F2FS_COMPR_FL | F2FS_NOCOMP_FL) + F2FS_CASEFOLD_FL) /* Flags that are appropriate for regular files (all but dir-specific ones). */ #define F2FS_REG_FLMASK (~(F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \ diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index e104409c3a0e5..54448dccbb6ad 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -22,8 +22,163 @@ #include "acl.h" #include +static inline int is_extension_exist(const unsigned char *s, const char *sub, + bool tmp_ext) +{ + size_t slen = strlen(s); + size_t sublen = strlen(sub); + int i; + + if (sublen == 1 && *sub == '*') + return 1; + + /* + * filename format of multimedia file should be defined as: + * "filename + '.' + extension + (optional: '.' + temp extension)". + */ + if (slen < sublen + 2) + return 0; + + if (!tmp_ext) { + /* file has no temp extension */ + if (s[slen - sublen - 1] != '.') + return 0; + return !strncasecmp(s + slen - sublen, sub, sublen); + } + + for (i = 1; i < slen - sublen; i++) { + if (s[i] != '.') + continue; + if (!strncasecmp(s + i + 1, sub, sublen)) + return 1; + } + + return 0; +} + +int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, + bool hot, bool set) +{ + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + int cold_count = le32_to_cpu(sbi->raw_super->extension_count); + int hot_count = sbi->raw_super->hot_ext_count; + int total_count = cold_count + hot_count; + int start, count; + int i; + + if (set) { + if (total_count == F2FS_MAX_EXTENSION) + return -EINVAL; + } else { + if (!hot && !cold_count) + return -EINVAL; + if (hot && !hot_count) + return -EINVAL; + } + + if (hot) { + start = cold_count; + count = total_count; + } else { + start = 0; + count = cold_count; + } + + for (i = start; i < count; i++) { + if (strcmp(name, extlist[i])) + continue; + + if (set) + return -EINVAL; + + memcpy(extlist[i], extlist[i + 1], + F2FS_EXTENSION_LEN * (total_count - i - 1)); + memset(extlist[total_count - 1], 0, F2FS_EXTENSION_LEN); + if (hot) + sbi->raw_super->hot_ext_count = hot_count - 1; + else + sbi->raw_super->extension_count = + cpu_to_le32(cold_count - 1); + return 0; + } + + if (!set) + return -EINVAL; + + if (hot) { + memcpy(extlist[count], name, strlen(name)); + sbi->raw_super->hot_ext_count = hot_count + 1; + } else { + char buf[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN]; + + memcpy(buf, &extlist[cold_count], + F2FS_EXTENSION_LEN * hot_count); + memset(extlist[cold_count], 0, F2FS_EXTENSION_LEN); + memcpy(extlist[cold_count], name, strlen(name)); + memcpy(&extlist[cold_count + 1], buf, + F2FS_EXTENSION_LEN * hot_count); + sbi->raw_super->extension_count = cpu_to_le32(cold_count + 1); + } + return 0; +} + +static void set_compress_new_inode(struct f2fs_sb_info *sbi, struct inode *dir, + struct inode *inode, const unsigned char *name) +{ + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + unsigned char (*noext)[F2FS_EXTENSION_LEN] = + F2FS_OPTION(sbi).noextensions; + unsigned char (*ext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).extensions; + unsigned char ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + unsigned char noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + int i, cold_count, hot_count; + + if (!f2fs_sb_has_compression(sbi)) + return; + + if (S_ISDIR(inode->i_mode)) + goto inherit_comp; + + /* This name comes only from normal files. */ + if (!name) + return; + + /* Don't compress hot files. */ + f2fs_down_read(&sbi->sb_lock); + cold_count = le32_to_cpu(sbi->raw_super->extension_count); + hot_count = sbi->raw_super->hot_ext_count; + for (i = cold_count; i < cold_count + hot_count; i++) + if (is_extension_exist(name, extlist[i], false)) + break; + f2fs_up_read(&sbi->sb_lock); + if (i < (cold_count + hot_count)) + return; + + /* Don't compress unallowed extension. */ + for (i = 0; i < noext_cnt; i++) + if (is_extension_exist(name, noext[i], false)) + return; + + /* Compress wanting extension. */ + for (i = 0; i < ext_cnt; i++) { + if (is_extension_exist(name, ext[i], false)) { + set_compress_context(inode); + return; + } + } +inherit_comp: + /* Inherit the {no-}compression flag in directory */ + if (F2FS_I(dir)->i_flags & F2FS_NOCOMP_FL) { + F2FS_I(inode)->i_flags |= F2FS_NOCOMP_FL; + f2fs_mark_inode_dirty_sync(inode, true); + } else if (F2FS_I(dir)->i_flags & F2FS_COMPR_FL) { + set_compress_context(inode); + } +} + static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, - struct inode *dir, umode_t mode) + struct inode *dir, umode_t mode, + const char *name) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); nid_t ino; @@ -114,12 +269,8 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL) set_inode_flag(inode, FI_PROJ_INHERIT); - if (f2fs_sb_has_compression(sbi)) { - /* Inherit the compression flag in directory */ - if ((F2FS_I(dir)->i_flags & F2FS_COMPR_FL) && - f2fs_may_compress(inode)) - set_compress_context(inode); - } + /* Check compression first. */ + set_compress_new_inode(sbi, dir, inode, name); /* Should enable inline_data after compression set */ if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) @@ -153,40 +304,6 @@ fail_drop: return ERR_PTR(err); } -static inline int is_extension_exist(const unsigned char *s, const char *sub, - bool tmp_ext) -{ - size_t slen = strlen(s); - size_t sublen = strlen(sub); - int i; - - if (sublen == 1 && *sub == '*') - return 1; - - /* - * filename format of multimedia file should be defined as: - * "filename + '.' + extension + (optional: '.' + temp extension)". - */ - if (slen < sublen + 2) - return 0; - - if (!tmp_ext) { - /* file has no temp extension */ - if (s[slen - sublen - 1] != '.') - return 0; - return !strncasecmp(s + slen - sublen, sub, sublen); - } - - for (i = 1; i < slen - sublen; i++) { - if (s[i] != '.') - continue; - if (!strncasecmp(s + i + 1, sub, sublen)) - return 1; - } - - return 0; -} - /* * Set file's temperature for hot/cold data separation */ @@ -217,124 +334,6 @@ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode * file_set_hot(inode); } -int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, - bool hot, bool set) -{ - __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; - int cold_count = le32_to_cpu(sbi->raw_super->extension_count); - int hot_count = sbi->raw_super->hot_ext_count; - int total_count = cold_count + hot_count; - int start, count; - int i; - - if (set) { - if (total_count == F2FS_MAX_EXTENSION) - return -EINVAL; - } else { - if (!hot && !cold_count) - return -EINVAL; - if (hot && !hot_count) - return -EINVAL; - } - - if (hot) { - start = cold_count; - count = total_count; - } else { - start = 0; - count = cold_count; - } - - for (i = start; i < count; i++) { - if (strcmp(name, extlist[i])) - continue; - - if (set) - return -EINVAL; - - memcpy(extlist[i], extlist[i + 1], - F2FS_EXTENSION_LEN * (total_count - i - 1)); - memset(extlist[total_count - 1], 0, F2FS_EXTENSION_LEN); - if (hot) - sbi->raw_super->hot_ext_count = hot_count - 1; - else - sbi->raw_super->extension_count = - cpu_to_le32(cold_count - 1); - return 0; - } - - if (!set) - return -EINVAL; - - if (hot) { - memcpy(extlist[count], name, strlen(name)); - sbi->raw_super->hot_ext_count = hot_count + 1; - } else { - char buf[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN]; - - memcpy(buf, &extlist[cold_count], - F2FS_EXTENSION_LEN * hot_count); - memset(extlist[cold_count], 0, F2FS_EXTENSION_LEN); - memcpy(extlist[cold_count], name, strlen(name)); - memcpy(&extlist[cold_count + 1], buf, - F2FS_EXTENSION_LEN * hot_count); - sbi->raw_super->extension_count = cpu_to_le32(cold_count + 1); - } - return 0; -} - -static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode, - const unsigned char *name) -{ - __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; - unsigned char (*noext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).noextensions; - unsigned char (*ext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).extensions; - unsigned char ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; - unsigned char noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; - int i, cold_count, hot_count; - - if (!f2fs_sb_has_compression(sbi) || - F2FS_I(inode)->i_flags & F2FS_NOCOMP_FL || - !f2fs_may_compress(inode) || - (!ext_cnt && !noext_cnt)) - return; - - f2fs_down_read(&sbi->sb_lock); - - cold_count = le32_to_cpu(sbi->raw_super->extension_count); - hot_count = sbi->raw_super->hot_ext_count; - - for (i = cold_count; i < cold_count + hot_count; i++) { - if (is_extension_exist(name, extlist[i], false)) { - f2fs_up_read(&sbi->sb_lock); - return; - } - } - - f2fs_up_read(&sbi->sb_lock); - - for (i = 0; i < noext_cnt; i++) { - if (is_extension_exist(name, noext[i], false)) { - f2fs_disable_compressed_file(inode); - return; - } - } - - if (is_inode_flag_set(inode, FI_COMPRESSED_FILE)) - return; - - for (i = 0; i < ext_cnt; i++) { - if (!is_extension_exist(name, ext[i], false)) - continue; - - /* Do not use inline_data with compression */ - stat_dec_inline_inode(inode); - clear_inode_flag(inode, FI_INLINE_DATA); - set_compress_context(inode); - return; - } -} - static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { @@ -352,15 +351,13 @@ static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, mode); + inode = f2fs_new_inode(mnt_userns, dir, mode, dentry->d_name.name); if (IS_ERR(inode)) return PTR_ERR(inode); if (!test_opt(sbi, DISABLE_EXT_IDENTIFY)) set_file_temperature(sbi, inode, dentry->d_name.name); - set_compress_inode(sbi, inode, dentry->d_name.name); - inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; @@ -689,7 +686,7 @@ static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, S_IFLNK | S_IRWXUGO); + inode = f2fs_new_inode(mnt_userns, dir, S_IFLNK | S_IRWXUGO, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -760,7 +757,7 @@ static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, S_IFDIR | mode); + inode = f2fs_new_inode(mnt_userns, dir, S_IFDIR | mode, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -817,7 +814,7 @@ static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, mode); + inode = f2fs_new_inode(mnt_userns, dir, mode, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -856,7 +853,7 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, mode); + inode = f2fs_new_inode(mnt_userns, dir, mode, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); -- GitLab From b16bcaaf7a325f90967259a0b7cfcce4ff8c56ba Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Fri, 11 Nov 2022 18:08:30 +0800 Subject: [PATCH 0941/1423] f2fs: move set_file_temperature into f2fs_new_inode Since the file name has already passed to f2fs_new_inode(), let's move set_file_temperature() into f2fs_new_inode(). Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 62 +++++++++++++++++++++++-------------------------- 1 file changed, 29 insertions(+), 33 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 54448dccbb6ad..58a91ce8fe083 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -176,6 +176,32 @@ inherit_comp: } } +/* + * Set file's temperature for hot/cold data separation + */ +static void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode, + const unsigned char *name) +{ + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + int i, cold_count, hot_count; + + f2fs_down_read(&sbi->sb_lock); + cold_count = le32_to_cpu(sbi->raw_super->extension_count); + hot_count = sbi->raw_super->hot_ext_count; + for (i = 0; i < cold_count + hot_count; i++) + if (is_extension_exist(name, extlist[i], true)) + break; + f2fs_up_read(&sbi->sb_lock); + + if (i == cold_count + hot_count) + return; + + if (i < cold_count) + file_set_cold(inode); + else + file_set_hot(inode); +} + static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, struct inode *dir, umode_t mode, const char *name) @@ -276,6 +302,9 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) set_inode_flag(inode, FI_INLINE_DATA); + if (name && !test_opt(sbi, DISABLE_EXT_IDENTIFY)) + set_file_temperature(sbi, inode, name); + stat_inc_inline_xattr(inode); stat_inc_inline_inode(inode); stat_inc_inline_dir(inode); @@ -304,36 +333,6 @@ fail_drop: return ERR_PTR(err); } -/* - * Set file's temperature for hot/cold data separation - */ -static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode, - const unsigned char *name) -{ - __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; - int i, cold_count, hot_count; - - f2fs_down_read(&sbi->sb_lock); - - cold_count = le32_to_cpu(sbi->raw_super->extension_count); - hot_count = sbi->raw_super->hot_ext_count; - - for (i = 0; i < cold_count + hot_count; i++) { - if (is_extension_exist(name, extlist[i], true)) - break; - } - - f2fs_up_read(&sbi->sb_lock); - - if (i == cold_count + hot_count) - return; - - if (i < cold_count) - file_set_cold(inode); - else - file_set_hot(inode); -} - static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { @@ -355,9 +354,6 @@ static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir, if (IS_ERR(inode)) return PTR_ERR(inode); - if (!test_opt(sbi, DISABLE_EXT_IDENTIFY)) - set_file_temperature(sbi, inode, dentry->d_name.name); - inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; -- GitLab From fc031877b8229ee2f56a2cc3868ca7f282b1231d Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sat, 19 Nov 2022 01:40:28 +0800 Subject: [PATCH 0942/1423] f2fs: fix description about discard_granularity node Let's fix the inconsistency in the text description. Default discard granularity is 16. For small devices, default value is 1. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 24e7cb77f265b..32404781e76fa 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -138,7 +138,8 @@ Contact: "Chao Yu" Description: Controls discard granularity of inner discard thread. Inner thread will not issue discards with size that is smaller than granularity. The unit size is one block(4KB), now only support configuring - in range of [1, 512]. Default value is 4(=16KB). + in range of [1, 512]. Default value is 16. + For small devices, default value is 1. What: /sys/fs/f2fs//umount_discard_timeout Date: January 2019 -- GitLab From 620816393239890feff8608251e2746b1cc2cfa0 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Thu, 17 Nov 2022 01:10:45 +0800 Subject: [PATCH 0943/1423] f2fs: make __queue_discard_cmd() return void Since __queue_discard_cmd() never returns an error, let's make it return void. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8b0b765505785..14ece4bf7c7e2 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1358,13 +1358,13 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, } } -static int __queue_discard_cmd(struct f2fs_sb_info *sbi, +static void __queue_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { block_t lblkstart = blkstart; if (!f2fs_bdev_support_discard(bdev)) - return 0; + return; trace_f2fs_queue_discard(bdev, blkstart, blklen); @@ -1376,7 +1376,6 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock); __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen); mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock); - return 0; } static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, @@ -1776,7 +1775,8 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, } /* For conventional zones, use regular discard if supported */ - return __queue_discard_cmd(sbi, bdev, lblkstart, blklen); + __queue_discard_cmd(sbi, bdev, lblkstart, blklen); + return 0; } #endif @@ -1787,7 +1787,8 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi, if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev)) return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); #endif - return __queue_discard_cmd(sbi, bdev, blkstart, blklen); + __queue_discard_cmd(sbi, bdev, blkstart, blklen); + return 0; } static int f2fs_issue_discard(struct f2fs_sb_info *sbi, -- GitLab From 78a99fe6254cad4be310cd84af39f6c46b668c72 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 23 Nov 2022 06:42:52 +0800 Subject: [PATCH 0944/1423] f2fs: truncate blocks in batch in __complete_revoke_list() Use f2fs_do_truncate_blocks() to truncate all blocks in-batch in __complete_revoke_list(). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 14ece4bf7c7e2..37c721e1eb034 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -262,24 +262,19 @@ static void __complete_revoke_list(struct inode *inode, struct list_head *head, bool revoke) { struct revoke_entry *cur, *tmp; - pgoff_t start_index = 0; bool truncate = is_inode_flag_set(inode, FI_ATOMIC_REPLACE); list_for_each_entry_safe(cur, tmp, head, list) { - if (revoke) { + if (revoke) __replace_atomic_write_block(inode, cur->index, cur->old_addr, NULL, true); - } else if (truncate) { - f2fs_truncate_hole(inode, start_index, cur->index); - start_index = cur->index + 1; - } list_del(&cur->list); kmem_cache_free(revoke_entry_slab, cur); } if (!revoke && truncate) - f2fs_do_truncate_blocks(inode, start_index * PAGE_SIZE, false); + f2fs_do_truncate_blocks(inode, 0, false); } static int __f2fs_commit_atomic_write(struct inode *inode) -- GitLab From e219aecfd4b766c4e878a3769057e9809f7fcadc Mon Sep 17 00:00:00 2001 From: Yonggil Song Date: Tue, 22 Nov 2022 18:03:20 +0900 Subject: [PATCH 0945/1423] f2fs: avoid victim selection from previous victim section When f2fs chooses GC victim in large section & LFS mode, next_victim_seg[gc_type] is referenced first. After segment is freed, next_victim_seg[gc_type] has the next segment number. However, next_victim_seg[gc_type] still has the last segment number even after the last segment of section is freed. In this case, when f2fs chooses a victim for the next GC round, the last segment of previous victim section is chosen as a victim. Initialize next_victim_seg[gc_type] to NULL_SEGNO for the last segment in large section. Fixes: e3080b0120a1 ("f2fs: support subsectional garbage collection") Signed-off-by: Yonggil Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 0f967b1e98f23..f1b68eda2235d 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1749,8 +1749,9 @@ freed: get_valid_blocks(sbi, segno, false) == 0) seg_freed++; - if (__is_large_section(sbi) && segno + 1 < end_segno) - sbi->next_victim_seg[gc_type] = segno + 1; + if (__is_large_section(sbi)) + sbi->next_victim_seg[gc_type] = + (segno + 1 < end_segno) ? segno + 1 : NULL_SEGNO; skip: f2fs_put_page(sum_page, 0); } -- GitLab From 48c08c51f938d955dd8f5b8972bc29faa4c9556f Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Fri, 18 Nov 2022 11:46:00 +0800 Subject: [PATCH 0946/1423] f2fs: init discard policy after thread wakeup Under the current logic, after the discard thread wakes up, it will not run according to the expected policy, but will use the expected policy before sleep. Move the strategy selection to after the thread wakes up, so that the running state of the thread meets expectations. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 37c721e1eb034..73ad8dc9a4d3c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1679,6 +1679,11 @@ static int issue_discard_thread(void *data) set_freezable(); do { + wait_event_interruptible_timeout(*q, + kthread_should_stop() || freezing(current) || + dcc->discard_wake, + msecs_to_jiffies(wait_ms)); + if (sbi->gc_mode == GC_URGENT_HIGH || !f2fs_available_free_memory(sbi, DISCARD_CACHE)) __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); @@ -1686,14 +1691,6 @@ static int issue_discard_thread(void *data) __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, dcc->discard_granularity); - if (!atomic_read(&dcc->discard_cmd_cnt)) - wait_ms = dpolicy.max_interval; - - wait_event_interruptible_timeout(*q, - kthread_should_stop() || freezing(current) || - dcc->discard_wake, - msecs_to_jiffies(wait_ms)); - if (dcc->discard_wake) dcc->discard_wake = 0; @@ -1707,12 +1704,11 @@ static int issue_discard_thread(void *data) continue; if (kthread_should_stop()) return 0; - if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) || + !atomic_read(&dcc->discard_cmd_cnt)) { wait_ms = dpolicy.max_interval; continue; } - if (!atomic_read(&dcc->discard_cmd_cnt)) - continue; sb_start_intwrite(sbi->sb); @@ -1727,6 +1723,8 @@ static int issue_discard_thread(void *data) } else { wait_ms = dpolicy.max_interval; } + if (!atomic_read(&dcc->discard_cmd_cnt)) + wait_ms = dpolicy.max_interval; sb_end_intwrite(sbi->sb); -- GitLab From 1cd2e6d544359ae13e6fd9029b6018b957cf08c3 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Thu, 24 Nov 2022 00:44:01 +0800 Subject: [PATCH 0947/1423] f2fs: define MIN_DISCARD_GRANULARITY macro Do cleanup in f2fs_tuning_parameters() and __init_discard_policy(), let's use macro instead of number. Suggested-by: Chao Yu Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/segment.c | 4 ++-- fs/f2fs/super.c | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f0833638f59ea..4694b55b6df49 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -329,6 +329,8 @@ struct discard_entry { unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */ }; +/* minimum discard granularity, unit: block count */ +#define MIN_DISCARD_GRANULARITY 1 /* default discard granularity of inner discard thread, unit: block count */ #define DEFAULT_DISCARD_GRANULARITY 16 /* default maximum discard granularity of ordered discard, unit: block count */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 73ad8dc9a4d3c..c7afcc6cd75bf 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1065,7 +1065,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->sync = false; dpolicy->ordered = true; if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) { - dpolicy->granularity = 1; + dpolicy->granularity = MIN_DISCARD_GRANULARITY; if (atomic_read(&dcc->discard_cmd_cnt)) dpolicy->max_interval = dcc->min_discard_issue_time; @@ -1080,7 +1080,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, } else if (discard_type == DPOLICY_UMOUNT) { dpolicy->io_aware = false; /* we need to issue all to keep CP_TRIMMED_FLAG */ - dpolicy->granularity = 1; + dpolicy->granularity = MIN_DISCARD_GRANULARITY; dpolicy->timeout = true; } } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 31435c8645c83..daf14b55a9729 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4080,7 +4080,8 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) /* adjust parameters according to the volume size */ if (MAIN_SEGS(sbi) <= SMALL_VOLUME_SEGMENTS) { if (f2fs_block_unit_discard(sbi)) - SM_I(sbi)->dcc_info->discard_granularity = 1; + SM_I(sbi)->dcc_info->discard_granularity = + MIN_DISCARD_GRANULARITY; SM_I(sbi)->ipu_policy = 1 << F2FS_IPU_FORCE | 1 << F2FS_IPU_HONOR_OPU_WRITE; } -- GitLab From 8a47d228de6a4fd4c751142fb27d56f385b3fe41 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Thu, 24 Nov 2022 00:44:02 +0800 Subject: [PATCH 0948/1423] f2fs: introduce discard_urgent_util sysfs node Through this node, you can control the background discard to run more aggressively or not aggressively when reach the utilization rate of the space. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 8 ++++++++ fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.c | 3 ++- fs/f2fs/sysfs.c | 9 +++++++++ 4 files changed, 20 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 32404781e76fa..84a009aab1a13 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -647,3 +647,11 @@ Date: October 2022 Contact: "Yangtao Li" Description: Show the current gc_mode as a string. This is a read-only entry. + +What: /sys/fs/f2fs//discard_urgent_util +Date: November 2022 +Contact: "Yangtao Li" +Description: When space utilization exceeds this, do background DISCARD aggressively. + Does DISCARD forcibly in a period of given min_discard_issue_time when the number + of discards is not 0 and set discard granularity to 1. + Default: 80 diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4694b55b6df49..296683648d4fa 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -413,6 +413,7 @@ struct discard_cmd_control { unsigned int min_discard_issue_time; /* min. interval between discard issue */ unsigned int mid_discard_issue_time; /* mid. interval between discard issue */ unsigned int max_discard_issue_time; /* max. interval between discard issue */ + unsigned int discard_urgent_util; /* utilization which issue discard proactively */ unsigned int discard_granularity; /* discard granularity */ unsigned int max_ordered_discard; /* maximum discard granularity issued by lba order */ unsigned int undiscard_blks; /* # of undiscard blocks */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c7afcc6cd75bf..0ff451ea18f66 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1064,7 +1064,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->io_aware = true; dpolicy->sync = false; dpolicy->ordered = true; - if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) { + if (utilization(sbi) > dcc->discard_urgent_util) { dpolicy->granularity = MIN_DISCARD_GRANULARITY; if (atomic_read(&dcc->discard_cmd_cnt)) dpolicy->max_interval = @@ -2079,6 +2079,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) dcc->min_discard_issue_time = DEF_MIN_DISCARD_ISSUE_TIME; dcc->mid_discard_issue_time = DEF_MID_DISCARD_ISSUE_TIME; dcc->max_discard_issue_time = DEF_MAX_DISCARD_ISSUE_TIME; + dcc->discard_urgent_util = DEF_DISCARD_URGENT_UTIL; dcc->undiscard_blks = 0; dcc->next_pos = 0; dcc->root = RB_ROOT_CACHED; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 33ec467b37725..a4745d596310f 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -493,6 +493,13 @@ out: return count; } + if (!strcmp(a->attr.name, "discard_urgent_util")) { + if (t > 100) + return -EINVAL; + *ui = t; + return count; + } + if (!strcmp(a->attr.name, "migration_granularity")) { if (t == 0 || t > sbi->segs_per_sec) return -EINVAL; @@ -800,6 +807,7 @@ F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_request, max_discard_req F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_issue_time); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_urgent_util, discard_urgent_util); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_ordered_discard, max_ordered_discard); F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); @@ -930,6 +938,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(min_discard_issue_time), ATTR_LIST(mid_discard_issue_time), ATTR_LIST(max_discard_issue_time), + ATTR_LIST(discard_urgent_util), ATTR_LIST(discard_granularity), ATTR_LIST(max_ordered_discard), ATTR_LIST(pending_discard), -- GitLab From f43dc4dc3eff028b5ddddd99f3a66c5a6bdd4e78 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 29 Nov 2022 09:09:11 +1100 Subject: [PATCH 0949/1423] iomap: buffered write failure should not truncate the page cache iomap_file_buffered_write_punch_delalloc() currently invalidates the page cache over the unused range of the delalloc extent that was allocated. While the write allocated the delalloc extent, it does not own it exclusively as the write does not hold any locks that prevent either writeback or mmap page faults from changing the state of either the page cache or the extent state backing this range. Whilst xfs_bmap_punch_delalloc_range() already handles races in extent conversion - it will only punch out delalloc extents and it ignores any other type of extent - the page cache truncate does not discriminate between data written by this write or some other task. As a result, truncating the page cache can result in data corruption if the write races with mmap modifications to the file over the same range. generic/346 exercises this workload, and if we randomly fail writes (as will happen when iomap gets stale iomap detection later in the patchset), it will randomly corrupt the file data because it removes data written by mmap() in the same page as the write() that failed. Hence we do not want to punch out the page cache over the range of the extent we failed to write to - what we actually need to do is detect the ranges that have dirty data in cache over them and *not punch them out*. To do this, we have to walk the page cache over the range of the delalloc extent we want to remove. This is made complex by the fact we have to handle partially up-to-date folios correctly and this can happen even when the FSB size == PAGE_SIZE because we now support multi-page folios in the page cache. Because we are only interested in discovering the edges of data ranges in the page cache (i.e. hole-data boundaries) we can make use of mapping_seek_hole_data() to find those transitions in the page cache. As we hold the invalidate_lock, we know that the boundaries are not going to change while we walk the range. This interface is also byte-based and is sub-page block aware, so we can find the data ranges in the cache based on byte offsets rather than page, folio or fs block sized chunks. This greatly simplifies the logic of finding dirty cached ranges in the page cache. Once we've identified a range that contains cached data, we can then iterate the range folio by folio. This allows us to determine if the data is dirty and hence perform the correct delalloc extent punching operations. The seek interface we use to iterate data ranges will give us sub-folio start/end granularity, so we may end up looking up the same folio multiple times as the seek interface iterates across each discontiguous data region in the folio. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 195 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 180 insertions(+), 15 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 734b761a1e4ab..dca9ec9dc4a8e 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -832,6 +832,165 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); +/* + * Scan the data range passed to us for dirty page cache folios. If we find a + * dirty folio, punch out the preceeding range and update the offset from which + * the next punch will start from. + * + * We can punch out storage reservations under clean pages because they either + * contain data that has been written back - in which case the delalloc punch + * over that range is a no-op - or they have been read faults in which case they + * contain zeroes and we can remove the delalloc backing range and any new + * writes to those pages will do the normal hole filling operation... + * + * This makes the logic simple: we only need to keep the delalloc extents only + * over the dirty ranges of the page cache. + * + * This function uses [start_byte, end_byte) intervals (i.e. open ended) to + * simplify range iterations. + */ +static int iomap_write_delalloc_scan(struct inode *inode, + loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, + int (*punch)(struct inode *inode, loff_t offset, loff_t length)) +{ + while (start_byte < end_byte) { + struct folio *folio; + + /* grab locked page */ + folio = filemap_lock_folio(inode->i_mapping, + start_byte >> PAGE_SHIFT); + if (!folio) { + start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) + + PAGE_SIZE; + continue; + } + + /* if dirty, punch up to offset */ + if (folio_test_dirty(folio)) { + if (start_byte > *punch_start_byte) { + int error; + + error = punch(inode, *punch_start_byte, + start_byte - *punch_start_byte); + if (error) { + folio_unlock(folio); + folio_put(folio); + return error; + } + } + + /* + * Make sure the next punch start is correctly bound to + * the end of this data range, not the end of the folio. + */ + *punch_start_byte = min_t(loff_t, end_byte, + folio_next_index(folio) << PAGE_SHIFT); + } + + /* move offset to start of next folio in range */ + start_byte = folio_next_index(folio) << PAGE_SHIFT; + folio_unlock(folio); + folio_put(folio); + } + return 0; +} + +/* + * Punch out all the delalloc blocks in the range given except for those that + * have dirty data still pending in the page cache - those are going to be + * written and so must still retain the delalloc backing for writeback. + * + * As we are scanning the page cache for data, we don't need to reimplement the + * wheel - mapping_seek_hole_data() does exactly what we need to identify the + * start and end of data ranges correctly even for sub-folio block sizes. This + * byte range based iteration is especially convenient because it means we + * don't have to care about variable size folios, nor where the start or end of + * the data range lies within a folio, if they lie within the same folio or even + * if there are multiple discontiguous data ranges within the folio. + * + * It should be noted that mapping_seek_hole_data() is not aware of EOF, and so + * can return data ranges that exist in the cache beyond EOF. e.g. a page fault + * spanning EOF will initialise the post-EOF data to zeroes and mark it up to + * date. A write page fault can then mark it dirty. If we then fail a write() + * beyond EOF into that up to date cached range, we allocate a delalloc block + * beyond EOF and then have to punch it out. Because the range is up to date, + * mapping_seek_hole_data() will return it, and we will skip the punch because + * the folio is dirty. THis is incorrect - we always need to punch out delalloc + * beyond EOF in this case as writeback will never write back and covert that + * delalloc block beyond EOF. Hence we limit the cached data scan range to EOF, + * resulting in always punching out the range from the EOF to the end of the + * range the iomap spans. + * + * Intervals are of the form [start_byte, end_byte) (i.e. open ended) because it + * matches the intervals returned by mapping_seek_hole_data(). i.e. SEEK_DATA + * returns the start of a data range (start_byte), and SEEK_HOLE(start_byte) + * returns the end of the data range (data_end). Using closed intervals would + * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose + * the code to subtle off-by-one bugs.... + */ +static int iomap_write_delalloc_release(struct inode *inode, + loff_t start_byte, loff_t end_byte, + int (*punch)(struct inode *inode, loff_t pos, loff_t length)) +{ + loff_t punch_start_byte = start_byte; + loff_t scan_end_byte = min(i_size_read(inode), end_byte); + int error = 0; + + /* + * Lock the mapping to avoid races with page faults re-instantiating + * folios and dirtying them via ->page_mkwrite whilst we walk the + * cache and perform delalloc extent removal. Failing to do this can + * leave dirty pages with no space reservation in the cache. + */ + filemap_invalidate_lock(inode->i_mapping); + while (start_byte < scan_end_byte) { + loff_t data_end; + + start_byte = mapping_seek_hole_data(inode->i_mapping, + start_byte, scan_end_byte, SEEK_DATA); + /* + * If there is no more data to scan, all that is left is to + * punch out the remaining range. + */ + if (start_byte == -ENXIO || start_byte == scan_end_byte) + break; + if (start_byte < 0) { + error = start_byte; + goto out_unlock; + } + WARN_ON_ONCE(start_byte < punch_start_byte); + WARN_ON_ONCE(start_byte > scan_end_byte); + + /* + * We find the end of this contiguous cached data range by + * seeking from start_byte to the beginning of the next hole. + */ + data_end = mapping_seek_hole_data(inode->i_mapping, start_byte, + scan_end_byte, SEEK_HOLE); + if (data_end < 0) { + error = data_end; + goto out_unlock; + } + WARN_ON_ONCE(data_end <= start_byte); + WARN_ON_ONCE(data_end > scan_end_byte); + + error = iomap_write_delalloc_scan(inode, &punch_start_byte, + start_byte, data_end, punch); + if (error) + goto out_unlock; + + /* The next data search starts at the end of this one. */ + start_byte = data_end; + } + + if (punch_start_byte < end_byte) + error = punch(inode, punch_start_byte, + end_byte - punch_start_byte); +out_unlock: + filemap_invalidate_unlock(inode->i_mapping); + return error; +} + /* * When a short write occurs, the filesystem may need to remove reserved space * that was allocated in ->iomap_begin from it's ->iomap_end method. For @@ -842,8 +1001,25 @@ EXPORT_SYMBOL_GPL(iomap_file_buffered_write); * allocated for this iomap. * * This function uses [start_byte, end_byte) intervals (i.e. open ended) to - * simplify range iterations, but converts them back to {offset,len} tuples for - * the punch callback. + * simplify range iterations. + * + * The punch() callback *must* only punch delalloc extents in the range passed + * to it. It must skip over all other types of extents in the range and leave + * them completely unchanged. It must do this punch atomically with respect to + * other extent modifications. + * + * The punch() callback may be called with a folio locked to prevent writeback + * extent allocation racing at the edge of the range we are currently punching. + * The locked folio may or may not cover the range being punched, so it is not + * safe for the punch() callback to lock folios itself. + * + * Lock order is: + * + * inode->i_rwsem (shared or exclusive) + * inode->i_mapping->invalidate_lock (exclusive) + * folio_lock() + * ->punch + * internal filesystem allocation lock */ int iomap_file_buffered_write_punch_delalloc(struct inode *inode, struct iomap *iomap, loff_t pos, loff_t length, @@ -853,7 +1029,6 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode, loff_t start_byte; loff_t end_byte; int blocksize = i_blocksize(inode); - int error = 0; if (iomap->type != IOMAP_DELALLOC) return 0; @@ -877,18 +1052,8 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode, if (start_byte >= end_byte) return 0; - /* - * Lock the mapping to avoid races with page faults re-instantiating - * folios and dirtying them via ->page_mkwrite between the page cache - * truncation and the delalloc extent removal. Failing to do this can - * leave dirty pages with no space reservation in the cache. - */ - filemap_invalidate_lock(inode->i_mapping); - truncate_pagecache_range(inode, start_byte, end_byte - 1); - error = punch(inode, start_byte, end_byte - start_byte); - filemap_invalidate_unlock(inode->i_mapping); - - return error; + return iomap_write_delalloc_release(inode, start_byte, end_byte, + punch); } EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc); -- GitLab From 7348b322332d8602a4133f0b861334ea021b134a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 29 Nov 2022 09:09:17 +1100 Subject: [PATCH 0950/1423] xfs: xfs_bmap_punch_delalloc_range() should take a byte range All the callers of xfs_bmap_punch_delalloc_range() jump through hoops to convert a byte range to filesystem blocks before calling xfs_bmap_punch_delalloc_range(). Instead, pass the byte range to xfs_bmap_punch_delalloc_range() and have it do the conversion to filesystem blocks internally. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_aops.c | 16 ++++++---------- fs/xfs/xfs_bmap_util.c | 10 ++++++---- fs/xfs/xfs_bmap_util.h | 2 +- fs/xfs/xfs_iomap.c | 8 ++------ 4 files changed, 15 insertions(+), 21 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 5d1a995b15f83..6aadc5815068e 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -114,9 +114,8 @@ xfs_end_ioend( if (unlikely(error)) { if (ioend->io_flags & IOMAP_F_SHARED) { xfs_reflink_cancel_cow_range(ip, offset, size, true); - xfs_bmap_punch_delalloc_range(ip, - XFS_B_TO_FSBT(mp, offset), - XFS_B_TO_FSB(mp, size)); + xfs_bmap_punch_delalloc_range(ip, offset, + offset + size); } goto done; } @@ -455,12 +454,8 @@ xfs_discard_folio( struct folio *folio, loff_t pos) { - struct inode *inode = folio->mapping->host; - struct xfs_inode *ip = XFS_I(inode); + struct xfs_inode *ip = XFS_I(folio->mapping->host); struct xfs_mount *mp = ip->i_mount; - size_t offset = offset_in_folio(folio, pos); - xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, pos); - xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, offset); int error; if (xfs_is_shutdown(mp)) @@ -470,8 +465,9 @@ xfs_discard_folio( "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.", folio, ip->i_ino, pos); - error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - i_blocks_per_folio(inode, folio) - pageoff_fsb); + error = xfs_bmap_punch_delalloc_range(ip, pos, + round_up(pos, folio_size(folio))); + if (error && !xfs_is_shutdown(mp)) xfs_alert(mp, "page discard unable to remove delalloc mapping."); } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 04d0c2bff67c4..867645b74d889 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -590,11 +590,13 @@ out_unlock_iolock: int xfs_bmap_punch_delalloc_range( struct xfs_inode *ip, - xfs_fileoff_t start_fsb, - xfs_fileoff_t length) + xfs_off_t start_byte, + xfs_off_t end_byte) { + struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = &ip->i_df; - xfs_fileoff_t end_fsb = start_fsb + length; + xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, start_byte); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte); struct xfs_bmbt_irec got, del; struct xfs_iext_cursor icur; int error = 0; @@ -607,7 +609,7 @@ xfs_bmap_punch_delalloc_range( while (got.br_startoff + got.br_blockcount > start_fsb) { del = got; - xfs_trim_extent(&del, start_fsb, length); + xfs_trim_extent(&del, start_fsb, end_fsb - start_fsb); /* * A delete can push the cursor forward. Step back to the diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 24b37d211f1dc..6888078f5c31e 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -31,7 +31,7 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap) #endif /* CONFIG_XFS_RT */ int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, - xfs_fileoff_t start_fsb, xfs_fileoff_t length); + xfs_off_t start_byte, xfs_off_t end_byte); struct kgetbmap { __s64 bmv_offset; /* file offset of segment in blocks */ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index ea96e8a348687..09676ff6940eb 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1126,12 +1126,8 @@ xfs_buffered_write_delalloc_punch( loff_t offset, loff_t length) { - struct xfs_mount *mp = XFS_M(inode->i_sb); - xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, offset); - xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length); - - return xfs_bmap_punch_delalloc_range(XFS_I(inode), start_fsb, - end_fsb - start_fsb); + return xfs_bmap_punch_delalloc_range(XFS_I(inode), offset, + offset + length); } static int -- GitLab From d7b64041164ca177170191d2ad775da074ab2926 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 29 Nov 2022 09:09:17 +1100 Subject: [PATCH 0951/1423] iomap: write iomap validity checks A recent multithreaded write data corruption has been uncovered in the iomap write code. The core of the problem is partial folio writes can be flushed to disk while a new racing write can map it and fill the rest of the page: writeback new write allocate blocks blocks are unwritten submit IO ..... map blocks iomap indicates UNWRITTEN range loop { lock folio copyin data ..... IO completes runs unwritten extent conv blocks are marked written get next folio } Now add memory pressure such that memory reclaim evicts the partially written folio that has already been written to disk. When the new write finally gets to the last partial page of the new write, it does not find it in cache, so it instantiates a new page, sees the iomap is unwritten, and zeros the part of the page that it does not have data from. This overwrites the data on disk that was originally written. The full description of the corruption mechanism can be found here: https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/ To solve this problem, we need to check whether the iomap is still valid after we lock each folio during the write. We have to do it after we lock the page so that we don't end up with state changes occurring while we wait for the folio to be locked. Hence we need a mechanism to be able to check that the cached iomap is still valid (similar to what we already do in buffered writeback), and we need a way for ->begin_write to back out and tell the high level iomap iterator that we need to remap the remaining write range. The iomap needs to grow some storage for the validity cookie that the filesystem provides to travel with the iomap. XFS, in particular, also needs to know some more information about what the iomap maps (attribute extents rather than file data extents) to for the validity cookie to cover all the types of iomaps we might need to validate. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 29 +++++++++++++++++++++++++++- fs/iomap/iter.c | 19 ++++++++++++++++++- include/linux/iomap.h | 43 ++++++++++++++++++++++++++++++++++-------- 3 files changed, 81 insertions(+), 10 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index dca9ec9dc4a8e..356193e44cf07 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -584,7 +584,7 @@ static int iomap_write_begin_inline(const struct iomap_iter *iter, return iomap_read_inline_data(iter, folio); } -static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, +static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, size_t len, struct folio **foliop) { const struct iomap_page_ops *page_ops = iter->iomap.page_ops; @@ -618,6 +618,27 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM; goto out_no_page; } + + /* + * Now we have a locked folio, before we do anything with it we need to + * check that the iomap we have cached is not stale. The inode extent + * mapping can change due to concurrent IO in flight (e.g. + * IOMAP_UNWRITTEN state can change and memory reclaim could have + * reclaimed a previously partially written page at this index after IO + * completion before this write reaches this file offset) and hence we + * could do the wrong thing here (zero a page range incorrectly or fail + * to zero) and corrupt data. + */ + if (page_ops && page_ops->iomap_valid) { + bool iomap_valid = page_ops->iomap_valid(iter->inode, + &iter->iomap); + if (!iomap_valid) { + iter->iomap.flags |= IOMAP_F_STALE; + status = 0; + goto out_unlock; + } + } + if (pos + len > folio_pos(folio) + folio_size(folio)) len = folio_pos(folio) + folio_size(folio) - pos; @@ -773,6 +794,8 @@ again: status = iomap_write_begin(iter, pos, bytes, &folio); if (unlikely(status)) break; + if (iter->iomap.flags & IOMAP_F_STALE) + break; page = folio_file_page(folio, pos >> PAGE_SHIFT); if (mapping_writably_mapped(mapping)) @@ -1081,6 +1104,8 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) status = iomap_write_begin(iter, pos, bytes, &folio); if (unlikely(status)) return status; + if (iter->iomap.flags & IOMAP_F_STALE) + break; status = iomap_write_end(iter, pos, bytes, bytes, folio); if (WARN_ON_ONCE(status == 0)) @@ -1136,6 +1161,8 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) status = iomap_write_begin(iter, pos, bytes, &folio); if (status) return status; + if (iter->iomap.flags & IOMAP_F_STALE) + break; offset = offset_in_folio(folio, pos); if (bytes > folio_size(folio) - offset) diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c index a1c7592d2aded..79a0614eaab77 100644 --- a/fs/iomap/iter.c +++ b/fs/iomap/iter.c @@ -7,12 +7,28 @@ #include #include "trace.h" +/* + * Advance to the next range we need to map. + * + * If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully + * processed - it was aborted because the extent the iomap spanned may have been + * changed during the operation. In this case, the iteration behaviour is to + * remap the unprocessed range of the iter, and that means we may need to remap + * even when we've made no progress (i.e. iter->processed = 0). Hence the + * "finished iterating" case needs to distinguish between + * (processed = 0) meaning we are done and (processed = 0 && stale) meaning we + * need to remap the entire remaining range. + */ static inline int iomap_iter_advance(struct iomap_iter *iter) { + bool stale = iter->iomap.flags & IOMAP_F_STALE; + /* handle the previous iteration (if any) */ if (iter->iomap.length) { - if (iter->processed <= 0) + if (iter->processed < 0) return iter->processed; + if (!iter->processed && !stale) + return 0; if (WARN_ON_ONCE(iter->processed > iomap_length(iter))) return -EIO; iter->pos += iter->processed; @@ -33,6 +49,7 @@ static inline void iomap_iter_done(struct iomap_iter *iter) WARN_ON_ONCE(iter->iomap.offset > iter->pos); WARN_ON_ONCE(iter->iomap.length == 0); WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos); + WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE); trace_iomap_iter_dstmap(iter->inode, &iter->iomap); if (iter->srcmap.type != IOMAP_HOLE) diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 0698c4b8ce0e2..0983dfc9a203c 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -49,26 +49,35 @@ struct vm_fault; * * IOMAP_F_BUFFER_HEAD indicates that the file system requires the use of * buffer heads for this mapping. + * + * IOMAP_F_XATTR indicates that the iomap is for an extended attribute extent + * rather than a file data extent. */ -#define IOMAP_F_NEW 0x01 -#define IOMAP_F_DIRTY 0x02 -#define IOMAP_F_SHARED 0x04 -#define IOMAP_F_MERGED 0x08 -#define IOMAP_F_BUFFER_HEAD 0x10 -#define IOMAP_F_ZONE_APPEND 0x20 +#define IOMAP_F_NEW (1U << 0) +#define IOMAP_F_DIRTY (1U << 1) +#define IOMAP_F_SHARED (1U << 2) +#define IOMAP_F_MERGED (1U << 3) +#define IOMAP_F_BUFFER_HEAD (1U << 4) +#define IOMAP_F_ZONE_APPEND (1U << 5) +#define IOMAP_F_XATTR (1U << 6) /* * Flags set by the core iomap code during operations: * * IOMAP_F_SIZE_CHANGED indicates to the iomap_end method that the file size * has changed as the result of this write operation. + * + * IOMAP_F_STALE indicates that the iomap is not valid any longer and the file + * range it covers needs to be remapped by the high level before the operation + * can proceed. */ -#define IOMAP_F_SIZE_CHANGED 0x100 +#define IOMAP_F_SIZE_CHANGED (1U << 8) +#define IOMAP_F_STALE (1U << 9) /* * Flags from 0x1000 up are for file system specific usage: */ -#define IOMAP_F_PRIVATE 0x1000 +#define IOMAP_F_PRIVATE (1U << 12) /* @@ -89,6 +98,7 @@ struct iomap { void *inline_data; void *private; /* filesystem private */ const struct iomap_page_ops *page_ops; + u64 validity_cookie; /* used with .iomap_valid() */ }; static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos) @@ -128,6 +138,23 @@ struct iomap_page_ops { int (*page_prepare)(struct inode *inode, loff_t pos, unsigned len); void (*page_done)(struct inode *inode, loff_t pos, unsigned copied, struct page *page); + + /* + * Check that the cached iomap still maps correctly to the filesystem's + * internal extent map. FS internal extent maps can change while iomap + * is iterating a cached iomap, so this hook allows iomap to detect that + * the iomap needs to be refreshed during a long running write + * operation. + * + * The filesystem can store internal state (e.g. a sequence number) in + * iomap->validity_cookie when the iomap is first mapped to be able to + * detect changes between mapping time and whenever .iomap_valid() is + * called. + * + * This is called with the folio over the specified file position held + * locked by the iomap code. + */ + bool (*iomap_valid)(struct inode *inode, const struct iomap *iomap); }; /* -- GitLab From 304a68b9c63bbfc1f6e159d68e8892fc54a06067 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 29 Nov 2022 09:09:17 +1100 Subject: [PATCH 0952/1423] xfs: use iomap_valid method to detect stale cached iomaps Now that iomap supports a mechanism to validate cached iomaps for buffered write operations, hook it up to the XFS buffered write ops so that we can avoid data corruptions that result from stale cached iomaps. See: https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/ or the ->iomap_valid() introduction commit for exact details of the corruption vector. The validity cookie we store in the iomap is based on the type of iomap we return. It is expected that the iomap->flags we set in xfs_bmbt_to_iomap() is not perturbed by the iomap core and are returned to us in the iomap passed via the .iomap_valid() callback. This ensures that the validity cookie is always checking the correct inode fork sequence numbers to detect potential changes that affect the extent cached by the iomap. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 6 ++- fs/xfs/xfs_aops.c | 2 +- fs/xfs/xfs_iomap.c | 95 +++++++++++++++++++++++++++++++--------- fs/xfs/xfs_iomap.h | 5 ++- fs/xfs/xfs_pnfs.c | 6 ++- 5 files changed, 87 insertions(+), 27 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 49d0d4ea63fcd..56b9b7db38bbd 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4551,7 +4551,8 @@ xfs_bmapi_convert_delalloc( * the extent. Just return the real extent at this offset. */ if (!isnullstartblock(bma.got.br_startblock)) { - xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, + xfs_iomap_inode_sequence(ip, flags)); *seq = READ_ONCE(ifp->if_seq); goto out_trans_cancel; } @@ -4599,7 +4600,8 @@ xfs_bmapi_convert_delalloc( XFS_STATS_INC(mp, xs_xstrat_quick); ASSERT(!isnullstartblock(bma.got.br_startblock)); - xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, + xfs_iomap_inode_sequence(ip, flags)); *seq = READ_ONCE(ifp->if_seq); if (whichfork == XFS_COW_FORK) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 6aadc5815068e..a22d90af40c85 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -372,7 +372,7 @@ retry: isnullstartblock(imap.br_startblock)) goto allocate_blocks; - xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0); + xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq); trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap); return 0; allocate_blocks: diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 09676ff6940eb..26ca3cc1a0489 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -48,13 +48,45 @@ xfs_alert_fsblock_zero( return -EFSCORRUPTED; } +u64 +xfs_iomap_inode_sequence( + struct xfs_inode *ip, + u16 iomap_flags) +{ + u64 cookie = 0; + + if (iomap_flags & IOMAP_F_XATTR) + return READ_ONCE(ip->i_af.if_seq); + if ((iomap_flags & IOMAP_F_SHARED) && ip->i_cowfp) + cookie = (u64)READ_ONCE(ip->i_cowfp->if_seq) << 32; + return cookie | READ_ONCE(ip->i_df.if_seq); +} + +/* + * Check that the iomap passed to us is still valid for the given offset and + * length. + */ +static bool +xfs_iomap_valid( + struct inode *inode, + const struct iomap *iomap) +{ + return iomap->validity_cookie == + xfs_iomap_inode_sequence(XFS_I(inode), iomap->flags); +} + +const struct iomap_page_ops xfs_iomap_page_ops = { + .iomap_valid = xfs_iomap_valid, +}; + int xfs_bmbt_to_iomap( struct xfs_inode *ip, struct iomap *iomap, struct xfs_bmbt_irec *imap, unsigned int mapping_flags, - u16 iomap_flags) + u16 iomap_flags, + u64 sequence_cookie) { struct xfs_mount *mp = ip->i_mount; struct xfs_buftarg *target = xfs_inode_buftarg(ip); @@ -91,6 +123,9 @@ xfs_bmbt_to_iomap( if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) iomap->flags |= IOMAP_F_DIRTY; + + iomap->validity_cookie = sequence_cookie; + iomap->page_ops = &xfs_iomap_page_ops; return 0; } @@ -195,7 +230,8 @@ xfs_iomap_write_direct( xfs_fileoff_t offset_fsb, xfs_fileoff_t count_fsb, unsigned int flags, - struct xfs_bmbt_irec *imap) + struct xfs_bmbt_irec *imap, + u64 *seq) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; @@ -285,6 +321,7 @@ xfs_iomap_write_direct( error = xfs_alert_fsblock_zero(ip, imap); out_unlock: + *seq = xfs_iomap_inode_sequence(ip, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; @@ -743,6 +780,7 @@ xfs_direct_write_iomap_begin( bool shared = false; u16 iomap_flags = 0; unsigned int lockmode = XFS_ILOCK_SHARED; + u64 seq; ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO)); @@ -811,9 +849,10 @@ xfs_direct_write_iomap_begin( goto out_unlock; } + seq = xfs_iomap_inode_sequence(ip, iomap_flags); xfs_iunlock(ip, lockmode); trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags, seq); allocate_blocks: error = -EAGAIN; @@ -839,24 +878,26 @@ allocate_blocks: xfs_iunlock(ip, lockmode); error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb, - flags, &imap); + flags, &imap, &seq); if (error) return error; trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, - iomap_flags | IOMAP_F_NEW); + iomap_flags | IOMAP_F_NEW, seq); out_found_cow: - xfs_iunlock(ip, lockmode); length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount); trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap); if (imap.br_startblock != HOLESTARTBLOCK) { - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); + seq = xfs_iomap_inode_sequence(ip, 0); + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq); if (error) - return error; + goto out_unlock; } - return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED); + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); + xfs_iunlock(ip, lockmode); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); out_unlock: if (lockmode) @@ -915,6 +956,7 @@ xfs_buffered_write_iomap_begin( int allocfork = XFS_DATA_FORK; int error = 0; unsigned int lockmode = XFS_ILOCK_EXCL; + u64 seq; if (xfs_is_shutdown(mp)) return -EIO; @@ -1094,26 +1136,31 @@ retry: * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch * them out if the write happens to fail. */ + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW); xfs_iunlock(ip, XFS_ILOCK_EXCL); trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq); found_imap: + seq = xfs_iomap_inode_sequence(ip, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); found_cow: - xfs_iunlock(ip, XFS_ILOCK_EXCL); + seq = xfs_iomap_inode_sequence(ip, 0); if (imap.br_startoff <= offset_fsb) { - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq); if (error) - return error; + goto out_unlock; + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, - IOMAP_F_SHARED); + IOMAP_F_SHARED, seq); } xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); - return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -1193,6 +1240,7 @@ xfs_read_iomap_begin( int nimaps = 1, error = 0; bool shared = false; unsigned int lockmode = XFS_ILOCK_SHARED; + u64 seq; ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO))); @@ -1206,13 +1254,14 @@ xfs_read_iomap_begin( &nimaps, 0); if (!error && (flags & IOMAP_REPORT)) error = xfs_reflink_trim_around_shared(ip, &imap, &shared); + seq = xfs_iomap_inode_sequence(ip, shared ? IOMAP_F_SHARED : 0); xfs_iunlock(ip, lockmode); if (error) return error; trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, - shared ? IOMAP_F_SHARED : 0); + shared ? IOMAP_F_SHARED : 0, seq); } const struct iomap_ops xfs_read_iomap_ops = { @@ -1237,6 +1286,7 @@ xfs_seek_iomap_begin( struct xfs_bmbt_irec imap, cmap; int error = 0; unsigned lockmode; + u64 seq; if (xfs_is_shutdown(mp)) return -EIO; @@ -1271,8 +1321,9 @@ xfs_seek_iomap_begin( if (data_fsb < cow_fsb + cmap.br_blockcount) end_fsb = min(end_fsb, data_fsb); xfs_trim_extent(&cmap, offset_fsb, end_fsb); + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, - IOMAP_F_SHARED); + IOMAP_F_SHARED, seq); /* * This is a COW extent, so we must probe the page cache * because there could be dirty page cache being backed @@ -1293,8 +1344,9 @@ xfs_seek_iomap_begin( imap.br_startblock = HOLESTARTBLOCK; imap.br_state = XFS_EXT_NORM; done: + seq = xfs_iomap_inode_sequence(ip, 0); xfs_trim_extent(&imap, offset_fsb, end_fsb); - error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); out_unlock: xfs_iunlock(ip, lockmode); return error; @@ -1320,6 +1372,7 @@ xfs_xattr_iomap_begin( struct xfs_bmbt_irec imap; int nimaps = 1, error = 0; unsigned lockmode; + int seq; if (xfs_is_shutdown(mp)) return -EIO; @@ -1336,12 +1389,14 @@ xfs_xattr_iomap_begin( error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, XFS_BMAPI_ATTRFORK); out_unlock: + + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_XATTR); xfs_iunlock(ip, lockmode); if (error) return error; ASSERT(nimaps); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_XATTR, seq); } const struct iomap_ops xfs_xattr_iomap_ops = { diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 0f62ab633040c..4da13440bae9b 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -13,14 +13,15 @@ struct xfs_bmbt_irec; int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb, xfs_fileoff_t count_fsb, unsigned int flags, - struct xfs_bmbt_irec *imap); + struct xfs_bmbt_irec *imap, u64 *sequence); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip, xfs_fileoff_t end_fsb); +u64 xfs_iomap_inode_sequence(struct xfs_inode *ip, u16 iomap_flags); int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap, struct xfs_bmbt_irec *imap, unsigned int mapping_flags, - u16 iomap_flags); + u16 iomap_flags, u64 sequence_cookie); int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len, bool *did_zero); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 37a24f0f7cd40..38d23f0e703a8 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -125,6 +125,7 @@ xfs_fs_map_blocks( int nimaps = 1; uint lock_flags; int error = 0; + u64 seq; if (xfs_is_shutdown(mp)) return -EIO; @@ -176,6 +177,7 @@ xfs_fs_map_blocks( lock_flags = xfs_ilock_data_map_shared(ip); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, bmapi_flags); + seq = xfs_iomap_inode_sequence(ip, 0); ASSERT(!nimaps || imap.br_startblock != DELAYSTARTBLOCK); @@ -189,7 +191,7 @@ xfs_fs_map_blocks( xfs_iunlock(ip, lock_flags); error = xfs_iomap_write_direct(ip, offset_fsb, - end_fsb - offset_fsb, 0, &imap); + end_fsb - offset_fsb, 0, &imap, &seq); if (error) goto out_unlock; @@ -209,7 +211,7 @@ xfs_fs_map_blocks( } xfs_iunlock(ip, XFS_IOLOCK_EXCL); - error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0, seq); *device_generation = mp->m_generation; return error; out_unlock: -- GitLab From 6e8af15ccdc4e138a5b529c1901a0013e1dcaa09 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 29 Nov 2022 09:09:17 +1100 Subject: [PATCH 0953/1423] xfs: drop write error injection is unfixable, remove it With the changes to scan the page cache for dirty data to avoid data corruptions from partial write cleanup racing with other page cache operations, the drop writes error injection no longer works the same way it used to and causes xfs/196 to fail. This is because xfs/196 writes to the file and populates the page cache before it turns on the error injection and starts failing -overwrites-. The result is that the original drop-writes code failed writes only -after- overwriting the data in the cache, followed by invalidates the cached data, then punching out the delalloc extent from under that data. On the surface, this looks fine. The problem is that page cache invalidation *doesn't guarantee that it removes anything from the page cache* and it doesn't change the dirty state of the folio. When block size == page size and we do page aligned IO (as xfs/196 does) everything happens to align perfectly and page cache invalidation removes the single page folios that span the written data. Hence the followup delalloc punch pass does not find cached data over that range and it can punch the extent out. IOWs, xfs/196 "works" for block size == page size with the new code. I say "works", because it actually only works for the case where IO is page aligned, and no data was read from disk before writes occur. Because the moment we actually read data first, the readahead code allocates multipage folios and suddenly the invalidate code goes back to zeroing subfolio ranges without changing dirty state. Hence, with multipage folios in play, block size == page size is functionally identical to block size < page size behaviour, and drop-writes is manifestly broken w.r.t to this case. Invalidation of a subfolio range doesn't result in the folio being removed from the cache, just the range gets zeroed. Hence after we've sequentially walked over a folio that we've dirtied (via write data) and then invalidated, we end up with a dirty folio full of zeroed data. And because the new code skips punching ranges that have dirty folios covering them, we end up leaving the delalloc range intact after failing all the writes. Hence failed writes now end up writing zeroes to disk in the cases where invalidation zeroes folios rather than removing them from cache. This is a fundamental change of behaviour that is needed to avoid the data corruption vectors that exist in the old write fail path, and it renders the drop-writes injection non-functional and unworkable as it stands. As it is, I think the error injection is also now unnecessary, as partial writes that need delalloc extent are going to be a lot more common with stale iomap detection in place. Hence this patch removes the drop-writes error injection completely. xfs/196 can remain for testing kernels that don't have this data corruption fix, but those that do will report: xfs/196 3s ... [not run] XFS error injection drop_writes unknown on this kernel. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_errortag.h | 12 +++++------- fs/xfs/xfs_error.c | 27 ++++++++++++++++++++------- fs/xfs/xfs_iomap.c | 9 --------- 3 files changed, 25 insertions(+), 23 deletions(-) diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index 5362908164b0b..580ccbd5aadc2 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -40,13 +40,12 @@ #define XFS_ERRTAG_REFCOUNT_FINISH_ONE 25 #define XFS_ERRTAG_BMAP_FINISH_ONE 26 #define XFS_ERRTAG_AG_RESV_CRITICAL 27 + /* - * DEBUG mode instrumentation to test and/or trigger delayed allocation - * block killing in the event of failed writes. When enabled, all - * buffered writes are silenty dropped and handled as if they failed. - * All delalloc blocks in the range of the write (including pre-existing - * delalloc blocks!) are tossed as part of the write failure error - * handling sequence. + * Drop-writes support removed because write error handling cannot trash + * pre-existing delalloc extents in any useful way anymore. We retain the + * definition so that we can reject it as an invalid value in + * xfs_errortag_valid(). */ #define XFS_ERRTAG_DROP_WRITES 28 #define XFS_ERRTAG_LOG_BAD_CRC 29 @@ -95,7 +94,6 @@ #define XFS_RANDOM_REFCOUNT_FINISH_ONE 1 #define XFS_RANDOM_BMAP_FINISH_ONE 1 #define XFS_RANDOM_AG_RESV_CRITICAL 4 -#define XFS_RANDOM_DROP_WRITES 1 #define XFS_RANDOM_LOG_BAD_CRC 1 #define XFS_RANDOM_LOG_ITEM_PIN 1 #define XFS_RANDOM_BUF_LRU_REF 2 diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index c6b2aabd6f187..dea3c0649d2f7 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -46,7 +46,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_REFCOUNT_FINISH_ONE, XFS_RANDOM_BMAP_FINISH_ONE, XFS_RANDOM_AG_RESV_CRITICAL, - XFS_RANDOM_DROP_WRITES, + 0, /* XFS_RANDOM_DROP_WRITES has been removed */ XFS_RANDOM_LOG_BAD_CRC, XFS_RANDOM_LOG_ITEM_PIN, XFS_RANDOM_BUF_LRU_REF, @@ -162,7 +162,6 @@ XFS_ERRORTAG_ATTR_RW(refcount_continue_update, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDA XFS_ERRORTAG_ATTR_RW(refcount_finish_one, XFS_ERRTAG_REFCOUNT_FINISH_ONE); XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE); XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL); -XFS_ERRORTAG_ATTR_RW(drop_writes, XFS_ERRTAG_DROP_WRITES); XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC); XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); @@ -206,7 +205,6 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(refcount_finish_one), XFS_ERRORTAG_ATTR_LIST(bmap_finish_one), XFS_ERRORTAG_ATTR_LIST(ag_resv_critical), - XFS_ERRORTAG_ATTR_LIST(drop_writes), XFS_ERRORTAG_ATTR_LIST(log_bad_crc), XFS_ERRORTAG_ATTR_LIST(log_item_pin), XFS_ERRORTAG_ATTR_LIST(buf_lru_ref), @@ -256,6 +254,19 @@ xfs_errortag_del( kmem_free(mp->m_errortag); } +static bool +xfs_errortag_valid( + unsigned int error_tag) +{ + if (error_tag >= XFS_ERRTAG_MAX) + return false; + + /* Error out removed injection types */ + if (error_tag == XFS_ERRTAG_DROP_WRITES) + return false; + return true; +} + bool xfs_errortag_test( struct xfs_mount *mp, @@ -277,7 +288,9 @@ xfs_errortag_test( if (!mp->m_errortag) return false; - ASSERT(error_tag < XFS_ERRTAG_MAX); + if (!xfs_errortag_valid(error_tag)) + return false; + randfactor = mp->m_errortag[error_tag]; if (!randfactor || prandom_u32_max(randfactor)) return false; @@ -293,7 +306,7 @@ xfs_errortag_get( struct xfs_mount *mp, unsigned int error_tag) { - if (error_tag >= XFS_ERRTAG_MAX) + if (!xfs_errortag_valid(error_tag)) return -EINVAL; return mp->m_errortag[error_tag]; @@ -305,7 +318,7 @@ xfs_errortag_set( unsigned int error_tag, unsigned int tag_value) { - if (error_tag >= XFS_ERRTAG_MAX) + if (!xfs_errortag_valid(error_tag)) return -EINVAL; mp->m_errortag[error_tag] = tag_value; @@ -319,7 +332,7 @@ xfs_errortag_add( { BUILD_BUG_ON(ARRAY_SIZE(xfs_errortag_random_default) != XFS_ERRTAG_MAX); - if (error_tag >= XFS_ERRTAG_MAX) + if (!xfs_errortag_valid(error_tag)) return -EINVAL; return xfs_errortag_set(mp, error_tag, diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 26ca3cc1a0489..1bdd7afc10108 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1190,15 +1190,6 @@ xfs_buffered_write_iomap_end( struct xfs_mount *mp = XFS_M(inode->i_sb); int error; - /* - * Behave as if the write failed if drop writes is enabled. Set the NEW - * flag to force delalloc cleanup. - */ - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DROP_WRITES)) { - iomap->flags |= IOMAP_F_NEW; - written = 0; - } - error = iomap_file_buffered_write_punch_delalloc(inode, iomap, offset, length, written, &xfs_buffered_write_delalloc_punch); if (error && !xfs_is_shutdown(mp)) { -- GitLab From c2beff99eb03866df6fdbd3a93b08fd27eb8bf5c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 28 Nov 2022 17:24:35 -0800 Subject: [PATCH 0954/1423] xfs: add debug knob to slow down writeback for fun Add a new error injection knob so that we can arbitrarily slow down writeback to test for race conditions and aberrant reclaim behavior if the writeback mechanisms are slow to issue writeback. This will enable functional testing for the ifork sequence counters introduced in commit 745b3f76d1c8 ("xfs: maintain a sequence count for inode fork manipulations"). Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/libxfs/xfs_errortag.h | 4 +++- fs/xfs/xfs_aops.c | 14 ++++++++++-- fs/xfs/xfs_error.c | 16 +++++++++++++ fs/xfs/xfs_error.h | 13 +++++++++++ fs/xfs/xfs_trace.c | 2 ++ fs/xfs/xfs_trace.h | 44 ++++++++++++++++++++++++++++++++++++ 6 files changed, 90 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index 580ccbd5aadc2..f5f629174ecad 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -61,7 +61,8 @@ #define XFS_ERRTAG_LARP 39 #define XFS_ERRTAG_DA_LEAF_SPLIT 40 #define XFS_ERRTAG_ATTR_LEAF_TO_NODE 41 -#define XFS_ERRTAG_MAX 42 +#define XFS_ERRTAG_WB_DELAY_MS 42 +#define XFS_ERRTAG_MAX 43 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -107,5 +108,6 @@ #define XFS_RANDOM_LARP 1 #define XFS_RANDOM_DA_LEAF_SPLIT 1 #define XFS_RANDOM_ATTR_LEAF_TO_NODE 1 +#define XFS_RANDOM_WB_DELAY_MS 3000 #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index a22d90af40c85..41734202796f4 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -17,6 +17,8 @@ #include "xfs_bmap.h" #include "xfs_bmap_util.h" #include "xfs_reflink.h" +#include "xfs_errortag.h" +#include "xfs_error.h" struct xfs_writepage_ctx { struct iomap_writepage_ctx ctx; @@ -217,11 +219,17 @@ xfs_imap_valid( * checked (and found nothing at this offset) could have added * overlapping blocks. */ - if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq)) + if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq)) { + trace_xfs_wb_data_iomap_invalid(ip, &wpc->iomap, + XFS_WPC(wpc)->data_seq, XFS_DATA_FORK); return false; + } if (xfs_inode_has_cow_data(ip) && - XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) + XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) { + trace_xfs_wb_cow_iomap_invalid(ip, &wpc->iomap, + XFS_WPC(wpc)->cow_seq, XFS_COW_FORK); return false; + } return true; } @@ -285,6 +293,8 @@ xfs_map_blocks( if (xfs_is_shutdown(mp)) return -EIO; + XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS); + /* * COW fork blocks can overlap data fork blocks even if the blocks * aren't shared. COW I/O always takes precedent, so we must always diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index dea3c0649d2f7..2d6e3c718e03d 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -60,6 +60,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_LARP, XFS_RANDOM_DA_LEAF_SPLIT, XFS_RANDOM_ATTR_LEAF_TO_NODE, + XFS_RANDOM_WB_DELAY_MS, }; struct xfs_errortag_attr { @@ -175,6 +176,7 @@ XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL); XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP); XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT); XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE); +XFS_ERRORTAG_ATTR_RW(wb_delay_ms, XFS_ERRTAG_WB_DELAY_MS); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -218,6 +220,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(larp), XFS_ERRORTAG_ATTR_LIST(da_leaf_split), XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node), + XFS_ERRORTAG_ATTR_LIST(wb_delay_ms), NULL, }; ATTRIBUTE_GROUPS(xfs_errortag); @@ -267,6 +270,19 @@ xfs_errortag_valid( return true; } +bool +xfs_errortag_enabled( + struct xfs_mount *mp, + unsigned int tag) +{ + if (!mp->m_errortag) + return false; + if (!xfs_errortag_valid(tag)) + return false; + + return mp->m_errortag[tag] != 0; +} + bool xfs_errortag_test( struct xfs_mount *mp, diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 5191e9145e552..dbe6c37dc6975 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -45,6 +45,18 @@ extern bool xfs_errortag_test(struct xfs_mount *mp, const char *expression, const char *file, int line, unsigned int error_tag); #define XFS_TEST_ERROR(expr, mp, tag) \ ((expr) || xfs_errortag_test((mp), #expr, __FILE__, __LINE__, (tag))) +bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag); +#define XFS_ERRORTAG_DELAY(mp, tag) \ + do { \ + might_sleep(); \ + if (!xfs_errortag_enabled((mp), (tag))) \ + break; \ + xfs_warn_ratelimited((mp), \ +"Injecting %ums delay at file %s, line %d, on filesystem \"%s\"", \ + (mp)->m_errortag[(tag)], __FILE__, __LINE__, \ + (mp)->m_super->s_id); \ + mdelay((mp)->m_errortag[(tag)]); \ + } while (0) extern int xfs_errortag_get(struct xfs_mount *mp, unsigned int error_tag); extern int xfs_errortag_set(struct xfs_mount *mp, unsigned int error_tag, @@ -55,6 +67,7 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); #define xfs_errortag_init(mp) (0) #define xfs_errortag_del(mp) #define XFS_TEST_ERROR(expr, mp, tag) (expr) +#define XFS_ERRORTAG_DELAY(mp, tag) ((void)0) #define xfs_errortag_set(mp, tag, val) (ENOSYS) #define xfs_errortag_add(mp, tag) (ENOSYS) #define xfs_errortag_clearall(mp) (ENOSYS) diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index d269ef57ff016..8a5dc1538aa82 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -34,6 +34,8 @@ #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_error.h" +#include +#include "xfs_iomap.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 372d871bccc5e..c9ada9577a4a7 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3352,6 +3352,50 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \ TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec), \ TP_ARGS(ip, irec)) +/* inode iomap invalidation events */ +DECLARE_EVENT_CLASS(xfs_wb_invalid_class, + TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap, unsigned int wpcseq, int whichfork), + TP_ARGS(ip, iomap, wpcseq, whichfork), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(u64, addr) + __field(loff_t, pos) + __field(u64, len) + __field(u16, type) + __field(u16, flags) + __field(u32, wpcseq) + __field(u32, forkseq) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->addr = iomap->addr; + __entry->pos = iomap->offset; + __entry->len = iomap->length; + __entry->type = iomap->type; + __entry->flags = iomap->flags; + __entry->wpcseq = wpcseq; + __entry->forkseq = READ_ONCE(xfs_ifork_ptr(ip, whichfork)->if_seq); + ), + TP_printk("dev %d:%d ino 0x%llx pos 0x%llx addr 0x%llx bytecount 0x%llx type 0x%x flags 0x%x wpcseq 0x%x forkseq 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->pos, + __entry->addr, + __entry->len, + __entry->type, + __entry->flags, + __entry->wpcseq, + __entry->forkseq) +); +#define DEFINE_WB_INVALID_EVENT(name) \ +DEFINE_EVENT(xfs_wb_invalid_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap, unsigned int wpcseq, int whichfork), \ + TP_ARGS(ip, iomap, wpcseq, whichfork)) +DEFINE_WB_INVALID_EVENT(xfs_wb_cow_iomap_invalid); +DEFINE_WB_INVALID_EVENT(xfs_wb_data_iomap_invalid); + /* refcount/reflink tracepoint definitions */ /* reflink tracepoints */ -- GitLab From 254e3459285cbf2174350bbc0051e475e1bc5196 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 28 Nov 2022 17:24:36 -0800 Subject: [PATCH 0955/1423] xfs: add debug knob to slow down write for fun Add a new error injection knob so that we can arbitrarily slow down pagecache writes to test for race conditions and aberrant reclaim behavior if the writeback mechanisms are slow to issue writeback. This will enable functional testing for the ifork sequence counters introduced in commit 304a68b9c63b ("xfs: use iomap_valid method to detect stale cached iomaps") that fixes write racing with reclaim writeback. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/libxfs/xfs_errortag.h | 4 +++- fs/xfs/xfs_error.c | 3 +++ fs/xfs/xfs_iomap.c | 14 ++++++++++-- fs/xfs/xfs_trace.h | 42 ++++++++++++++++++++++++++++++++++++ 4 files changed, 60 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index f5f629174ecad..01a9e86b30379 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -62,7 +62,8 @@ #define XFS_ERRTAG_DA_LEAF_SPLIT 40 #define XFS_ERRTAG_ATTR_LEAF_TO_NODE 41 #define XFS_ERRTAG_WB_DELAY_MS 42 -#define XFS_ERRTAG_MAX 43 +#define XFS_ERRTAG_WRITE_DELAY_MS 43 +#define XFS_ERRTAG_MAX 44 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -109,5 +110,6 @@ #define XFS_RANDOM_DA_LEAF_SPLIT 1 #define XFS_RANDOM_ATTR_LEAF_TO_NODE 1 #define XFS_RANDOM_WB_DELAY_MS 3000 +#define XFS_RANDOM_WRITE_DELAY_MS 3000 #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 2d6e3c718e03d..713341d246d18 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -61,6 +61,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_DA_LEAF_SPLIT, XFS_RANDOM_ATTR_LEAF_TO_NODE, XFS_RANDOM_WB_DELAY_MS, + XFS_RANDOM_WRITE_DELAY_MS, }; struct xfs_errortag_attr { @@ -177,6 +178,7 @@ XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP); XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT); XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE); XFS_ERRORTAG_ATTR_RW(wb_delay_ms, XFS_ERRTAG_WB_DELAY_MS); +XFS_ERRORTAG_ATTR_RW(write_delay_ms, XFS_ERRTAG_WRITE_DELAY_MS); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -221,6 +223,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(da_leaf_split), XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node), XFS_ERRORTAG_ATTR_LIST(wb_delay_ms), + XFS_ERRORTAG_ATTR_LIST(write_delay_ms), NULL, }; ATTRIBUTE_GROUPS(xfs_errortag); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 1bdd7afc10108..1005f1e365454 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -27,6 +27,8 @@ #include "xfs_dquot_item.h" #include "xfs_dquot.h" #include "xfs_reflink.h" +#include "xfs_error.h" +#include "xfs_errortag.h" #define XFS_ALLOC_ALIGN(mp, off) \ (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log) @@ -71,8 +73,16 @@ xfs_iomap_valid( struct inode *inode, const struct iomap *iomap) { - return iomap->validity_cookie == - xfs_iomap_inode_sequence(XFS_I(inode), iomap->flags); + struct xfs_inode *ip = XFS_I(inode); + + if (iomap->validity_cookie != + xfs_iomap_inode_sequence(ip, iomap->flags)) { + trace_xfs_iomap_invalid(ip, iomap); + return false; + } + + XFS_ERRORTAG_DELAY(ip->i_mount, XFS_ERRTAG_WRITE_DELAY_MS); + return true; } const struct iomap_page_ops xfs_iomap_page_ops = { diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index c9ada9577a4a7..421d1e504ac43 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3396,6 +3396,48 @@ DEFINE_EVENT(xfs_wb_invalid_class, name, \ DEFINE_WB_INVALID_EVENT(xfs_wb_cow_iomap_invalid); DEFINE_WB_INVALID_EVENT(xfs_wb_data_iomap_invalid); +DECLARE_EVENT_CLASS(xfs_iomap_invalid_class, + TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap), + TP_ARGS(ip, iomap), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(u64, addr) + __field(loff_t, pos) + __field(u64, len) + __field(u64, validity_cookie) + __field(u64, inodeseq) + __field(u16, type) + __field(u16, flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->addr = iomap->addr; + __entry->pos = iomap->offset; + __entry->len = iomap->length; + __entry->validity_cookie = iomap->validity_cookie; + __entry->type = iomap->type; + __entry->flags = iomap->flags; + __entry->inodeseq = xfs_iomap_inode_sequence(ip, iomap->flags); + ), + TP_printk("dev %d:%d ino 0x%llx pos 0x%llx addr 0x%llx bytecount 0x%llx type 0x%x flags 0x%x validity_cookie 0x%llx inodeseq 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->pos, + __entry->addr, + __entry->len, + __entry->type, + __entry->flags, + __entry->validity_cookie, + __entry->inodeseq) +); +#define DEFINE_IOMAP_INVALID_EVENT(name) \ +DEFINE_EVENT(xfs_iomap_invalid_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap), \ + TP_ARGS(ip, iomap)) +DEFINE_IOMAP_INVALID_EVENT(xfs_iomap_invalid); + /* refcount/reflink tracepoint definitions */ /* reflink tracepoints */ -- GitLab From 2d6c66f5253e7d168a76048d18e1209c52f98a2b Mon Sep 17 00:00:00 2001 From: zhang songyi Date: Tue, 29 Nov 2022 15:54:07 +0800 Subject: [PATCH 0956/1423] RDMA/mlx4: Remove NULL check before dev_{put, hold} The call netdev_{put, hold} of dev_{put, hold} will check NULL, so there is no need to check before using dev_{put, hold}. Fix the following coccicheck warnings: /drivers/infiniband/hw/mlx4/main.c:1311:2-10: WARNING: WARNING NULL check before dev_{put, hold} functions is not needed. /drivers/infiniband/hw/mlx4/main.c:148:2-10: WARNING: WARNING NULL check before dev_{put, hold} functions is not needed. /drivers/infiniband/hw/mlx4/main.c:1959:3-11: WARNING: WARNING NULL check before dev_{put, hold} functions is not needed. /drivers/infiniband/hw/mlx4/main.c:1962:3-10: WARNING: WARNING NULL check before dev_{put, hold} functions is not needed. Signed-off-by: zhang songyi Link: https://lore.kernel.org/r/202211291554079687539@zte.com.cn Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx4/main.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index ba47874f90d38..dceebcd885bbd 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -144,8 +144,7 @@ static struct net_device *mlx4_ib_get_netdev(struct ib_device *device, } } } - if (dev) - dev_hold(dev); + dev_hold(dev); rcu_read_unlock(); return dev; @@ -1307,8 +1306,7 @@ int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, spin_lock_bh(&mdev->iboe.lock); ndev = mdev->iboe.netdevs[mqp->port - 1]; - if (ndev) - dev_hold(ndev); + dev_hold(ndev); spin_unlock_bh(&mdev->iboe.lock); if (ndev) { @@ -1955,11 +1953,9 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) if (ge) { spin_lock_bh(&mdev->iboe.lock); ndev = ge->added ? mdev->iboe.netdevs[ge->port - 1] : NULL; - if (ndev) - dev_hold(ndev); + dev_hold(ndev); spin_unlock_bh(&mdev->iboe.lock); - if (ndev) - dev_put(ndev); + dev_put(ndev); list_del(&ge->list); kfree(ge); } else -- GitLab From b0284cd29a957e62d60c2886fd663be93c56f9c0 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 3 Nov 2022 18:10:34 -0700 Subject: [PATCH 0957/1423] mm: Do not enable PG_arch_2 for all 64-bit architectures Commit 4beba9486abd ("mm: Add PG_arch_2 page flag") introduced a new page flag for all 64-bit architectures. However, even if an architecture is 64-bit, it may still have limited spare bits in the 'flags' member of 'struct page'. This may happen if an architecture enables SPARSEMEM without SPARSEMEM_VMEMMAP as is the case with the newly added loongarch. This architecture port needs 19 more bits for the sparsemem section information and, while it is currently fine with PG_arch_2, adding any more PG_arch_* flags will trigger build-time warnings. Add a new CONFIG_ARCH_USES_PG_ARCH_X option which can be selected by architectures that need more PG_arch_* flags beyond PG_arch_1. Select it on arm64. Signed-off-by: Catalin Marinas [pcc@google.com: fix build with CONFIG_ARM64_MTE disabled] Signed-off-by: Peter Collingbourne Reported-by: kernel test robot Cc: Andrew Morton Cc: Steven Price Reviewed-by: Steven Price Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221104011041.290951-2-pcc@google.com --- arch/arm64/Kconfig | 1 + fs/proc/page.c | 2 +- include/linux/page-flags.h | 2 +- include/trace/events/mmflags.h | 8 ++++---- mm/Kconfig | 8 ++++++++ mm/huge_memory.c | 2 +- 6 files changed, 16 insertions(+), 7 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 505c8a1ccbe0c..cd93d07384256 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1965,6 +1965,7 @@ config ARM64_MTE depends on ARM64_PAN select ARCH_HAS_SUBPAGE_FAULTS select ARCH_USES_HIGH_VMA_FLAGS + select ARCH_USES_PG_ARCH_X help Memory Tagging (part of the ARMv8.5 Extensions) provides architectural support for run-time, always-on detection of diff --git a/fs/proc/page.c b/fs/proc/page.c index f2273b164535b..882525c8e94c5 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -219,7 +219,7 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2); u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1); u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1); -#ifdef CONFIG_64BIT +#ifdef CONFIG_ARCH_USES_PG_ARCH_X u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2); #endif diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 0b0ae5084e60c..5dc7977edf9d5 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -132,7 +132,7 @@ enum pageflags { PG_young, PG_idle, #endif -#ifdef CONFIG_64BIT +#ifdef CONFIG_ARCH_USES_PG_ARCH_X PG_arch_2, #endif #ifdef CONFIG_KASAN_HW_TAGS diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index e87cb2b80ed3c..d9f6d35fb1509 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -91,10 +91,10 @@ #define IF_HAVE_PG_IDLE(flag,string) #endif -#ifdef CONFIG_64BIT -#define IF_HAVE_PG_ARCH_2(flag,string) ,{1UL << flag, string} +#ifdef CONFIG_ARCH_USES_PG_ARCH_X +#define IF_HAVE_PG_ARCH_X(flag,string) ,{1UL << flag, string} #else -#define IF_HAVE_PG_ARCH_2(flag,string) +#define IF_HAVE_PG_ARCH_X(flag,string) #endif #ifdef CONFIG_KASAN_HW_TAGS @@ -130,7 +130,7 @@ IF_HAVE_PG_UNCACHED(PG_uncached, "uncached" ) \ IF_HAVE_PG_HWPOISON(PG_hwpoison, "hwpoison" ) \ IF_HAVE_PG_IDLE(PG_young, "young" ) \ IF_HAVE_PG_IDLE(PG_idle, "idle" ) \ -IF_HAVE_PG_ARCH_2(PG_arch_2, "arch_2" ) \ +IF_HAVE_PG_ARCH_X(PG_arch_2, "arch_2" ) \ IF_HAVE_PG_SKIP_KASAN_POISON(PG_skip_kasan_poison, "skip_kasan_poison") #define show_page_flags(flags) \ diff --git a/mm/Kconfig b/mm/Kconfig index 57e1d8c5b5052..807bd7192f519 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1005,6 +1005,14 @@ config ARCH_USES_HIGH_VMA_FLAGS config ARCH_HAS_PKEYS bool +config ARCH_USES_PG_ARCH_X + bool + help + Enable the definition of PG_arch_x page flags with x > 1. Only + suitable for 64-bit architectures with CONFIG_FLATMEM or + CONFIG_SPARSEMEM_VMEMMAP enabled, otherwise there may not be + enough room for additional bits in page->flags. + config VM_EVENT_COUNTERS default y bool "Enable VM event counters for /proc/vmstat" if EXPERT diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 561a42567477d..76e7b973919ca 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2444,7 +2444,7 @@ static void __split_huge_page_tail(struct page *head, int tail, (1L << PG_workingset) | (1L << PG_locked) | (1L << PG_unevictable) | -#ifdef CONFIG_64BIT +#ifdef CONFIG_ARCH_USES_PG_ARCH_X (1L << PG_arch_2) | #endif (1L << PG_dirty) | -- GitLab From e059853d14ca4ed0f6a190d7109487918a22a976 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 3 Nov 2022 18:10:35 -0700 Subject: [PATCH 0958/1423] arm64: mte: Fix/clarify the PG_mte_tagged semantics Currently the PG_mte_tagged page flag mostly means the page contains valid tags and it should be set after the tags have been cleared or restored. However, in mte_sync_tags() it is set before setting the tags to avoid, in theory, a race with concurrent mprotect(PROT_MTE) for shared pages. However, a concurrent mprotect(PROT_MTE) with a copy on write in another thread can cause the new page to have stale tags. Similarly, tag reading via ptrace() can read stale tags if the PG_mte_tagged flag is set before actually clearing/restoring the tags. Fix the PG_mte_tagged semantics so that it is only set after the tags have been cleared or restored. This is safe for swap restoring into a MAP_SHARED or CoW page since the core code takes the page lock. Add two functions to test and set the PG_mte_tagged flag with acquire and release semantics. The downside is that concurrent mprotect(PROT_MTE) on a MAP_SHARED page may cause tag loss. This is already the case for KVM guests if a VMM changes the page protection while the guest triggers a user_mem_abort(). Signed-off-by: Catalin Marinas [pcc@google.com: fix build with CONFIG_ARM64_MTE disabled] Signed-off-by: Peter Collingbourne Reviewed-by: Cornelia Huck Reviewed-by: Steven Price Cc: Will Deacon Cc: Marc Zyngier Cc: Peter Collingbourne Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221104011041.290951-3-pcc@google.com --- arch/arm64/include/asm/mte.h | 30 ++++++++++++++++++++++++++++++ arch/arm64/include/asm/pgtable.h | 2 +- arch/arm64/kernel/cpufeature.c | 4 +++- arch/arm64/kernel/elfcore.c | 2 +- arch/arm64/kernel/hibernate.c | 2 +- arch/arm64/kernel/mte.c | 17 +++++++++++------ arch/arm64/kvm/guest.c | 4 ++-- arch/arm64/kvm/mmu.c | 4 ++-- arch/arm64/mm/copypage.c | 5 +++-- arch/arm64/mm/fault.c | 2 +- arch/arm64/mm/mteswap.c | 2 +- 11 files changed, 56 insertions(+), 18 deletions(-) diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h index 760c62f8e22f8..3f8199ba265a1 100644 --- a/arch/arm64/include/asm/mte.h +++ b/arch/arm64/include/asm/mte.h @@ -37,6 +37,29 @@ void mte_free_tag_storage(char *storage); /* track which pages have valid allocation tags */ #define PG_mte_tagged PG_arch_2 +static inline void set_page_mte_tagged(struct page *page) +{ + /* + * Ensure that the tags written prior to this function are visible + * before the page flags update. + */ + smp_wmb(); + set_bit(PG_mte_tagged, &page->flags); +} + +static inline bool page_mte_tagged(struct page *page) +{ + bool ret = test_bit(PG_mte_tagged, &page->flags); + + /* + * If the page is tagged, ensure ordering with a likely subsequent + * read of the tags. + */ + if (ret) + smp_rmb(); + return ret; +} + void mte_zero_clear_page_tags(void *addr); void mte_sync_tags(pte_t old_pte, pte_t pte); void mte_copy_page_tags(void *kto, const void *kfrom); @@ -56,6 +79,13 @@ size_t mte_probe_user_range(const char __user *uaddr, size_t size); /* unused if !CONFIG_ARM64_MTE, silence the compiler */ #define PG_mte_tagged 0 +static inline void set_page_mte_tagged(struct page *page) +{ +} +static inline bool page_mte_tagged(struct page *page) +{ + return false; +} static inline void mte_zero_clear_page_tags(void *addr) { } diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 71a1af42f0e89..98b6384415216 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1050,7 +1050,7 @@ static inline void arch_swap_invalidate_area(int type) static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) { if (system_supports_mte() && mte_restore_tags(entry, &folio->page)) - set_bit(PG_mte_tagged, &folio->flags); + set_page_mte_tagged(&folio->page); } #endif /* CONFIG_ARM64_MTE */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 6062454a90674..df11cfe61fcb0 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2050,8 +2050,10 @@ static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap) * Clear the tags in the zero page. This needs to be done via the * linear map which has the Tagged attribute. */ - if (!test_and_set_bit(PG_mte_tagged, &ZERO_PAGE(0)->flags)) + if (!page_mte_tagged(ZERO_PAGE(0))) { mte_clear_page_tags(lm_alias(empty_zero_page)); + set_page_mte_tagged(ZERO_PAGE(0)); + } kasan_init_hw_tags_cpu(); } diff --git a/arch/arm64/kernel/elfcore.c b/arch/arm64/kernel/elfcore.c index 27ef7ad3ffd2e..353009d7f307f 100644 --- a/arch/arm64/kernel/elfcore.c +++ b/arch/arm64/kernel/elfcore.c @@ -47,7 +47,7 @@ static int mte_dump_tag_range(struct coredump_params *cprm, * Pages mapped in user space as !pte_access_permitted() (e.g. * PROT_EXEC only) may not have the PG_mte_tagged flag set. */ - if (!test_bit(PG_mte_tagged, &page->flags)) { + if (!page_mte_tagged(page)) { put_page(page); dump_skip(cprm, MTE_PAGE_TAG_STORAGE); continue; diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index af5df48ba915b..788597a6b6a2f 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -271,7 +271,7 @@ static int swsusp_mte_save_tags(void) if (!page) continue; - if (!test_bit(PG_mte_tagged, &page->flags)) + if (!page_mte_tagged(page)) continue; ret = save_tags(page, pfn); diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 7467217c1eaf3..84a085d536f84 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -41,8 +41,10 @@ static void mte_sync_page_tags(struct page *page, pte_t old_pte, if (check_swap && is_swap_pte(old_pte)) { swp_entry_t entry = pte_to_swp_entry(old_pte); - if (!non_swap_entry(entry) && mte_restore_tags(entry, page)) + if (!non_swap_entry(entry) && mte_restore_tags(entry, page)) { + set_page_mte_tagged(page); return; + } } if (!pte_is_tagged) @@ -52,8 +54,10 @@ static void mte_sync_page_tags(struct page *page, pte_t old_pte, * Test PG_mte_tagged again in case it was racing with another * set_pte_at(). */ - if (!test_and_set_bit(PG_mte_tagged, &page->flags)) + if (!page_mte_tagged(page)) { mte_clear_page_tags(page_address(page)); + set_page_mte_tagged(page); + } } void mte_sync_tags(pte_t old_pte, pte_t pte) @@ -69,9 +73,11 @@ void mte_sync_tags(pte_t old_pte, pte_t pte) /* if PG_mte_tagged is set, tags have already been initialised */ for (i = 0; i < nr_pages; i++, page++) { - if (!test_bit(PG_mte_tagged, &page->flags)) + if (!page_mte_tagged(page)) { mte_sync_page_tags(page, old_pte, check_swap, pte_is_tagged); + set_page_mte_tagged(page); + } } /* ensure the tags are visible before the PTE is set */ @@ -96,8 +102,7 @@ int memcmp_pages(struct page *page1, struct page *page2) * pages is tagged, set_pte_at() may zero or change the tags of the * other page via mte_sync_tags(). */ - if (test_bit(PG_mte_tagged, &page1->flags) || - test_bit(PG_mte_tagged, &page2->flags)) + if (page_mte_tagged(page1) || page_mte_tagged(page2)) return addr1 != addr2; return ret; @@ -454,7 +459,7 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, put_page(page); break; } - WARN_ON_ONCE(!test_bit(PG_mte_tagged, &page->flags)); + WARN_ON_ONCE(!page_mte_tagged(page)); /* limit access to the end of the page */ offset = offset_in_page(addr); diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 2ff13a3f84796..817fdd1ab7783 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -1059,7 +1059,7 @@ long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, maddr = page_address(page); if (!write) { - if (test_bit(PG_mte_tagged, &page->flags)) + if (page_mte_tagged(page)) num_tags = mte_copy_tags_to_user(tags, maddr, MTE_GRANULES_PER_PAGE); else @@ -1076,7 +1076,7 @@ long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, * completed fully */ if (num_tags == MTE_GRANULES_PER_PAGE) - set_bit(PG_mte_tagged, &page->flags); + set_page_mte_tagged(page); kvm_release_pfn_dirty(pfn); } diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 60ee3d9f01f8c..2c3759f1f2c56 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1110,9 +1110,9 @@ static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, return -EFAULT; for (i = 0; i < nr_pages; i++, page++) { - if (!test_bit(PG_mte_tagged, &page->flags)) { + if (!page_mte_tagged(page)) { mte_clear_page_tags(page_address(page)); - set_bit(PG_mte_tagged, &page->flags); + set_page_mte_tagged(page); } } diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c index 24913271e898c..731d8a35701ea 100644 --- a/arch/arm64/mm/copypage.c +++ b/arch/arm64/mm/copypage.c @@ -21,9 +21,10 @@ void copy_highpage(struct page *to, struct page *from) copy_page(kto, kfrom); - if (system_supports_mte() && test_bit(PG_mte_tagged, &from->flags)) { - set_bit(PG_mte_tagged, &to->flags); + if (system_supports_mte() && page_mte_tagged(from)) { + page_kasan_tag_reset(to); mte_copy_page_tags(kto, kfrom); + set_page_mte_tagged(to); } } EXPORT_SYMBOL(copy_highpage); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 5b391490e045b..629e886ceec40 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -934,5 +934,5 @@ struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma, void tag_clear_highpage(struct page *page) { mte_zero_clear_page_tags(page_address(page)); - set_bit(PG_mte_tagged, &page->flags); + set_page_mte_tagged(page); } diff --git a/arch/arm64/mm/mteswap.c b/arch/arm64/mm/mteswap.c index bed803d8e1585..70f913205db99 100644 --- a/arch/arm64/mm/mteswap.c +++ b/arch/arm64/mm/mteswap.c @@ -24,7 +24,7 @@ int mte_save_tags(struct page *page) { void *tag_storage, *ret; - if (!test_bit(PG_mte_tagged, &page->flags)) + if (!page_mte_tagged(page)) return 0; tag_storage = mte_allocate_tag_storage(); -- GitLab From 2dbf12ae132cc78048615cfa19c9be64baaf0ced Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 3 Nov 2022 18:10:36 -0700 Subject: [PATCH 0959/1423] KVM: arm64: Simplify the sanitise_mte_tags() logic Currently sanitise_mte_tags() checks if it's an online page before attempting to sanitise the tags. Such detection should be done in the caller via the VM_MTE_ALLOWED vma flag. Since kvm_set_spte_gfn() does not have the vma, leave the page unmapped if not already tagged. Tag initialisation will be done on a subsequent access fault in user_mem_abort(). Signed-off-by: Catalin Marinas [pcc@google.com: fix the page initializer] Signed-off-by: Peter Collingbourne Reviewed-by: Steven Price Cc: Will Deacon Cc: Marc Zyngier Cc: Peter Collingbourne Reviewed-by: Cornelia Huck Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221104011041.290951-4-pcc@google.com --- arch/arm64/kvm/mmu.c | 40 +++++++++++++++------------------------- 1 file changed, 15 insertions(+), 25 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 2c3759f1f2c56..e81bfb7306298 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1091,23 +1091,14 @@ static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva) * - mmap_lock protects between a VM faulting a page in and the VMM performing * an mprotect() to add VM_MTE */ -static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, - unsigned long size) +static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, + unsigned long size) { unsigned long i, nr_pages = size >> PAGE_SHIFT; - struct page *page; + struct page *page = pfn_to_page(pfn); if (!kvm_has_mte(kvm)) - return 0; - - /* - * pfn_to_online_page() is used to reject ZONE_DEVICE pages - * that may not support tags. - */ - page = pfn_to_online_page(pfn); - - if (!page) - return -EFAULT; + return; for (i = 0; i < nr_pages; i++, page++) { if (!page_mte_tagged(page)) { @@ -1115,8 +1106,6 @@ static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, set_page_mte_tagged(page); } } - - return 0; } static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, @@ -1127,7 +1116,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, bool write_fault, writable, force_pte = false; bool exec_fault; bool device = false; - bool shared; unsigned long mmu_seq; struct kvm *kvm = vcpu->kvm; struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; @@ -1177,8 +1165,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, vma_shift = get_vma_page_shift(vma, hva); } - shared = (vma->vm_flags & VM_SHARED); - switch (vma_shift) { #ifndef __PAGETABLE_PMD_FOLDED case PUD_SHIFT: @@ -1299,12 +1285,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) { /* Check the VMM hasn't introduced a new VM_SHARED VMA */ - if (!shared) - ret = sanitise_mte_tags(kvm, pfn, vma_pagesize); - else + if ((vma->vm_flags & VM_MTE_ALLOWED) && + !(vma->vm_flags & VM_SHARED)) { + sanitise_mte_tags(kvm, pfn, vma_pagesize); + } else { ret = -EFAULT; - if (ret) goto out_unlock; + } } if (writable) @@ -1526,15 +1513,18 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { kvm_pfn_t pfn = pte_pfn(range->pte); - int ret; if (!kvm->arch.mmu.pgt) return false; WARN_ON(range->end - range->start != 1); - ret = sanitise_mte_tags(kvm, pfn, PAGE_SIZE); - if (ret) + /* + * If the page isn't tagged, defer to user_mem_abort() for sanitising + * the MTE tags. The S2 pte should have been unmapped by + * mmu_notifier_invalidate_range_end(). + */ + if (kvm_has_mte(kvm) && !page_mte_tagged(pfn_to_page(pfn))) return false; /* -- GitLab From ef6458b1b6ca3fdb991ce4182e981a88d4c58c0f Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Thu, 3 Nov 2022 18:10:37 -0700 Subject: [PATCH 0960/1423] mm: Add PG_arch_3 page flag As with PG_arch_2, this flag is only allowed on 64-bit architectures due to the shortage of bits available. It will be used by the arm64 MTE code in subsequent patches. Signed-off-by: Peter Collingbourne Cc: Will Deacon Cc: Marc Zyngier Cc: Steven Price [catalin.marinas@arm.com: added flag preserving in __split_huge_page_tail()] Signed-off-by: Catalin Marinas Reviewed-by: Steven Price Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221104011041.290951-5-pcc@google.com --- fs/proc/page.c | 1 + include/linux/kernel-page-flags.h | 1 + include/linux/page-flags.h | 1 + include/trace/events/mmflags.h | 1 + mm/huge_memory.c | 1 + 5 files changed, 5 insertions(+) diff --git a/fs/proc/page.c b/fs/proc/page.c index 882525c8e94c5..6249c347809a9 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -221,6 +221,7 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1); #ifdef CONFIG_ARCH_USES_PG_ARCH_X u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2); + u |= kpf_copy_bit(k, KPF_ARCH_3, PG_arch_3); #endif return u; diff --git a/include/linux/kernel-page-flags.h b/include/linux/kernel-page-flags.h index eee1877a354e5..859f4b0c1b2bb 100644 --- a/include/linux/kernel-page-flags.h +++ b/include/linux/kernel-page-flags.h @@ -18,5 +18,6 @@ #define KPF_UNCACHED 39 #define KPF_SOFTDIRTY 40 #define KPF_ARCH_2 41 +#define KPF_ARCH_3 42 #endif /* LINUX_KERNEL_PAGE_FLAGS_H */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5dc7977edf9d5..c50ce2812f177 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -134,6 +134,7 @@ enum pageflags { #endif #ifdef CONFIG_ARCH_USES_PG_ARCH_X PG_arch_2, + PG_arch_3, #endif #ifdef CONFIG_KASAN_HW_TAGS PG_skip_kasan_poison, diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index d9f6d35fb1509..412b5a46374c0 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -131,6 +131,7 @@ IF_HAVE_PG_HWPOISON(PG_hwpoison, "hwpoison" ) \ IF_HAVE_PG_IDLE(PG_young, "young" ) \ IF_HAVE_PG_IDLE(PG_idle, "idle" ) \ IF_HAVE_PG_ARCH_X(PG_arch_2, "arch_2" ) \ +IF_HAVE_PG_ARCH_X(PG_arch_3, "arch_3" ) \ IF_HAVE_PG_SKIP_KASAN_POISON(PG_skip_kasan_poison, "skip_kasan_poison") #define show_page_flags(flags) \ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 76e7b973919ca..dfe72ea23c5f3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2446,6 +2446,7 @@ static void __split_huge_page_tail(struct page *head, int tail, (1L << PG_unevictable) | #ifdef CONFIG_ARCH_USES_PG_ARCH_X (1L << PG_arch_2) | + (1L << PG_arch_3) | #endif (1L << PG_dirty) | LRU_GEN_MASK | LRU_REFS_MASK)); -- GitLab From d77e59a8fccde7fb5dd8c57594ed147b4291c970 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 3 Nov 2022 18:10:38 -0700 Subject: [PATCH 0961/1423] arm64: mte: Lock a page for MTE tag initialisation Initialising the tags and setting PG_mte_tagged flag for a page can race between multiple set_pte_at() on shared pages or setting the stage 2 pte via user_mem_abort(). Introduce a new PG_mte_lock flag as PG_arch_3 and set it before attempting page initialisation. Given that PG_mte_tagged is never cleared for a page, consider setting this flag to mean page unlocked and wait on this bit with acquire semantics if the page is locked: - try_page_mte_tagging() - lock the page for tagging, return true if it can be tagged, false if already tagged. No acquire semantics if it returns true (PG_mte_tagged not set) as there is no serialisation with a previous set_page_mte_tagged(). - set_page_mte_tagged() - set PG_mte_tagged with release semantics. The two-bit locking is based on Peter Collingbourne's idea. Signed-off-by: Catalin Marinas Signed-off-by: Peter Collingbourne Reviewed-by: Steven Price Cc: Will Deacon Cc: Marc Zyngier Cc: Peter Collingbourne Reviewed-by: Cornelia Huck Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221104011041.290951-6-pcc@google.com --- arch/arm64/include/asm/mte.h | 35 +++++++++++++++++++++++++++++++- arch/arm64/include/asm/pgtable.h | 4 ++-- arch/arm64/kernel/cpufeature.c | 2 +- arch/arm64/kernel/mte.c | 12 +++-------- arch/arm64/kvm/guest.c | 16 +++++++++------ arch/arm64/kvm/mmu.c | 2 +- arch/arm64/mm/copypage.c | 2 ++ arch/arm64/mm/fault.c | 2 ++ arch/arm64/mm/mteswap.c | 14 +++++-------- 9 files changed, 60 insertions(+), 29 deletions(-) diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h index 3f8199ba265a1..20dd06d70af5f 100644 --- a/arch/arm64/include/asm/mte.h +++ b/arch/arm64/include/asm/mte.h @@ -25,7 +25,7 @@ unsigned long mte_copy_tags_to_user(void __user *to, void *from, unsigned long n); int mte_save_tags(struct page *page); void mte_save_page_tags(const void *page_addr, void *tag_storage); -bool mte_restore_tags(swp_entry_t entry, struct page *page); +void mte_restore_tags(swp_entry_t entry, struct page *page); void mte_restore_page_tags(void *page_addr, const void *tag_storage); void mte_invalidate_tags(int type, pgoff_t offset); void mte_invalidate_tags_area(int type); @@ -36,6 +36,8 @@ void mte_free_tag_storage(char *storage); /* track which pages have valid allocation tags */ #define PG_mte_tagged PG_arch_2 +/* simple lock to avoid multiple threads tagging the same page */ +#define PG_mte_lock PG_arch_3 static inline void set_page_mte_tagged(struct page *page) { @@ -60,6 +62,33 @@ static inline bool page_mte_tagged(struct page *page) return ret; } +/* + * Lock the page for tagging and return 'true' if the page can be tagged, + * 'false' if already tagged. PG_mte_tagged is never cleared and therefore the + * locking only happens once for page initialisation. + * + * The page MTE lock state: + * + * Locked: PG_mte_lock && !PG_mte_tagged + * Unlocked: !PG_mte_lock || PG_mte_tagged + * + * Acquire semantics only if the page is tagged (returning 'false'). + */ +static inline bool try_page_mte_tagging(struct page *page) +{ + if (!test_and_set_bit(PG_mte_lock, &page->flags)) + return true; + + /* + * The tags are either being initialised or may have been initialised + * already. Check if the PG_mte_tagged flag has been set or wait + * otherwise. + */ + smp_cond_load_acquire(&page->flags, VAL & (1UL << PG_mte_tagged)); + + return false; +} + void mte_zero_clear_page_tags(void *addr); void mte_sync_tags(pte_t old_pte, pte_t pte); void mte_copy_page_tags(void *kto, const void *kfrom); @@ -86,6 +115,10 @@ static inline bool page_mte_tagged(struct page *page) { return false; } +static inline bool try_page_mte_tagging(struct page *page) +{ + return false; +} static inline void mte_zero_clear_page_tags(void *addr) { } diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 98b6384415216..8735ac1a1e326 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1049,8 +1049,8 @@ static inline void arch_swap_invalidate_area(int type) #define __HAVE_ARCH_SWAP_RESTORE static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) { - if (system_supports_mte() && mte_restore_tags(entry, &folio->page)) - set_page_mte_tagged(&folio->page); + if (system_supports_mte()) + mte_restore_tags(entry, &folio->page); } #endif /* CONFIG_ARM64_MTE */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index df11cfe61fcb0..afb4ffd745c33 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2050,7 +2050,7 @@ static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap) * Clear the tags in the zero page. This needs to be done via the * linear map which has the Tagged attribute. */ - if (!page_mte_tagged(ZERO_PAGE(0))) { + if (try_page_mte_tagging(ZERO_PAGE(0))) { mte_clear_page_tags(lm_alias(empty_zero_page)); set_page_mte_tagged(ZERO_PAGE(0)); } diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 84a085d536f84..f5bcb0dc62673 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -41,20 +41,14 @@ static void mte_sync_page_tags(struct page *page, pte_t old_pte, if (check_swap && is_swap_pte(old_pte)) { swp_entry_t entry = pte_to_swp_entry(old_pte); - if (!non_swap_entry(entry) && mte_restore_tags(entry, page)) { - set_page_mte_tagged(page); - return; - } + if (!non_swap_entry(entry)) + mte_restore_tags(entry, page); } if (!pte_is_tagged) return; - /* - * Test PG_mte_tagged again in case it was racing with another - * set_pte_at(). - */ - if (!page_mte_tagged(page)) { + if (try_page_mte_tagging(page)) { mte_clear_page_tags(page_address(page)); set_page_mte_tagged(page); } diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 817fdd1ab7783..5626ddb540ce3 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -1068,15 +1068,19 @@ long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, clear_user(tags, MTE_GRANULES_PER_PAGE); kvm_release_pfn_clean(pfn); } else { + /* + * Only locking to serialise with a concurrent + * set_pte_at() in the VMM but still overriding the + * tags, hence ignoring the return value. + */ + try_page_mte_tagging(page); num_tags = mte_copy_tags_from_user(maddr, tags, MTE_GRANULES_PER_PAGE); - /* - * Set the flag after checking the write - * completed fully - */ - if (num_tags == MTE_GRANULES_PER_PAGE) - set_page_mte_tagged(page); + /* uaccess failed, don't leave stale tags */ + if (num_tags != MTE_GRANULES_PER_PAGE) + mte_clear_page_tags(page); + set_page_mte_tagged(page); kvm_release_pfn_dirty(pfn); } diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index e81bfb7306298..fa2c85b93149f 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1101,7 +1101,7 @@ static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, return; for (i = 0; i < nr_pages; i++, page++) { - if (!page_mte_tagged(page)) { + if (try_page_mte_tagging(page)) { mte_clear_page_tags(page_address(page)); set_page_mte_tagged(page); } diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c index 731d8a35701ea..8dd5a8fe64b4f 100644 --- a/arch/arm64/mm/copypage.c +++ b/arch/arm64/mm/copypage.c @@ -23,6 +23,8 @@ void copy_highpage(struct page *to, struct page *from) if (system_supports_mte() && page_mte_tagged(from)) { page_kasan_tag_reset(to); + /* It's a new page, shouldn't have been tagged yet */ + WARN_ON_ONCE(!try_page_mte_tagging(to)); mte_copy_page_tags(kto, kfrom); set_page_mte_tagged(to); } diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 629e886ceec40..b8b299d1736af 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -933,6 +933,8 @@ struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma, void tag_clear_highpage(struct page *page) { + /* Newly allocated page, shouldn't have been tagged yet */ + WARN_ON_ONCE(!try_page_mte_tagging(page)); mte_zero_clear_page_tags(page_address(page)); set_page_mte_tagged(page); } diff --git a/arch/arm64/mm/mteswap.c b/arch/arm64/mm/mteswap.c index 70f913205db99..cd508ba80ab1b 100644 --- a/arch/arm64/mm/mteswap.c +++ b/arch/arm64/mm/mteswap.c @@ -46,21 +46,17 @@ int mte_save_tags(struct page *page) return 0; } -bool mte_restore_tags(swp_entry_t entry, struct page *page) +void mte_restore_tags(swp_entry_t entry, struct page *page) { void *tags = xa_load(&mte_pages, entry.val); if (!tags) - return false; + return; - /* - * Test PG_mte_tagged again in case it was racing with another - * set_pte_at(). - */ - if (!test_and_set_bit(PG_mte_tagged, &page->flags)) + if (try_page_mte_tagging(page)) { mte_restore_page_tags(page_address(page), tags); - - return true; + set_page_mte_tagged(page); + } } void mte_invalidate_tags(int type, pgoff_t offset) -- GitLab From d89585fbb30869011b326ef26c94c3137d228df9 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Thu, 3 Nov 2022 18:10:39 -0700 Subject: [PATCH 0962/1423] KVM: arm64: unify the tests for VMAs in memslots when MTE is enabled Previously we allowed creating a memslot containing a private mapping that was not VM_MTE_ALLOWED, but would later reject KVM_RUN with -EFAULT. Now we reject the memory region at memslot creation time. Since this is a minor tweak to the ABI (a VMM that created one of these memslots would fail later anyway), no VMM to my knowledge has MTE support yet, and the hardware with the necessary features is not generally available, we can probably make this ABI change at this point. Signed-off-by: Peter Collingbourne Reviewed-by: Catalin Marinas Reviewed-by: Steven Price Reviewed-by: Cornelia Huck Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221104011041.290951-7-pcc@google.com --- arch/arm64/kvm/mmu.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index fa2c85b93149f..9ff9a271cf01b 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1108,6 +1108,19 @@ static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, } } +static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) +{ + /* + * VM_SHARED mappings are not allowed with MTE to avoid races + * when updating the PG_mte_tagged page flag, see + * sanitise_mte_tags for more details. + */ + if (vma->vm_flags & VM_SHARED) + return false; + + return vma->vm_flags & VM_MTE_ALLOWED; +} + static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_memory_slot *memslot, unsigned long hva, unsigned long fault_status) @@ -1284,9 +1297,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) { - /* Check the VMM hasn't introduced a new VM_SHARED VMA */ - if ((vma->vm_flags & VM_MTE_ALLOWED) && - !(vma->vm_flags & VM_SHARED)) { + /* Check the VMM hasn't introduced a new disallowed VMA */ + if (kvm_vma_mte_allowed(vma)) { sanitise_mte_tags(kvm, pfn, vma_pagesize); } else { ret = -EFAULT; @@ -1730,12 +1742,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, if (!vma) break; - /* - * VM_SHARED mappings are not allowed with MTE to avoid races - * when updating the PG_mte_tagged page flag, see - * sanitise_mte_tags for more details. - */ - if (kvm_has_mte(kvm) && vma->vm_flags & VM_SHARED) { + if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) { ret = -EINVAL; break; } -- GitLab From c911f0d4687947915f04024aa01803247fcf7f1a Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Thu, 3 Nov 2022 18:10:40 -0700 Subject: [PATCH 0963/1423] KVM: arm64: permit all VM_MTE_ALLOWED mappings with MTE enabled Certain VMMs such as crosvm have features (e.g. sandboxing) that depend on being able to map guest memory as MAP_SHARED. The current restriction on sharing MAP_SHARED pages with the guest is preventing the use of those features with MTE. Now that the races between tasks concurrently clearing tags on the same page have been fixed, remove this restriction. Note that this is a relaxation of the ABI. Signed-off-by: Peter Collingbourne Reviewed-by: Catalin Marinas Reviewed-by: Steven Price Reviewed-by: Cornelia Huck Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221104011041.290951-8-pcc@google.com --- arch/arm64/kvm/mmu.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 9ff9a271cf01b..b9402d8b5a90c 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1110,14 +1110,6 @@ static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) { - /* - * VM_SHARED mappings are not allowed with MTE to avoid races - * when updating the PG_mte_tagged page flag, see - * sanitise_mte_tags for more details. - */ - if (vma->vm_flags & VM_SHARED) - return false; - return vma->vm_flags & VM_MTE_ALLOWED; } -- GitLab From a4baf8d2639f24d4d31983ff67c01878e7a5393f Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Thu, 3 Nov 2022 18:10:41 -0700 Subject: [PATCH 0964/1423] Documentation: document the ABI changes for KVM_CAP_ARM_MTE Document both the restriction on VM_MTE_ALLOWED mappings and the relaxation for shared mappings. Signed-off-by: Peter Collingbourne Acked-by: Catalin Marinas Reviewed-by: Cornelia Huck Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221104011041.290951-9-pcc@google.com --- Documentation/virt/kvm/api.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index eee9f857a986f..b55f80dadcfeb 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7385,8 +7385,9 @@ hibernation of the host; however the VMM needs to manually save/restore the tags as appropriate if the VM is migrated. When this capability is enabled all memory in memslots must be mapped as -not-shareable (no MAP_SHARED), attempts to create a memslot with a -MAP_SHARED mmap will result in an -EINVAL return. +``MAP_ANONYMOUS`` or with a RAM-based file mapping (``tmpfs``, ``memfd``), +attempts to create a memslot with an invalid mmap will result in an +-EINVAL return. When enabled the VMM may make use of the ``KVM_ARM_MTE_COPY_TAGS`` ioctl to perform a bulk copy of tags to/from the guest. -- GitLab From 3b7c7478eda00945987d45f902bc3942c89243d3 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 28 Nov 2022 21:00:54 +0200 Subject: [PATCH 0965/1423] gpiolib: Provide to_gpio_device() helper Provide to_gpio_device() helper which can be utilized in the existing and future code. While at it, make sure it becomes no-op at compilation time. Signed-off-by: Andy Shevchenko Reviewed-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 2 +- drivers/gpio/gpiolib.h | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 2729f7ebab9d6..0058ee83989df 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -549,7 +549,7 @@ EXPORT_SYMBOL_GPL(gpiochip_line_is_valid); static void gpiodevice_release(struct device *dev) { - struct gpio_device *gdev = container_of(dev, struct gpio_device, dev); + struct gpio_device *gdev = to_gpio_device(dev); unsigned long flags; spin_lock_irqsave(&gpio_lock, flags); diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h index d900ecdbac46d..e443c1023a374 100644 --- a/drivers/gpio/gpiolib.h +++ b/drivers/gpio/gpiolib.h @@ -20,9 +20,9 @@ /** * struct gpio_device - internal state container for GPIO devices - * @id: numerical ID number for the GPIO chip * @dev: the GPIO device struct * @chrdev: character device for the GPIO device + * @id: numerical ID number for the GPIO chip * @mockdev: class device used by the deprecated sysfs interface (may be * NULL) * @owner: helps prevent removal of modules exporting active GPIOs @@ -47,9 +47,9 @@ * userspace. */ struct gpio_device { - int id; struct device dev; struct cdev chrdev; + int id; struct device *mockdev; struct module *owner; struct gpio_chip *chip; @@ -72,6 +72,11 @@ struct gpio_device { #endif }; +static inline struct gpio_device *to_gpio_device(struct device *dev) +{ + return container_of(dev, struct gpio_device, dev); +} + /* gpio suffixes used for ACPI and device tree lookup */ static __maybe_unused const char * const gpio_suffixes[] = { "gpios", "gpio" }; -- GitLab From 9ec1eb1bcceec735fb3c9255cdcdbcc2acf860a0 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 18 Nov 2022 21:15:02 +0000 Subject: [PATCH 0966/1423] KVM: selftests: Have perf_test_util signal when to stop vCPUs Signal that a test run is complete through perf_test_args instead of having tests open code a similar solution. Ensure that the field resets to false at the beginning of a test run as the structure is reused between test runs, eliminating a couple of bugs: access_tracking_perf_test hangs indefinitely on a subsequent test run, as 'done' remains true. The bug doesn't amount to much right now, as x86 supports a single guest mode. However, this is a precondition of enabling the test for other architectures with >1 guest mode, like arm64. memslot_modification_stress_test has the exact opposite problem, where subsequent test runs complete immediately as 'run_vcpus' remains false. Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson [oliver: added commit message, preserve spin_wait_for_next_iteration()] Signed-off-by: Oliver Upton Reviewed-by: Gavin Shan Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221118211503.4049023-2-oliver.upton@linux.dev --- tools/testing/selftests/kvm/access_tracking_perf_test.c | 8 +------- tools/testing/selftests/kvm/include/perf_test_util.h | 3 +++ tools/testing/selftests/kvm/lib/perf_test_util.c | 3 +++ .../selftests/kvm/memslot_modification_stress_test.c | 6 +----- 4 files changed, 8 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c index 76c583a07ea22..942370d573925 100644 --- a/tools/testing/selftests/kvm/access_tracking_perf_test.c +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c @@ -58,9 +58,6 @@ static enum { ITERATION_MARK_IDLE, } iteration_work; -/* Set to true when vCPU threads should exit. */ -static bool done; - /* The iteration that was last completed by each vCPU. */ static int vcpu_last_completed_iteration[KVM_MAX_VCPUS]; @@ -211,7 +208,7 @@ static bool spin_wait_for_next_iteration(int *current_iteration) int last_iteration = *current_iteration; do { - if (READ_ONCE(done)) + if (READ_ONCE(perf_test_args.stop_vcpus)) return false; *current_iteration = READ_ONCE(iteration); @@ -321,9 +318,6 @@ static void run_test(enum vm_guest_mode mode, void *arg) mark_memory_idle(vm, nr_vcpus); access_memory(vm, nr_vcpus, ACCESS_READ, "Reading from idle memory"); - /* Set done to signal the vCPU threads to exit */ - done = true; - perf_test_join_vcpu_threads(nr_vcpus); perf_test_destroy_vm(vm); } diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h index eaa88df0555a8..536d7c3c3f14e 100644 --- a/tools/testing/selftests/kvm/include/perf_test_util.h +++ b/tools/testing/selftests/kvm/include/perf_test_util.h @@ -40,6 +40,9 @@ struct perf_test_args { /* Run vCPUs in L2 instead of L1, if the architecture supports it. */ bool nested; + /* Test is done, stop running vCPUs. */ + bool stop_vcpus; + struct perf_test_vcpu_args vcpu_args[KVM_MAX_VCPUS]; }; diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index 9618b37c66f7e..ee3f499ccbd22 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -267,6 +267,7 @@ void perf_test_start_vcpu_threads(int nr_vcpus, vcpu_thread_fn = vcpu_fn; WRITE_ONCE(all_vcpu_threads_running, false); + WRITE_ONCE(perf_test_args.stop_vcpus, false); for (i = 0; i < nr_vcpus; i++) { struct vcpu_thread *vcpu = &vcpu_threads[i]; @@ -289,6 +290,8 @@ void perf_test_join_vcpu_threads(int nr_vcpus) { int i; + WRITE_ONCE(perf_test_args.stop_vcpus, true); + for (i = 0; i < nr_vcpus; i++) pthread_join(vcpu_threads[i].thread, NULL); } diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index bb1d17a1171bc..3a5e4518307c2 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -34,8 +34,6 @@ static int nr_vcpus = 1; static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE; -static bool run_vcpus = true; - static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args) { struct kvm_vcpu *vcpu = vcpu_args->vcpu; @@ -45,7 +43,7 @@ static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args) run = vcpu->run; /* Let the guest access its memory until a stop signal is received */ - while (READ_ONCE(run_vcpus)) { + while (!READ_ONCE(perf_test_args.stop_vcpus)) { ret = _vcpu_run(vcpu); TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret); @@ -110,8 +108,6 @@ static void run_test(enum vm_guest_mode mode, void *arg) add_remove_memslot(vm, p->memslot_modification_delay, p->nr_memslot_modifications); - run_vcpus = false; - perf_test_join_vcpu_threads(nr_vcpus); pr_info("All vCPU threads joined\n"); -- GitLab From 4568180411e0fb5613e217da1c693466e39b9c27 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 18 Nov 2022 21:15:03 +0000 Subject: [PATCH 0967/1423] KVM: selftests: Build access_tracking_perf_test for arm64 Does exactly what it says on the tin. Reviewed-by: Gavin Shan Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221118211503.4049023-3-oliver.upton@linux.dev --- tools/testing/selftests/kvm/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 0172eb6cb6eee..4c0ff91a89644 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -156,6 +156,7 @@ TEST_GEN_PROGS_aarch64 += aarch64/psci_test TEST_GEN_PROGS_aarch64 += aarch64/vcpu_width_config TEST_GEN_PROGS_aarch64 += aarch64/vgic_init TEST_GEN_PROGS_aarch64 += aarch64/vgic_irq +TEST_GEN_PROGS_aarch64 += access_tracking_perf_test TEST_GEN_PROGS_aarch64 += demand_paging_test TEST_GEN_PROGS_aarch64 += dirty_log_test TEST_GEN_PROGS_aarch64 += dirty_log_perf_test -- GitLab From 41555cc9e2e9778ddc7c0293a4a2e4995e332643 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Fri, 21 Oct 2022 17:00:30 +0100 Subject: [PATCH 0968/1423] RISC-V: enable sparsemem by default for defconfig on an arch level, RISC-V defaults to FLATMEM. On PolarFire SoC, the memory layout is almost always sparse, with a maximum of 1 GiB at 0x8000_0000 & a possible 16 GiB range at 0x10_0000_0000. The Icicle kit, for example, has 2 GiB of DDR - so there's a big hole in the memory map between the two gigs. Prior to v6.1-rc1, boot times from defconfig builds were pretty bad on Icicle but enabling sparsemem would fix those issues. As of v6.1-rc1, the Icicle kit no longer boots from defconfig builds with the in-kernel devicetree. A change to the memory map resulted in a futher "sparse-ification", producing a splat on boot: OF: fdt: Ignoring memory range 0x80000000 - 0x80200000 Machine model: Microchip PolarFire-SoC Icicle Kit earlycon: ns16550a0 at MMIO32 0x0000000020100000 (options '115200n8') printk: bootconsole [ns16550a0] enabled printk: debug: skip boot console de-registration. efi: UEFI not found. Zone ranges: DMA32 [mem 0x0000000080200000-0x00000000ffffffff] Normal [mem 0x0000000100000000-0x000000107fffffff] Movable zone start for each node Early memory node ranges node 0: [mem 0x0000000080200000-0x00000000bfbfffff] node 0: [mem 0x00000000bfc00000-0x00000000bfffffff] node 0: [mem 0x0000001040000000-0x000000107fffffff] Initmem setup node 0 [mem 0x0000000080200000-0x000000107fffffff] Kernel panic - not syncing: Failed to allocate 1073741824 bytes for node 0 memory map CPU: 0 PID: 0 Comm: swapper Not tainted 5.19.0-dirty #1 Hardware name: Microchip PolarFire-SoC Icicle Kit (DT) Call Trace: [] show_stack+0x30/0x3c [] dump_stack_lvl+0x4a/0x66 [] dump_stack+0x18/0x20 [] panic+0x124/0x2c6 [] free_area_init_core+0x0/0x11e [] free_area_init_node+0xc2/0xf6 [] free_area_init+0x222/0x260 [] misc_mem_init+0x62/0x9a [] setup_arch+0xb0/0xea [] start_kernel+0x88/0x4ee ---[ end Kernel panic - not syncing: Failed to allocate 1073741824 bytes for node 0 memory map ]--- With the aim of keeping defconfig builds booting on icicle, enable SPARSEMEM_MANUAL. Signed-off-by: Conor Dooley Link: https://lore.kernel.org/r/20221021160028.4042304-1-conor@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/configs/defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index 05fd5fcf24f9b..daba5d7438620 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -38,6 +38,7 @@ CONFIG_KVM=m CONFIG_JUMP_LABEL=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y +CONFIG_SPARSEMEM_MANUAL=y CONFIG_BLK_DEV_THROTTLING=y CONFIG_NET=y CONFIG_PACKET=y -- GitLab From 4989764d8ed3d3d1024e4e831ff2affc40ee01d6 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:24 -0400 Subject: [PATCH 0969/1423] iommu: Add IOMMU_CAP_ENFORCE_CACHE_COHERENCY This queries if a domain linked to a device should expect to support enforce_cache_coherency() so iommufd can negotiate the rules for when a domain should be shared or not. For iommufd a device that declares IOMMU_CAP_ENFORCE_CACHE_COHERENCY will not be attached to a domain that does not support it. Link: https://lore.kernel.org/r/1-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Tested-by: Yu He Signed-off-by: Jason Gunthorpe --- drivers/iommu/amd/iommu.c | 2 ++ drivers/iommu/intel/iommu.c | 16 +++++++++++----- include/linux/iommu.h | 5 +++++ 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 45299eb7e8e30..240c535e317cc 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2278,6 +2278,8 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) return false; case IOMMU_CAP_PRE_BOOT_PROTECTION: return amdr_ivrs_remap_support; + case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: + return true; default: break; } diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index f298e51d5aa67..157c972741107 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4450,14 +4450,20 @@ static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain) static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap) { - if (cap == IOMMU_CAP_CACHE_COHERENCY) + struct device_domain_info *info = dev_iommu_priv_get(dev); + + switch (cap) { + case IOMMU_CAP_CACHE_COHERENCY: return true; - if (cap == IOMMU_CAP_INTR_REMAP) + case IOMMU_CAP_INTR_REMAP: return irq_remapping_enabled == 1; - if (cap == IOMMU_CAP_PRE_BOOT_PROTECTION) + case IOMMU_CAP_PRE_BOOT_PROTECTION: return dmar_platform_optin(); - - return false; + case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: + return ecap_sc_support(info->iommu->ecap); + default: + return false; + } } static struct iommu_device *intel_iommu_probe_device(struct device *dev) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 68d7d304cdb76..a09fd32d8cc27 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -124,6 +124,11 @@ enum iommu_cap { IOMMU_CAP_NOEXEC, /* IOMMU_NOEXEC flag */ IOMMU_CAP_PRE_BOOT_PROTECTION, /* Firmware says it used the IOMMU for DMA protection and we should too */ + /* + * Per-device flag indicating if enforce_cache_coherency() will work on + * this device. + */ + IOMMU_CAP_ENFORCE_CACHE_COHERENCY, }; /* These are the possible reserved region types */ -- GitLab From 89395ccedbc153fecbc29342fbb94a6dfadf24cd Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 29 Nov 2022 16:29:25 -0400 Subject: [PATCH 0970/1423] iommu: Add device-centric DMA ownership interfaces These complement the group interfaces used by VFIO and are for use by iommufd. The main difference is that multiple devices in the same group can all share the ownership by passing the same ownership pointer. Move the common code into shared functions. Link: https://lore.kernel.org/r/2-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Reviewed-by: Kevin Tian Reviewed-by: Eric Auger Signed-off-by: Lu Baolu Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommu.c | 121 +++++++++++++++++++++++++++++++++--------- include/linux/iommu.h | 12 +++++ 2 files changed, 107 insertions(+), 26 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 6ca377f4fbf9e..d69ebba81bebd 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3108,41 +3108,49 @@ static int __iommu_group_alloc_blocking_domain(struct iommu_group *group) return 0; } +static int __iommu_take_dma_ownership(struct iommu_group *group, void *owner) +{ + int ret; + + if ((group->domain && group->domain != group->default_domain) || + !xa_empty(&group->pasid_array)) + return -EBUSY; + + ret = __iommu_group_alloc_blocking_domain(group); + if (ret) + return ret; + ret = __iommu_group_set_domain(group, group->blocking_domain); + if (ret) + return ret; + + group->owner = owner; + group->owner_cnt++; + return 0; +} + /** * iommu_group_claim_dma_owner() - Set DMA ownership of a group * @group: The group. * @owner: Caller specified pointer. Used for exclusive ownership. * - * This is to support backward compatibility for vfio which manages - * the dma ownership in iommu_group level. New invocations on this - * interface should be prohibited. + * This is to support backward compatibility for vfio which manages the dma + * ownership in iommu_group level. New invocations on this interface should be + * prohibited. Only a single owner may exist for a group. */ int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner) { int ret = 0; + if (WARN_ON(!owner)) + return -EINVAL; + mutex_lock(&group->mutex); if (group->owner_cnt) { ret = -EPERM; goto unlock_out; - } else { - if ((group->domain && group->domain != group->default_domain) || - !xa_empty(&group->pasid_array)) { - ret = -EBUSY; - goto unlock_out; - } - - ret = __iommu_group_alloc_blocking_domain(group); - if (ret) - goto unlock_out; - - ret = __iommu_group_set_domain(group, group->blocking_domain); - if (ret) - goto unlock_out; - group->owner = owner; } - group->owner_cnt++; + ret = __iommu_take_dma_ownership(group, owner); unlock_out: mutex_unlock(&group->mutex); @@ -3151,30 +3159,91 @@ unlock_out: EXPORT_SYMBOL_GPL(iommu_group_claim_dma_owner); /** - * iommu_group_release_dma_owner() - Release DMA ownership of a group - * @group: The group. + * iommu_device_claim_dma_owner() - Set DMA ownership of a device + * @dev: The device. + * @owner: Caller specified pointer. Used for exclusive ownership. * - * Release the DMA ownership claimed by iommu_group_claim_dma_owner(). + * Claim the DMA ownership of a device. Multiple devices in the same group may + * concurrently claim ownership if they present the same owner value. Returns 0 + * on success and error code on failure */ -void iommu_group_release_dma_owner(struct iommu_group *group) +int iommu_device_claim_dma_owner(struct device *dev, void *owner) { - int ret; + struct iommu_group *group = iommu_group_get(dev); + int ret = 0; + + if (!group) + return -ENODEV; + if (WARN_ON(!owner)) + return -EINVAL; mutex_lock(&group->mutex); + if (group->owner_cnt) { + if (group->owner != owner) { + ret = -EPERM; + goto unlock_out; + } + group->owner_cnt++; + goto unlock_out; + } + + ret = __iommu_take_dma_ownership(group, owner); +unlock_out: + mutex_unlock(&group->mutex); + iommu_group_put(group); + + return ret; +} +EXPORT_SYMBOL_GPL(iommu_device_claim_dma_owner); + +static void __iommu_release_dma_ownership(struct iommu_group *group) +{ + int ret; + if (WARN_ON(!group->owner_cnt || !group->owner || !xa_empty(&group->pasid_array))) - goto unlock_out; + return; group->owner_cnt = 0; group->owner = NULL; ret = __iommu_group_set_domain(group, group->default_domain); WARN(ret, "iommu driver failed to attach the default domain"); +} -unlock_out: +/** + * iommu_group_release_dma_owner() - Release DMA ownership of a group + * @dev: The device + * + * Release the DMA ownership claimed by iommu_group_claim_dma_owner(). + */ +void iommu_group_release_dma_owner(struct iommu_group *group) +{ + mutex_lock(&group->mutex); + __iommu_release_dma_ownership(group); mutex_unlock(&group->mutex); } EXPORT_SYMBOL_GPL(iommu_group_release_dma_owner); +/** + * iommu_device_release_dma_owner() - Release DMA ownership of a device + * @group: The device. + * + * Release the DMA ownership claimed by iommu_device_claim_dma_owner(). + */ +void iommu_device_release_dma_owner(struct device *dev) +{ + struct iommu_group *group = iommu_group_get(dev); + + mutex_lock(&group->mutex); + if (group->owner_cnt > 1) + group->owner_cnt--; + else + __iommu_release_dma_ownership(group); + mutex_unlock(&group->mutex); + iommu_group_put(group); +} +EXPORT_SYMBOL_GPL(iommu_device_release_dma_owner); + /** * iommu_group_dma_owner_claimed() - Query group dma ownership status * @group: The group. diff --git a/include/linux/iommu.h b/include/linux/iommu.h index a09fd32d8cc27..1690c334e5163 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -707,6 +707,9 @@ int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner); void iommu_group_release_dma_owner(struct iommu_group *group); bool iommu_group_dma_owner_claimed(struct iommu_group *group); +int iommu_device_claim_dma_owner(struct device *dev, void *owner); +void iommu_device_release_dma_owner(struct device *dev); + struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm); int iommu_attach_device_pasid(struct iommu_domain *domain, @@ -1064,6 +1067,15 @@ static inline bool iommu_group_dma_owner_claimed(struct iommu_group *group) return false; } +static inline void iommu_device_release_dma_owner(struct device *dev) +{ +} + +static inline int iommu_device_claim_dma_owner(struct device *dev, void *owner) +{ + return -ENODEV; +} + static inline struct iommu_domain * iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm) { -- GitLab From 5fe937862c8426f24cd1dcbf7c22fb1a31069b4f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:26 -0400 Subject: [PATCH 0971/1423] interval-tree: Add a utility to iterate over spans in an interval tree The span iterator travels over the indexes of the interval_tree, not the nodes, and classifies spans of indexes as either 'used' or 'hole'. 'used' spans are fully covered by nodes in the tree and 'hole' spans have no node intersecting the span. This is done greedily such that spans are maximally sized and every iteration step switches between used/hole. As an example a trivial allocator can be written as: for (interval_tree_span_iter_first(&span, itree, 0, ULONG_MAX); !interval_tree_span_iter_done(&span); interval_tree_span_iter_next(&span)) if (span.is_hole && span.last_hole - span.start_hole >= allocation_size - 1) return span.start_hole; With all the tricky boundary conditions handled by the library code. The following iommufd patches have several algorithms for its overlapping node interval trees that are significantly simplified with this kind of iteration primitive. As it seems generally useful, put it into lib/. Link: https://lore.kernel.org/r/3-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Eric Auger Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Signed-off-by: Jason Gunthorpe --- .clang-format | 1 + include/linux/interval_tree.h | 58 +++++++++++++++ lib/Kconfig | 4 ++ lib/interval_tree.c | 132 ++++++++++++++++++++++++++++++++++ 4 files changed, 195 insertions(+) diff --git a/.clang-format b/.clang-format index 1247d54f9e49f..96d07786dcfb4 100644 --- a/.clang-format +++ b/.clang-format @@ -440,6 +440,7 @@ ForEachMacros: - 'inet_lhash2_for_each_icsk' - 'inet_lhash2_for_each_icsk_continue' - 'inet_lhash2_for_each_icsk_rcu' + - 'interval_tree_for_each_span' - 'intlist__for_each_entry' - 'intlist__for_each_entry_safe' - 'kcore_copy__for_each_phdr' diff --git a/include/linux/interval_tree.h b/include/linux/interval_tree.h index 288c26f50732d..2b8026a399064 100644 --- a/include/linux/interval_tree.h +++ b/include/linux/interval_tree.h @@ -27,4 +27,62 @@ extern struct interval_tree_node * interval_tree_iter_next(struct interval_tree_node *node, unsigned long start, unsigned long last); +/** + * struct interval_tree_span_iter - Find used and unused spans. + * @start_hole: Start of an interval for a hole when is_hole == 1 + * @last_hole: Inclusive end of an interval for a hole when is_hole == 1 + * @start_used: Start of a used interval when is_hole == 0 + * @last_used: Inclusive end of a used interval when is_hole == 0 + * @is_hole: 0 == used, 1 == is_hole, -1 == done iteration + * + * This iterator travels over spans in an interval tree. It does not return + * nodes but classifies each span as either a hole, where no nodes intersect, or + * a used, which is fully covered by nodes. Each iteration step toggles between + * hole and used until the entire range is covered. The returned spans always + * fully cover the requested range. + * + * The iterator is greedy, it always returns the largest hole or used possible, + * consolidating all consecutive nodes. + * + * Use interval_tree_span_iter_done() to detect end of iteration. + */ +struct interval_tree_span_iter { + /* private: not for use by the caller */ + struct interval_tree_node *nodes[2]; + unsigned long first_index; + unsigned long last_index; + + /* public: */ + union { + unsigned long start_hole; + unsigned long start_used; + }; + union { + unsigned long last_hole; + unsigned long last_used; + }; + int is_hole; +}; + +void interval_tree_span_iter_first(struct interval_tree_span_iter *state, + struct rb_root_cached *itree, + unsigned long first_index, + unsigned long last_index); +void interval_tree_span_iter_advance(struct interval_tree_span_iter *iter, + struct rb_root_cached *itree, + unsigned long new_index); +void interval_tree_span_iter_next(struct interval_tree_span_iter *state); + +static inline bool +interval_tree_span_iter_done(struct interval_tree_span_iter *state) +{ + return state->is_hole == -1; +} + +#define interval_tree_for_each_span(span, itree, first_index, last_index) \ + for (interval_tree_span_iter_first(span, itree, \ + first_index, last_index); \ + !interval_tree_span_iter_done(span); \ + interval_tree_span_iter_next(span)) + #endif /* _LINUX_INTERVAL_TREE_H */ diff --git a/lib/Kconfig b/lib/Kconfig index 9bbf8a4b2108e..c6c323fd25172 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -479,6 +479,10 @@ config INTERVAL_TREE for more information. +config INTERVAL_TREE_SPAN_ITER + bool + depends on INTERVAL_TREE + config XARRAY_MULTI bool help diff --git a/lib/interval_tree.c b/lib/interval_tree.c index 593ce56ece505..3412737ff365e 100644 --- a/lib/interval_tree.c +++ b/lib/interval_tree.c @@ -15,3 +15,135 @@ EXPORT_SYMBOL_GPL(interval_tree_insert); EXPORT_SYMBOL_GPL(interval_tree_remove); EXPORT_SYMBOL_GPL(interval_tree_iter_first); EXPORT_SYMBOL_GPL(interval_tree_iter_next); + +#ifdef CONFIG_INTERVAL_TREE_SPAN_ITER +/* + * Roll nodes[1] into nodes[0] by advancing nodes[1] to the end of a contiguous + * span of nodes. This makes nodes[0]->last the end of that contiguous used span + * indexes that started at the original nodes[1]->start. nodes[1] is now the + * first node starting the next used span. A hole span is between nodes[0]->last + * and nodes[1]->start. nodes[1] must be !NULL. + */ +static void +interval_tree_span_iter_next_gap(struct interval_tree_span_iter *state) +{ + struct interval_tree_node *cur = state->nodes[1]; + + state->nodes[0] = cur; + do { + if (cur->last > state->nodes[0]->last) + state->nodes[0] = cur; + cur = interval_tree_iter_next(cur, state->first_index, + state->last_index); + } while (cur && (state->nodes[0]->last >= cur->start || + state->nodes[0]->last + 1 == cur->start)); + state->nodes[1] = cur; +} + +void interval_tree_span_iter_first(struct interval_tree_span_iter *iter, + struct rb_root_cached *itree, + unsigned long first_index, + unsigned long last_index) +{ + iter->first_index = first_index; + iter->last_index = last_index; + iter->nodes[0] = NULL; + iter->nodes[1] = + interval_tree_iter_first(itree, first_index, last_index); + if (!iter->nodes[1]) { + /* No nodes intersect the span, whole span is hole */ + iter->start_hole = first_index; + iter->last_hole = last_index; + iter->is_hole = 1; + return; + } + if (iter->nodes[1]->start > first_index) { + /* Leading hole on first iteration */ + iter->start_hole = first_index; + iter->last_hole = iter->nodes[1]->start - 1; + iter->is_hole = 1; + interval_tree_span_iter_next_gap(iter); + return; + } + + /* Starting inside a used */ + iter->start_used = first_index; + iter->is_hole = 0; + interval_tree_span_iter_next_gap(iter); + iter->last_used = iter->nodes[0]->last; + if (iter->last_used >= last_index) { + iter->last_used = last_index; + iter->nodes[0] = NULL; + iter->nodes[1] = NULL; + } +} +EXPORT_SYMBOL_GPL(interval_tree_span_iter_first); + +void interval_tree_span_iter_next(struct interval_tree_span_iter *iter) +{ + if (!iter->nodes[0] && !iter->nodes[1]) { + iter->is_hole = -1; + return; + } + + if (iter->is_hole) { + iter->start_used = iter->last_hole + 1; + iter->last_used = iter->nodes[0]->last; + if (iter->last_used >= iter->last_index) { + iter->last_used = iter->last_index; + iter->nodes[0] = NULL; + iter->nodes[1] = NULL; + } + iter->is_hole = 0; + return; + } + + if (!iter->nodes[1]) { + /* Trailing hole */ + iter->start_hole = iter->nodes[0]->last + 1; + iter->last_hole = iter->last_index; + iter->nodes[0] = NULL; + iter->is_hole = 1; + return; + } + + /* must have both nodes[0] and [1], interior hole */ + iter->start_hole = iter->nodes[0]->last + 1; + iter->last_hole = iter->nodes[1]->start - 1; + iter->is_hole = 1; + interval_tree_span_iter_next_gap(iter); +} +EXPORT_SYMBOL_GPL(interval_tree_span_iter_next); + +/* + * Advance the iterator index to a specific position. The returned used/hole is + * updated to start at new_index. This is faster than calling + * interval_tree_span_iter_first() as it can avoid full searches in several + * cases where the iterator is already set. + */ +void interval_tree_span_iter_advance(struct interval_tree_span_iter *iter, + struct rb_root_cached *itree, + unsigned long new_index) +{ + if (iter->is_hole == -1) + return; + + iter->first_index = new_index; + if (new_index > iter->last_index) { + iter->is_hole = -1; + return; + } + + /* Rely on the union aliasing hole/used */ + if (iter->start_hole <= new_index && new_index <= iter->last_hole) { + iter->start_hole = new_index; + return; + } + if (new_index == iter->last_hole + 1) + interval_tree_span_iter_next(iter); + else + interval_tree_span_iter_first(iter, itree, new_index, + iter->last_index); +} +EXPORT_SYMBOL_GPL(interval_tree_span_iter_advance); +#endif -- GitLab From 632ce1377dbbdabff575d33bec9c79d75ef0395a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:27 -0400 Subject: [PATCH 0972/1423] scripts/kernel-doc: support EXPORT_SYMBOL_NS_GPL() with -export Parse EXPORT_SYMBOL_NS_GPL() in addition to EXPORT_SYMBOL_GPL() for use with the -export flag. Link: https://lore.kernel.org/r/4-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Acked-by: Jonathan Corbet Signed-off-by: Jason Gunthorpe --- scripts/kernel-doc | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/kernel-doc b/scripts/kernel-doc index aea04365bc69d..48e3feca31701 100755 --- a/scripts/kernel-doc +++ b/scripts/kernel-doc @@ -256,6 +256,7 @@ my $doc_inline_sect = '\s*\*\s*(@\s*[\w][\w\.]*\s*):(.*)'; my $doc_inline_end = '^\s*\*/\s*$'; my $doc_inline_oneline = '^\s*/\*\*\s*(@[\w\s]+):\s*(.*)\s*\*/\s*$'; my $export_symbol = '^\s*EXPORT_SYMBOL(_GPL)?\s*\(\s*(\w+)\s*\)\s*;'; +my $export_symbol_ns = '^\s*EXPORT_SYMBOL_NS(_GPL)?\s*\(\s*(\w+)\s*,\s*\w+\)\s*;'; my $function_pointer = qr{([^\(]*\(\*)\s*\)\s*\(([^\)]*)\)}; my $attribute = qr{__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)}i; @@ -1948,6 +1949,10 @@ sub process_export_file($) { next if (defined($nosymbol_table{$2})); $function_table{$2} = 1; } + if (/$export_symbol_ns/) { + next if (defined($nosymbol_table{$2})); + $function_table{$2} = 1; + } } close(IN); @@ -2419,12 +2424,12 @@ found on PATH. =item -export Only output documentation for the symbols that have been exported using -EXPORT_SYMBOL() or EXPORT_SYMBOL_GPL() in any input FILE or -export-file FILE. +EXPORT_SYMBOL() and related macros in any input FILE or -export-file FILE. =item -internal Only output documentation for the symbols that have NOT been exported using -EXPORT_SYMBOL() or EXPORT_SYMBOL_GPL() in any input FILE or -export-file FILE. +EXPORT_SYMBOL() and related macros in any input FILE or -export-file FILE. =item -function NAME @@ -2451,8 +2456,7 @@ Do not output DOC: sections. =item -export-file FILE -Specify an additional FILE in which to look for EXPORT_SYMBOL() and -EXPORT_SYMBOL_GPL(). +Specify an additional FILE in which to look for EXPORT_SYMBOL information. To be used with -export or -internal. -- GitLab From 67e6272d53386f9708f91c4d0015c4a1c470eef5 Mon Sep 17 00:00:00 2001 From: Or Har-Toov Date: Mon, 28 Nov 2022 13:52:45 +0200 Subject: [PATCH 0973/1423] RDMA/nldev: Add NULL check to silence false warnings Using nlmsg_put causes static analysis tools to many false positives of not checking the return value of nlmsg_put. In all uses in nldev.c, payload parameter is 0 so NULL will never be returned. So let's add useless checks to silence the warnings. Signed-off-by: Or Har-Toov Reviewed-by: Michael Guralnik Link: https://lore.kernel.org/r/bd924da89d5b4f5291a4a01d9b5ae47c0a9b6a3f.1669636336.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/nldev.c | 44 +++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index ca0ed7d143261..b4716cda65d89 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -1043,6 +1043,10 @@ static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), 0, 0); + if (!nlh) { + err = -EMSGSIZE; + goto err_free; + } err = fill_dev_info(msg, device); if (err) @@ -1128,7 +1132,7 @@ static int _nldev_get_dumpit(struct ib_device *device, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), 0, NLM_F_MULTI); - if (fill_dev_info(skb, device)) { + if (!nlh || fill_dev_info(skb, device)) { nlmsg_cancel(skb, nlh); goto out; } @@ -1187,6 +1191,10 @@ static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), 0, 0); + if (!nlh) { + err = -EMSGSIZE; + goto err_free; + } err = fill_port_info(msg, device, port, sock_net(skb->sk)); if (err) @@ -1248,7 +1256,7 @@ static int nldev_port_get_dumpit(struct sk_buff *skb, RDMA_NLDEV_CMD_PORT_GET), 0, NLM_F_MULTI); - if (fill_port_info(skb, device, p, sock_net(skb->sk))) { + if (!nlh || fill_port_info(skb, device, p, sock_net(skb->sk))) { nlmsg_cancel(skb, nlh); goto out; } @@ -1290,6 +1298,10 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET), 0, 0); + if (!nlh) { + ret = -EMSGSIZE; + goto err_free; + } ret = fill_res_info(msg, device); if (ret) @@ -1321,7 +1333,7 @@ static int _nldev_res_get_dumpit(struct ib_device *device, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET), 0, NLM_F_MULTI); - if (fill_res_info(skb, device)) { + if (!nlh || fill_res_info(skb, device)) { nlmsg_cancel(skb, nlh); goto out; } @@ -1456,7 +1468,7 @@ static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, RDMA_NL_GET_OP(nlh->nlmsg_type)), 0, 0); - if (fill_nldev_handle(msg, device)) { + if (!nlh || fill_nldev_handle(msg, device)) { ret = -EMSGSIZE; goto err_free; } @@ -1535,7 +1547,7 @@ static int res_get_common_dumpit(struct sk_buff *skb, RDMA_NL_GET_OP(cb->nlh->nlmsg_type)), 0, NLM_F_MULTI); - if (fill_nldev_handle(skb, device)) { + if (!nlh || fill_nldev_handle(skb, device)) { ret = -EMSGSIZE; goto err; } @@ -1797,6 +1809,10 @@ static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET_CHARDEV), 0, 0); + if (!nlh) { + err = -EMSGSIZE; + goto out_nlmsg; + } data.nl_msg = msg; err = ib_get_client_nl_info(ibdev, client_name, &data); @@ -1854,6 +1870,10 @@ static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_SYS_GET), 0, 0); + if (!nlh) { + nlmsg_free(msg); + return -EMSGSIZE; + } err = nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_NETNS_MODE, (u8)ib_devices_shared_netns); @@ -2034,7 +2054,7 @@ static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_STAT_SET), 0, 0); - if (fill_nldev_handle(msg, device) || + if (!nlh || fill_nldev_handle(msg, device) || nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) { ret = -EMSGSIZE; goto err_free_msg; @@ -2103,6 +2123,10 @@ static int nldev_stat_del_doit(struct sk_buff *skb, struct nlmsghdr *nlh, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_STAT_SET), 0, 0); + if (!nlh) { + ret = -EMSGSIZE; + goto err_fill; + } cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]); qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]); @@ -2173,7 +2197,7 @@ static int stat_get_doit_default_counter(struct sk_buff *skb, RDMA_NLDEV_CMD_STAT_GET), 0, 0); - if (fill_nldev_handle(msg, device) || + if (!nlh || fill_nldev_handle(msg, device) || nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) { ret = -EMSGSIZE; goto err_msg; @@ -2261,6 +2285,10 @@ static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_STAT_GET), 0, 0); + if (!nlh) { + ret = -EMSGSIZE; + goto err_msg; + } ret = rdma_counter_get_mode(device, port, &mode, &mask); if (ret) @@ -2393,7 +2421,7 @@ static int nldev_stat_get_counter_status_doit(struct sk_buff *skb, 0, 0); ret = -EMSGSIZE; - if (fill_nldev_handle(msg, device) || + if (!nlh || fill_nldev_handle(msg, device) || nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) goto err_msg; -- GitLab From fc8f93ad3e5485d45c992233c96acd902992dfc4 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Mon, 28 Nov 2022 13:52:46 +0200 Subject: [PATCH 0974/1423] RDMA/nldev: Fix failure to send large messages Return "-EMSGSIZE" instead of "-EINVAL" when filling a QP entry, so that new SKBs will be allocated if there's not enough room in current SKB. Fixes: 65959522f806 ("RDMA: Add support to dump resource tracker in RAW format") Signed-off-by: Mark Zhang Reviewed-by: Patrisious Haddad Link: https://lore.kernel.org/r/b5e9c62f6b8369acab5648b661bf539cbceeffdc.1669636336.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/nldev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index b4716cda65d89..a981ac2f09758 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -513,7 +513,7 @@ static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin, /* In create_qp() port is not set yet */ if (qp->port && nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, qp->port)) - return -EINVAL; + return -EMSGSIZE; ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qp->qp_num); if (ret) -- GitLab From 5ec3289b31ab9bb209be59cee360aac4b03f320a Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 18 Nov 2022 14:32:38 +0000 Subject: [PATCH 0975/1423] KVM: x86/xen: Compatibility fixes for shared runstate area MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The guest runstate area can be arbitrarily byte-aligned. In fact, even when a sane 32-bit guest aligns the overall structure nicely, the 64-bit fields in the structure end up being unaligned due to the fact that the 32-bit ABI only aligns them to 32 bits. So setting the ->state_entry_time field to something|XEN_RUNSTATE_UPDATE is buggy, because if it's unaligned then we can't update the whole field atomically; the low bytes might be observable before the _UPDATE bit is. Xen actually updates the *byte* containing that top bit, on its own. KVM should do the same. In addition, we cannot assume that the runstate area fits within a single page. One option might be to make the gfn_to_pfn cache cope with regions that cross a page — but getting a contiguous virtual kernel mapping of a discontiguous set of IOMEM pages is a distinctly non-trivial exercise, and it seems this is the *only* current use case for the GPC which would benefit from it. An earlier version of the runstate code did use a gfn_to_hva cache for this purpose, but it still had the single-page restriction because it used the uhva directly — because it needs to be able to do so atomically when the vCPU is being scheduled out, so it used pagefault_disable() around the accesses and didn't just use kvm_write_guest_cached() which has a fallback path. So... use a pair of GPCs for the first and potential second page covering the runstate area. We can get away with locking both at once because nothing else takes more than one GPC lock at a time so we can invent a trivial ordering rule. The common case where it's all in the same page is kept as a fast path, but in both cases, the actual guest structure (compat or not) is built up from the fields in @vx, following preset pointers to the state and times fields. The only difference is whether those pointers point to the kernel stack (in the split case) or to guest memory directly via the GPC. The fast path is also fixed to use a byte access for the XEN_RUNSTATE_UPDATE bit, then the only real difference is the dual memcpy. Finally, Xen also does write the runstate area immediately when it's configured. Flip the kvm_xen_update_runstate() and …_guest() functions and call the latter directly when the runstate area is set. This means that other ioctls which modify the runstate also write it immediately to the guest when they do so, which is also intended. Update the xen_shinfo_test to exercise the pathological case where the XEN_RUNSTATE_UPDATE flag in the top byte of the state_entry_time is actually in a different page to the rest of the 64-bit word. Signed-off-by: David Woodhouse Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/xen.c | 370 +++++++++++++----- arch/x86/kvm/xen.h | 6 +- .../selftests/kvm/x86_64/xen_shinfo_test.c | 12 +- 4 files changed, 276 insertions(+), 113 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d1013c4f673ca..70af7240a1d5a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -686,6 +686,7 @@ struct kvm_vcpu_xen { struct gfn_to_pfn_cache vcpu_info_cache; struct gfn_to_pfn_cache vcpu_time_info_cache; struct gfn_to_pfn_cache runstate_cache; + struct gfn_to_pfn_cache runstate2_cache; u64 last_steal; u64 runstate_entry_time; u64 runstate_times[4]; diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 4b8e9628fbf57..cfc1c07bc78f2 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -170,112 +170,44 @@ static void kvm_xen_init_timer(struct kvm_vcpu *vcpu) vcpu->arch.xen.timer.function = xen_timer_callback; } -static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state) +static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) { struct kvm_vcpu_xen *vx = &v->arch.xen; - u64 now = get_kvmclock_ns(v->kvm); - u64 delta_ns = now - vx->runstate_entry_time; - u64 run_delay = current->sched_info.run_delay; - - if (unlikely(!vx->runstate_entry_time)) - vx->current_runstate = RUNSTATE_offline; - - /* - * Time waiting for the scheduler isn't "stolen" if the - * vCPU wasn't running anyway. - */ - if (vx->current_runstate == RUNSTATE_running) { - u64 steal_ns = run_delay - vx->last_steal; - - delta_ns -= steal_ns; - - vx->runstate_times[RUNSTATE_runnable] += steal_ns; - } - vx->last_steal = run_delay; - - vx->runstate_times[vx->current_runstate] += delta_ns; - vx->current_runstate = state; - vx->runstate_entry_time = now; -} - -void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) -{ - struct kvm_vcpu_xen *vx = &v->arch.xen; - struct gfn_to_pfn_cache *gpc = &vx->runstate_cache; - uint64_t *user_times; + struct gfn_to_pfn_cache *gpc1 = &vx->runstate_cache; + struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache; + size_t user_len, user_len1, user_len2; + struct vcpu_runstate_info rs; unsigned long flags; - size_t user_len; - int *user_state; - - kvm_xen_update_runstate(v, state); - - if (!vx->runstate_cache.active) - return; - - if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) - user_len = sizeof(struct vcpu_runstate_info); - else - user_len = sizeof(struct compat_vcpu_runstate_info); - - read_lock_irqsave(&gpc->lock, flags); - while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa, - user_len)) { - read_unlock_irqrestore(&gpc->lock, flags); - - /* When invoked from kvm_sched_out() we cannot sleep */ - if (state == RUNSTATE_runnable) - return; - - if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, user_len)) - return; - - read_lock_irqsave(&gpc->lock, flags); - } + size_t times_ofs; + uint8_t *update_bit; + uint64_t *rs_times; + int *rs_state; /* * The only difference between 32-bit and 64-bit versions of the - * runstate struct us the alignment of uint64_t in 32-bit, which + * runstate struct is the alignment of uint64_t in 32-bit, which * means that the 64-bit version has an additional 4 bytes of - * padding after the first field 'state'. - * - * So we use 'int __user *user_state' to point to the state field, - * and 'uint64_t __user *user_times' for runstate_entry_time. So - * the actual array of time[] in each state starts at user_times[1]. + * padding after the first field 'state'. Let's be really really + * paranoid about that, and matching it with our internal data + * structures that we memcpy into it... */ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0); BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0); BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c); #ifdef CONFIG_X86_64 + /* + * The 64-bit structure has 4 bytes of padding before 'state_entry_time' + * so each subsequent field is shifted by 4, and it's 4 bytes longer. + */ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) != offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4); BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) != offsetof(struct compat_vcpu_runstate_info, time) + 4); + BUILD_BUG_ON(sizeof(struct vcpu_runstate_info) != 0x2c + 4); #endif - - user_state = gpc->khva; - - if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) - user_times = gpc->khva + offsetof(struct vcpu_runstate_info, - state_entry_time); - else - user_times = gpc->khva + offsetof(struct compat_vcpu_runstate_info, - state_entry_time); - /* - * First write the updated state_entry_time at the appropriate - * location determined by 'offset'. - */ - BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) != - sizeof(user_times[0])); - BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) != - sizeof(user_times[0])); - - user_times[0] = vx->runstate_entry_time | XEN_RUNSTATE_UPDATE; - smp_wmb(); - - /* - * Next, write the new runstate. This is in the *same* place - * for 32-bit and 64-bit guests, asserted here for paranoia. + * The state field is in the same place at the start of both structs, + * and is the same size (int) as vx->current_runstate. */ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != offsetof(struct compat_vcpu_runstate_info, state)); @@ -284,34 +216,228 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) != sizeof(vx->current_runstate)); - *user_state = vx->current_runstate; + /* + * The state_entry_time field is 64 bits in both versions, and the + * XEN_RUNSTATE_UPDATE flag is in the top bit, which given that x86 + * is little-endian means that it's in the last *byte* of the word. + * That detail is important later. + */ + BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) != + sizeof(uint64_t)); + BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) != + sizeof(uint64_t)); + BUILD_BUG_ON((XEN_RUNSTATE_UPDATE >> 56) != 0x80); /* - * Write the actual runstate times immediately after the - * runstate_entry_time. + * The time array is four 64-bit quantities in both versions, matching + * the vx->runstate_times and immediately following state_entry_time. */ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) != - offsetof(struct vcpu_runstate_info, time) - sizeof(u64)); + offsetof(struct vcpu_runstate_info, time) - sizeof(uint64_t)); BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) != - offsetof(struct compat_vcpu_runstate_info, time) - sizeof(u64)); + offsetof(struct compat_vcpu_runstate_info, time) - sizeof(uint64_t)); BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != sizeof_field(struct compat_vcpu_runstate_info, time)); BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != sizeof(vx->runstate_times)); - memcpy(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times)); + if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) { + user_len = sizeof(struct vcpu_runstate_info); + times_ofs = offsetof(struct vcpu_runstate_info, + state_entry_time); + } else { + user_len = sizeof(struct compat_vcpu_runstate_info); + times_ofs = offsetof(struct compat_vcpu_runstate_info, + state_entry_time); + } + + /* + * There are basically no alignment constraints. The guest can set it + * up so it crosses from one page to the next, and at arbitrary byte + * alignment (and the 32-bit ABI doesn't align the 64-bit integers + * anyway, even if the overall struct had been 64-bit aligned). + */ + if ((gpc1->gpa & ~PAGE_MASK) + user_len >= PAGE_SIZE) { + user_len1 = PAGE_SIZE - (gpc1->gpa & ~PAGE_MASK); + user_len2 = user_len - user_len1; + } else { + user_len1 = user_len; + user_len2 = 0; + } + BUG_ON(user_len1 + user_len2 != user_len); + + retry: + /* + * Attempt to obtain the GPC lock on *both* (if there are two) + * gfn_to_pfn caches that cover the region. + */ + read_lock_irqsave(&gpc1->lock, flags); + while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc1, gpc1->gpa, user_len1)) { + read_unlock_irqrestore(&gpc1->lock, flags); + + /* When invoked from kvm_sched_out() we cannot sleep */ + if (atomic) + return; + + if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc1, gpc1->gpa, user_len1)) + return; + + read_lock_irqsave(&gpc1->lock, flags); + } + + if (likely(!user_len2)) { + /* + * Set up three pointers directly to the runstate_info + * struct in the guest (via the GPC). + * + * • @rs_state → state field + * • @rs_times → state_entry_time field. + * • @update_bit → last byte of state_entry_time, which + * contains the XEN_RUNSTATE_UPDATE bit. + */ + rs_state = gpc1->khva; + rs_times = gpc1->khva + times_ofs; + update_bit = ((void *)(&rs_times[1])) - 1; + } else { + /* + * The guest's runstate_info is split across two pages and we + * need to hold and validate both GPCs simultaneously. We can + * declare a lock ordering GPC1 > GPC2 because nothing else + * takes them more than one at a time. + */ + read_lock(&gpc2->lock); + + if (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc2, gpc2->gpa, user_len2)) { + read_unlock(&gpc2->lock); + read_unlock_irqrestore(&gpc1->lock, flags); + + /* When invoked from kvm_sched_out() we cannot sleep */ + if (atomic) + return; + + /* + * Use kvm_gpc_activate() here because if the runstate + * area was configured in 32-bit mode and only extends + * to the second page now because the guest changed to + * 64-bit mode, the second GPC won't have been set up. + */ + if (kvm_gpc_activate(v->kvm, gpc2, NULL, KVM_HOST_USES_PFN, + gpc1->gpa + user_len1, user_len2)) + return; + + /* + * We dropped the lock on GPC1 so we have to go all the + * way back and revalidate that too. + */ + goto retry; + } + + /* + * In this case, the runstate_info struct will be assembled on + * the kernel stack (compat or not as appropriate) and will + * be copied to GPC1/GPC2 with a dual memcpy. Set up the three + * rs pointers accordingly. + */ + rs_times = &rs.state_entry_time; + + /* + * The rs_state pointer points to the start of what we'll + * copy to the guest, which in the case of a compat guest + * is the 32-bit field that the compiler thinks is padding. + */ + rs_state = ((void *)rs_times) - times_ofs; + + /* + * The update_bit is still directly in the guest memory, + * via one GPC or the other. + */ + if (user_len1 >= times_ofs + sizeof(uint64_t)) + update_bit = gpc1->khva + times_ofs + + sizeof(uint64_t) - 1; + else + update_bit = gpc2->khva + times_ofs + + sizeof(uint64_t) - 1 - user_len1; + +#ifdef CONFIG_X86_64 + /* + * Don't leak kernel memory through the padding in the 64-bit + * version of the struct. + */ + memset(&rs, 0, offsetof(struct vcpu_runstate_info, state_entry_time)); +#endif + } + + /* + * First, set the XEN_RUNSTATE_UPDATE bit in the top bit of the + * state_entry_time field, directly in the guest. We need to set + * that (and write-barrier) before writing to the rest of the + * structure, and clear it last. Just as Xen does, we address the + * single *byte* in which it resides because it might be in a + * different cache line to the rest of the 64-bit word, due to + * the (lack of) alignment constraints. + */ + *update_bit = (vx->runstate_entry_time | XEN_RUNSTATE_UPDATE) >> 56; smp_wmb(); /* - * Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's - * runstate_entry_time field. + * Now assemble the actual structure, either on our kernel stack + * or directly in the guest according to how the rs_state and + * rs_times pointers were set up above. */ - user_times[0] &= ~XEN_RUNSTATE_UPDATE; + *rs_state = vx->current_runstate; + rs_times[0] = vx->runstate_entry_time | XEN_RUNSTATE_UPDATE; + memcpy(rs_times + 1, vx->runstate_times, sizeof(vx->runstate_times)); + + /* For the split case, we have to then copy it to the guest. */ + if (user_len2) { + memcpy(gpc1->khva, rs_state, user_len1); + memcpy(gpc2->khva, ((void *)rs_state) + user_len1, user_len2); + } smp_wmb(); - read_unlock_irqrestore(&gpc->lock, flags); + /* Finally, clear the XEN_RUNSTATE_UPDATE bit. */ + *update_bit = vx->runstate_entry_time >> 56; + smp_wmb(); - mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT); + if (user_len2) + read_unlock(&gpc2->lock); + + read_unlock_irqrestore(&gpc1->lock, flags); + + mark_page_dirty_in_slot(v->kvm, gpc1->memslot, gpc1->gpa >> PAGE_SHIFT); + if (user_len2) + mark_page_dirty_in_slot(v->kvm, gpc2->memslot, gpc2->gpa >> PAGE_SHIFT); +} + +void kvm_xen_update_runstate(struct kvm_vcpu *v, int state) +{ + struct kvm_vcpu_xen *vx = &v->arch.xen; + u64 now = get_kvmclock_ns(v->kvm); + u64 delta_ns = now - vx->runstate_entry_time; + u64 run_delay = current->sched_info.run_delay; + + if (unlikely(!vx->runstate_entry_time)) + vx->current_runstate = RUNSTATE_offline; + + /* + * Time waiting for the scheduler isn't "stolen" if the + * vCPU wasn't running anyway. + */ + if (vx->current_runstate == RUNSTATE_running) { + u64 steal_ns = run_delay - vx->last_steal; + + delta_ns -= steal_ns; + + vx->runstate_times[RUNSTATE_runnable] += steal_ns; + } + vx->last_steal = run_delay; + + vx->runstate_times[vx->current_runstate] += delta_ns; + vx->current_runstate = state; + vx->runstate_entry_time = now; + + if (vx->runstate_cache.active) + kvm_xen_update_runstate_guest(v, state == RUNSTATE_runnable); } static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v) @@ -584,23 +710,57 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); break; - case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: + case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: { + size_t sz, sz1, sz2; + if (!sched_info_on()) { r = -EOPNOTSUPP; break; } if (data->u.gpa == GPA_INVALID) { + r = 0; + deactivate_out: kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate_cache); - r = 0; + kvm_gpc_deactivate(vcpu->kvm, + &vcpu->arch.xen.runstate2_cache); break; } + /* + * If the guest switches to 64-bit mode after setting the runstate + * address, that's actually OK. kvm_xen_update_runstate_guest() + * will cope. + */ + if (IS_ENABLED(CONFIG_64BIT) && vcpu->kvm->arch.xen.long_mode) + sz = sizeof(struct vcpu_runstate_info); + else + sz = sizeof(struct compat_vcpu_runstate_info); + + /* How much fits in the (first) page? */ + sz1 = PAGE_SIZE - (data->u.gpa & ~PAGE_MASK); r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache, - NULL, KVM_HOST_USES_PFN, data->u.gpa, - sizeof(struct vcpu_runstate_info)); - break; + NULL, KVM_HOST_USES_PFN, data->u.gpa, sz1); + if (r) + goto deactivate_out; + + /* Either map the second page, or deactivate the second GPC */ + if (sz1 >= sz) { + kvm_gpc_deactivate(vcpu->kvm, + &vcpu->arch.xen.runstate2_cache); + } else { + sz2 = sz - sz1; + BUG_ON((data->u.gpa + sz1) & ~PAGE_MASK); + r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate2_cache, + NULL, KVM_HOST_USES_PFN, + data->u.gpa + sz1, sz2); + if (r) + goto deactivate_out; + } + kvm_xen_update_runstate_guest(vcpu, false); + break; + } case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT: if (!sched_info_on()) { r = -EOPNOTSUPP; @@ -1834,6 +1994,7 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu) timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0); kvm_gpc_init(&vcpu->arch.xen.runstate_cache); + kvm_gpc_init(&vcpu->arch.xen.runstate2_cache); kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache); kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache); } @@ -1844,6 +2005,7 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) kvm_xen_stop_timer(vcpu); kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate_cache); + kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate2_cache); kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache); kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_time_info_cache); diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h index 532a535a9e99f..8503d2c6891e3 100644 --- a/arch/x86/kvm/xen.h +++ b/arch/x86/kvm/xen.h @@ -143,11 +143,11 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu); #include #include -void kvm_xen_update_runstate_guest(struct kvm_vcpu *vcpu, int state); +void kvm_xen_update_runstate(struct kvm_vcpu *vcpu, int state); static inline void kvm_xen_runstate_set_running(struct kvm_vcpu *vcpu) { - kvm_xen_update_runstate_guest(vcpu, RUNSTATE_running); + kvm_xen_update_runstate(vcpu, RUNSTATE_running); } static inline void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu) @@ -162,7 +162,7 @@ static inline void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu) if (WARN_ON_ONCE(!vcpu->preempted)) return; - kvm_xen_update_runstate_guest(vcpu, RUNSTATE_runnable); + kvm_xen_update_runstate(vcpu, RUNSTATE_runnable); } /* 32-bit compatibility definitions, also used natively in 32-bit build */ diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index 2a5727188c8d3..7f39815f17728 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -26,17 +26,17 @@ #define SHINFO_REGION_GPA 0xc0000000ULL #define SHINFO_REGION_SLOT 10 -#define DUMMY_REGION_GPA (SHINFO_REGION_GPA + (2 * PAGE_SIZE)) +#define DUMMY_REGION_GPA (SHINFO_REGION_GPA + (3 * PAGE_SIZE)) #define DUMMY_REGION_SLOT 11 #define SHINFO_ADDR (SHINFO_REGION_GPA) -#define PVTIME_ADDR (SHINFO_REGION_GPA + PAGE_SIZE) -#define RUNSTATE_ADDR (SHINFO_REGION_GPA + PAGE_SIZE + 0x20) #define VCPU_INFO_ADDR (SHINFO_REGION_GPA + 0x40) +#define PVTIME_ADDR (SHINFO_REGION_GPA + PAGE_SIZE) +#define RUNSTATE_ADDR (SHINFO_REGION_GPA + PAGE_SIZE + PAGE_SIZE - 15) #define SHINFO_VADDR (SHINFO_REGION_GVA) -#define RUNSTATE_VADDR (SHINFO_REGION_GVA + PAGE_SIZE + 0x20) #define VCPU_INFO_VADDR (SHINFO_REGION_GVA + 0x40) +#define RUNSTATE_VADDR (SHINFO_REGION_GVA + PAGE_SIZE + PAGE_SIZE - 15) #define EVTCHN_VECTOR 0x10 @@ -449,8 +449,8 @@ int main(int argc, char *argv[]) /* Map a region for the shared_info page */ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, - SHINFO_REGION_GPA, SHINFO_REGION_SLOT, 2, 0); - virt_map(vm, SHINFO_REGION_GVA, SHINFO_REGION_GPA, 2); + SHINFO_REGION_GPA, SHINFO_REGION_SLOT, 3, 0); + virt_map(vm, SHINFO_REGION_GVA, SHINFO_REGION_GPA, 3); struct shared_info *shinfo = addr_gpa2hva(vm, SHINFO_VADDR); -- GitLab From d8ba8ba4c801b794f47852a6f1821ea48f83b5d1 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sun, 27 Nov 2022 12:22:10 +0000 Subject: [PATCH 0976/1423] KVM: x86/xen: Allow XEN_RUNSTATE_UPDATE flag behaviour to be configured Closer inspection of the Xen code shows that we aren't supposed to be using the XEN_RUNSTATE_UPDATE flag unconditionally. It should be explicitly enabled by guests through the HYPERVISOR_vm_assist hypercall. If we randomly set the top bit of ->state_entry_time for a guest that hasn't asked for it and doesn't expect it, that could make the runtimes fail to add up and confuse the guest. Without the flag it's perfectly safe for a vCPU to read its own vcpu_runstate_info; just not for one vCPU to read *another's*. I briefly pondered adding a word for the whole set of VMASST_TYPE_* flags but the only one we care about for HVM guests is this, so it seemed a bit pointless. Signed-off-by: David Woodhouse Message-Id: <20221127122210.248427-3-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 34 +++++++++-- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 3 +- arch/x86/kvm/xen.c | 57 ++++++++++++++----- include/uapi/linux/kvm.h | 4 ++ .../selftests/kvm/x86_64/xen_shinfo_test.c | 14 +++++ 6 files changed, 93 insertions(+), 20 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 9175d41e80816..5617bc4f899f4 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -5339,6 +5339,7 @@ KVM_PV_ASYNC_CLEANUP_PERFORM union { __u8 long_mode; __u8 vector; + __u8 runstate_update_flag; struct { __u64 gfn; } shared_info; @@ -5416,6 +5417,14 @@ KVM_XEN_ATTR_TYPE_XEN_VERSION event channel delivery, so responding within the kernel without exiting to userspace is beneficial. +KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG + This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates + support for KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG. It enables the + XEN_RUNSTATE_UPDATE flag which allows guest vCPUs to safely read + other vCPUs' vcpu_runstate_info. Xen guests enable this feature via + the VM_ASST_TYPE_runstate_update_flag of the HYPERVISOR_vm_assist + hypercall. + 4.127 KVM_XEN_HVM_GET_ATTR -------------------------- @@ -8059,12 +8068,13 @@ to userspace. This capability indicates the features that Xen supports for hosting Xen PVHVM guests. Valid flags are:: - #define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0) - #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) - #define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) - #define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) - #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) - #define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5) + #define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0) + #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) + #define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) + #define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) + #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) + #define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5) + #define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6) The KVM_XEN_HVM_CONFIG_HYPERCALL_MSR flag indicates that the KVM_XEN_HVM_CONFIG ioctl is available, for the guest to set its hypercall page. @@ -8096,6 +8106,18 @@ KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID/TIMER/UPCALL_VECTOR vCPU attributes. related to event channel delivery, timers, and the XENVER_version interception. +The KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG flag indicates that KVM supports +the KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG attribute in the KVM_XEN_SET_ATTR +and KVM_XEN_GET_ATTR ioctls. This controls whether KVM will set the +XEN_RUNSTATE_UPDATE flag in guest memory mapped vcpu_runstate_info during +updates of the runstate information. Note that versions of KVM which support +the RUNSTATE feature above, but not thie RUNSTATE_UPDATE_FLAG feature, will +always set the XEN_RUNSTATE_UPDATE flag when updating the guest structure, +which is perhaps counterintuitive. When this flag is advertised, KVM will +behave more correctly, not using the XEN_RUNSTATE_UPDATE flag until/unless +specifically enabled (by the guest making the hypercall, causing the VMM +to enable the KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG attribute). + 8.31 KVM_CAP_PPC_MULTITCE ------------------------- diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 70af7240a1d5a..283cbb83d6ae5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1113,6 +1113,7 @@ struct msr_bitmap_range { struct kvm_xen { u32 xen_version; bool long_mode; + bool runstate_update_flag; u8 upcall_vector; struct gfn_to_pfn_cache shinfo_cache; struct idr evtchn_ports; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 72ac6bf05c8b4..59fd55badd732 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4431,7 +4431,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL | KVM_XEN_HVM_CONFIG_EVTCHN_SEND; if (sched_info_on()) - r |= KVM_XEN_HVM_CONFIG_RUNSTATE; + r |= KVM_XEN_HVM_CONFIG_RUNSTATE | + KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG; break; #endif case KVM_CAP_SYNC_REGS: diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index cfc1c07bc78f2..7acac5dfe2f85 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -179,7 +179,8 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) struct vcpu_runstate_info rs; unsigned long flags; size_t times_ofs; - uint8_t *update_bit; + uint8_t *update_bit = NULL; + uint64_t entry_time; uint64_t *rs_times; int *rs_state; @@ -297,7 +298,8 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) */ rs_state = gpc1->khva; rs_times = gpc1->khva + times_ofs; - update_bit = ((void *)(&rs_times[1])) - 1; + if (v->kvm->arch.xen.runstate_update_flag) + update_bit = ((void *)(&rs_times[1])) - 1; } else { /* * The guest's runstate_info is split across two pages and we @@ -351,12 +353,14 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) * The update_bit is still directly in the guest memory, * via one GPC or the other. */ - if (user_len1 >= times_ofs + sizeof(uint64_t)) - update_bit = gpc1->khva + times_ofs + - sizeof(uint64_t) - 1; - else - update_bit = gpc2->khva + times_ofs + - sizeof(uint64_t) - 1 - user_len1; + if (v->kvm->arch.xen.runstate_update_flag) { + if (user_len1 >= times_ofs + sizeof(uint64_t)) + update_bit = gpc1->khva + times_ofs + + sizeof(uint64_t) - 1; + else + update_bit = gpc2->khva + times_ofs + + sizeof(uint64_t) - 1 - user_len1; + } #ifdef CONFIG_X86_64 /* @@ -376,8 +380,12 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) * different cache line to the rest of the 64-bit word, due to * the (lack of) alignment constraints. */ - *update_bit = (vx->runstate_entry_time | XEN_RUNSTATE_UPDATE) >> 56; - smp_wmb(); + entry_time = vx->runstate_entry_time; + if (update_bit) { + entry_time |= XEN_RUNSTATE_UPDATE; + *update_bit = (vx->runstate_entry_time | XEN_RUNSTATE_UPDATE) >> 56; + smp_wmb(); + } /* * Now assemble the actual structure, either on our kernel stack @@ -385,7 +393,7 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) * rs_times pointers were set up above. */ *rs_state = vx->current_runstate; - rs_times[0] = vx->runstate_entry_time | XEN_RUNSTATE_UPDATE; + rs_times[0] = entry_time; memcpy(rs_times + 1, vx->runstate_times, sizeof(vx->runstate_times)); /* For the split case, we have to then copy it to the guest. */ @@ -396,8 +404,11 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) smp_wmb(); /* Finally, clear the XEN_RUNSTATE_UPDATE bit. */ - *update_bit = vx->runstate_entry_time >> 56; - smp_wmb(); + if (update_bit) { + entry_time &= ~XEN_RUNSTATE_UPDATE; + *update_bit = entry_time >> 56; + smp_wmb(); + } if (user_len2) read_unlock(&gpc2->lock); @@ -619,6 +630,17 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) r = 0; break; + case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG: + if (!sched_info_on()) { + r = -EOPNOTSUPP; + break; + } + mutex_lock(&kvm->lock); + kvm->arch.xen.runstate_update_flag = !!data->u.runstate_update_flag; + mutex_unlock(&kvm->lock); + r = 0; + break; + default: break; } @@ -656,6 +678,15 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) r = 0; break; + case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG: + if (!sched_info_on()) { + r = -EOPNOTSUPP; + break; + } + data->u.runstate_update_flag = kvm->arch.xen.runstate_update_flag; + r = 0; + break; + default: break; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 88448397642cb..64dfe9c07c879 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1271,6 +1271,7 @@ struct kvm_x86_mce { #define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) #define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5) +#define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6) struct kvm_xen_hvm_config { __u32 flags; @@ -1776,6 +1777,7 @@ struct kvm_xen_hvm_attr { union { __u8 long_mode; __u8 vector; + __u8 runstate_update_flag; struct { __u64 gfn; } shared_info; @@ -1816,6 +1818,8 @@ struct kvm_xen_hvm_attr { /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ #define KVM_XEN_ATTR_TYPE_EVTCHN 0x3 #define KVM_XEN_ATTR_TYPE_XEN_VERSION 0x4 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG */ +#define KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG 0x5 /* Per-vCPU Xen attributes */ #define KVM_XEN_VCPU_GET_ATTR _IOWR(KVMIO, 0xca, struct kvm_xen_vcpu_attr) diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index 7f39815f17728..c9b0110d73f3b 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -440,6 +440,7 @@ int main(int argc, char *argv[]) TEST_REQUIRE(xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO); bool do_runstate_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE); + bool do_runstate_flag = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG); bool do_eventfd_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL); bool do_evtchn_tests = do_eventfd_tests && !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND); @@ -475,6 +476,19 @@ int main(int argc, char *argv[]) }; vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &lm); + if (do_runstate_flag) { + struct kvm_xen_hvm_attr ruf = { + .type = KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG, + .u.runstate_update_flag = 1, + }; + vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &ruf); + + ruf.u.runstate_update_flag = 0; + vm_ioctl(vm, KVM_XEN_HVM_GET_ATTR, &ruf); + TEST_ASSERT(ruf.u.runstate_update_flag == 1, + "Failed to read back RUNSTATE_UPDATE_FLAG attr"); + } + struct kvm_xen_hvm_attr ha = { .type = KVM_XEN_ATTR_TYPE_SHARED_INFO, .u.shared_info.gfn = SHINFO_REGION_GPA / PAGE_SIZE, -- GitLab From 8acc35186ed63436bfaf60051c8bb53f344dcbfc Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 19 Nov 2022 09:27:46 +0000 Subject: [PATCH 0977/1423] KVM: x86/xen: Add runstate tests for 32-bit mode and crossing page boundary Torture test the cases where the runstate crosses a page boundary, and and especially the case where it's configured in 32-bit mode and doesn't, but then switching to 64-bit mode makes it go onto the second page. To simplify this, make the KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST ioctl also update the guest runstate area. It already did so if the actual runstate changed, as a side-effect of kvm_xen_update_runstate(). So doing it in the plain adjustment case is making it more consistent, as well as giving us a nice way to trigger the update without actually running the vCPU again and changing the values. Signed-off-by: David Woodhouse Reviewed-by: Paul Durrant Signed-off-by: Paolo Bonzini --- arch/x86/kvm/xen.c | 2 + .../selftests/kvm/x86_64/xen_shinfo_test.c | 115 +++++++++++++++--- 2 files changed, 97 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 7acac5dfe2f85..60a9bdd4199fc 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -884,6 +884,8 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) if (data->u.runstate.state <= RUNSTATE_offline) kvm_xen_update_runstate(vcpu, data->u.runstate.state); + else if (vcpu->arch.xen.runstate_cache.active) + kvm_xen_update_runstate_guest(vcpu, false); r = 0; break; diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index c9b0110d73f3b..721f6a693799b 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -88,14 +88,20 @@ struct pvclock_wall_clock { } __attribute__((__packed__)); struct vcpu_runstate_info { - uint32_t state; - uint64_t state_entry_time; - uint64_t time[4]; + uint32_t state; + uint64_t state_entry_time; + uint64_t time[5]; /* Extra field for overrun check */ }; +struct compat_vcpu_runstate_info { + uint32_t state; + uint64_t state_entry_time; + uint64_t time[5]; +} __attribute__((__packed__));; + struct arch_vcpu_info { - unsigned long cr2; - unsigned long pad; /* sizeof(vcpu_info_t) == 64 */ + unsigned long cr2; + unsigned long pad; /* sizeof(vcpu_info_t) == 64 */ }; struct vcpu_info { @@ -1013,22 +1019,91 @@ int main(int argc, char *argv[]) runstate_names[i], rs->time[i]); } } - TEST_ASSERT(rs->state == rst.u.runstate.state, "Runstate mismatch"); - TEST_ASSERT(rs->state_entry_time == rst.u.runstate.state_entry_time, - "State entry time mismatch"); - TEST_ASSERT(rs->time[RUNSTATE_running] == rst.u.runstate.time_running, - "Running time mismatch"); - TEST_ASSERT(rs->time[RUNSTATE_runnable] == rst.u.runstate.time_runnable, - "Runnable time mismatch"); - TEST_ASSERT(rs->time[RUNSTATE_blocked] == rst.u.runstate.time_blocked, - "Blocked time mismatch"); - TEST_ASSERT(rs->time[RUNSTATE_offline] == rst.u.runstate.time_offline, - "Offline time mismatch"); - - TEST_ASSERT(rs->state_entry_time == rs->time[0] + - rs->time[1] + rs->time[2] + rs->time[3], - "runstate times don't add up"); + + /* + * Exercise runstate info at all points across the page boundary, in + * 32-bit and 64-bit mode. In particular, test the case where it is + * configured in 32-bit mode and then switched to 64-bit mode while + * active, which takes it onto the second page. + */ + unsigned long runstate_addr; + struct compat_vcpu_runstate_info *crs; + for (runstate_addr = SHINFO_REGION_GPA + PAGE_SIZE + PAGE_SIZE - sizeof(*rs) - 4; + runstate_addr < SHINFO_REGION_GPA + PAGE_SIZE + PAGE_SIZE + 4; runstate_addr++) { + + rs = addr_gpa2hva(vm, runstate_addr); + crs = (void *)rs; + + memset(rs, 0xa5, sizeof(*rs)); + + /* Set to compatibility mode */ + lm.u.long_mode = 0; + vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &lm); + + /* Set runstate to new address (kernel will write it) */ + struct kvm_xen_vcpu_attr st = { + .type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR, + .u.gpa = runstate_addr, + }; + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &st); + + if (verbose) + printf("Compatibility runstate at %08lx\n", runstate_addr); + + TEST_ASSERT(crs->state == rst.u.runstate.state, "Runstate mismatch"); + TEST_ASSERT(crs->state_entry_time == rst.u.runstate.state_entry_time, + "State entry time mismatch"); + TEST_ASSERT(crs->time[RUNSTATE_running] == rst.u.runstate.time_running, + "Running time mismatch"); + TEST_ASSERT(crs->time[RUNSTATE_runnable] == rst.u.runstate.time_runnable, + "Runnable time mismatch"); + TEST_ASSERT(crs->time[RUNSTATE_blocked] == rst.u.runstate.time_blocked, + "Blocked time mismatch"); + TEST_ASSERT(crs->time[RUNSTATE_offline] == rst.u.runstate.time_offline, + "Offline time mismatch"); + TEST_ASSERT(crs->time[RUNSTATE_offline + 1] == 0xa5a5a5a5a5a5a5a5ULL, + "Structure overrun"); + TEST_ASSERT(crs->state_entry_time == crs->time[0] + + crs->time[1] + crs->time[2] + crs->time[3], + "runstate times don't add up"); + + + /* Now switch to 64-bit mode */ + lm.u.long_mode = 1; + vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &lm); + + memset(rs, 0xa5, sizeof(*rs)); + + /* Don't change the address, just trigger a write */ + struct kvm_xen_vcpu_attr adj = { + .type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST, + .u.runstate.state = (uint64_t)-1 + }; + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &adj); + + if (verbose) + printf("64-bit runstate at %08lx\n", runstate_addr); + + TEST_ASSERT(rs->state == rst.u.runstate.state, "Runstate mismatch"); + TEST_ASSERT(rs->state_entry_time == rst.u.runstate.state_entry_time, + "State entry time mismatch"); + TEST_ASSERT(rs->time[RUNSTATE_running] == rst.u.runstate.time_running, + "Running time mismatch"); + TEST_ASSERT(rs->time[RUNSTATE_runnable] == rst.u.runstate.time_runnable, + "Runnable time mismatch"); + TEST_ASSERT(rs->time[RUNSTATE_blocked] == rst.u.runstate.time_blocked, + "Blocked time mismatch"); + TEST_ASSERT(rs->time[RUNSTATE_offline] == rst.u.runstate.time_offline, + "Offline time mismatch"); + TEST_ASSERT(rs->time[RUNSTATE_offline + 1] == 0xa5a5a5a5a5a5a5a5ULL, + "Structure overrun"); + + TEST_ASSERT(rs->state_entry_time == rs->time[0] + + rs->time[1] + rs->time[2] + rs->time[3], + "runstate times don't add up"); + } } + kvm_vm_free(vm); return 0; } -- GitLab From aba3caef58626f09b629085440eec5dd1368669a Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 13 Oct 2022 21:12:22 +0000 Subject: [PATCH 0978/1423] KVM: Shorten gfn_to_pfn_cache function names Formalize "gpc" as the acronym and use it in function names. No functional change intended. Suggested-by: Sean Christopherson Signed-off-by: Michal Luczaj Signed-off-by: Sean Christopherson Signed-off-by: David Woodhouse Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 8 ++++---- arch/x86/kvm/xen.c | 30 +++++++++++++++--------------- include/linux/kvm_host.h | 21 ++++++++++----------- virt/kvm/pfncache.c | 20 ++++++++++---------- 4 files changed, 39 insertions(+), 40 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 59fd55badd732..246bdc9a91547 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3037,12 +3037,12 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, unsigned long flags; read_lock_irqsave(&gpc->lock, flags); - while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa, - offset + sizeof(*guest_hv_clock))) { + while (!kvm_gpc_check(v->kvm, gpc, gpc->gpa, + offset + sizeof(*guest_hv_clock))) { read_unlock_irqrestore(&gpc->lock, flags); - if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, - offset + sizeof(*guest_hv_clock))) + if (kvm_gpc_refresh(v->kvm, gpc, gpc->gpa, + offset + sizeof(*guest_hv_clock))) return; read_lock_irqsave(&gpc->lock, flags); diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 60a9bdd4199fc..9187d024d0066 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -273,14 +273,14 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) * gfn_to_pfn caches that cover the region. */ read_lock_irqsave(&gpc1->lock, flags); - while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc1, gpc1->gpa, user_len1)) { + while (!kvm_gpc_check(v->kvm, gpc1, gpc1->gpa, user_len1)) { read_unlock_irqrestore(&gpc1->lock, flags); /* When invoked from kvm_sched_out() we cannot sleep */ if (atomic) return; - if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc1, gpc1->gpa, user_len1)) + if (kvm_gpc_refresh(v->kvm, gpc1, gpc1->gpa, user_len1)) return; read_lock_irqsave(&gpc1->lock, flags); @@ -309,7 +309,7 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) */ read_lock(&gpc2->lock); - if (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc2, gpc2->gpa, user_len2)) { + if (!kvm_gpc_check(v->kvm, gpc2, gpc2->gpa, user_len2)) { read_unlock(&gpc2->lock); read_unlock_irqrestore(&gpc1->lock, flags); @@ -489,12 +489,12 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v) * little more honest about it. */ read_lock_irqsave(&gpc->lock, flags); - while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa, - sizeof(struct vcpu_info))) { + while (!kvm_gpc_check(v->kvm, gpc, gpc->gpa, + sizeof(struct vcpu_info))) { read_unlock_irqrestore(&gpc->lock, flags); - if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, - sizeof(struct vcpu_info))) + if (kvm_gpc_refresh(v->kvm, gpc, gpc->gpa, + sizeof(struct vcpu_info))) return; read_lock_irqsave(&gpc->lock, flags); @@ -554,8 +554,8 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending)); read_lock_irqsave(&gpc->lock, flags); - while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa, - sizeof(struct vcpu_info))) { + while (!kvm_gpc_check(v->kvm, gpc, gpc->gpa, + sizeof(struct vcpu_info))) { read_unlock_irqrestore(&gpc->lock, flags); /* @@ -569,8 +569,8 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) if (in_atomic() || !task_is_running(current)) return 1; - if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, - sizeof(struct vcpu_info))) { + if (kvm_gpc_refresh(v->kvm, gpc, gpc->gpa, + sizeof(struct vcpu_info))) { /* * If this failed, userspace has screwed up the * vcpu_info mapping. No interrupts for you. @@ -1167,7 +1167,7 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, read_lock_irqsave(&gpc->lock, flags); idx = srcu_read_lock(&kvm->srcu); - if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE)) + if (!kvm_gpc_check(kvm, gpc, gpc->gpa, PAGE_SIZE)) goto out_rcu; ret = false; @@ -1564,7 +1564,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) idx = srcu_read_lock(&kvm->srcu); read_lock_irqsave(&gpc->lock, flags); - if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE)) + if (!kvm_gpc_check(kvm, gpc, gpc->gpa, PAGE_SIZE)) goto out_rcu; if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { @@ -1598,7 +1598,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) gpc = &vcpu->arch.xen.vcpu_info_cache; read_lock_irqsave(&gpc->lock, flags); - if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, sizeof(struct vcpu_info))) { + if (!kvm_gpc_check(kvm, gpc, gpc->gpa, sizeof(struct vcpu_info))) { /* * Could not access the vcpu_info. Set the bit in-kernel * and prod the vCPU to deliver it for itself. @@ -1696,7 +1696,7 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm) break; idx = srcu_read_lock(&kvm->srcu); - rc = kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpc->gpa, PAGE_SIZE); + rc = kvm_gpc_refresh(kvm, gpc, gpc->gpa, PAGE_SIZE); srcu_read_unlock(&kvm->srcu, idx); } while(!rc); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b8d12356f015f..8f874a9643139 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1288,16 +1288,15 @@ void kvm_gpc_init(struct gfn_to_pfn_cache *gpc); * -EFAULT for an untranslatable guest physical address. * * This primes a gfn_to_pfn_cache and links it into the @kvm's list for - * invalidations to be processed. Callers are required to use - * kvm_gfn_to_pfn_cache_check() to ensure that the cache is valid before - * accessing the target page. + * invalidations to be processed. Callers are required to use kvm_gpc_check() + * to ensure that the cache is valid before accessing the target page. */ int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, struct kvm_vcpu *vcpu, enum pfn_cache_usage usage, gpa_t gpa, unsigned long len); /** - * kvm_gfn_to_pfn_cache_check - check validity of a gfn_to_pfn_cache. + * kvm_gpc_check - check validity of a gfn_to_pfn_cache. * * @kvm: pointer to kvm instance. * @gpc: struct gfn_to_pfn_cache object. @@ -1314,11 +1313,11 @@ int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, * Callers in IN_GUEST_MODE may do so without locking, although they should * still hold a read lock on kvm->scru for the memslot checks. */ -bool kvm_gfn_to_pfn_cache_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, - gpa_t gpa, unsigned long len); +bool kvm_gpc_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa, + unsigned long len); /** - * kvm_gfn_to_pfn_cache_refresh - update a previously initialized cache. + * kvm_gpc_refresh - update a previously initialized cache. * * @kvm: pointer to kvm instance. * @gpc: struct gfn_to_pfn_cache object. @@ -1335,11 +1334,11 @@ bool kvm_gfn_to_pfn_cache_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, * still lock and check the cache status, as this function does not return * with the lock still held to permit access. */ -int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, - gpa_t gpa, unsigned long len); +int kvm_gpc_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa, + unsigned long len); /** - * kvm_gfn_to_pfn_cache_unmap - temporarily unmap a gfn_to_pfn_cache. + * kvm_gpc_unmap - temporarily unmap a gfn_to_pfn_cache. * * @kvm: pointer to kvm instance. * @gpc: struct gfn_to_pfn_cache object. @@ -1348,7 +1347,7 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, * but at least the mapping from GPA to userspace HVA will remain cached * and can be reused on a subsequent refresh. */ -void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc); +void kvm_gpc_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc); /** * kvm_gpc_deactivate - deactivate and unlink a gfn_to_pfn_cache. diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index 5f83321bfd2a0..8c4db3dcaf6d5 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -76,8 +76,8 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start, } } -bool kvm_gfn_to_pfn_cache_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, - gpa_t gpa, unsigned long len) +bool kvm_gpc_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa, + unsigned long len) { struct kvm_memslots *slots = kvm_memslots(kvm); @@ -96,7 +96,7 @@ bool kvm_gfn_to_pfn_cache_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, return true; } -EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_check); +EXPORT_SYMBOL_GPL(kvm_gpc_check); static void gpc_unmap_khva(struct kvm *kvm, kvm_pfn_t pfn, void *khva) { @@ -238,8 +238,8 @@ out_error: return -EFAULT; } -int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, - gpa_t gpa, unsigned long len) +int kvm_gpc_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa, + unsigned long len) { struct kvm_memslots *slots = kvm_memslots(kvm); unsigned long page_offset = gpa & ~PAGE_MASK; @@ -333,9 +333,9 @@ out_unlock: return ret; } -EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_refresh); +EXPORT_SYMBOL_GPL(kvm_gpc_refresh); -void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) +void kvm_gpc_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) { void *old_khva; kvm_pfn_t old_pfn; @@ -360,7 +360,7 @@ void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) gpc_unmap_khva(kvm, old_pfn, old_khva); } -EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_unmap); +EXPORT_SYMBOL_GPL(kvm_gpc_unmap); void kvm_gpc_init(struct gfn_to_pfn_cache *gpc) { @@ -396,7 +396,7 @@ int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpc->active = true; write_unlock_irq(&gpc->lock); } - return kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpa, len); + return kvm_gpc_refresh(kvm, gpc, gpa, len); } EXPORT_SYMBOL_GPL(kvm_gpc_activate); @@ -416,7 +416,7 @@ void kvm_gpc_deactivate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) list_del(&gpc->list); spin_unlock(&kvm->gpc_lock); - kvm_gfn_to_pfn_cache_unmap(kvm, gpc); + kvm_gpc_unmap(kvm, gpc); } } EXPORT_SYMBOL_GPL(kvm_gpc_deactivate); -- GitLab From c1a81f3bd9b40edc1444dfaeac33f92cff0e770a Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 13 Oct 2022 21:12:23 +0000 Subject: [PATCH 0979/1423] KVM: x86: Remove unused argument in gpc_unmap_khva() Remove the unused @kvm argument from gpc_unmap_khva(). Signed-off-by: Michal Luczaj Signed-off-by: Sean Christopherson Signed-off-by: David Woodhouse Signed-off-by: Paolo Bonzini --- virt/kvm/pfncache.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index 8c4db3dcaf6d5..b4295474519f8 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -98,7 +98,7 @@ bool kvm_gpc_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa, } EXPORT_SYMBOL_GPL(kvm_gpc_check); -static void gpc_unmap_khva(struct kvm *kvm, kvm_pfn_t pfn, void *khva) +static void gpc_unmap_khva(kvm_pfn_t pfn, void *khva) { /* Unmap the old pfn/page if it was mapped before. */ if (!is_error_noslot_pfn(pfn) && khva) { @@ -177,7 +177,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) * the existing mapping and didn't create a new one. */ if (new_khva != old_khva) - gpc_unmap_khva(kvm, new_pfn, new_khva); + gpc_unmap_khva(new_pfn, new_khva); kvm_release_pfn_clean(new_pfn); @@ -329,7 +329,7 @@ out_unlock: mutex_unlock(&gpc->refresh_lock); if (unmap_old) - gpc_unmap_khva(kvm, old_pfn, old_khva); + gpc_unmap_khva(old_pfn, old_khva); return ret; } @@ -358,7 +358,7 @@ void kvm_gpc_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) write_unlock_irq(&gpc->lock); mutex_unlock(&gpc->refresh_lock); - gpc_unmap_khva(kvm, old_pfn, old_khva); + gpc_unmap_khva(old_pfn, old_khva); } EXPORT_SYMBOL_GPL(kvm_gpc_unmap); -- GitLab From df0bb47baa95aad133820b149851d5b94cbc6790 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 30 Nov 2022 11:14:35 -0500 Subject: [PATCH 0980/1423] KVM: x86: fix uninitialized variable use on KVM_REQ_TRIPLE_FAULT If a triple fault was fixed by kvm_x86_ops.nested_ops->triple_fault (by turning it into a vmexit), there is no need to leave vcpu_enter_guest(). Any vcpu->requests will be caught later before the actual vmentry, and in fact vcpu_enter_guest() was not initializing the "r" variable. Depending on the compiler's whims, this could cause the x86_64/triple_fault_event_test test to fail. Cc: Maxim Levitsky Fixes: 92e7d5c83aff ("KVM: x86: allow L1 to not intercept triple fault") Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 246bdc9a91547..7f850dfb40861 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10280,8 +10280,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; vcpu->mmio_needed = 0; r = 0; + goto out; } - goto out; } if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) { /* Page is swapped out. Do synthetic halt */ -- GitLab From 032e160305f6872e590c77f11896fb28365c6d6c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 28 Nov 2022 17:24:42 -0800 Subject: [PATCH 0981/1423] xfs: invalidate block device page cache during unmount Every now and then I see fstests failures on aarch64 (64k pages) that trigger on the following sequence: mkfs.xfs $dev mount $dev $mnt touch $mnt/a umount $mnt xfs_db -c 'path /a' -c 'print' $dev 99% of the time this succeeds, but every now and then xfs_db cannot find /a and fails. This turns out to be a race involving udev/blkid, the page cache for the block device, and the xfs_db process. udev is triggered whenever anyone closes a block device or unmounts it. The default udev rules invoke blkid to read the fs super and create symlinks to the bdev under /dev/disk. For this, it uses buffered reads through the page cache. xfs_db also uses buffered reads to examine metadata. There is no coordination between xfs_db and udev, which means that they can run concurrently. Note there is no coordination between the kernel and blkid either. On a system with 64k pages, the page cache can cache the superblock and the root inode (and hence the root dir) with the same 64k page. If udev spawns blkid after the mkfs and the system is busy enough that it is still running when xfs_db starts up, they'll both read from the same page in the pagecache. The unmount writes updated inode metadata to disk directly. The XFS buffer cache does not use the bdev pagecache, nor does it invalidate the pagecache on umount. If the above scenario occurs, the pagecache no longer reflects what's on disk, xfs_db reads the stale metadata, and fails to find /a. Most of the time this succeeds because closing a bdev invalidates the page cache, but when processes race, everyone loses. Fix the problem by invalidating the bdev pagecache after flushing the bdev, so that xfs_db will see up to date metadata. Signed-off-by: Darrick J. Wong Reviewed-by: Gao Xiang Reviewed-by: Dave Chinner --- fs/xfs/xfs_buf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index dde346450952a..54c774af6e1c6 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1945,6 +1945,7 @@ xfs_free_buftarg( list_lru_destroy(&btp->bt_lru); blkdev_issue_flush(btp->bt_bdev); + invalidate_bdev(btp->bt_bdev); fs_put_dax(btp->bt_daxdev, btp->bt_mount); kmem_free(btp); -- GitLab From fd5beaff250d7e88912a937fad072d9d24f219da Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 28 Nov 2022 17:24:42 -0800 Subject: [PATCH 0982/1423] xfs: use memcpy, not strncpy, to format the attr prefix during listxattr When -Wstringop-truncation is enabled, the compiler complains about truncation of the null byte at the end of the xattr name prefix. This is intentional, since we're concatenating the two strings together and do _not_ want a null byte in the middle of the name. We've already ensured that the name buffer is long enough to handle prefix and name, and the prefix_len is supposed to be the length of the prefix string without the null byte, so use memcpy here instead. Signed-off-by: Darrick J. Wong Reviewed-by: Gao Xiang Reviewed-by: Dave Chinner --- fs/xfs/xfs_xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index c325a28b89a8d..10aa1fd39d2b0 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -210,7 +210,7 @@ __xfs_xattr_put_listent( return; } offset = context->buffer + context->count; - strncpy(offset, prefix, prefix_len); + memcpy(offset, prefix, prefix_len); offset += prefix_len; strncpy(offset, (char *)name, namelen); /* real name */ offset += namelen; -- GitLab From e5827a007aa4bb737c63121fd2c77e089b18a372 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 28 Nov 2022 17:24:42 -0800 Subject: [PATCH 0983/1423] xfs: shut up -Wuninitialized in xfsaild_push -Wuninitialized complains about @target in xfsaild_push being uninitialized in the case where the waitqueue is active but there is no last item in the AIL to wait for. I /think/ it should never be the case that the subsequent xfs_trans_ail_cursor_first returns a log item and hence we'll never end up at XFS_LSN_CMP, but let's make this explicit. Signed-off-by: Darrick J. Wong Reviewed-by: Gao Xiang Reviewed-by: Dave Chinner --- fs/xfs/xfs_trans_ail.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index f51df7d94ef74..7d4109af193e6 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -422,7 +422,7 @@ xfsaild_push( struct xfs_ail_cursor cur; struct xfs_log_item *lip; xfs_lsn_t lsn; - xfs_lsn_t target; + xfs_lsn_t target = NULLCOMMITLSN; long tout; int stuck = 0; int flushing = 0; @@ -472,6 +472,8 @@ xfsaild_push( XFS_STATS_INC(mp, xs_push_ail); + ASSERT(target != NULLCOMMITLSN); + lsn = lip->li_lsn; while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) { int lock_result; -- GitLab From 4c6dbfd2756bd83a0085ed804e2bb7be9cc16bc5 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 28 Nov 2022 17:24:43 -0800 Subject: [PATCH 0984/1423] xfs: attach dquots to inode before reading data/cow fork mappings I've been running near-continuous integration testing of online fsck, and I've noticed that once a day, one of the ARM VMs will fail the test with out of order records in the data fork. xfs/804 races fsstress with online scrub (aka scan but do not change anything), so I think this might be a bug in the core xfs code. This also only seems to trigger if one runs the test for more than ~6 minutes via TIME_FACTOR=13 or something. https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfstests-dev.git/tree/tests/xfs/804?h=djwong-wtf I added a debugging patch to the kernel to check the data fork extents after taking the ILOCK, before dropping ILOCK, and before and after each bmapping operation. So far I've narrowed it down to the delalloc code inserting a record in the wrong place in the iext tree: xfs_bmap_add_extent_hole_delay, near line 2691: case 0: /* * New allocation is not contiguous with another * delayed allocation. * Insert a new entry. */ oldlen = newlen = 0; xfs_iunlock_check_datafork(ip); <-- ok here xfs_iext_insert(ip, icur, new, state); xfs_iunlock_check_datafork(ip); <-- bad here break; } I recorded the state of the data fork mappings and iext cursor state when a corrupt data fork is detected immediately after the xfs_bmap_add_extent_hole_delay call in xfs_bmapi_reserve_delalloc: ino 0x140bb3 func xfs_bmapi_reserve_delalloc line 4164 data fork: ino 0x140bb3 nr 0x0 nr_real 0x0 offset 0xb9 blockcount 0x1f startblock 0x935de2 state 1 ino 0x140bb3 nr 0x1 nr_real 0x1 offset 0xe6 blockcount 0xa startblock 0xffffffffe0007 state 0 ino 0x140bb3 nr 0x2 nr_real 0x1 offset 0xd8 blockcount 0xe startblock 0x935e01 state 0 Here we see that a delalloc extent was inserted into the wrong position in the iext leaf, same as all the other times. The extra trace data I collected are as follows: ino 0x140bb3 fork 0 oldoff 0xe6 oldlen 0x4 oldprealloc 0x6 isize 0xe6000 ino 0x140bb3 oldgotoff 0xea oldgotstart 0xfffffffffffffffe oldgotcount 0x0 oldgotstate 0 ino 0x140bb3 crapgotoff 0x0 crapgotstart 0x0 crapgotcount 0x0 crapgotstate 0 ino 0x140bb3 freshgotoff 0xd8 freshgotstart 0x935e01 freshgotcount 0xe freshgotstate 0 ino 0x140bb3 nowgotoff 0xe6 nowgotstart 0xffffffffe0007 nowgotcount 0xa nowgotstate 0 ino 0x140bb3 oldicurpos 1 oldleafnr 2 oldleaf 0xfffffc00f0609a00 ino 0x140bb3 crapicurpos 2 crapleafnr 2 crapleaf 0xfffffc00f0609a00 ino 0x140bb3 freshicurpos 1 freshleafnr 2 freshleaf 0xfffffc00f0609a00 ino 0x140bb3 newicurpos 1 newleafnr 3 newleaf 0xfffffc00f0609a00 The first line shows that xfs_bmapi_reserve_delalloc was called with whichfork=XFS_DATA_FORK, off=0xe6, len=0x4, prealloc=6. The second line ("oldgot") shows the contents of @got at the beginning of the call, which are the results of the first iext lookup in xfs_buffered_write_iomap_begin. Line 3 ("crapgot") is the result of duplicating the cursor at the start of the body of xfs_bmapi_reserve_delalloc and performing a fresh lookup at @off. Line 4 ("freshgot") is the result of a new xfs_iext_get_extent right before the call to xfs_bmap_add_extent_hole_delay. Totally garbage. Line 5 ("nowgot") is contents of @got after the xfs_bmap_add_extent_hole_delay call. Line 6 is the contents of @icur at the beginning fo the call. Lines 7-9 are the contents of the iext cursors at the point where the block mappings were sampled. I think @oldgot is a HOLESTARTBLOCK extent because the first lookup didn't find anything, so we filled in imap with "fake hole until the end". At the time of the first lookup, I suspect that there's only one 32-block unwritten extent in the mapping (hence oldicurpos==1) but by the time we get to recording crapgot, crapicurpos==2. Dave then added: Ok, that's much simpler to reason about, and implies the smoke is coming from xfs_buffered_write_iomap_begin() or xfs_bmapi_reserve_delalloc(). I suspect the former - it does a lot of stuff with the ILOCK_EXCL held..... .... including calling xfs_qm_dqattach_locked(). xfs_buffered_write_iomap_begin ILOCK_EXCL look up icur xfs_qm_dqattach_locked xfs_qm_dqattach_one xfs_qm_dqget_inode dquot cache miss xfs_iunlock(ip, XFS_ILOCK_EXCL); error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp); xfs_ilock(ip, XFS_ILOCK_EXCL); .... xfs_bmapi_reserve_delalloc(icur) Yup, that's what is letting the magic smoke out - xfs_qm_dqattach_locked() can cycle the ILOCK. If that happens, we can pass a stale icur to xfs_bmapi_reserve_delalloc() and it all goes downhill from there. Back to Darrick now: So. Fix this by moving the dqattach_locked call up before we take the ILOCK, like all the other callers in that file. Fixes: a526c85c2236 ("xfs: move xfs_file_iomap_begin_delay around") # goes further back than this Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_iomap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 1005f1e365454..68436370927d5 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -978,6 +978,10 @@ xfs_buffered_write_iomap_begin( ASSERT(!XFS_IS_REALTIME_INODE(ip)); + error = xfs_qm_dqattach(ip); + if (error) + return error; + error = xfs_ilock_for_iomap(ip, flags, &lockmode); if (error) return error; @@ -1081,10 +1085,6 @@ xfs_buffered_write_iomap_begin( allocfork = XFS_COW_FORK; } - error = xfs_qm_dqattach_locked(ip, false); - if (error) - goto out_unlock; - if (eof && offset + count > XFS_ISIZE(ip)) { /* * Determine the initial size of the preallocation. -- GitLab From 1eb52a6a71981b80f9acbd915acd6a05a5037196 Mon Sep 17 00:00:00 2001 From: Guo Xuenan Date: Wed, 30 Nov 2022 09:25:46 -0800 Subject: [PATCH 0985/1423] xfs: wait iclog complete before tearing down AIL Fix uaf in xfs_trans_ail_delete during xlog force shutdown. In commit cd6f79d1fb32 ("xfs: run callbacks before waking waiters in xlog_state_shutdown_callbacks") changed the order of running callbacks and wait for iclog completion to avoid unmount path untimely destroy AIL. But which seems not enough to ensue this, adding mdelay in `xfs_buf_item_unpin` can prove that. The reproduction is as follows. To ensure destroy AIL safely, we should wait all xlog ioend workers done and sync the AIL. ================================================================== BUG: KASAN: use-after-free in xfs_trans_ail_delete+0x240/0x2a0 Read of size 8 at addr ffff888023169400 by task kworker/1:1H/43 CPU: 1 PID: 43 Comm: kworker/1:1H Tainted: G W 6.1.0-rc1-00002-gc28266863c4a #137 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Workqueue: xfs-log/sda xlog_ioend_work Call Trace: dump_stack_lvl+0x4d/0x66 print_report+0x171/0x4a6 kasan_report+0xb3/0x130 xfs_trans_ail_delete+0x240/0x2a0 xfs_buf_item_done+0x7b/0xa0 xfs_buf_ioend+0x1e9/0x11f0 xfs_buf_item_unpin+0x4c8/0x860 xfs_trans_committed_bulk+0x4c2/0x7c0 xlog_cil_committed+0xab6/0xfb0 xlog_cil_process_committed+0x117/0x1e0 xlog_state_shutdown_callbacks+0x208/0x440 xlog_force_shutdown+0x1b3/0x3a0 xlog_ioend_work+0xef/0x1d0 process_one_work+0x6f9/0xf70 worker_thread+0x578/0xf30 kthread+0x28c/0x330 ret_from_fork+0x1f/0x30 Allocated by task 9606: kasan_save_stack+0x1e/0x40 kasan_set_track+0x21/0x30 __kasan_kmalloc+0x7a/0x90 __kmalloc+0x59/0x140 kmem_alloc+0xb2/0x2f0 xfs_trans_ail_init+0x20/0x320 xfs_log_mount+0x37e/0x690 xfs_mountfs+0xe36/0x1b40 xfs_fs_fill_super+0xc5c/0x1a70 get_tree_bdev+0x3c5/0x6c0 vfs_get_tree+0x85/0x250 path_mount+0xec3/0x1830 do_mount+0xef/0x110 __x64_sys_mount+0x150/0x1f0 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Freed by task 9662: kasan_save_stack+0x1e/0x40 kasan_set_track+0x21/0x30 kasan_save_free_info+0x2a/0x40 __kasan_slab_free+0x105/0x1a0 __kmem_cache_free+0x99/0x2d0 kvfree+0x3a/0x40 xfs_log_unmount+0x60/0xf0 xfs_unmountfs+0xf3/0x1d0 xfs_fs_put_super+0x78/0x300 generic_shutdown_super+0x151/0x400 kill_block_super+0x9a/0xe0 deactivate_locked_super+0x82/0xe0 deactivate_super+0x91/0xb0 cleanup_mnt+0x32a/0x4a0 task_work_run+0x15f/0x240 exit_to_user_mode_prepare+0x188/0x190 syscall_exit_to_user_mode+0x12/0x30 do_syscall_64+0x42/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd The buggy address belongs to the object at ffff888023169400 which belongs to the cache kmalloc-128 of size 128 The buggy address is located 0 bytes inside of 128-byte region [ffff888023169400, ffff888023169480) The buggy address belongs to the physical page: page:ffffea00008c5a00 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff888023168f80 pfn:0x23168 head:ffffea00008c5a00 order:1 compound_mapcount:0 compound_pincount:0 flags: 0x1fffff80010200(slab|head|node=0|zone=1|lastcpupid=0x1fffff) raw: 001fffff80010200 ffffea00006b3988 ffffea0000577a88 ffff88800f842ac0 raw: ffff888023168f80 0000000000150007 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888023169300: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff888023169380: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff888023169400: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff888023169480: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff888023169500: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ================================================================== Disabling lock debugging due to kernel taint Fixes: cd6f79d1fb32 ("xfs: run callbacks before waking waiters in xlog_state_shutdown_callbacks") Signed-off-by: Guo Xuenan Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 0141d9907d31b..fc61cc0240236 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -888,6 +888,23 @@ xlog_force_iclog( return xlog_state_release_iclog(iclog->ic_log, iclog, NULL); } +/* + * Cycle all the iclogbuf locks to make sure all log IO completion + * is done before we tear down these buffers. + */ +static void +xlog_wait_iclog_completion(struct xlog *log) +{ + int i; + struct xlog_in_core *iclog = log->l_iclog; + + for (i = 0; i < log->l_iclog_bufs; i++) { + down(&iclog->ic_sema); + up(&iclog->ic_sema); + iclog = iclog->ic_next; + } +} + /* * Wait for the iclog and all prior iclogs to be written disk as required by the * log force state machine. Waiting on ic_force_wait ensures iclog completions @@ -1113,6 +1130,14 @@ xfs_log_unmount( { xfs_log_clean(mp); + /* + * If shutdown has come from iclog IO context, the log + * cleaning will have been skipped and so we need to wait + * for the iclog to complete shutdown processing before we + * tear anything down. + */ + xlog_wait_iclog_completion(mp->m_log); + xfs_buftarg_drain(mp->m_ddev_targp); xfs_trans_ail_destroy(mp); @@ -2115,17 +2140,6 @@ xlog_dealloc_log( xlog_in_core_t *iclog, *next_iclog; int i; - /* - * Cycle all the iclogbuf locks to make sure all log IO completion - * is done before we tear down these buffers. - */ - iclog = log->l_iclog; - for (i = 0; i < log->l_iclog_bufs; i++) { - down(&iclog->ic_sema); - up(&iclog->ic_sema); - iclog = iclog->ic_next; - } - /* * Destroy the CIL after waiting for iclog IO completion because an * iclog EIO error will try to shut down the log, which accesses the -- GitLab From 575689fc0ffa6c4bb4e72fd18e31a6525a6124e0 Mon Sep 17 00:00:00 2001 From: Guo Xuenan Date: Wed, 30 Nov 2022 09:25:46 -0800 Subject: [PATCH 0986/1423] xfs: fix super block buf log item UAF during force shutdown xfs log io error will trigger xlog shut down, and end_io worker call xlog_state_shutdown_callbacks to unpin and release the buf log item. The race condition is that when there are some thread doing transaction commit and happened not to be intercepted by xlog_is_shutdown, then, these log item will be insert into CIL, when unpin and release these buf log item, UAF will occur. BTW, add delay before `xlog_cil_commit` can increase recurrence probability. The following call graph actually encountered this bad situation. fsstress io end worker kworker/0:1H-216 xlog_ioend_work ->xlog_force_shutdown ->xlog_state_shutdown_callbacks ->xlog_cil_process_committed ->xlog_cil_committed ->xfs_trans_committed_bulk ->xfs_trans_apply_sb_deltas ->li_ops->iop_unpin(lip, 1); ->xfs_trans_getsb ->_xfs_trans_bjoin ->xfs_buf_item_init ->if (bip) { return 0;} //relog ->xlog_cil_commit ->xlog_cil_insert_items //insert into CIL ->xfs_buf_ioend_fail(bp); ->xfs_buf_ioend ->xfs_buf_item_done ->xfs_buf_item_relse ->xfs_buf_item_free when cil push worker gather percpu cil and insert super block buf log item into ctx->log_items then uaf occurs. ================================================================== BUG: KASAN: use-after-free in xlog_cil_push_work+0x1c8f/0x22f0 Write of size 8 at addr ffff88801800f3f0 by task kworker/u4:4/105 CPU: 0 PID: 105 Comm: kworker/u4:4 Tainted: G W 6.1.0-rc1-00001-g274115149b42 #136 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Workqueue: xfs-cil/sda xlog_cil_push_work Call Trace: dump_stack_lvl+0x4d/0x66 print_report+0x171/0x4a6 kasan_report+0xb3/0x130 xlog_cil_push_work+0x1c8f/0x22f0 process_one_work+0x6f9/0xf70 worker_thread+0x578/0xf30 kthread+0x28c/0x330 ret_from_fork+0x1f/0x30 Allocated by task 2145: kasan_save_stack+0x1e/0x40 kasan_set_track+0x21/0x30 __kasan_slab_alloc+0x54/0x60 kmem_cache_alloc+0x14a/0x510 xfs_buf_item_init+0x160/0x6d0 _xfs_trans_bjoin+0x7f/0x2e0 xfs_trans_getsb+0xb6/0x3f0 xfs_trans_apply_sb_deltas+0x1f/0x8c0 __xfs_trans_commit+0xa25/0xe10 xfs_symlink+0xe23/0x1660 xfs_vn_symlink+0x157/0x280 vfs_symlink+0x491/0x790 do_symlinkat+0x128/0x220 __x64_sys_symlink+0x7a/0x90 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Freed by task 216: kasan_save_stack+0x1e/0x40 kasan_set_track+0x21/0x30 kasan_save_free_info+0x2a/0x40 __kasan_slab_free+0x105/0x1a0 kmem_cache_free+0xb6/0x460 xfs_buf_ioend+0x1e9/0x11f0 xfs_buf_item_unpin+0x3d6/0x840 xfs_trans_committed_bulk+0x4c2/0x7c0 xlog_cil_committed+0xab6/0xfb0 xlog_cil_process_committed+0x117/0x1e0 xlog_state_shutdown_callbacks+0x208/0x440 xlog_force_shutdown+0x1b3/0x3a0 xlog_ioend_work+0xef/0x1d0 process_one_work+0x6f9/0xf70 worker_thread+0x578/0xf30 kthread+0x28c/0x330 ret_from_fork+0x1f/0x30 The buggy address belongs to the object at ffff88801800f388 which belongs to the cache xfs_buf_item of size 272 The buggy address is located 104 bytes inside of 272-byte region [ffff88801800f388, ffff88801800f498) The buggy address belongs to the physical page: page:ffffea0000600380 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff88801800f208 pfn:0x1800e head:ffffea0000600380 order:1 compound_mapcount:0 compound_pincount:0 flags: 0x1fffff80010200(slab|head|node=0|zone=1|lastcpupid=0x1fffff) raw: 001fffff80010200 ffffea0000699788 ffff88801319db50 ffff88800fb50640 raw: ffff88801800f208 000000000015000a 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88801800f280: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88801800f300: fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff88801800f380: fc fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88801800f400: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88801800f480: fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc fc ================================================================== Disabling lock debugging due to kernel taint Signed-off-by: Guo Xuenan Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf_item.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 522d450a94b18..df7322ed73fa9 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -1018,6 +1018,8 @@ xfs_buf_item_relse( trace_xfs_buf_item_relse(bp, _RET_IP_); ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); + if (atomic_read(&bip->bli_refcount)) + return; bp->b_log_item = NULL; xfs_buf_rele(bp); xfs_buf_item_free(bip); -- GitLab From 214b0a88c46d5f32d80abe0d1bc2eea1cbd38f11 Mon Sep 17 00:00:00 2001 From: Metin Kaya Date: Mon, 21 Mar 2022 11:05:32 +0000 Subject: [PATCH 0987/1423] KVM: x86/xen: add support for 32-bit guests in SCHEDOP_poll This patch introduces compat version of struct sched_poll for SCHEDOP_poll sub-operation of sched_op hypercall, reads correct amount of data (16 bytes in 32-bit case, 24 bytes otherwise) by using new compat_sched_poll struct, copies it to sched_poll properly, and lets rest of the code run as is. Signed-off-by: Metin Kaya Signed-off-by: David Woodhouse Reviewed-by: Paul Durrant --- arch/x86/kvm/xen.c | 33 +++++++++++++++++++++++++++++---- arch/x86/kvm/xen.h | 7 +++++++ 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 9187d024d0066..3e434dc339fb5 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -1201,20 +1201,45 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, evtchn_port_t port, *ports; gpa_t gpa; - if (!longmode || !lapic_in_kernel(vcpu) || + if (!lapic_in_kernel(vcpu) || !(vcpu->kvm->arch.xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND)) return false; idx = srcu_read_lock(&vcpu->kvm->srcu); gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL); srcu_read_unlock(&vcpu->kvm->srcu, idx); - - if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, &sched_poll, - sizeof(sched_poll))) { + if (!gpa) { *r = -EFAULT; return true; } + if (IS_ENABLED(CONFIG_64BIT) && !longmode) { + struct compat_sched_poll sp32; + + /* Sanity check that the compat struct definition is correct */ + BUILD_BUG_ON(sizeof(sp32) != 16); + + if (kvm_vcpu_read_guest(vcpu, gpa, &sp32, sizeof(sp32))) { + *r = -EFAULT; + return true; + } + + /* + * This is a 32-bit pointer to an array of evtchn_port_t which + * are uint32_t, so once it's converted no further compat + * handling is needed. + */ + sched_poll.ports = (void *)(unsigned long)(sp32.ports); + sched_poll.nr_ports = sp32.nr_ports; + sched_poll.timeout = sp32.timeout; + } else { + if (kvm_vcpu_read_guest(vcpu, gpa, &sched_poll, + sizeof(sched_poll))) { + *r = -EFAULT; + return true; + } + } + if (unlikely(sched_poll.nr_ports > 1)) { /* Xen (unofficially) limits number of pollers to 128 */ if (sched_poll.nr_ports > 128) { diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h index 8503d2c6891e3..ea33d80a0c51f 100644 --- a/arch/x86/kvm/xen.h +++ b/arch/x86/kvm/xen.h @@ -207,4 +207,11 @@ struct compat_vcpu_runstate_info { uint64_t time[4]; } __attribute__((packed)); +struct compat_sched_poll { + /* This is actually a guest virtual address which points to ports. */ + uint32_t ports; + unsigned int nr_ports; + uint64_t timeout; +}; + #endif /* __ARCH_X86_KVM_XEN_H__ */ -- GitLab From 8c82a0b3ba1a411b84af5d43a4cc5994efa897ec Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 13 Oct 2022 21:12:24 +0000 Subject: [PATCH 0988/1423] KVM: Store immutable gfn_to_pfn_cache properties Move the assignment of immutable properties @kvm, @vcpu, and @usage to the initializer. Make _activate() and _deactivate() use stored values. Note, @len is also effectively immutable for most cases, but not in the case of the Xen runstate cache, which may be split across two pages and the length of the first segment will depend on its address. Suggested-by: Sean Christopherson Signed-off-by: Michal Luczaj [sean: handle @len in a separate patch] Signed-off-by: Sean Christopherson [dwmw2: acknowledge that @len can actually change for some use cases] Signed-off-by: David Woodhouse --- arch/x86/kvm/x86.c | 14 ++++----- arch/x86/kvm/xen.c | 65 ++++++++++++++++++--------------------- include/linux/kvm_host.h | 37 +++++++++++----------- include/linux/kvm_types.h | 1 + virt/kvm/pfncache.c | 22 ++++++++----- 5 files changed, 69 insertions(+), 70 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7f850dfb40861..b5e7aea221106 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2317,13 +2317,11 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time, kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); /* we verify if the enable bit is set... */ - if (system_time & 1) { - kvm_gpc_activate(vcpu->kvm, &vcpu->arch.pv_time, vcpu, - KVM_HOST_USES_PFN, system_time & ~1ULL, + if (system_time & 1) + kvm_gpc_activate(&vcpu->arch.pv_time, system_time & ~1ULL, sizeof(struct pvclock_vcpu_time_info)); - } else { - kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time); - } + else + kvm_gpc_deactivate(&vcpu->arch.pv_time); return; } @@ -3391,7 +3389,7 @@ static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data) static void kvmclock_reset(struct kvm_vcpu *vcpu) { - kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time); + kvm_gpc_deactivate(&vcpu->arch.pv_time); vcpu->arch.time = 0; } @@ -11542,7 +11540,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.regs_avail = ~0; vcpu->arch.regs_dirty = ~0; - kvm_gpc_init(&vcpu->arch.pv_time); + kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm, vcpu, KVM_HOST_USES_PFN); if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 3e434dc339fb5..55257c2a1610f 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -42,13 +42,12 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) int idx = srcu_read_lock(&kvm->srcu); if (gfn == GPA_INVALID) { - kvm_gpc_deactivate(kvm, gpc); + kvm_gpc_deactivate(gpc); goto out; } do { - ret = kvm_gpc_activate(kvm, gpc, NULL, KVM_HOST_USES_PFN, gpa, - PAGE_SIZE); + ret = kvm_gpc_activate(gpc, gpa, PAGE_SIZE); if (ret) goto out; @@ -323,8 +322,8 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) * to the second page now because the guest changed to * 64-bit mode, the second GPC won't have been set up. */ - if (kvm_gpc_activate(v->kvm, gpc2, NULL, KVM_HOST_USES_PFN, - gpc1->gpa + user_len1, user_len2)) + if (kvm_gpc_activate(gpc2, gpc1->gpa + user_len1, + user_len2)) return; /* @@ -711,15 +710,13 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) offsetof(struct compat_vcpu_info, time)); if (data->u.gpa == GPA_INVALID) { - kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache); + kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache); r = 0; break; } - r = kvm_gpc_activate(vcpu->kvm, - &vcpu->arch.xen.vcpu_info_cache, NULL, - KVM_HOST_USES_PFN, data->u.gpa, - sizeof(struct vcpu_info)); + r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache, + data->u.gpa, sizeof(struct vcpu_info)); if (!r) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -727,15 +724,13 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO: if (data->u.gpa == GPA_INVALID) { - kvm_gpc_deactivate(vcpu->kvm, - &vcpu->arch.xen.vcpu_time_info_cache); + kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache); r = 0; break; } - r = kvm_gpc_activate(vcpu->kvm, - &vcpu->arch.xen.vcpu_time_info_cache, - NULL, KVM_HOST_USES_PFN, data->u.gpa, + r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_time_info_cache, + data->u.gpa, sizeof(struct pvclock_vcpu_time_info)); if (!r) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -751,10 +746,8 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) if (data->u.gpa == GPA_INVALID) { r = 0; deactivate_out: - kvm_gpc_deactivate(vcpu->kvm, - &vcpu->arch.xen.runstate_cache); - kvm_gpc_deactivate(vcpu->kvm, - &vcpu->arch.xen.runstate2_cache); + kvm_gpc_deactivate(&vcpu->arch.xen.runstate_cache); + kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache); break; } @@ -770,20 +763,18 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) /* How much fits in the (first) page? */ sz1 = PAGE_SIZE - (data->u.gpa & ~PAGE_MASK); - r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache, - NULL, KVM_HOST_USES_PFN, data->u.gpa, sz1); + r = kvm_gpc_activate(&vcpu->arch.xen.runstate_cache, + data->u.gpa, sz1); if (r) goto deactivate_out; /* Either map the second page, or deactivate the second GPC */ if (sz1 >= sz) { - kvm_gpc_deactivate(vcpu->kvm, - &vcpu->arch.xen.runstate2_cache); + kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache); } else { sz2 = sz - sz1; BUG_ON((data->u.gpa + sz1) & ~PAGE_MASK); - r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate2_cache, - NULL, KVM_HOST_USES_PFN, + r = kvm_gpc_activate(&vcpu->arch.xen.runstate2_cache, data->u.gpa + sz1, sz2); if (r) goto deactivate_out; @@ -2051,10 +2042,14 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu) timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0); - kvm_gpc_init(&vcpu->arch.xen.runstate_cache); - kvm_gpc_init(&vcpu->arch.xen.runstate2_cache); - kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache); - kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache); + kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm, NULL, + KVM_HOST_USES_PFN); + kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm, NULL, + KVM_HOST_USES_PFN); + kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache, vcpu->kvm, NULL, + KVM_HOST_USES_PFN); + kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache, vcpu->kvm, NULL, + KVM_HOST_USES_PFN); } void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) @@ -2062,10 +2057,10 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) if (kvm_xen_timer_enabled(vcpu)) kvm_xen_stop_timer(vcpu); - kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate_cache); - kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate2_cache); - kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache); - kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_time_info_cache); + kvm_gpc_deactivate(&vcpu->arch.xen.runstate_cache); + kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache); + kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache); + kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache); del_timer_sync(&vcpu->arch.xen.poll_timer); } @@ -2073,7 +2068,7 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) void kvm_xen_init_vm(struct kvm *kvm) { idr_init(&kvm->arch.xen.evtchn_ports); - kvm_gpc_init(&kvm->arch.xen.shinfo_cache); + kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm, NULL, KVM_HOST_USES_PFN); } void kvm_xen_destroy_vm(struct kvm *kvm) @@ -2081,7 +2076,7 @@ void kvm_xen_destroy_vm(struct kvm *kvm) struct evtchnfd *evtchnfd; int i; - kvm_gpc_deactivate(kvm, &kvm->arch.xen.shinfo_cache); + kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) { if (!evtchnfd->deliver.port.port) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8f874a9643139..73ded328f9dce 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1260,18 +1260,7 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn); * kvm_gpc_init - initialize gfn_to_pfn_cache. * * @gpc: struct gfn_to_pfn_cache object. - * - * This sets up a gfn_to_pfn_cache by initializing locks. Note, the cache must - * be zero-allocated (or zeroed by the caller before init). - */ -void kvm_gpc_init(struct gfn_to_pfn_cache *gpc); - -/** - * kvm_gpc_activate - prepare a cached kernel mapping and HPA for a given guest - * physical address. - * * @kvm: pointer to kvm instance. - * @gpc: struct gfn_to_pfn_cache object. * @vcpu: vCPU to be used for marking pages dirty and to be woken on * invalidation. * @usage: indicates if the resulting host physical PFN is used while @@ -1280,20 +1269,31 @@ void kvm_gpc_init(struct gfn_to_pfn_cache *gpc); * changes!---will also force @vcpu to exit the guest and * refresh the cache); and/or if the PFN used directly * by KVM (and thus needs a kernel virtual mapping). + * + * This sets up a gfn_to_pfn_cache by initializing locks and assigning the + * immutable attributes. Note, the cache must be zero-allocated (or zeroed by + * the caller before init). + */ +void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm, + struct kvm_vcpu *vcpu, enum pfn_cache_usage usage); + +/** + * kvm_gpc_activate - prepare a cached kernel mapping and HPA for a given guest + * physical address. + * + * @gpc: struct gfn_to_pfn_cache object. * @gpa: guest physical address to map. * @len: sanity check; the range being access must fit a single page. * * @return: 0 for success. * -EINVAL for a mapping which would cross a page boundary. - * -EFAULT for an untranslatable guest physical address. + * -EFAULT for an untranslatable guest physical address. * - * This primes a gfn_to_pfn_cache and links it into the @kvm's list for + * This primes a gfn_to_pfn_cache and links it into the @gpc->kvm's list for * invalidations to be processed. Callers are required to use kvm_gpc_check() * to ensure that the cache is valid before accessing the target page. */ -int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, - struct kvm_vcpu *vcpu, enum pfn_cache_usage usage, - gpa_t gpa, unsigned long len); +int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len); /** * kvm_gpc_check - check validity of a gfn_to_pfn_cache. @@ -1352,13 +1352,12 @@ void kvm_gpc_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc); /** * kvm_gpc_deactivate - deactivate and unlink a gfn_to_pfn_cache. * - * @kvm: pointer to kvm instance. * @gpc: struct gfn_to_pfn_cache object. * - * This removes a cache from the @kvm's list to be processed on MMU notifier + * This removes a cache from the VM's list to be processed on MMU notifier * invocation. */ -void kvm_gpc_deactivate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc); +void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc); void kvm_sigset_activate(struct kvm_vcpu *vcpu); void kvm_sigset_deactivate(struct kvm_vcpu *vcpu); diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 3ca3db020e0e3..76de36e56cdfe 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -67,6 +67,7 @@ struct gfn_to_pfn_cache { gpa_t gpa; unsigned long uhva; struct kvm_memory_slot *memslot; + struct kvm *kvm; struct kvm_vcpu *vcpu; struct list_head list; rwlock_t lock; diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index b4295474519f8..d8ce30b893d97 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -362,25 +362,29 @@ void kvm_gpc_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) } EXPORT_SYMBOL_GPL(kvm_gpc_unmap); -void kvm_gpc_init(struct gfn_to_pfn_cache *gpc) +void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm, + struct kvm_vcpu *vcpu, enum pfn_cache_usage usage) { + WARN_ON_ONCE(!usage || (usage & KVM_GUEST_AND_HOST_USE_PFN) != usage); + WARN_ON_ONCE((usage & KVM_GUEST_USES_PFN) && !vcpu); + rwlock_init(&gpc->lock); mutex_init(&gpc->refresh_lock); + + gpc->kvm = kvm; + gpc->vcpu = vcpu; + gpc->usage = usage; } EXPORT_SYMBOL_GPL(kvm_gpc_init); -int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, - struct kvm_vcpu *vcpu, enum pfn_cache_usage usage, - gpa_t gpa, unsigned long len) +int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) { - WARN_ON_ONCE(!usage || (usage & KVM_GUEST_AND_HOST_USE_PFN) != usage); + struct kvm *kvm = gpc->kvm; if (!gpc->active) { gpc->khva = NULL; gpc->pfn = KVM_PFN_ERR_FAULT; gpc->uhva = KVM_HVA_ERR_BAD; - gpc->vcpu = vcpu; - gpc->usage = usage; gpc->valid = false; spin_lock(&kvm->gpc_lock); @@ -400,8 +404,10 @@ int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, } EXPORT_SYMBOL_GPL(kvm_gpc_activate); -void kvm_gpc_deactivate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) +void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc) { + struct kvm *kvm = gpc->kvm; + if (gpc->active) { /* * Deactivate the cache before removing it from the list, KVM -- GitLab From e308c24a358d1e79951b16c387cbc6c6593639a5 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 13 Oct 2022 21:12:26 +0000 Subject: [PATCH 0989/1423] KVM: Use gfn_to_pfn_cache's immutable "kvm" in kvm_gpc_check() Make kvm_gpc_check() use kvm instance cached in gfn_to_pfn_cache. Suggested-by: Sean Christopherson Signed-off-by: Michal Luczaj Signed-off-by: Sean Christopherson Signed-off-by: David Woodhouse --- arch/x86/kvm/x86.c | 2 +- arch/x86/kvm/xen.c | 16 +++++++--------- include/linux/kvm_host.h | 4 +--- virt/kvm/pfncache.c | 5 ++--- 4 files changed, 11 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b5e7aea221106..441f08c3af960 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3035,7 +3035,7 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, unsigned long flags; read_lock_irqsave(&gpc->lock, flags); - while (!kvm_gpc_check(v->kvm, gpc, gpc->gpa, + while (!kvm_gpc_check(gpc, gpc->gpa, offset + sizeof(*guest_hv_clock))) { read_unlock_irqrestore(&gpc->lock, flags); diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 55257c2a1610f..148319e980c41 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -272,7 +272,7 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) * gfn_to_pfn caches that cover the region. */ read_lock_irqsave(&gpc1->lock, flags); - while (!kvm_gpc_check(v->kvm, gpc1, gpc1->gpa, user_len1)) { + while (!kvm_gpc_check(gpc1, gpc1->gpa, user_len1)) { read_unlock_irqrestore(&gpc1->lock, flags); /* When invoked from kvm_sched_out() we cannot sleep */ @@ -308,7 +308,7 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) */ read_lock(&gpc2->lock); - if (!kvm_gpc_check(v->kvm, gpc2, gpc2->gpa, user_len2)) { + if (!kvm_gpc_check(gpc2, gpc2->gpa, user_len2)) { read_unlock(&gpc2->lock); read_unlock_irqrestore(&gpc1->lock, flags); @@ -488,8 +488,7 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v) * little more honest about it. */ read_lock_irqsave(&gpc->lock, flags); - while (!kvm_gpc_check(v->kvm, gpc, gpc->gpa, - sizeof(struct vcpu_info))) { + while (!kvm_gpc_check(gpc, gpc->gpa, sizeof(struct vcpu_info))) { read_unlock_irqrestore(&gpc->lock, flags); if (kvm_gpc_refresh(v->kvm, gpc, gpc->gpa, @@ -553,8 +552,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending)); read_lock_irqsave(&gpc->lock, flags); - while (!kvm_gpc_check(v->kvm, gpc, gpc->gpa, - sizeof(struct vcpu_info))) { + while (!kvm_gpc_check(gpc, gpc->gpa, sizeof(struct vcpu_info))) { read_unlock_irqrestore(&gpc->lock, flags); /* @@ -1158,7 +1156,7 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, read_lock_irqsave(&gpc->lock, flags); idx = srcu_read_lock(&kvm->srcu); - if (!kvm_gpc_check(kvm, gpc, gpc->gpa, PAGE_SIZE)) + if (!kvm_gpc_check(gpc, gpc->gpa, PAGE_SIZE)) goto out_rcu; ret = false; @@ -1580,7 +1578,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) idx = srcu_read_lock(&kvm->srcu); read_lock_irqsave(&gpc->lock, flags); - if (!kvm_gpc_check(kvm, gpc, gpc->gpa, PAGE_SIZE)) + if (!kvm_gpc_check(gpc, gpc->gpa, PAGE_SIZE)) goto out_rcu; if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { @@ -1614,7 +1612,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) gpc = &vcpu->arch.xen.vcpu_info_cache; read_lock_irqsave(&gpc->lock, flags); - if (!kvm_gpc_check(kvm, gpc, gpc->gpa, sizeof(struct vcpu_info))) { + if (!kvm_gpc_check(gpc, gpc->gpa, sizeof(struct vcpu_info))) { /* * Could not access the vcpu_info. Set the bit in-kernel * and prod the vCPU to deliver it for itself. diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 73ded328f9dce..befc8114ed0da 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1298,7 +1298,6 @@ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) /** * kvm_gpc_check - check validity of a gfn_to_pfn_cache. * - * @kvm: pointer to kvm instance. * @gpc: struct gfn_to_pfn_cache object. * @gpa: current guest physical address to map. * @len: sanity check; the range being access must fit a single page. @@ -1313,8 +1312,7 @@ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) * Callers in IN_GUEST_MODE may do so without locking, although they should * still hold a read lock on kvm->scru for the memslot checks. */ -bool kvm_gpc_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa, - unsigned long len); +bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len); /** * kvm_gpc_refresh - update a previously initialized cache. diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index d8ce30b893d97..decf4fdde668f 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -76,10 +76,9 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start, } } -bool kvm_gpc_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa, - unsigned long len) +bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) { - struct kvm_memslots *slots = kvm_memslots(kvm); + struct kvm_memslots *slots = kvm_memslots(gpc->kvm); if (!gpc->active) return false; -- GitLab From 2a0b128a906ab28b1ab41ceedcaf462b6f74f1aa Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 13 Oct 2022 21:12:27 +0000 Subject: [PATCH 0990/1423] KVM: Clean up hva_to_pfn_retry() Make hva_to_pfn_retry() use kvm instance cached in gfn_to_pfn_cache. Suggested-by: Sean Christopherson Signed-off-by: Michal Luczaj Signed-off-by: Sean Christopherson Signed-off-by: David Woodhouse --- virt/kvm/pfncache.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index decf4fdde668f..9d506de6c150d 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -138,7 +138,7 @@ static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_s return kvm->mmu_invalidate_seq != mmu_seq; } -static kvm_pfn_t hva_to_pfn_retry(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) +static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) { /* Note, the new page offset may be different than the old! */ void *old_khva = gpc->khva - offset_in_page(gpc->khva); @@ -158,7 +158,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) gpc->valid = false; do { - mmu_seq = kvm->mmu_invalidate_seq; + mmu_seq = gpc->kvm->mmu_invalidate_seq; smp_rmb(); write_unlock_irq(&gpc->lock); @@ -216,7 +216,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) * attempting to refresh. */ WARN_ON_ONCE(gpc->valid); - } while (mmu_notifier_retry_cache(kvm, mmu_seq)); + } while (mmu_notifier_retry_cache(gpc->kvm, mmu_seq)); gpc->valid = true; gpc->pfn = new_pfn; @@ -294,7 +294,7 @@ int kvm_gpc_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa, * drop the lock and do the HVA to PFN lookup again. */ if (!gpc->valid || old_uhva != gpc->uhva) { - ret = hva_to_pfn_retry(kvm, gpc); + ret = hva_to_pfn_retry(gpc); } else { /* * If the HVA→PFN mapping was already valid, don't unmap it. -- GitLab From 0318f207d1c2e297d1ec1c6e145bb8bd053236f9 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 13 Oct 2022 21:12:28 +0000 Subject: [PATCH 0991/1423] KVM: Use gfn_to_pfn_cache's immutable "kvm" in kvm_gpc_refresh() Make kvm_gpc_refresh() use kvm instance cached in gfn_to_pfn_cache. No functional change intended. Suggested-by: Sean Christopherson Signed-off-by: Michal Luczaj [sean: leave kvm_gpc_unmap() as-is] Signed-off-by: Sean Christopherson Signed-off-by: David Woodhouse --- arch/x86/kvm/x86.c | 2 +- arch/x86/kvm/xen.c | 10 ++++------ include/linux/kvm_host.h | 10 ++++------ virt/kvm/pfncache.c | 7 +++---- 4 files changed, 12 insertions(+), 17 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 441f08c3af960..490df3e997fac 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3039,7 +3039,7 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, offset + sizeof(*guest_hv_clock))) { read_unlock_irqrestore(&gpc->lock, flags); - if (kvm_gpc_refresh(v->kvm, gpc, gpc->gpa, + if (kvm_gpc_refresh(gpc, gpc->gpa, offset + sizeof(*guest_hv_clock))) return; diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 148319e980c41..f50c88b1eaab3 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -279,7 +279,7 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) if (atomic) return; - if (kvm_gpc_refresh(v->kvm, gpc1, gpc1->gpa, user_len1)) + if (kvm_gpc_refresh(gpc1, gpc1->gpa, user_len1)) return; read_lock_irqsave(&gpc1->lock, flags); @@ -491,8 +491,7 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v) while (!kvm_gpc_check(gpc, gpc->gpa, sizeof(struct vcpu_info))) { read_unlock_irqrestore(&gpc->lock, flags); - if (kvm_gpc_refresh(v->kvm, gpc, gpc->gpa, - sizeof(struct vcpu_info))) + if (kvm_gpc_refresh(gpc, gpc->gpa, sizeof(struct vcpu_info))) return; read_lock_irqsave(&gpc->lock, flags); @@ -566,8 +565,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) if (in_atomic() || !task_is_running(current)) return 1; - if (kvm_gpc_refresh(v->kvm, gpc, gpc->gpa, - sizeof(struct vcpu_info))) { + if (kvm_gpc_refresh(gpc, gpc->gpa, sizeof(struct vcpu_info))) { /* * If this failed, userspace has screwed up the * vcpu_info mapping. No interrupts for you. @@ -1710,7 +1708,7 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm) break; idx = srcu_read_lock(&kvm->srcu); - rc = kvm_gpc_refresh(kvm, gpc, gpc->gpa, PAGE_SIZE); + rc = kvm_gpc_refresh(gpc, gpc->gpa, PAGE_SIZE); srcu_read_unlock(&kvm->srcu, idx); } while(!rc); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index befc8114ed0da..3ce4650776b87 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1317,23 +1317,21 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len); /** * kvm_gpc_refresh - update a previously initialized cache. * - * @kvm: pointer to kvm instance. * @gpc: struct gfn_to_pfn_cache object. * @gpa: updated guest physical address to map. * @len: sanity check; the range being access must fit a single page. - * + * @return: 0 for success. * -EINVAL for a mapping which would cross a page boundary. - * -EFAULT for an untranslatable guest physical address. + * -EFAULT for an untranslatable guest physical address. * * This will attempt to refresh a gfn_to_pfn_cache. Note that a successful - * returm from this function does not mean the page can be immediately + * return from this function does not mean the page can be immediately * accessed because it may have raced with an invalidation. Callers must * still lock and check the cache status, as this function does not return * with the lock still held to permit access. */ -int kvm_gpc_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa, - unsigned long len); +int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len); /** * kvm_gpc_unmap - temporarily unmap a gfn_to_pfn_cache. diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index 9d506de6c150d..015c5d16948a1 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -237,10 +237,9 @@ out_error: return -EFAULT; } -int kvm_gpc_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, gpa_t gpa, - unsigned long len) +int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) { - struct kvm_memslots *slots = kvm_memslots(kvm); + struct kvm_memslots *slots = kvm_memslots(gpc->kvm); unsigned long page_offset = gpa & ~PAGE_MASK; bool unmap_old = false; unsigned long old_uhva; @@ -399,7 +398,7 @@ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) gpc->active = true; write_unlock_irq(&gpc->lock); } - return kvm_gpc_refresh(kvm, gpc, gpa, len); + return kvm_gpc_refresh(gpc, gpa, len); } EXPORT_SYMBOL_GPL(kvm_gpc_activate); -- GitLab From 9f87791d686d85614584438d4f249eb32ef7964c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 13 Oct 2022 21:12:29 +0000 Subject: [PATCH 0992/1423] KVM: Drop KVM's API to allow temporarily unmapping gfn=>pfn cache Drop kvm_gpc_unmap() as it has no users and unclear requirements. The API was added as part of the original gfn_to_pfn_cache support, but its sole usage[*] was never merged. Fold the guts of kvm_gpc_unmap() into the deactivate path and drop the API. Omit acquiring refresh_lock as as concurrent calls to kvm_gpc_deactivate() are not allowed (this is not enforced, e.g. via lockdep. due to it being called during vCPU destruction). If/when temporary unmapping makes a comeback, the desirable behavior is likely to restrict temporary unmapping to vCPU-exclusive mappings and require the vcpu->mutex be held to serialize unmap. Use of the refresh_lock to protect unmapping was somewhat specuatively added by commit 93984f19e7bc ("KVM: Fully serialize gfn=>pfn cache refresh via mutex") to guard against concurrent unmaps, but the primary use case of the temporary unmap, nested virtualization[*], doesn't actually need or want concurrent unmaps. [*] https://lore.kernel.org/all/20211210163625.2886-7-dwmw2@infradead.org Signed-off-by: Sean Christopherson Signed-off-by: David Woodhouse --- include/linux/kvm_host.h | 12 ----------- virt/kvm/pfncache.c | 44 +++++++++++++++------------------------- 2 files changed, 16 insertions(+), 40 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3ce4650776b87..eac76965cf448 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1333,18 +1333,6 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len); */ int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len); -/** - * kvm_gpc_unmap - temporarily unmap a gfn_to_pfn_cache. - * - * @kvm: pointer to kvm instance. - * @gpc: struct gfn_to_pfn_cache object. - * - * This unmaps the referenced page. The cache is left in the invalid state - * but at least the mapping from GPA to userspace HVA will remain cached - * and can be reused on a subsequent refresh. - */ -void kvm_gpc_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc); - /** * kvm_gpc_deactivate - deactivate and unlink a gfn_to_pfn_cache. * diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index 015c5d16948a1..5b25127936915 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -333,33 +333,6 @@ out_unlock: } EXPORT_SYMBOL_GPL(kvm_gpc_refresh); -void kvm_gpc_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) -{ - void *old_khva; - kvm_pfn_t old_pfn; - - mutex_lock(&gpc->refresh_lock); - write_lock_irq(&gpc->lock); - - gpc->valid = false; - - old_khva = gpc->khva - offset_in_page(gpc->khva); - old_pfn = gpc->pfn; - - /* - * We can leave the GPA → uHVA map cache intact but the PFN - * lookup will need to be redone even for the same page. - */ - gpc->khva = NULL; - gpc->pfn = KVM_PFN_ERR_FAULT; - - write_unlock_irq(&gpc->lock); - mutex_unlock(&gpc->refresh_lock); - - gpc_unmap_khva(old_pfn, old_khva); -} -EXPORT_SYMBOL_GPL(kvm_gpc_unmap); - void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm, struct kvm_vcpu *vcpu, enum pfn_cache_usage usage) { @@ -405,6 +378,8 @@ EXPORT_SYMBOL_GPL(kvm_gpc_activate); void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc) { struct kvm *kvm = gpc->kvm; + kvm_pfn_t old_pfn; + void *old_khva; if (gpc->active) { /* @@ -414,13 +389,26 @@ void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc) */ write_lock_irq(&gpc->lock); gpc->active = false; + gpc->valid = false; + + /* + * Leave the GPA => uHVA cache intact, it's protected by the + * memslot generation. The PFN lookup needs to be redone every + * time as mmu_notifier protection is lost when the cache is + * removed from the VM's gpc_list. + */ + old_khva = gpc->khva - offset_in_page(gpc->khva); + gpc->khva = NULL; + + old_pfn = gpc->pfn; + gpc->pfn = KVM_PFN_ERR_FAULT; write_unlock_irq(&gpc->lock); spin_lock(&kvm->gpc_lock); list_del(&gpc->list); spin_unlock(&kvm->gpc_lock); - kvm_gpc_unmap(kvm, gpc); + gpc_unmap_khva(old_pfn, old_khva); } } EXPORT_SYMBOL_GPL(kvm_gpc_deactivate); -- GitLab From 5762cb10235776dd1ed5f5f9d6c1aff2b73bec5c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 13 Oct 2022 21:12:30 +0000 Subject: [PATCH 0993/1423] KVM: Do not partially reinitialize gfn=>pfn cache during activation Don't partially reinitialize a gfn=>pfn cache when activating the cache, and instead assert that the cache is not valid during activation. Bug the VM if the assertion fails, as use-after-free and/or data corruption is all but guaranteed if KVM ends up with a valid-but-inactive cache. Signed-off-by: Sean Christopherson Signed-off-by: David Woodhouse --- virt/kvm/pfncache.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index 5b25127936915..c1a772cedc4bd 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -345,6 +345,8 @@ void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm, gpc->kvm = kvm; gpc->vcpu = vcpu; gpc->usage = usage; + gpc->pfn = KVM_PFN_ERR_FAULT; + gpc->uhva = KVM_HVA_ERR_BAD; } EXPORT_SYMBOL_GPL(kvm_gpc_init); @@ -353,10 +355,8 @@ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) struct kvm *kvm = gpc->kvm; if (!gpc->active) { - gpc->khva = NULL; - gpc->pfn = KVM_PFN_ERR_FAULT; - gpc->uhva = KVM_HVA_ERR_BAD; - gpc->valid = false; + if (KVM_BUG_ON(gpc->valid, kvm)) + return -EIO; spin_lock(&kvm->gpc_lock); list_add(&gpc->list, &kvm->gpc_list); -- GitLab From 58f5ee5fedd981e05cb086cba4e8f923c3727a04 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 13 Oct 2022 21:12:31 +0000 Subject: [PATCH 0994/1423] KVM: Drop @gpa from exported gfn=>pfn cache check() and refresh() helpers Drop the @gpa param from the exported check()+refresh() helpers and limit changing the cache's GPA to the activate path. All external users just feed in gpc->gpa, i.e. this is a fancy nop. Allowing users to change the GPA at check()+refresh() is dangerous as those helpers explicitly allow concurrent calls, e.g. KVM could get into a livelock scenario. It's also unclear as to what the expected behavior should be if multiple tasks attempt to refresh with different GPAs. Signed-off-by: Sean Christopherson Signed-off-by: David Woodhouse --- arch/x86/kvm/x86.c | 6 ++---- arch/x86/kvm/xen.c | 22 +++++++++++----------- include/linux/kvm_host.h | 8 +++----- virt/kvm/pfncache.c | 17 +++++++++++------ 4 files changed, 27 insertions(+), 26 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 490df3e997fac..006b445996a9a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3035,12 +3035,10 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, unsigned long flags; read_lock_irqsave(&gpc->lock, flags); - while (!kvm_gpc_check(gpc, gpc->gpa, - offset + sizeof(*guest_hv_clock))) { + while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) { read_unlock_irqrestore(&gpc->lock, flags); - if (kvm_gpc_refresh(gpc, gpc->gpa, - offset + sizeof(*guest_hv_clock))) + if (kvm_gpc_refresh(gpc, offset + sizeof(*guest_hv_clock))) return; read_lock_irqsave(&gpc->lock, flags); diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index f50c88b1eaab3..5208e05ca9a6c 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -272,14 +272,14 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) * gfn_to_pfn caches that cover the region. */ read_lock_irqsave(&gpc1->lock, flags); - while (!kvm_gpc_check(gpc1, gpc1->gpa, user_len1)) { + while (!kvm_gpc_check(gpc1, user_len1)) { read_unlock_irqrestore(&gpc1->lock, flags); /* When invoked from kvm_sched_out() we cannot sleep */ if (atomic) return; - if (kvm_gpc_refresh(gpc1, gpc1->gpa, user_len1)) + if (kvm_gpc_refresh(gpc1, user_len1)) return; read_lock_irqsave(&gpc1->lock, flags); @@ -308,7 +308,7 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) */ read_lock(&gpc2->lock); - if (!kvm_gpc_check(gpc2, gpc2->gpa, user_len2)) { + if (!kvm_gpc_check(gpc2, user_len2)) { read_unlock(&gpc2->lock); read_unlock_irqrestore(&gpc1->lock, flags); @@ -488,10 +488,10 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v) * little more honest about it. */ read_lock_irqsave(&gpc->lock, flags); - while (!kvm_gpc_check(gpc, gpc->gpa, sizeof(struct vcpu_info))) { + while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) { read_unlock_irqrestore(&gpc->lock, flags); - if (kvm_gpc_refresh(gpc, gpc->gpa, sizeof(struct vcpu_info))) + if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info))) return; read_lock_irqsave(&gpc->lock, flags); @@ -551,7 +551,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending)); read_lock_irqsave(&gpc->lock, flags); - while (!kvm_gpc_check(gpc, gpc->gpa, sizeof(struct vcpu_info))) { + while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) { read_unlock_irqrestore(&gpc->lock, flags); /* @@ -565,7 +565,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) if (in_atomic() || !task_is_running(current)) return 1; - if (kvm_gpc_refresh(gpc, gpc->gpa, sizeof(struct vcpu_info))) { + if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info))) { /* * If this failed, userspace has screwed up the * vcpu_info mapping. No interrupts for you. @@ -1154,7 +1154,7 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, read_lock_irqsave(&gpc->lock, flags); idx = srcu_read_lock(&kvm->srcu); - if (!kvm_gpc_check(gpc, gpc->gpa, PAGE_SIZE)) + if (!kvm_gpc_check(gpc, PAGE_SIZE)) goto out_rcu; ret = false; @@ -1576,7 +1576,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) idx = srcu_read_lock(&kvm->srcu); read_lock_irqsave(&gpc->lock, flags); - if (!kvm_gpc_check(gpc, gpc->gpa, PAGE_SIZE)) + if (!kvm_gpc_check(gpc, PAGE_SIZE)) goto out_rcu; if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { @@ -1610,7 +1610,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) gpc = &vcpu->arch.xen.vcpu_info_cache; read_lock_irqsave(&gpc->lock, flags); - if (!kvm_gpc_check(gpc, gpc->gpa, sizeof(struct vcpu_info))) { + if (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) { /* * Could not access the vcpu_info. Set the bit in-kernel * and prod the vCPU to deliver it for itself. @@ -1708,7 +1708,7 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm) break; idx = srcu_read_lock(&kvm->srcu); - rc = kvm_gpc_refresh(gpc, gpc->gpa, PAGE_SIZE); + rc = kvm_gpc_refresh(gpc, PAGE_SIZE); srcu_read_unlock(&kvm->srcu, idx); } while(!rc); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index eac76965cf448..7008846fd3dd2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1299,7 +1299,6 @@ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) * kvm_gpc_check - check validity of a gfn_to_pfn_cache. * * @gpc: struct gfn_to_pfn_cache object. - * @gpa: current guest physical address to map. * @len: sanity check; the range being access must fit a single page. * * @return: %true if the cache is still valid and the address matches. @@ -1312,15 +1311,14 @@ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) * Callers in IN_GUEST_MODE may do so without locking, although they should * still hold a read lock on kvm->scru for the memslot checks. */ -bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len); +bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len); /** * kvm_gpc_refresh - update a previously initialized cache. * * @gpc: struct gfn_to_pfn_cache object. - * @gpa: updated guest physical address to map. * @len: sanity check; the range being access must fit a single page. - + * * @return: 0 for success. * -EINVAL for a mapping which would cross a page boundary. * -EFAULT for an untranslatable guest physical address. @@ -1331,7 +1329,7 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len); * still lock and check the cache status, as this function does not return * with the lock still held to permit access. */ -int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len); +int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, unsigned long len); /** * kvm_gpc_deactivate - deactivate and unlink a gfn_to_pfn_cache. diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index c1a772cedc4bd..a805cc1544bf6 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -76,18 +76,17 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start, } } -bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) +bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len) { struct kvm_memslots *slots = kvm_memslots(gpc->kvm); if (!gpc->active) return false; - if ((gpa & ~PAGE_MASK) + len > PAGE_SIZE) + if ((gpc->gpa & ~PAGE_MASK) + len > PAGE_SIZE) return false; - if (gpc->gpa != gpa || gpc->generation != slots->generation || - kvm_is_error_hva(gpc->uhva)) + if (gpc->generation != slots->generation || kvm_is_error_hva(gpc->uhva)) return false; if (!gpc->valid) @@ -237,7 +236,8 @@ out_error: return -EFAULT; } -int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) +static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, + unsigned long len) { struct kvm_memslots *slots = kvm_memslots(gpc->kvm); unsigned long page_offset = gpa & ~PAGE_MASK; @@ -331,6 +331,11 @@ out_unlock: return ret; } + +int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, unsigned long len) +{ + return __kvm_gpc_refresh(gpc, gpc->gpa, len); +} EXPORT_SYMBOL_GPL(kvm_gpc_refresh); void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm, @@ -371,7 +376,7 @@ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) gpc->active = true; write_unlock_irq(&gpc->lock); } - return kvm_gpc_refresh(gpc, gpa, len); + return __kvm_gpc_refresh(gpc, gpa, len); } EXPORT_SYMBOL_GPL(kvm_gpc_activate); -- GitLab From 06e155c44aa0e7921aa44d3c67f8ea464b16cb75 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 13 Oct 2022 21:12:32 +0000 Subject: [PATCH 0995/1423] KVM: Skip unnecessary "unmap" if gpc is already valid during refresh When refreshing a gfn=>pfn cache, skip straight to unlocking if the cache already valid instead of stuffing the "old" variables to turn the unmapping outro into a nop. Signed-off-by: Sean Christopherson Signed-off-by: David Woodhouse --- virt/kvm/pfncache.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index a805cc1544bf6..2d6aba6778307 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -301,9 +301,8 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, * may have changed. */ gpc->khva = old_khva + page_offset; - old_pfn = KVM_PFN_ERR_FAULT; - old_khva = NULL; ret = 0; + goto out_unlock; } out: -- GitLab From eb3992e833d3a17f9b0a3e0371d0b1d3d566f740 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Sep 2022 23:31:32 +0000 Subject: [PATCH 0996/1423] KVM: VMX: Resume guest immediately when injecting #GP on ECREATE Resume the guest immediately when injecting a #GP on ECREATE due to an invalid enclave size, i.e. don't attempt ECREATE in the host. The #GP is a terminal fault, e.g. skipping the instruction if ECREATE is successful would result in KVM injecting #GP on the instruction following ECREATE. Fixes: 70210c044b4e ("KVM: VMX: Add SGX ENCLS[ECREATE] handler to enforce CPUID restrictions") Cc: stable@vger.kernel.org Cc: Kai Huang Signed-off-by: Sean Christopherson Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20220930233132.1723330-1-seanjc@google.com --- arch/x86/kvm/vmx/sgx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 8f95c7c014335..b12da2a6dec95 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -182,8 +182,10 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu, /* Enforce CPUID restriction on max enclave size. */ max_size_log2 = (attributes & SGX_ATTR_MODE64BIT) ? sgx_12_0->edx >> 8 : sgx_12_0->edx; - if (size >= BIT_ULL(max_size_log2)) + if (size >= BIT_ULL(max_size_log2)) { kvm_inject_gp(vcpu, 0); + return 1; + } /* * sgx_virt_ecreate() returns: -- GitLab From 4265df667bbdc71c640e43c905bd9aeeead92365 Mon Sep 17 00:00:00 2001 From: Peng Hao Date: Tue, 8 Nov 2022 11:50:54 +0800 Subject: [PATCH 0997/1423] KVM: x86: Keep the lock order consistent between SRCU and gpc spinlock Acquire SRCU before taking the gpc spinlock in wait_pending_event() so as to be consistent with all other functions that acquire both locks. It's not illegal to acquire SRCU inside a spinlock, nor is there deadlock potential, but in general it's preferable to order locks from least restrictive to most restrictive, e.g. if wait_pending_event() needed to sleep for whatever reason, it could do so while holding SRCU, but would need to drop the spinlock. Signed-off-by: Peng Hao Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/CAPm50a++Cb=QfnjMZ2EnCj-Sb9Y4UM-=uOEtHAcjnNLCAAf-dQ@mail.gmail.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/xen.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 9187d024d0066..2f21fa5ee7de2 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -1165,8 +1165,8 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, bool ret = true; int idx, i; - read_lock_irqsave(&gpc->lock, flags); idx = srcu_read_lock(&kvm->srcu); + read_lock_irqsave(&gpc->lock, flags); if (!kvm_gpc_check(kvm, gpc, gpc->gpa, PAGE_SIZE)) goto out_rcu; @@ -1187,8 +1187,8 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, } out_rcu: - srcu_read_unlock(&kvm->srcu, idx); read_unlock_irqrestore(&gpc->lock, flags); + srcu_read_unlock(&kvm->srcu, idx); return ret; } -- GitLab From 17122c06b86c9f77f45b86b8e62c3ed440847a59 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Sep 2022 23:36:32 +0000 Subject: [PATCH 0998/1423] KVM: x86: Fail emulation during EMULTYPE_SKIP on any exception Treat any exception during instruction decode for EMULTYPE_SKIP as a "full" emulation failure, i.e. signal failure instead of queuing the exception. When decoding purely to skip an instruction, KVM and/or the CPU has already done some amount of emulation that cannot be unwound, e.g. on an EPT misconfig VM-Exit KVM has already processeed the emulated MMIO. KVM already does this if a #UD is encountered, but not for other exceptions, e.g. if a #PF is encountered during fetch. In SVM's soft-injection use case, queueing the exception is particularly problematic as queueing exceptions while injecting events can put KVM into an infinite loop due to bailing from VM-Enter to service the newly pending exception. E.g. multiple warnings to detect such behavior fire: ------------[ cut here ]------------ WARNING: CPU: 3 PID: 1017 at arch/x86/kvm/x86.c:9873 kvm_arch_vcpu_ioctl_run+0x1de5/0x20a0 [kvm] Modules linked in: kvm_amd ccp kvm irqbypass CPU: 3 PID: 1017 Comm: svm_nested_soft Not tainted 6.0.0-rc1+ #220 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:kvm_arch_vcpu_ioctl_run+0x1de5/0x20a0 [kvm] Call Trace: kvm_vcpu_ioctl+0x223/0x6d0 [kvm] __x64_sys_ioctl+0x85/0xc0 do_syscall_64+0x2b/0x50 entry_SYSCALL_64_after_hwframe+0x46/0xb0 ---[ end trace 0000000000000000 ]--- ------------[ cut here ]------------ WARNING: CPU: 3 PID: 1017 at arch/x86/kvm/x86.c:9987 kvm_arch_vcpu_ioctl_run+0x12a3/0x20a0 [kvm] Modules linked in: kvm_amd ccp kvm irqbypass CPU: 3 PID: 1017 Comm: svm_nested_soft Tainted: G W 6.0.0-rc1+ #220 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:kvm_arch_vcpu_ioctl_run+0x12a3/0x20a0 [kvm] Call Trace: kvm_vcpu_ioctl+0x223/0x6d0 [kvm] __x64_sys_ioctl+0x85/0xc0 do_syscall_64+0x2b/0x50 entry_SYSCALL_64_after_hwframe+0x46/0xb0 ---[ end trace 0000000000000000 ]--- Fixes: 6ea6e84309ca ("KVM: x86: inject exceptions produced by x86_decode_insn") Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220930233632.1725475-1-seanjc@google.com --- arch/x86/kvm/x86.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7f850dfb40861..ef12747ecb63d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8772,7 +8772,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, write_fault_to_spt, emulation_type)) return 1; - if (ctxt->have_exception) { + + if (ctxt->have_exception && + !(emulation_type & EMULTYPE_SKIP)) { /* * #UD should result in just EMULATION_FAILED, and trap-like * exception should not be encountered during decode. -- GitLab From 5c30e8101e8d5d020b1d7119117889756a6ed713 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Sep 2022 23:40:31 +0000 Subject: [PATCH 0999/1423] KVM: SVM: Skip WRMSR fastpath on VM-Exit if next RIP isn't valid Skip the WRMSR fastpath in SVM's VM-Exit handler if the next RIP isn't valid, e.g. because KVM is running with nrips=false. SVM must decode and emulate to skip the WRMSR if the CPU doesn't provide the next RIP. Getting the instruction bytes to decode the WRMSR requires reading guest memory, which in turn means dereferencing memslots, and that isn't safe because KVM doesn't hold SRCU when the fastpath runs. Don't bother trying to enable the fastpath for this case, e.g. by doing only the WRMSR and leaving the "skip" until later. NRIPS is supported on all modern CPUs (KVM has considered making it mandatory), and the next RIP will be valid the vast, vast majority of the time. ============================= WARNING: suspicious RCU usage 6.0.0-smp--4e557fcd3d80-skip #13 Tainted: G O ----------------------------- include/linux/kvm_host.h:954 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by stable/206475: #0: ffff9d9dfebcc0f0 (&vcpu->mutex){+.+.}-{3:3}, at: kvm_vcpu_ioctl+0x8b/0x620 [kvm] stack backtrace: CPU: 152 PID: 206475 Comm: stable Tainted: G O 6.0.0-smp--4e557fcd3d80-skip #13 Hardware name: Google, Inc. Arcadia_IT_80/Arcadia_IT_80, BIOS 10.48.0 01/27/2022 Call Trace: dump_stack_lvl+0x69/0xaa dump_stack+0x10/0x12 lockdep_rcu_suspicious+0x11e/0x130 kvm_vcpu_gfn_to_memslot+0x155/0x190 [kvm] kvm_vcpu_gfn_to_hva_prot+0x18/0x80 [kvm] paging64_walk_addr_generic+0x183/0x450 [kvm] paging64_gva_to_gpa+0x63/0xd0 [kvm] kvm_fetch_guest_virt+0x53/0xc0 [kvm] __do_insn_fetch_bytes+0x18b/0x1c0 [kvm] x86_decode_insn+0xf0/0xef0 [kvm] x86_emulate_instruction+0xba/0x790 [kvm] kvm_emulate_instruction+0x17/0x20 [kvm] __svm_skip_emulated_instruction+0x85/0x100 [kvm_amd] svm_skip_emulated_instruction+0x13/0x20 [kvm_amd] handle_fastpath_set_msr_irqoff+0xae/0x180 [kvm] svm_vcpu_run+0x4b8/0x5a0 [kvm_amd] vcpu_enter_guest+0x16ca/0x22f0 [kvm] kvm_arch_vcpu_ioctl_run+0x39d/0x900 [kvm] kvm_vcpu_ioctl+0x538/0x620 [kvm] __se_sys_ioctl+0x77/0xc0 __x64_sys_ioctl+0x1d/0x20 do_syscall_64+0x3d/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: 404d5d7bff0d ("KVM: X86: Introduce more exit_fastpath_completion enum values") Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220930234031.1732249-1-seanjc@google.com --- arch/x86/kvm/svm/svm.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 91352d6928452..6ffadbd577447 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3895,8 +3895,14 @@ static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { - if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && - to_svm(vcpu)->vmcb->control.exit_info_1) + struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; + + /* + * Note, the next RIP must be provided as SRCU isn't held, i.e. KVM + * can't read guest memory (dereference memslots) to decode the WRMSR. + */ + if (control->exit_code == SVM_EXIT_MSR && control->exit_info_1 && + nrips && control->next_rip) return handle_fastpath_set_msr_irqoff(vcpu); return EXIT_FASTPATH_NONE; -- GitLab From a8a12c0069b9e3c909b3c22bf8711d2f18a0af97 Mon Sep 17 00:00:00 2001 From: Zhao Liu Date: Wed, 28 Sep 2022 17:27:48 +0800 Subject: [PATCH 1000/1423] KVM: SVM: Replace kmap_atomic() with kmap_local_page() The use of kmap_atomic() is being deprecated in favor of kmap_local_page()[1]. The main difference between atomic and local mappings is that local mappings don't disable page faults or preemption. There're 2 reasons we can use kmap_local_page() here: 1. SEV is 64-bit only and kmap_local_page() doesn't disable migration in this case, but here the function clflush_cache_range() uses CLFLUSHOPT instruction to flush, and on x86 CLFLUSHOPT is not CPU-local and flushes the page out of the entire cache hierarchy on all CPUs (APM volume 3, chapter 3, CLFLUSHOPT). So there's no need to disable preemption to ensure CPU-local. 2. clflush_cache_range() doesn't need to disable pagefault and the mapping is still valid even if sleeps. This is also true for sched out/in when preempted. In addition, though kmap_local_page() is a thin wrapper around page_address() on 64-bit, kmap_local_page() should still be used here in preference to page_address() since page_address() isn't suitable to be used in a generic function (like sev_clflush_pages()) where the page passed in is not easy to determine the source of allocation. Keeping the kmap* API in place means it can be used for things other than highmem mappings[2]. Therefore, sev_clflush_pages() is a function that should use kmap_local_page() in place of kmap_atomic(). Convert the calls of kmap_atomic() / kunmap_atomic() to kmap_local_page() / kunmap_local(). [1]: https://lore.kernel.org/all/20220813220034.806698-1-ira.weiny@intel.com [2]: https://lore.kernel.org/lkml/5d667258-b58b-3d28-3609-e7914c99b31b@intel.com/ Suggested-by: Dave Hansen Suggested-by: Ira Weiny Suggested-by: Fabio M. De Francesco Signed-off-by: Zhao Liu Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20220928092748.463631-1-zhao1.liu@linux.intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 69dbf17f0d6ad..86d6897f48068 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -465,9 +465,9 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages) return; for (i = 0; i < npages; i++) { - page_virtual = kmap_atomic(pages[i]); + page_virtual = kmap_local_page(pages[i]); clflush_cache_range(page_virtual, PAGE_SIZE); - kunmap_atomic(page_virtual); + kunmap_local(page_virtual); cond_resched(); } } -- GitLab From 9cc409325ddd776f6fd6293d5ce93ce1248af6e4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:19:56 +0000 Subject: [PATCH 1001/1423] KVM: nVMX: Inject #GP, not #UD, if "generic" VMXON CR0/CR4 check fails Inject #GP for if VMXON is attempting with a CR0/CR4 that fails the generic "is CRx valid" check, but passes the CR4.VMXE check, and do the generic checks _after_ handling the post-VMXON VM-Fail. The CR4.VMXE check, and all other #UD cases, are special pre-conditions that are enforced prior to pivoting on the current VMX mode, i.e. occur before interception if VMXON is attempted in VMX non-root mode. All other CR0/CR4 checks generate #GP and effectively have lower priority than the post-VMXON check. Per the SDM: IF (register operand) or (CR0.PE = 0) or (CR4.VMXE = 0) or ... THEN #UD; ELSIF not in VMX operation THEN IF (CPL > 0) or (in A20M mode) or (the values of CR0 and CR4 are not supported in VMX operation) THEN #GP(0); ELSIF in VMX non-root operation THEN VMexit; ELSIF CPL > 0 THEN #GP(0); ELSE VMfail("VMXON executed in VMX root operation"); FI; which, if re-written without ELSIF, yields: IF (register operand) or (CR0.PE = 0) or (CR4.VMXE = 0) or ... THEN #UD IF in VMX non-root operation THEN VMexit; IF CPL > 0 THEN #GP(0) IF in VMX operation THEN VMfail("VMXON executed in VMX root operation"); IF (in A20M mode) or (the values of CR0 and CR4 are not supported in VMX operation) THEN #GP(0); Note, KVM unconditionally forwards VMXON VM-Exits that occur in L2 to L1, i.e. there is no need to check the vCPU is not in VMX non-root mode. Add a comment to explain why unconditionally forwarding such exits is functionally correct. Reported-by: Eric Li Fixes: c7d855c2aff2 ("KVM: nVMX: Inject #UD if VMXON is attempted with incompatible CR0/CR4") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221006001956.329314-1-seanjc@google.com --- arch/x86/kvm/vmx/nested.c | 44 +++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b28be793de298..8927910199681 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5131,24 +5131,35 @@ static int handle_vmxon(struct kvm_vcpu *vcpu) | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; /* - * Note, KVM cannot rely on hardware to perform the CR0/CR4 #UD checks - * that have higher priority than VM-Exit (see Intel SDM's pseudocode - * for VMXON), as KVM must load valid CR0/CR4 values into hardware while - * running the guest, i.e. KVM needs to check the _guest_ values. + * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter + * the guest and so cannot rely on hardware to perform the check, + * which has higher priority than VM-Exit (see Intel SDM's pseudocode + * for VMXON). * - * Rely on hardware for the other two pre-VM-Exit checks, !VM86 and - * !COMPATIBILITY modes. KVM may run the guest in VM86 to emulate Real - * Mode, but KVM will never take the guest out of those modes. + * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 + * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't + * force any of the relevant guest state. For a restricted guest, KVM + * does force CR0.PE=1, but only to also force VM86 in order to emulate + * Real Mode, and so there's no need to check CR0.PE manually. */ - if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || - !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { + if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } /* - * CPL=0 and all other checks that are lower priority than VM-Exit must - * be checked manually. + * The CPL is checked for "not in VMX operation" and for "in VMX root", + * and has higher priority than the VM-Fail due to being post-VMXON, + * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, + * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits + * from L2 to L1, i.e. there's no need to check for the vCPU being in + * VMX non-root. + * + * Forwarding the VM-Exit unconditionally, i.e. without performing the + * #UD checks (see above), is functionally ok because KVM doesn't allow + * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's + * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are + * missed by hardware due to shadowing CR0 and/or CR4. */ if (vmx_get_cpl(vcpu)) { kvm_inject_gp(vcpu, 0); @@ -5158,6 +5169,17 @@ static int handle_vmxon(struct kvm_vcpu *vcpu) if (vmx->nested.vmxon) return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); + /* + * Invalid CR0/CR4 generates #GP. These checks are performed if and + * only if the vCPU isn't already in VMX operation, i.e. effectively + * have lower priority than the VM-Fail above. + */ + if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || + !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { + kvm_inject_gp(vcpu, 0); + return 1; + } + if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) != VMXON_NEEDED_FEATURES) { kvm_inject_gp(vcpu, 0); -- GitLab From 4f209989586c79e9bf59ba9381101f5fb449dfbb Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 19 Oct 2022 14:36:19 -0700 Subject: [PATCH 1002/1423] KVM: VMX: Guest usage of IA32_SPEC_CTRL is likely At this point in time, most guests (in the default, out-of-the-box configuration) are likely to use IA32_SPEC_CTRL. Therefore, drop the compiler hint that it is unlikely for KVM to be intercepting WRMSR of IA32_SPEC_CTRL. Signed-off-by: Jim Mattson Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20221019213620.1953281-2-jmattson@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cea8c07f52298..cb40f724d8cc0 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -858,7 +858,7 @@ unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) * to change it directly without causing a vmexit. In that case read * it after vmexit and store it in vmx->spec_ctrl. */ - if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))) + if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)) flags |= VMX_RUN_SAVE_SPEC_CTRL; return flags; -- GitLab From 2e7eab81425ad6c875f2ed47c0ce01e78afc38a5 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 19 Oct 2022 14:36:20 -0700 Subject: [PATCH 1003/1423] KVM: VMX: Execute IBPB on emulated VM-exit when guest has IBRS According to Intel's document on Indirect Branch Restricted Speculation, "Enabling IBRS does not prevent software from controlling the predicted targets of indirect branches of unrelated software executed later at the same predictor mode (for example, between two different user applications, or two different virtual machines). Such isolation can be ensured through use of the Indirect Branch Predictor Barrier (IBPB) command." This applies to both basic and enhanced IBRS. Since L1 and L2 VMs share hardware predictor modes (guest-user and guest-kernel), hardware IBRS is not sufficient to virtualize IBRS. (The way that basic IBRS is implemented on pre-eIBRS parts, hardware IBRS is actually sufficient in practice, even though it isn't sufficient architecturally.) For virtual CPUs that support IBRS, add an indirect branch prediction barrier on emulated VM-exit, to ensure that the predicted targets of indirect branches executed in L1 cannot be controlled by software that was executed in L2. Since we typically don't intercept guest writes to IA32_SPEC_CTRL, perform the IBPB at emulated VM-exit regardless of the current IA32_SPEC_CTRL.IBRS value, even though the IBPB could technically be deferred until L1 sets IA32_SPEC_CTRL.IBRS, if IA32_SPEC_CTRL.IBRS is clear at emulated VM-exit. This is CVE-2022-2196. Fixes: 5c911beff20a ("KVM: nVMX: Skip IBPB when switching between vmcs01 and vmcs02") Cc: Sean Christopherson Signed-off-by: Jim Mattson Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20221019213620.1953281-3-jmattson@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 11 +++++++++++ arch/x86/kvm/vmx/vmx.c | 6 ++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 8927910199681..61c83424285c8 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4798,6 +4798,17 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmx_switch_vmcs(vcpu, &vmx->vmcs01); + /* + * If IBRS is advertised to the vCPU, KVM must flush the indirect + * branch predictors when transitioning from L2 to L1, as L1 expects + * hardware (KVM in this case) to provide separate predictor modes. + * Bare metal isolates VMX root (host) from VMX non-root (guest), but + * doesn't isolate different VMCSs, i.e. in this case, doesn't provide + * separate modes for L2 vs L1. + */ + if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + indirect_branch_prediction_barrier(); + /* Update any VMCS fields that might have changed while L2 ran */ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cb40f724d8cc0..3f31c46c306e8 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1348,8 +1348,10 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, /* * No indirect branch prediction barrier needed when switching - * the active VMCS within a guest, e.g. on nested VM-Enter. - * The L1 VMM can protect itself with retpolines, IBPB or IBRS. + * the active VMCS within a vCPU, unless IBRS is advertised to + * the vCPU. To minimize the number of IBPBs executed, KVM + * performs IBPB on nested VM-Exit (a single nested transition + * may switch the active VMCS multiple times). */ if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev)) indirect_branch_prediction_barrier(); -- GitLab From 658234de0d2ed3a1b86d793f4772e38a2e039b35 Mon Sep 17 00:00:00 2001 From: Kevin Tian Date: Tue, 29 Nov 2022 16:29:28 -0400 Subject: [PATCH 1004/1423] iommufd: Document overview of iommufd Add iommufd into the documentation tree, and supply initial documentation. Much of this is linked from code comments by kdoc. Link: https://lore.kernel.org/r/5-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Reviewed-by: Bagas Sanjaya Reviewed-by: Eric Auger Signed-off-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- Documentation/userspace-api/index.rst | 1 + Documentation/userspace-api/iommufd.rst | 223 ++++++++++++++++++++++++ 2 files changed, 224 insertions(+) create mode 100644 Documentation/userspace-api/iommufd.rst diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst index c78da9ce0ec44..f16337bdb8520 100644 --- a/Documentation/userspace-api/index.rst +++ b/Documentation/userspace-api/index.rst @@ -25,6 +25,7 @@ place where this information is gathered. ebpf/index ioctl/index iommu + iommufd media/index netlink/index sysfs-platform_profile diff --git a/Documentation/userspace-api/iommufd.rst b/Documentation/userspace-api/iommufd.rst new file mode 100644 index 0000000000000..79dd9eb515874 --- /dev/null +++ b/Documentation/userspace-api/iommufd.rst @@ -0,0 +1,223 @@ +.. SPDX-License-Identifier: GPL-2.0+ + +======= +IOMMUFD +======= + +:Author: Jason Gunthorpe +:Author: Kevin Tian + +Overview +======== + +IOMMUFD is the user API to control the IOMMU subsystem as it relates to managing +IO page tables from userspace using file descriptors. It intends to be general +and consumable by any driver that wants to expose DMA to userspace. These +drivers are eventually expected to deprecate any internal IOMMU logic +they may already/historically implement (e.g. vfio_iommu_type1.c). + +At minimum iommufd provides universal support of managing I/O address spaces and +I/O page tables for all IOMMUs, with room in the design to add non-generic +features to cater to specific hardware functionality. + +In this context the capital letter (IOMMUFD) refers to the subsystem while the +small letter (iommufd) refers to the file descriptors created via /dev/iommu for +use by userspace. + +Key Concepts +============ + +User Visible Objects +-------------------- + +Following IOMMUFD objects are exposed to userspace: + +- IOMMUFD_OBJ_IOAS, representing an I/O address space (IOAS), allowing map/unmap + of user space memory into ranges of I/O Virtual Address (IOVA). + + The IOAS is a functional replacement for the VFIO container, and like the VFIO + container it copies an IOVA map to a list of iommu_domains held within it. + +- IOMMUFD_OBJ_DEVICE, representing a device that is bound to iommufd by an + external driver. + +- IOMMUFD_OBJ_HW_PAGETABLE, representing an actual hardware I/O page table + (i.e. a single struct iommu_domain) managed by the iommu driver. + + The IOAS has a list of HW_PAGETABLES that share the same IOVA mapping and + it will synchronize its mapping with each member HW_PAGETABLE. + +All user-visible objects are destroyed via the IOMMU_DESTROY uAPI. + +The diagram below shows relationship between user-visible objects and kernel +datastructures (external to iommufd), with numbers referred to operations +creating the objects and links:: + + _________________________________________________________ + | iommufd | + | [1] | + | _________________ | + | | | | + | | | | + | | | | + | | | | + | | | | + | | | | + | | | [3] [2] | + | | | ____________ __________ | + | | IOAS |<--| |<------| | | + | | | |HW_PAGETABLE| | DEVICE | | + | | | |____________| |__________| | + | | | | | | + | | | | | | + | | | | | | + | | | | | | + | | | | | | + | |_________________| | | | + | | | | | + |_________|___________________|___________________|_______| + | | | + | _____v______ _______v_____ + | PFN storage | | | | + |------------>|iommu_domain| |struct device| + |____________| |_____________| + +1. IOMMUFD_OBJ_IOAS is created via the IOMMU_IOAS_ALLOC uAPI. An iommufd can + hold multiple IOAS objects. IOAS is the most generic object and does not + expose interfaces that are specific to single IOMMU drivers. All operations + on the IOAS must operate equally on each of the iommu_domains inside of it. + +2. IOMMUFD_OBJ_DEVICE is created when an external driver calls the IOMMUFD kAPI + to bind a device to an iommufd. The driver is expected to implement a set of + ioctls to allow userspace to initiate the binding operation. Successful + completion of this operation establishes the desired DMA ownership over the + device. The driver must also set the driver_managed_dma flag and must not + touch the device until this operation succeeds. + +3. IOMMUFD_OBJ_HW_PAGETABLE is created when an external driver calls the IOMMUFD + kAPI to attach a bound device to an IOAS. Similarly the external driver uAPI + allows userspace to initiate the attaching operation. If a compatible + pagetable already exists then it is reused for the attachment. Otherwise a + new pagetable object and iommu_domain is created. Successful completion of + this operation sets up the linkages among IOAS, device and iommu_domain. Once + this completes the device could do DMA. + + Every iommu_domain inside the IOAS is also represented to userspace as a + HW_PAGETABLE object. + + .. note:: + + Future IOMMUFD updates will provide an API to create and manipulate the + HW_PAGETABLE directly. + +A device can only bind to an iommufd due to DMA ownership claim and attach to at +most one IOAS object (no support of PASID yet). + +Kernel Datastructure +-------------------- + +User visible objects are backed by following datastructures: + +- iommufd_ioas for IOMMUFD_OBJ_IOAS. +- iommufd_device for IOMMUFD_OBJ_DEVICE. +- iommufd_hw_pagetable for IOMMUFD_OBJ_HW_PAGETABLE. + +Several terminologies when looking at these datastructures: + +- Automatic domain - refers to an iommu domain created automatically when + attaching a device to an IOAS object. This is compatible to the semantics of + VFIO type1. + +- Manual domain - refers to an iommu domain designated by the user as the + target pagetable to be attached to by a device. Though currently there are + no uAPIs to directly create such domain, the datastructure and algorithms + are ready for handling that use case. + +- In-kernel user - refers to something like a VFIO mdev that is using the + IOMMUFD access interface to access the IOAS. This starts by creating an + iommufd_access object that is similar to the domain binding a physical device + would do. The access object will then allow converting IOVA ranges into struct + page * lists, or doing direct read/write to an IOVA. + +iommufd_ioas serves as the metadata datastructure to manage how IOVA ranges are +mapped to memory pages, composed of: + +- struct io_pagetable holding the IOVA map +- struct iopt_area's representing populated portions of IOVA +- struct iopt_pages representing the storage of PFNs +- struct iommu_domain representing the IO page table in the IOMMU +- struct iopt_pages_access representing in-kernel users of PFNs +- struct xarray pinned_pfns holding a list of pages pinned by in-kernel users + +Each iopt_pages represents a logical linear array of full PFNs. The PFNs are +ultimately derived from userspace VAs via an mm_struct. Once they have been +pinned the PFNs are stored in IOPTEs of an iommu_domain or inside the pinned_pfns +xarray if they have been pinned through an iommufd_access. + +PFN have to be copied between all combinations of storage locations, depending +on what domains are present and what kinds of in-kernel "software access" users +exist. The mechanism ensures that a page is pinned only once. + +An io_pagetable is composed of iopt_areas pointing at iopt_pages, along with a +list of iommu_domains that mirror the IOVA to PFN map. + +Multiple io_pagetable-s, through their iopt_area-s, can share a single +iopt_pages which avoids multi-pinning and double accounting of page +consumption. + +iommufd_ioas is sharable between subsystems, e.g. VFIO and VDPA, as long as +devices managed by different subsystems are bound to a same iommufd. + +IOMMUFD User API +================ + +.. kernel-doc:: include/uapi/linux/iommufd.h + +IOMMUFD Kernel API +================== + +The IOMMUFD kAPI is device-centric with group-related tricks managed behind the +scene. This allows the external drivers calling such kAPI to implement a simple +device-centric uAPI for connecting its device to an iommufd, instead of +explicitly imposing the group semantics in its uAPI as VFIO does. + +.. kernel-doc:: drivers/iommu/iommufd/device.c + :export: + +.. kernel-doc:: drivers/iommu/iommufd/main.c + :export: + +VFIO and IOMMUFD +---------------- + +Connecting a VFIO device to iommufd can be done in two ways. + +First is a VFIO compatible way by directly implementing the /dev/vfio/vfio +container IOCTLs by mapping them into io_pagetable operations. Doing so allows +the use of iommufd in legacy VFIO applications by symlinking /dev/vfio/vfio to +/dev/iommufd or extending VFIO to SET_CONTAINER using an iommufd instead of a +container fd. + +The second approach directly extends VFIO to support a new set of device-centric +user API based on aforementioned IOMMUFD kernel API. It requires userspace +change but better matches the IOMMUFD API semantics and easier to support new +iommufd features when comparing it to the first approach. + +Currently both approaches are still work-in-progress. + +There are still a few gaps to be resolved to catch up with VFIO type1, as +documented in iommufd_vfio_check_extension(). + +Future TODOs +============ + +Currently IOMMUFD supports only kernel-managed I/O page table, similar to VFIO +type1. New features on the radar include: + + - Binding iommu_domain's to PASID/SSID + - Userspace page tables, for ARM, x86 and S390 + - Kernel bypass'd invalidation of user page tables + - Re-use of the KVM page table in the IOMMU + - Dirty page tracking in the IOMMU + - Runtime Increase/Decrease of IOPTE size + - PRI support with faults resolved in userspace -- GitLab From 2ff4bed7fee72ba1abfcff5f11ae8f8e570353f2 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:29 -0400 Subject: [PATCH 1005/1423] iommufd: File descriptor, context, kconfig and makefiles This is the basic infrastructure of a new miscdevice to hold the iommufd IOCTL API. It provides: - A miscdevice to create file descriptors to run the IOCTL interface over - A table based ioctl dispatch and centralized extendable pre-validation step - An xarray mapping userspace ID's to kernel objects. The design has multiple inter-related objects held within in a single IOMMUFD fd - A simple usage count to build a graph of object relations and protect against hostile userspace racing ioctls The only IOCTL provided in this patch is the generic 'destroy any object by handle' operation. Link: https://lore.kernel.org/r/6-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Eric Auger Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- .../userspace-api/ioctl/ioctl-number.rst | 1 + MAINTAINERS | 12 + drivers/iommu/Kconfig | 1 + drivers/iommu/Makefile | 2 +- drivers/iommu/iommufd/Kconfig | 12 + drivers/iommu/iommufd/Makefile | 5 + drivers/iommu/iommufd/iommufd_private.h | 109 ++++++ drivers/iommu/iommufd/main.c | 344 ++++++++++++++++++ include/linux/iommufd.h | 31 ++ include/uapi/linux/iommufd.h | 55 +++ 10 files changed, 571 insertions(+), 1 deletion(-) create mode 100644 drivers/iommu/iommufd/Kconfig create mode 100644 drivers/iommu/iommufd/Makefile create mode 100644 drivers/iommu/iommufd/iommufd_private.h create mode 100644 drivers/iommu/iommufd/main.c create mode 100644 include/linux/iommufd.h create mode 100644 include/uapi/linux/iommufd.h diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index 5f81e2a24a5c0..eb045fc495a4e 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -105,6 +105,7 @@ Code Seq# Include File Comments '8' all SNP8023 advanced NIC card ';' 64-7F linux/vfio.h +';' 80-FF linux/iommufd.h '=' 00-3f uapi/linux/ptp_clock.h '@' 00-0F linux/radeonfb.h conflict! '@' 00-0F drivers/video/aty/aty128fb.c conflict! diff --git a/MAINTAINERS b/MAINTAINERS index 379945f82a643..c0a93779731d7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10717,6 +10717,18 @@ F: drivers/iommu/dma-iommu.h F: drivers/iommu/iova.c F: include/linux/iova.h +IOMMUFD +M: Jason Gunthorpe +M: Kevin Tian +L: iommu@lists.linux.dev +S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd.git +F: Documentation/userspace-api/iommufd.rst +F: drivers/iommu/iommufd/ +F: include/linux/iommufd.h +F: include/uapi/linux/iommufd.h +F: tools/testing/selftests/iommu/ + IOMMU SUBSYSTEM M: Joerg Roedel M: Will Deacon diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index dc5f7a156ff5e..319966cde5cf6 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -188,6 +188,7 @@ config MSM_IOMMU source "drivers/iommu/amd/Kconfig" source "drivers/iommu/intel/Kconfig" +source "drivers/iommu/iommufd/Kconfig" config IRQ_REMAP bool "Support for Interrupt Remapping" diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index 7fbf6a3376620..f461d06513856 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y += amd/ intel/ arm/ +obj-y += amd/ intel/ arm/ iommufd/ obj-$(CONFIG_IOMMU_API) += iommu.o obj-$(CONFIG_IOMMU_API) += iommu-traces.o obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig new file mode 100644 index 0000000000000..164812084a675 --- /dev/null +++ b/drivers/iommu/iommufd/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0-only +config IOMMUFD + tristate "IOMMU Userspace API" + select INTERVAL_TREE + select INTERVAL_TREE_SPAN_ITER + select IOMMU_API + default n + help + Provides /dev/iommu, the user API to control the IOMMU subsystem as + it relates to managing IO page tables that point at user space memory. + + If you don't know what to do here, say N. diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile new file mode 100644 index 0000000000000..a07a8cffe937c --- /dev/null +++ b/drivers/iommu/iommufd/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only +iommufd-y := \ + main.o + +obj-$(CONFIG_IOMMUFD) += iommufd.o diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h new file mode 100644 index 0000000000000..bb720bc113178 --- /dev/null +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + */ +#ifndef __IOMMUFD_PRIVATE_H +#define __IOMMUFD_PRIVATE_H + +#include +#include +#include +#include + +struct iommufd_ctx { + struct file *file; + struct xarray objects; +}; + +struct iommufd_ucmd { + struct iommufd_ctx *ictx; + void __user *ubuffer; + u32 user_size; + void *cmd; +}; + +/* Copy the response in ucmd->cmd back to userspace. */ +static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd, + size_t cmd_len) +{ + if (copy_to_user(ucmd->ubuffer, ucmd->cmd, + min_t(size_t, ucmd->user_size, cmd_len))) + return -EFAULT; + return 0; +} + +enum iommufd_object_type { + IOMMUFD_OBJ_NONE, + IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE, +}; + +/* Base struct for all objects with a userspace ID handle. */ +struct iommufd_object { + struct rw_semaphore destroy_rwsem; + refcount_t users; + enum iommufd_object_type type; + unsigned int id; +}; + +static inline bool iommufd_lock_obj(struct iommufd_object *obj) +{ + if (!down_read_trylock(&obj->destroy_rwsem)) + return false; + if (!refcount_inc_not_zero(&obj->users)) { + up_read(&obj->destroy_rwsem); + return false; + } + return true; +} + +struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, + enum iommufd_object_type type); +static inline void iommufd_put_object(struct iommufd_object *obj) +{ + refcount_dec(&obj->users); + up_read(&obj->destroy_rwsem); +} + +/** + * iommufd_ref_to_users() - Switch from destroy_rwsem to users refcount + * protection + * @obj - Object to release + * + * Objects have two refcount protections (destroy_rwsem and the refcount_t + * users). Holding either of these will prevent the object from being destroyed. + * + * Depending on the use case, one protection or the other is appropriate. In + * most cases references are being protected by the destroy_rwsem. This allows + * orderly destruction of the object because iommufd_object_destroy_user() will + * wait for it to become unlocked. However, as a rwsem, it cannot be held across + * a system call return. So cases that have longer term needs must switch + * to the weaker users refcount_t. + * + * With users protection iommufd_object_destroy_user() will return false, + * refusing to destroy the object, causing -EBUSY to userspace. + */ +static inline void iommufd_ref_to_users(struct iommufd_object *obj) +{ + up_read(&obj->destroy_rwsem); + /* iommufd_lock_obj() obtains users as well */ +} +void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj); +void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx, + struct iommufd_object *obj); +void iommufd_object_finalize(struct iommufd_ctx *ictx, + struct iommufd_object *obj); +bool iommufd_object_destroy_user(struct iommufd_ctx *ictx, + struct iommufd_object *obj); +struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, + size_t size, + enum iommufd_object_type type); + +#define iommufd_object_alloc(ictx, ptr, type) \ + container_of(_iommufd_object_alloc( \ + ictx, \ + sizeof(*(ptr)) + BUILD_BUG_ON_ZERO( \ + offsetof(typeof(*(ptr)), \ + obj) != 0), \ + type), \ + typeof(*(ptr)), obj) + +#endif diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c new file mode 100644 index 0000000000000..dfbc68b97506d --- /dev/null +++ b/drivers/iommu/iommufd/main.c @@ -0,0 +1,344 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (C) 2021 Intel Corporation + * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + * + * iommufd provides control over the IOMMU HW objects created by IOMMU kernel + * drivers. IOMMU HW objects revolve around IO page tables that map incoming DMA + * addresses (IOVA) to CPU addresses. + */ +#define pr_fmt(fmt) "iommufd: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "iommufd_private.h" + +struct iommufd_object_ops { + void (*destroy)(struct iommufd_object *obj); +}; +static const struct iommufd_object_ops iommufd_object_ops[]; + +struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, + size_t size, + enum iommufd_object_type type) +{ + struct iommufd_object *obj; + int rc; + + obj = kzalloc(size, GFP_KERNEL_ACCOUNT); + if (!obj) + return ERR_PTR(-ENOMEM); + obj->type = type; + init_rwsem(&obj->destroy_rwsem); + refcount_set(&obj->users, 1); + + /* + * Reserve an ID in the xarray but do not publish the pointer yet since + * the caller hasn't initialized it yet. Once the pointer is published + * in the xarray and visible to other threads we can't reliably destroy + * it anymore, so the caller must complete all errorable operations + * before calling iommufd_object_finalize(). + */ + rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY, + xa_limit_32b, GFP_KERNEL_ACCOUNT); + if (rc) + goto out_free; + return obj; +out_free: + kfree(obj); + return ERR_PTR(rc); +} + +/* + * Allow concurrent access to the object. + * + * Once another thread can see the object pointer it can prevent object + * destruction. Expect for special kernel-only objects there is no in-kernel way + * to reliably destroy a single object. Thus all APIs that are creating objects + * must use iommufd_object_abort() to handle their errors and only call + * iommufd_object_finalize() once object creation cannot fail. + */ +void iommufd_object_finalize(struct iommufd_ctx *ictx, + struct iommufd_object *obj) +{ + void *old; + + old = xa_store(&ictx->objects, obj->id, obj, GFP_KERNEL); + /* obj->id was returned from xa_alloc() so the xa_store() cannot fail */ + WARN_ON(old); +} + +/* Undo _iommufd_object_alloc() if iommufd_object_finalize() was not called */ +void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj) +{ + void *old; + + old = xa_erase(&ictx->objects, obj->id); + WARN_ON(old); + kfree(obj); +} + +/* + * Abort an object that has been fully initialized and needs destroy, but has + * not been finalized. + */ +void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx, + struct iommufd_object *obj) +{ + iommufd_object_ops[obj->type].destroy(obj); + iommufd_object_abort(ictx, obj); +} + +struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, + enum iommufd_object_type type) +{ + struct iommufd_object *obj; + + xa_lock(&ictx->objects); + obj = xa_load(&ictx->objects, id); + if (!obj || (type != IOMMUFD_OBJ_ANY && obj->type != type) || + !iommufd_lock_obj(obj)) + obj = ERR_PTR(-ENOENT); + xa_unlock(&ictx->objects); + return obj; +} + +/* + * The caller holds a users refcount and wants to destroy the object. Returns + * true if the object was destroyed. In all cases the caller no longer has a + * reference on obj. + */ +bool iommufd_object_destroy_user(struct iommufd_ctx *ictx, + struct iommufd_object *obj) +{ + /* + * The purpose of the destroy_rwsem is to ensure deterministic + * destruction of objects used by external drivers and destroyed by this + * function. Any temporary increment of the refcount must hold the read + * side of this, such as during ioctl execution. + */ + down_write(&obj->destroy_rwsem); + xa_lock(&ictx->objects); + refcount_dec(&obj->users); + if (!refcount_dec_if_one(&obj->users)) { + xa_unlock(&ictx->objects); + up_write(&obj->destroy_rwsem); + return false; + } + __xa_erase(&ictx->objects, obj->id); + xa_unlock(&ictx->objects); + up_write(&obj->destroy_rwsem); + + iommufd_object_ops[obj->type].destroy(obj); + kfree(obj); + return true; +} + +static int iommufd_destroy(struct iommufd_ucmd *ucmd) +{ + struct iommu_destroy *cmd = ucmd->cmd; + struct iommufd_object *obj; + + obj = iommufd_get_object(ucmd->ictx, cmd->id, IOMMUFD_OBJ_ANY); + if (IS_ERR(obj)) + return PTR_ERR(obj); + iommufd_ref_to_users(obj); + /* See iommufd_ref_to_users() */ + if (!iommufd_object_destroy_user(ucmd->ictx, obj)) + return -EBUSY; + return 0; +} + +static int iommufd_fops_open(struct inode *inode, struct file *filp) +{ + struct iommufd_ctx *ictx; + + ictx = kzalloc(sizeof(*ictx), GFP_KERNEL_ACCOUNT); + if (!ictx) + return -ENOMEM; + + xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT); + ictx->file = filp; + filp->private_data = ictx; + return 0; +} + +static int iommufd_fops_release(struct inode *inode, struct file *filp) +{ + struct iommufd_ctx *ictx = filp->private_data; + struct iommufd_object *obj; + + /* + * The objects in the xarray form a graph of "users" counts, and we have + * to destroy them in a depth first manner. Leaf objects will reduce the + * users count of interior objects when they are destroyed. + * + * Repeatedly destroying all the "1 users" leaf objects will progress + * until the entire list is destroyed. If this can't progress then there + * is some bug related to object refcounting. + */ + while (!xa_empty(&ictx->objects)) { + unsigned int destroyed = 0; + unsigned long index; + + xa_for_each(&ictx->objects, index, obj) { + if (!refcount_dec_if_one(&obj->users)) + continue; + destroyed++; + xa_erase(&ictx->objects, index); + iommufd_object_ops[obj->type].destroy(obj); + kfree(obj); + } + /* Bug related to users refcount */ + if (WARN_ON(!destroyed)) + break; + } + kfree(ictx); + return 0; +} + +union ucmd_buffer { + struct iommu_destroy destroy; +}; + +struct iommufd_ioctl_op { + unsigned int size; + unsigned int min_size; + unsigned int ioctl_num; + int (*execute)(struct iommufd_ucmd *ucmd); +}; + +#define IOCTL_OP(_ioctl, _fn, _struct, _last) \ + [_IOC_NR(_ioctl) - IOMMUFD_CMD_BASE] = { \ + .size = sizeof(_struct) + \ + BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \ + sizeof(_struct)), \ + .min_size = offsetofend(_struct, _last), \ + .ioctl_num = _ioctl, \ + .execute = _fn, \ + } +static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { + IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id), +}; + +static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + const struct iommufd_ioctl_op *op; + struct iommufd_ucmd ucmd = {}; + union ucmd_buffer buf; + unsigned int nr; + int ret; + + ucmd.ictx = filp->private_data; + ucmd.ubuffer = (void __user *)arg; + ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer); + if (ret) + return ret; + + nr = _IOC_NR(cmd); + if (nr < IOMMUFD_CMD_BASE || + (nr - IOMMUFD_CMD_BASE) >= ARRAY_SIZE(iommufd_ioctl_ops)) + return -ENOIOCTLCMD; + op = &iommufd_ioctl_ops[nr - IOMMUFD_CMD_BASE]; + if (op->ioctl_num != cmd) + return -ENOIOCTLCMD; + if (ucmd.user_size < op->min_size) + return -EINVAL; + + ucmd.cmd = &buf; + ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer, + ucmd.user_size); + if (ret) + return ret; + ret = op->execute(&ucmd); + return ret; +} + +static const struct file_operations iommufd_fops = { + .owner = THIS_MODULE, + .open = iommufd_fops_open, + .release = iommufd_fops_release, + .unlocked_ioctl = iommufd_fops_ioctl, +}; + +/** + * iommufd_ctx_get - Get a context reference + * @ictx: Context to get + * + * The caller must already hold a valid reference to ictx. + */ +void iommufd_ctx_get(struct iommufd_ctx *ictx) +{ + get_file(ictx->file); +} +EXPORT_SYMBOL_NS_GPL(iommufd_ctx_get, IOMMUFD); + +/** + * iommufd_ctx_from_file - Acquires a reference to the iommufd context + * @file: File to obtain the reference from + * + * Returns a pointer to the iommufd_ctx, otherwise ERR_PTR. The struct file + * remains owned by the caller and the caller must still do fput. On success + * the caller is responsible to call iommufd_ctx_put(). + */ +struct iommufd_ctx *iommufd_ctx_from_file(struct file *file) +{ + struct iommufd_ctx *ictx; + + if (file->f_op != &iommufd_fops) + return ERR_PTR(-EBADFD); + ictx = file->private_data; + iommufd_ctx_get(ictx); + return ictx; +} +EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_file, IOMMUFD); + +/** + * iommufd_ctx_put - Put back a reference + * @ictx: Context to put back + */ +void iommufd_ctx_put(struct iommufd_ctx *ictx) +{ + fput(ictx->file); +} +EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, IOMMUFD); + +static const struct iommufd_object_ops iommufd_object_ops[] = { +}; + +static struct miscdevice iommu_misc_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "iommu", + .fops = &iommufd_fops, + .nodename = "iommu", + .mode = 0660, +}; + +static int __init iommufd_init(void) +{ + int ret; + + ret = misc_register(&iommu_misc_dev); + if (ret) + return ret; + return 0; +} + +static void __exit iommufd_exit(void) +{ + misc_deregister(&iommu_misc_dev); +} + +module_init(iommufd_init); +module_exit(iommufd_exit); + +MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h new file mode 100644 index 0000000000000..d1817472c2737 --- /dev/null +++ b/include/linux/iommufd.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2021 Intel Corporation + * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + */ +#ifndef __LINUX_IOMMUFD_H +#define __LINUX_IOMMUFD_H + +#include +#include +#include + +struct iommufd_ctx; +struct file; + +void iommufd_ctx_get(struct iommufd_ctx *ictx); + +#if IS_ENABLED(CONFIG_IOMMUFD) +struct iommufd_ctx *iommufd_ctx_from_file(struct file *file); +void iommufd_ctx_put(struct iommufd_ctx *ictx); +#else /* !CONFIG_IOMMUFD */ +static inline struct iommufd_ctx *iommufd_ctx_from_file(struct file *file) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void iommufd_ctx_put(struct iommufd_ctx *ictx) +{ +} +#endif /* CONFIG_IOMMUFD */ +#endif diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h new file mode 100644 index 0000000000000..37de92f0534b8 --- /dev/null +++ b/include/uapi/linux/iommufd.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. + */ +#ifndef _UAPI_IOMMUFD_H +#define _UAPI_IOMMUFD_H + +#include +#include + +#define IOMMUFD_TYPE (';') + +/** + * DOC: General ioctl format + * + * The ioctl interface follows a general format to allow for extensibility. Each + * ioctl is passed in a structure pointer as the argument providing the size of + * the structure in the first u32. The kernel checks that any structure space + * beyond what it understands is 0. This allows userspace to use the backward + * compatible portion while consistently using the newer, larger, structures. + * + * ioctls use a standard meaning for common errnos: + * + * - ENOTTY: The IOCTL number itself is not supported at all + * - E2BIG: The IOCTL number is supported, but the provided structure has + * non-zero in a part the kernel does not understand. + * - EOPNOTSUPP: The IOCTL number is supported, and the structure is + * understood, however a known field has a value the kernel does not + * understand or support. + * - EINVAL: Everything about the IOCTL was understood, but a field is not + * correct. + * - ENOENT: An ID or IOVA provided does not exist. + * - ENOMEM: Out of memory. + * - EOVERFLOW: Mathematics overflowed. + * + * As well as additional errnos, within specific ioctls. + */ +enum { + IOMMUFD_CMD_BASE = 0x80, + IOMMUFD_CMD_DESTROY = IOMMUFD_CMD_BASE, +}; + +/** + * struct iommu_destroy - ioctl(IOMMU_DESTROY) + * @size: sizeof(struct iommu_destroy) + * @id: iommufd object ID to destroy. Can by any destroyable object type. + * + * Destroy any object held within iommufd. + */ +struct iommu_destroy { + __u32 size; + __u32 id; +}; +#define IOMMU_DESTROY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_DESTROY) + +#endif -- GitLab From ce5a23c835aa0f0a931b5bcde1e7811f951b0146 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:30 -0400 Subject: [PATCH 1006/1423] kernel/user: Allow user_struct::locked_vm to be usable for iommufd Following the pattern of io_uring, perf, skb, and bpf, iommfd will use user->locked_vm for accounting pinned pages. Ensure the value is included in the struct and export free_uid() as iommufd is modular. user->locked_vm is the good accounting to use for ulimit because it is per-user, and the security sandboxing of locked pages is not supposed to be per-process. Other places (vfio, vdpa and infiniband) have used mm->pinned_vm and/or mm->locked_vm for accounting pinned pages, but this is only per-process and inconsistent with the new FOLL_LONGTERM users in the kernel. Concurrent work is underway to try to put this in a cgroup, so everything can be consistent and the kernel can provide a FOLL_LONGTERM limit that actually provides security. Link: https://lore.kernel.org/r/7-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Eric Auger Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Signed-off-by: Jason Gunthorpe --- include/linux/sched/user.h | 2 +- kernel/user.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h index f054d0360a753..4cc52698e214e 100644 --- a/include/linux/sched/user.h +++ b/include/linux/sched/user.h @@ -25,7 +25,7 @@ struct user_struct { #if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \ defined(CONFIG_NET) || defined(CONFIG_IO_URING) || \ - defined(CONFIG_VFIO_PCI_ZDEV_KVM) + defined(CONFIG_VFIO_PCI_ZDEV_KVM) || IS_ENABLED(CONFIG_IOMMUFD) atomic_long_t locked_vm; #endif #ifdef CONFIG_WATCH_QUEUE diff --git a/kernel/user.c b/kernel/user.c index e2cf8c22b539a..d667debeafd60 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -185,6 +185,7 @@ void free_uid(struct user_struct *up) if (refcount_dec_and_lock_irqsave(&up->__count, &uidhash_lock, &flags)) free_user(up, flags); } +EXPORT_SYMBOL_GPL(free_uid); struct user_struct *alloc_uid(kuid_t uid) { -- GitLab From f394576eb11dbcd3a740fa41e577b97f0720d26e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:31 -0400 Subject: [PATCH 1007/1423] iommufd: PFN handling for iopt_pages The top of the data structure provides an IO Address Space (IOAS) that is similar to a VFIO container. The IOAS allows map/unmap of memory into ranges of IOVA called iopt_areas. Multiple IOMMU domains (IO page tables) and in-kernel accesses (like VFIO mdevs) can be attached to the IOAS to access the PFNs that those IOVA areas cover. The IO Address Space (IOAS) datastructure is composed of: - struct io_pagetable holding the IOVA map - struct iopt_areas representing populated portions of IOVA - struct iopt_pages representing the storage of PFNs - struct iommu_domain representing each IO page table in the system IOMMU - struct iopt_pages_access representing in-kernel accesses of PFNs (ie VFIO mdevs) - struct xarray pinned_pfns holding a list of pages pinned by in-kernel accesses This patch introduces the lowest part of the datastructure - the movement of PFNs in a tiered storage scheme: 1) iopt_pages::pinned_pfns xarray 2) Multiple iommu_domains 3) The origin of the PFNs, i.e. the userspace pointer PFN have to be copied between all combinations of tiers, depending on the configuration. The interface is an iterator called a 'pfn_reader' which determines which tier each PFN is stored and loads it into a list of PFNs held in a struct pfn_batch. Each step of the iterator will fill up the pfn_batch, then the caller can use the pfn_batch to send the PFNs to the required destination. Repeating this loop will read all the PFNs in an IOVA range. The pfn_reader and pfn_batch also keep track of the pinned page accounting. While PFNs are always stored and accessed as full PAGE_SIZE units the iommu_domain tier can store with a sub-page offset/length to support IOMMUs with a smaller IOPTE size than PAGE_SIZE. Link: https://lore.kernel.org/r/8-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Signed-off-by: Jason Gunthorpe --- .clang-format | 1 + drivers/iommu/iommufd/Makefile | 3 +- drivers/iommu/iommufd/double_span.h | 53 ++ drivers/iommu/iommufd/io_pagetable.h | 109 +++ drivers/iommu/iommufd/iommufd_private.h | 24 + drivers/iommu/iommufd/pages.c | 1066 +++++++++++++++++++++++ include/linux/iommufd.h | 7 + 7 files changed, 1262 insertions(+), 1 deletion(-) create mode 100644 drivers/iommu/iommufd/double_span.h create mode 100644 drivers/iommu/iommufd/io_pagetable.h create mode 100644 drivers/iommu/iommufd/pages.c diff --git a/.clang-format b/.clang-format index 96d07786dcfb4..501241f897766 100644 --- a/.clang-format +++ b/.clang-format @@ -440,6 +440,7 @@ ForEachMacros: - 'inet_lhash2_for_each_icsk' - 'inet_lhash2_for_each_icsk_continue' - 'inet_lhash2_for_each_icsk_rcu' + - 'interval_tree_for_each_double_span' - 'interval_tree_for_each_span' - 'intlist__for_each_entry' - 'intlist__for_each_entry_safe' diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile index a07a8cffe937c..05a0e91e30afa 100644 --- a/drivers/iommu/iommufd/Makefile +++ b/drivers/iommu/iommufd/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only iommufd-y := \ - main.o + main.o \ + pages.o obj-$(CONFIG_IOMMUFD) += iommufd.o diff --git a/drivers/iommu/iommufd/double_span.h b/drivers/iommu/iommufd/double_span.h new file mode 100644 index 0000000000000..b37aab7488c0f --- /dev/null +++ b/drivers/iommu/iommufd/double_span.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. + */ +#ifndef __IOMMUFD_DOUBLE_SPAN_H +#define __IOMMUFD_DOUBLE_SPAN_H + +#include + +/* + * This is a variation of the general interval_tree_span_iter that computes the + * spans over the union of two different interval trees. Used ranges are broken + * up and reported based on the tree that provides the interval. The first span + * always takes priority. Like interval_tree_span_iter it is greedy and the same + * value of is_used will not repeat on two iteration cycles. + */ +struct interval_tree_double_span_iter { + struct rb_root_cached *itrees[2]; + struct interval_tree_span_iter spans[2]; + union { + unsigned long start_hole; + unsigned long start_used; + }; + union { + unsigned long last_hole; + unsigned long last_used; + }; + /* 0 = hole, 1 = used span[0], 2 = used span[1], -1 done iteration */ + int is_used; +}; + +void interval_tree_double_span_iter_update( + struct interval_tree_double_span_iter *iter); +void interval_tree_double_span_iter_first( + struct interval_tree_double_span_iter *iter, + struct rb_root_cached *itree1, struct rb_root_cached *itree2, + unsigned long first_index, unsigned long last_index); +void interval_tree_double_span_iter_next( + struct interval_tree_double_span_iter *iter); + +static inline bool +interval_tree_double_span_iter_done(struct interval_tree_double_span_iter *state) +{ + return state->is_used == -1; +} + +#define interval_tree_for_each_double_span(span, itree1, itree2, first_index, \ + last_index) \ + for (interval_tree_double_span_iter_first(span, itree1, itree2, \ + first_index, last_index); \ + !interval_tree_double_span_iter_done(span); \ + interval_tree_double_span_iter_next(span)) + +#endif diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h new file mode 100644 index 0000000000000..b74bf01ffc52c --- /dev/null +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. + * + */ +#ifndef __IO_PAGETABLE_H +#define __IO_PAGETABLE_H + +#include +#include +#include +#include + +#include "iommufd_private.h" + +struct iommu_domain; + +/* + * Each io_pagetable is composed of intervals of areas which cover regions of + * the iova that are backed by something. iova not covered by areas is not + * populated in the page table. Each area is fully populated with pages. + * + * iovas are in byte units, but must be iopt->iova_alignment aligned. + * + * pages can be NULL, this means some other thread is still working on setting + * up or tearing down the area. When observed under the write side of the + * domain_rwsem a NULL pages must mean the area is still being setup and no + * domains are filled. + * + * storage_domain points at an arbitrary iommu_domain that is holding the PFNs + * for this area. It is locked by the pages->mutex. This simplifies the locking + * as the pages code can rely on the storage_domain without having to get the + * iopt->domains_rwsem. + * + * The io_pagetable::iova_rwsem protects node + * The iopt_pages::mutex protects pages_node + * iopt and immu_prot are immutable + * The pages::mutex protects num_accesses + */ +struct iopt_area { + struct interval_tree_node node; + struct interval_tree_node pages_node; + struct io_pagetable *iopt; + struct iopt_pages *pages; + struct iommu_domain *storage_domain; + /* How many bytes into the first page the area starts */ + unsigned int page_offset; + /* IOMMU_READ, IOMMU_WRITE, etc */ + int iommu_prot; + unsigned int num_accesses; +}; + +static inline unsigned long iopt_area_index(struct iopt_area *area) +{ + return area->pages_node.start; +} + +static inline unsigned long iopt_area_last_index(struct iopt_area *area) +{ + return area->pages_node.last; +} + +static inline unsigned long iopt_area_iova(struct iopt_area *area) +{ + return area->node.start; +} + +static inline unsigned long iopt_area_last_iova(struct iopt_area *area) +{ + return area->node.last; +} + +enum { + IOPT_PAGES_ACCOUNT_NONE = 0, + IOPT_PAGES_ACCOUNT_USER = 1, + IOPT_PAGES_ACCOUNT_MM = 2, +}; + +/* + * This holds a pinned page list for multiple areas of IO address space. The + * pages always originate from a linear chunk of userspace VA. Multiple + * io_pagetable's, through their iopt_area's, can share a single iopt_pages + * which avoids multi-pinning and double accounting of page consumption. + * + * indexes in this structure are measured in PAGE_SIZE units, are 0 based from + * the start of the uptr and extend to npages. pages are pinned dynamically + * according to the intervals in the access_itree and domains_itree, npinned + * records the current number of pages pinned. + */ +struct iopt_pages { + struct kref kref; + struct mutex mutex; + size_t npages; + size_t npinned; + size_t last_npinned; + struct task_struct *source_task; + struct mm_struct *source_mm; + struct user_struct *source_user; + void __user *uptr; + bool writable:1; + u8 account_mode; + + struct xarray pinned_pfns; + /* Of iopt_pages_access::node */ + struct rb_root_cached access_itree; + /* Of iopt_area::pages_node */ + struct rb_root_cached domains_itree; +}; + +#endif diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index bb720bc113178..169a30ff3bf0d 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -14,6 +14,30 @@ struct iommufd_ctx { struct xarray objects; }; +/* + * The IOVA to PFN map. The map automatically copies the PFNs into multiple + * domains and permits sharing of PFNs between io_pagetable instances. This + * supports both a design where IOAS's are 1:1 with a domain (eg because the + * domain is HW customized), or where the IOAS is 1:N with multiple generic + * domains. The io_pagetable holds an interval tree of iopt_areas which point + * to shared iopt_pages which hold the pfns mapped to the page table. + * + * The locking order is domains_rwsem -> iova_rwsem -> pages::mutex + */ +struct io_pagetable { + struct rw_semaphore domains_rwsem; + struct xarray domains; + unsigned int next_domain_id; + + struct rw_semaphore iova_rwsem; + struct rb_root_cached area_itree; + /* IOVA that cannot become reserved, struct iopt_allowed */ + struct rb_root_cached allowed_itree; + /* IOVA that cannot be allocated, struct iopt_reserved */ + struct rb_root_cached reserved_itree; + u8 disable_large_pages; +}; + struct iommufd_ucmd { struct iommufd_ctx *ictx; void __user *ubuffer; diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c new file mode 100644 index 0000000000000..ebca78e743c6f --- /dev/null +++ b/drivers/iommu/iommufd/pages.c @@ -0,0 +1,1066 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. + * + * The iopt_pages is the center of the storage and motion of PFNs. Each + * iopt_pages represents a logical linear array of full PFNs. The array is 0 + * based and has npages in it. Accessors use 'index' to refer to the entry in + * this logical array, regardless of its storage location. + * + * PFNs are stored in a tiered scheme: + * 1) iopt_pages::pinned_pfns xarray + * 2) An iommu_domain + * 3) The origin of the PFNs, i.e. the userspace pointer + * + * PFN have to be copied between all combinations of tiers, depending on the + * configuration. + * + * When a PFN is taken out of the userspace pointer it is pinned exactly once. + * The storage locations of the PFN's index are tracked in the two interval + * trees. If no interval includes the index then it is not pinned. + * + * If access_itree includes the PFN's index then an in-kernel access has + * requested the page. The PFN is stored in the xarray so other requestors can + * continue to find it. + * + * If the domains_itree includes the PFN's index then an iommu_domain is storing + * the PFN and it can be read back using iommu_iova_to_phys(). To avoid + * duplicating storage the xarray is not used if only iommu_domains are using + * the PFN's index. + * + * As a general principle this is designed so that destroy never fails. This + * means removing an iommu_domain or releasing a in-kernel access will not fail + * due to insufficient memory. In practice this means some cases have to hold + * PFNs in the xarray even though they are also being stored in an iommu_domain. + * + * While the iopt_pages can use an iommu_domain as storage, it does not have an + * IOVA itself. Instead the iopt_area represents a range of IOVA and uses the + * iopt_pages as the PFN provider. Multiple iopt_areas can share the iopt_pages + * and reference their own slice of the PFN array, with sub page granularity. + * + * In this file the term 'last' indicates an inclusive and closed interval, eg + * [0,0] refers to a single PFN. 'end' means an open range, eg [0,0) refers to + * no PFNs. + * + * Be cautious of overflow. An IOVA can go all the way up to U64_MAX, so + * last_iova + 1 can overflow. An iopt_pages index will always be much less than + * ULONG_MAX so last_index + 1 cannot overflow. + */ +#include +#include +#include +#include +#include +#include +#include + +#include "io_pagetable.h" +#include "double_span.h" + +#define TEMP_MEMORY_LIMIT 65536 +#define BATCH_BACKUP_SIZE 32 + +/* + * More memory makes pin_user_pages() and the batching more efficient, but as + * this is only a performance optimization don't try too hard to get it. A 64k + * allocation can hold about 26M of 4k pages and 13G of 2M pages in an + * pfn_batch. Various destroy paths cannot fail and provide a small amount of + * stack memory as a backup contingency. If backup_len is given this cannot + * fail. + */ +static void *temp_kmalloc(size_t *size, void *backup, size_t backup_len) +{ + void *res; + + if (WARN_ON(*size == 0)) + return NULL; + + if (*size < backup_len) + return backup; + *size = min_t(size_t, *size, TEMP_MEMORY_LIMIT); + res = kmalloc(*size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); + if (res) + return res; + *size = PAGE_SIZE; + if (backup_len) { + res = kmalloc(*size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); + if (res) + return res; + *size = backup_len; + return backup; + } + return kmalloc(*size, GFP_KERNEL); +} + +void interval_tree_double_span_iter_update( + struct interval_tree_double_span_iter *iter) +{ + unsigned long last_hole = ULONG_MAX; + unsigned int i; + + for (i = 0; i != ARRAY_SIZE(iter->spans); i++) { + if (interval_tree_span_iter_done(&iter->spans[i])) { + iter->is_used = -1; + return; + } + + if (iter->spans[i].is_hole) { + last_hole = min(last_hole, iter->spans[i].last_hole); + continue; + } + + iter->is_used = i + 1; + iter->start_used = iter->spans[i].start_used; + iter->last_used = min(iter->spans[i].last_used, last_hole); + return; + } + + iter->is_used = 0; + iter->start_hole = iter->spans[0].start_hole; + iter->last_hole = + min(iter->spans[0].last_hole, iter->spans[1].last_hole); +} + +void interval_tree_double_span_iter_first( + struct interval_tree_double_span_iter *iter, + struct rb_root_cached *itree1, struct rb_root_cached *itree2, + unsigned long first_index, unsigned long last_index) +{ + unsigned int i; + + iter->itrees[0] = itree1; + iter->itrees[1] = itree2; + for (i = 0; i != ARRAY_SIZE(iter->spans); i++) + interval_tree_span_iter_first(&iter->spans[i], iter->itrees[i], + first_index, last_index); + interval_tree_double_span_iter_update(iter); +} + +void interval_tree_double_span_iter_next( + struct interval_tree_double_span_iter *iter) +{ + unsigned int i; + + if (iter->is_used == -1 || + iter->last_hole == iter->spans[0].last_index) { + iter->is_used = -1; + return; + } + + for (i = 0; i != ARRAY_SIZE(iter->spans); i++) + interval_tree_span_iter_advance( + &iter->spans[i], iter->itrees[i], iter->last_hole + 1); + interval_tree_double_span_iter_update(iter); +} + +static void iopt_pages_add_npinned(struct iopt_pages *pages, size_t npages) +{ + pages->npinned += npages; +} + +static void iopt_pages_sub_npinned(struct iopt_pages *pages, size_t npages) +{ + pages->npinned -= npages; +} + +static void iopt_pages_err_unpin(struct iopt_pages *pages, + unsigned long start_index, + unsigned long last_index, + struct page **page_list) +{ + unsigned long npages = last_index - start_index + 1; + + unpin_user_pages(page_list, npages); + iopt_pages_sub_npinned(pages, npages); +} + +/* + * index is the number of PAGE_SIZE units from the start of the area's + * iopt_pages. If the iova is sub page-size then the area has an iova that + * covers a portion of the first and last pages in the range. + */ +static unsigned long iopt_area_index_to_iova(struct iopt_area *area, + unsigned long index) +{ + index -= iopt_area_index(area); + if (index == 0) + return iopt_area_iova(area); + return iopt_area_iova(area) - area->page_offset + index * PAGE_SIZE; +} + +static unsigned long iopt_area_index_to_iova_last(struct iopt_area *area, + unsigned long index) +{ + if (index == iopt_area_last_index(area)) + return iopt_area_last_iova(area); + return iopt_area_iova(area) - area->page_offset + + (index - iopt_area_index(area) + 1) * PAGE_SIZE - 1; +} + +static void iommu_unmap_nofail(struct iommu_domain *domain, unsigned long iova, + size_t size) +{ + size_t ret; + + ret = iommu_unmap(domain, iova, size); + /* + * It is a logic error in this code or a driver bug if the IOMMU unmaps + * something other than exactly as requested. This implies that the + * iommu driver may not fail unmap for reasons beyond bad agruments. + * Particularly, the iommu driver may not do a memory allocation on the + * unmap path. + */ + WARN_ON(ret != size); +} + +static struct iopt_area *iopt_pages_find_domain_area(struct iopt_pages *pages, + unsigned long index) +{ + struct interval_tree_node *node; + + node = interval_tree_iter_first(&pages->domains_itree, index, index); + if (!node) + return NULL; + return container_of(node, struct iopt_area, pages_node); +} + +/* + * A simple datastructure to hold a vector of PFNs, optimized for contiguous + * PFNs. This is used as a temporary holding memory for shuttling pfns from one + * place to another. Generally everything is made more efficient if operations + * work on the largest possible grouping of pfns. eg fewer lock/unlock cycles, + * better cache locality, etc + */ +struct pfn_batch { + unsigned long *pfns; + u32 *npfns; + unsigned int array_size; + unsigned int end; + unsigned int total_pfns; +}; + +static void batch_clear(struct pfn_batch *batch) +{ + batch->total_pfns = 0; + batch->end = 0; + batch->pfns[0] = 0; + batch->npfns[0] = 0; +} + +/* + * Carry means we carry a portion of the final hugepage over to the front of the + * batch + */ +static void batch_clear_carry(struct pfn_batch *batch, unsigned int keep_pfns) +{ + if (!keep_pfns) + return batch_clear(batch); + + batch->total_pfns = keep_pfns; + batch->npfns[0] = keep_pfns; + batch->pfns[0] = batch->pfns[batch->end - 1] + + (batch->npfns[batch->end - 1] - keep_pfns); + batch->end = 0; +} + +static void batch_skip_carry(struct pfn_batch *batch, unsigned int skip_pfns) +{ + if (!batch->total_pfns) + return; + skip_pfns = min(batch->total_pfns, skip_pfns); + batch->pfns[0] += skip_pfns; + batch->npfns[0] -= skip_pfns; + batch->total_pfns -= skip_pfns; +} + +static int __batch_init(struct pfn_batch *batch, size_t max_pages, void *backup, + size_t backup_len) +{ + const size_t elmsz = sizeof(*batch->pfns) + sizeof(*batch->npfns); + size_t size = max_pages * elmsz; + + batch->pfns = temp_kmalloc(&size, backup, backup_len); + if (!batch->pfns) + return -ENOMEM; + batch->array_size = size / elmsz; + batch->npfns = (u32 *)(batch->pfns + batch->array_size); + batch_clear(batch); + return 0; +} + +static int batch_init(struct pfn_batch *batch, size_t max_pages) +{ + return __batch_init(batch, max_pages, NULL, 0); +} + +static void batch_init_backup(struct pfn_batch *batch, size_t max_pages, + void *backup, size_t backup_len) +{ + __batch_init(batch, max_pages, backup, backup_len); +} + +static void batch_destroy(struct pfn_batch *batch, void *backup) +{ + if (batch->pfns != backup) + kfree(batch->pfns); +} + +/* true if the pfn could be added, false otherwise */ +static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn) +{ + const unsigned int MAX_NPFNS = type_max(typeof(*batch->npfns)); + + if (batch->end && + pfn == batch->pfns[batch->end - 1] + batch->npfns[batch->end - 1] && + batch->npfns[batch->end - 1] != MAX_NPFNS) { + batch->npfns[batch->end - 1]++; + batch->total_pfns++; + return true; + } + if (batch->end == batch->array_size) + return false; + batch->total_pfns++; + batch->pfns[batch->end] = pfn; + batch->npfns[batch->end] = 1; + batch->end++; + return true; +} + +/* + * Fill the batch with pfns from the domain. When the batch is full, or it + * reaches last_index, the function will return. The caller should use + * batch->total_pfns to determine the starting point for the next iteration. + */ +static void batch_from_domain(struct pfn_batch *batch, + struct iommu_domain *domain, + struct iopt_area *area, unsigned long start_index, + unsigned long last_index) +{ + unsigned int page_offset = 0; + unsigned long iova; + phys_addr_t phys; + + iova = iopt_area_index_to_iova(area, start_index); + if (start_index == iopt_area_index(area)) + page_offset = area->page_offset; + while (start_index <= last_index) { + /* + * This is pretty slow, it would be nice to get the page size + * back from the driver, or have the driver directly fill the + * batch. + */ + phys = iommu_iova_to_phys(domain, iova) - page_offset; + if (!batch_add_pfn(batch, PHYS_PFN(phys))) + return; + iova += PAGE_SIZE - page_offset; + page_offset = 0; + start_index++; + } +} + +static struct page **raw_pages_from_domain(struct iommu_domain *domain, + struct iopt_area *area, + unsigned long start_index, + unsigned long last_index, + struct page **out_pages) +{ + unsigned int page_offset = 0; + unsigned long iova; + phys_addr_t phys; + + iova = iopt_area_index_to_iova(area, start_index); + if (start_index == iopt_area_index(area)) + page_offset = area->page_offset; + while (start_index <= last_index) { + phys = iommu_iova_to_phys(domain, iova) - page_offset; + *(out_pages++) = pfn_to_page(PHYS_PFN(phys)); + iova += PAGE_SIZE - page_offset; + page_offset = 0; + start_index++; + } + return out_pages; +} + +/* Continues reading a domain until we reach a discontiguity in the pfns. */ +static void batch_from_domain_continue(struct pfn_batch *batch, + struct iommu_domain *domain, + struct iopt_area *area, + unsigned long start_index, + unsigned long last_index) +{ + unsigned int array_size = batch->array_size; + + batch->array_size = batch->end; + batch_from_domain(batch, domain, area, start_index, last_index); + batch->array_size = array_size; +} + +/* + * This is part of the VFIO compatibility support for VFIO_TYPE1_IOMMU. That + * mode permits splitting a mapped area up, and then one of the splits is + * unmapped. Doing this normally would cause us to violate our invariant of + * pairing map/unmap. Thus, to support old VFIO compatibility disable support + * for batching consecutive PFNs. All PFNs mapped into the iommu are done in + * PAGE_SIZE units, not larger or smaller. + */ +static int batch_iommu_map_small(struct iommu_domain *domain, + unsigned long iova, phys_addr_t paddr, + size_t size, int prot) +{ + unsigned long start_iova = iova; + int rc; + + while (size) { + rc = iommu_map(domain, iova, paddr, PAGE_SIZE, prot); + if (rc) + goto err_unmap; + iova += PAGE_SIZE; + paddr += PAGE_SIZE; + size -= PAGE_SIZE; + } + return 0; + +err_unmap: + if (start_iova != iova) + iommu_unmap_nofail(domain, start_iova, iova - start_iova); + return rc; +} + +static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain, + struct iopt_area *area, unsigned long start_index) +{ + bool disable_large_pages = area->iopt->disable_large_pages; + unsigned long last_iova = iopt_area_last_iova(area); + unsigned int page_offset = 0; + unsigned long start_iova; + unsigned long next_iova; + unsigned int cur = 0; + unsigned long iova; + int rc; + + /* The first index might be a partial page */ + if (start_index == iopt_area_index(area)) + page_offset = area->page_offset; + next_iova = iova = start_iova = + iopt_area_index_to_iova(area, start_index); + while (cur < batch->end) { + next_iova = min(last_iova + 1, + next_iova + batch->npfns[cur] * PAGE_SIZE - + page_offset); + if (disable_large_pages) + rc = batch_iommu_map_small( + domain, iova, + PFN_PHYS(batch->pfns[cur]) + page_offset, + next_iova - iova, area->iommu_prot); + else + rc = iommu_map(domain, iova, + PFN_PHYS(batch->pfns[cur]) + page_offset, + next_iova - iova, area->iommu_prot); + if (rc) + goto err_unmap; + iova = next_iova; + page_offset = 0; + cur++; + } + return 0; +err_unmap: + if (start_iova != iova) + iommu_unmap_nofail(domain, start_iova, iova - start_iova); + return rc; +} + +static void batch_from_xarray(struct pfn_batch *batch, struct xarray *xa, + unsigned long start_index, + unsigned long last_index) +{ + XA_STATE(xas, xa, start_index); + void *entry; + + rcu_read_lock(); + while (true) { + entry = xas_next(&xas); + if (xas_retry(&xas, entry)) + continue; + WARN_ON(!xa_is_value(entry)); + if (!batch_add_pfn(batch, xa_to_value(entry)) || + start_index == last_index) + break; + start_index++; + } + rcu_read_unlock(); +} + +static void batch_from_xarray_clear(struct pfn_batch *batch, struct xarray *xa, + unsigned long start_index, + unsigned long last_index) +{ + XA_STATE(xas, xa, start_index); + void *entry; + + xas_lock(&xas); + while (true) { + entry = xas_next(&xas); + if (xas_retry(&xas, entry)) + continue; + WARN_ON(!xa_is_value(entry)); + if (!batch_add_pfn(batch, xa_to_value(entry))) + break; + xas_store(&xas, NULL); + if (start_index == last_index) + break; + start_index++; + } + xas_unlock(&xas); +} + +static void clear_xarray(struct xarray *xa, unsigned long start_index, + unsigned long last_index) +{ + XA_STATE(xas, xa, start_index); + void *entry; + + xas_lock(&xas); + xas_for_each(&xas, entry, last_index) + xas_store(&xas, NULL); + xas_unlock(&xas); +} + +static int pages_to_xarray(struct xarray *xa, unsigned long start_index, + unsigned long last_index, struct page **pages) +{ + struct page **end_pages = pages + (last_index - start_index) + 1; + XA_STATE(xas, xa, start_index); + + do { + void *old; + + xas_lock(&xas); + while (pages != end_pages) { + old = xas_store(&xas, xa_mk_value(page_to_pfn(*pages))); + if (xas_error(&xas)) + break; + WARN_ON(old); + pages++; + xas_next(&xas); + } + xas_unlock(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); + + if (xas_error(&xas)) { + if (xas.xa_index != start_index) + clear_xarray(xa, start_index, xas.xa_index - 1); + return xas_error(&xas); + } + return 0; +} + +static void batch_from_pages(struct pfn_batch *batch, struct page **pages, + size_t npages) +{ + struct page **end = pages + npages; + + for (; pages != end; pages++) + if (!batch_add_pfn(batch, page_to_pfn(*pages))) + break; +} + +static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages, + unsigned int first_page_off, size_t npages) +{ + unsigned int cur = 0; + + while (first_page_off) { + if (batch->npfns[cur] > first_page_off) + break; + first_page_off -= batch->npfns[cur]; + cur++; + } + + while (npages) { + size_t to_unpin = min_t(size_t, npages, + batch->npfns[cur] - first_page_off); + + unpin_user_page_range_dirty_lock( + pfn_to_page(batch->pfns[cur] + first_page_off), + to_unpin, pages->writable); + iopt_pages_sub_npinned(pages, to_unpin); + cur++; + first_page_off = 0; + npages -= to_unpin; + } +} + +static void copy_data_page(struct page *page, void *data, unsigned long offset, + size_t length, unsigned int flags) +{ + void *mem; + + mem = kmap_local_page(page); + if (flags & IOMMUFD_ACCESS_RW_WRITE) { + memcpy(mem + offset, data, length); + set_page_dirty_lock(page); + } else { + memcpy(data, mem + offset, length); + } + kunmap_local(mem); +} + +static unsigned long batch_rw(struct pfn_batch *batch, void *data, + unsigned long offset, unsigned long length, + unsigned int flags) +{ + unsigned long copied = 0; + unsigned int npage = 0; + unsigned int cur = 0; + + while (cur < batch->end) { + unsigned long bytes = min(length, PAGE_SIZE - offset); + + copy_data_page(pfn_to_page(batch->pfns[cur] + npage), data, + offset, bytes, flags); + offset = 0; + length -= bytes; + data += bytes; + copied += bytes; + npage++; + if (npage == batch->npfns[cur]) { + npage = 0; + cur++; + } + if (!length) + break; + } + return copied; +} + +/* pfn_reader_user is just the pin_user_pages() path */ +struct pfn_reader_user { + struct page **upages; + size_t upages_len; + unsigned long upages_start; + unsigned long upages_end; + unsigned int gup_flags; + /* + * 1 means mmget() and mmap_read_lock(), 0 means only mmget(), -1 is + * neither + */ + int locked; +}; + +static void pfn_reader_user_init(struct pfn_reader_user *user, + struct iopt_pages *pages) +{ + user->upages = NULL; + user->upages_start = 0; + user->upages_end = 0; + user->locked = -1; + + if (pages->writable) { + user->gup_flags = FOLL_LONGTERM | FOLL_WRITE; + } else { + /* Still need to break COWs on read */ + user->gup_flags = FOLL_LONGTERM | FOLL_FORCE | FOLL_WRITE; + } +} + +static void pfn_reader_user_destroy(struct pfn_reader_user *user, + struct iopt_pages *pages) +{ + if (user->locked != -1) { + if (user->locked) + mmap_read_unlock(pages->source_mm); + if (pages->source_mm != current->mm) + mmput(pages->source_mm); + user->locked = 0; + } + + kfree(user->upages); + user->upages = NULL; +} + +static int pfn_reader_user_pin(struct pfn_reader_user *user, + struct iopt_pages *pages, + unsigned long start_index, + unsigned long last_index) +{ + bool remote_mm = pages->source_mm != current->mm; + unsigned long npages; + uintptr_t uptr; + long rc; + + if (!user->upages) { + /* All undone in pfn_reader_destroy() */ + user->upages_len = + (last_index - start_index + 1) * sizeof(*user->upages); + user->upages = temp_kmalloc(&user->upages_len, NULL, 0); + if (!user->upages) + return -ENOMEM; + } + + if (user->locked == -1) { + /* + * The majority of usages will run the map task within the mm + * providing the pages, so we can optimize into + * get_user_pages_fast() + */ + if (remote_mm) { + if (!mmget_not_zero(pages->source_mm)) + return -EFAULT; + } + user->locked = 0; + } + + npages = min_t(unsigned long, last_index - start_index + 1, + user->upages_len / sizeof(*user->upages)); + + uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE); + if (!remote_mm) + rc = pin_user_pages_fast(uptr, npages, user->gup_flags, + user->upages); + else { + if (!user->locked) { + mmap_read_lock(pages->source_mm); + user->locked = 1; + } + /* + * FIXME: last NULL can be &pfns->locked once the GUP patch + * is merged. + */ + rc = pin_user_pages_remote(pages->source_mm, uptr, npages, + user->gup_flags, user->upages, NULL, + NULL); + } + if (rc <= 0) { + if (WARN_ON(!rc)) + return -EFAULT; + return rc; + } + iopt_pages_add_npinned(pages, rc); + user->upages_start = start_index; + user->upages_end = start_index + rc; + return 0; +} + +/* This is the "modern" and faster accounting method used by io_uring */ +static int incr_user_locked_vm(struct iopt_pages *pages, unsigned long npages) +{ + unsigned long lock_limit; + unsigned long cur_pages; + unsigned long new_pages; + + lock_limit = task_rlimit(pages->source_task, RLIMIT_MEMLOCK) >> + PAGE_SHIFT; + npages = pages->npinned - pages->last_npinned; + do { + cur_pages = atomic_long_read(&pages->source_user->locked_vm); + new_pages = cur_pages + npages; + if (new_pages > lock_limit) + return -ENOMEM; + } while (atomic_long_cmpxchg(&pages->source_user->locked_vm, cur_pages, + new_pages) != cur_pages); + return 0; +} + +static void decr_user_locked_vm(struct iopt_pages *pages, unsigned long npages) +{ + if (WARN_ON(atomic_long_read(&pages->source_user->locked_vm) < npages)) + return; + atomic_long_sub(npages, &pages->source_user->locked_vm); +} + +/* This is the accounting method used for compatibility with VFIO */ +static int update_mm_locked_vm(struct iopt_pages *pages, unsigned long npages, + bool inc, struct pfn_reader_user *user) +{ + bool do_put = false; + int rc; + + if (user && user->locked) { + mmap_read_unlock(pages->source_mm); + user->locked = 0; + /* If we had the lock then we also have a get */ + } else if ((!user || !user->upages) && + pages->source_mm != current->mm) { + if (!mmget_not_zero(pages->source_mm)) + return -EINVAL; + do_put = true; + } + + mmap_write_lock(pages->source_mm); + rc = __account_locked_vm(pages->source_mm, npages, inc, + pages->source_task, false); + mmap_write_unlock(pages->source_mm); + + if (do_put) + mmput(pages->source_mm); + return rc; +} + +static int do_update_pinned(struct iopt_pages *pages, unsigned long npages, + bool inc, struct pfn_reader_user *user) +{ + int rc = 0; + + switch (pages->account_mode) { + case IOPT_PAGES_ACCOUNT_NONE: + break; + case IOPT_PAGES_ACCOUNT_USER: + if (inc) + rc = incr_user_locked_vm(pages, npages); + else + decr_user_locked_vm(pages, npages); + break; + case IOPT_PAGES_ACCOUNT_MM: + rc = update_mm_locked_vm(pages, npages, inc, user); + break; + } + if (rc) + return rc; + + pages->last_npinned = pages->npinned; + if (inc) + atomic64_add(npages, &pages->source_mm->pinned_vm); + else + atomic64_sub(npages, &pages->source_mm->pinned_vm); + return 0; +} + +static void update_unpinned(struct iopt_pages *pages) +{ + if (WARN_ON(pages->npinned > pages->last_npinned)) + return; + if (pages->npinned == pages->last_npinned) + return; + do_update_pinned(pages, pages->last_npinned - pages->npinned, false, + NULL); +} + +/* + * Changes in the number of pages pinned is done after the pages have been read + * and processed. If the user lacked the limit then the error unwind will unpin + * everything that was just pinned. This is because it is expensive to calculate + * how many pages we have already pinned within a range to generate an accurate + * prediction in advance of doing the work to actually pin them. + */ +static int pfn_reader_user_update_pinned(struct pfn_reader_user *user, + struct iopt_pages *pages) +{ + unsigned long npages; + bool inc; + + lockdep_assert_held(&pages->mutex); + + if (pages->npinned == pages->last_npinned) + return 0; + + if (pages->npinned < pages->last_npinned) { + npages = pages->last_npinned - pages->npinned; + inc = false; + } else { + npages = pages->npinned - pages->last_npinned; + inc = true; + } + return do_update_pinned(pages, npages, inc, user); +} + +/* + * PFNs are stored in three places, in order of preference: + * - The iopt_pages xarray. This is only populated if there is a + * iopt_pages_access + * - The iommu_domain under an area + * - The original PFN source, ie pages->source_mm + * + * This iterator reads the pfns optimizing to load according to the + * above order. + */ +struct pfn_reader { + struct iopt_pages *pages; + struct interval_tree_double_span_iter span; + struct pfn_batch batch; + unsigned long batch_start_index; + unsigned long batch_end_index; + unsigned long last_index; + + struct pfn_reader_user user; +}; + +static int pfn_reader_update_pinned(struct pfn_reader *pfns) +{ + return pfn_reader_user_update_pinned(&pfns->user, pfns->pages); +} + +/* + * The batch can contain a mixture of pages that are still in use and pages that + * need to be unpinned. Unpin only pages that are not held anywhere else. + */ +static void pfn_reader_unpin(struct pfn_reader *pfns) +{ + unsigned long last = pfns->batch_end_index - 1; + unsigned long start = pfns->batch_start_index; + struct interval_tree_double_span_iter span; + struct iopt_pages *pages = pfns->pages; + + lockdep_assert_held(&pages->mutex); + + interval_tree_for_each_double_span(&span, &pages->access_itree, + &pages->domains_itree, start, last) { + if (span.is_used) + continue; + + batch_unpin(&pfns->batch, pages, span.start_hole - start, + span.last_hole - span.start_hole + 1); + } +} + +/* Process a single span to load it from the proper storage */ +static int pfn_reader_fill_span(struct pfn_reader *pfns) +{ + struct interval_tree_double_span_iter *span = &pfns->span; + unsigned long start_index = pfns->batch_end_index; + struct iopt_area *area; + int rc; + + if (span->is_used == 1) { + batch_from_xarray(&pfns->batch, &pfns->pages->pinned_pfns, + start_index, span->last_used); + return 0; + } + + if (span->is_used == 2) { + /* + * Pull as many pages from the first domain we find in the + * target span. If it is too small then we will be called again + * and we'll find another area. + */ + area = iopt_pages_find_domain_area(pfns->pages, start_index); + if (WARN_ON(!area)) + return -EINVAL; + + /* The storage_domain cannot change without the pages mutex */ + batch_from_domain( + &pfns->batch, area->storage_domain, area, start_index, + min(iopt_area_last_index(area), span->last_used)); + return 0; + } + + if (start_index >= pfns->user.upages_end) { + rc = pfn_reader_user_pin(&pfns->user, pfns->pages, start_index, + span->last_hole); + if (rc) + return rc; + } + + batch_from_pages(&pfns->batch, + pfns->user.upages + + (start_index - pfns->user.upages_start), + pfns->user.upages_end - start_index); + return 0; +} + +static bool pfn_reader_done(struct pfn_reader *pfns) +{ + return pfns->batch_start_index == pfns->last_index + 1; +} + +static int pfn_reader_next(struct pfn_reader *pfns) +{ + int rc; + + batch_clear(&pfns->batch); + pfns->batch_start_index = pfns->batch_end_index; + + while (pfns->batch_end_index != pfns->last_index + 1) { + unsigned int npfns = pfns->batch.total_pfns; + + rc = pfn_reader_fill_span(pfns); + if (rc) + return rc; + + if (WARN_ON(!pfns->batch.total_pfns)) + return -EINVAL; + + pfns->batch_end_index = + pfns->batch_start_index + pfns->batch.total_pfns; + if (pfns->batch_end_index == pfns->span.last_used + 1) + interval_tree_double_span_iter_next(&pfns->span); + + /* Batch is full */ + if (npfns == pfns->batch.total_pfns) + return 0; + } + return 0; +} + +static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages, + unsigned long start_index, unsigned long last_index) +{ + int rc; + + lockdep_assert_held(&pages->mutex); + + pfns->pages = pages; + pfns->batch_start_index = start_index; + pfns->batch_end_index = start_index; + pfns->last_index = last_index; + pfn_reader_user_init(&pfns->user, pages); + rc = batch_init(&pfns->batch, last_index - start_index + 1); + if (rc) + return rc; + interval_tree_double_span_iter_first(&pfns->span, &pages->access_itree, + &pages->domains_itree, start_index, + last_index); + return 0; +} + +/* + * There are many assertions regarding the state of pages->npinned vs + * pages->last_pinned, for instance something like unmapping a domain must only + * decrement the npinned, and pfn_reader_destroy() must be called only after all + * the pins are updated. This is fine for success flows, but error flows + * sometimes need to release the pins held inside the pfn_reader before going on + * to complete unmapping and releasing pins held in domains. + */ +static void pfn_reader_release_pins(struct pfn_reader *pfns) +{ + struct iopt_pages *pages = pfns->pages; + + if (pfns->user.upages_end > pfns->batch_end_index) { + size_t npages = pfns->user.upages_end - pfns->batch_end_index; + + /* Any pages not transferred to the batch are just unpinned */ + unpin_user_pages(pfns->user.upages + (pfns->batch_end_index - + pfns->user.upages_start), + npages); + iopt_pages_sub_npinned(pages, npages); + pfns->user.upages_end = pfns->batch_end_index; + } + if (pfns->batch_start_index != pfns->batch_end_index) { + pfn_reader_unpin(pfns); + pfns->batch_start_index = pfns->batch_end_index; + } +} + +static void pfn_reader_destroy(struct pfn_reader *pfns) +{ + struct iopt_pages *pages = pfns->pages; + + pfn_reader_release_pins(pfns); + pfn_reader_user_destroy(&pfns->user, pfns->pages); + batch_destroy(&pfns->batch, NULL); + WARN_ON(pages->last_npinned != pages->npinned); +} + +static int pfn_reader_first(struct pfn_reader *pfns, struct iopt_pages *pages, + unsigned long start_index, unsigned long last_index) +{ + int rc; + + rc = pfn_reader_init(pfns, pages, start_index, last_index); + if (rc) + return rc; + rc = pfn_reader_next(pfns); + if (rc) { + pfn_reader_destroy(pfns); + return rc; + } + return 0; +} diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index d1817472c2737..26e09d539737b 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -13,6 +13,13 @@ struct iommufd_ctx; struct file; +enum { + IOMMUFD_ACCESS_RW_READ = 0, + IOMMUFD_ACCESS_RW_WRITE = 1 << 0, + /* Set if the caller is in a kthread then rw will use kthread_use_mm() */ + IOMMUFD_ACCESS_RW_KTHREAD = 1 << 1, +}; + void iommufd_ctx_get(struct iommufd_ctx *ictx); #if IS_ENABLED(CONFIG_IOMMUFD) -- GitLab From 8d160cd4d5066f864ec0f2c981470e55ac03ac27 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:32 -0400 Subject: [PATCH 1008/1423] iommufd: Algorithms for PFN storage The iopt_pages which represents a logical linear list of full PFNs held in different storage tiers. Each area points to a slice of exactly one iopt_pages, and each iopt_pages can have multiple areas and accesses. The three storage tiers are managed to meet these objectives: - If no iommu_domain or in-kerenel access exists then minimal memory should be consumed by iomufd - If a page has been pinned then an iopt_pages will not pin it again - If an in-kernel access exists then the xarray must provide the backing storage to avoid allocations on domain removals - Otherwise any iommu_domain will be used for storage In a common configuration with only an iommu_domain the iopt_pages does not allocate significant memory itself. The external interface for pages has several logical operations: iopt_area_fill_domain() will load the PFNs from storage into a single domain. This is used when attaching a new domain to an existing IOAS. iopt_area_fill_domains() will load the PFNs from storage into multiple domains. This is used when creating a new IOVA map in an existing IOAS iopt_pages_add_access() creates an iopt_pages_access that tracks an in-kernel access of PFNs. This is some external driver that might be accessing the IOVA using the CPU, or programming PFNs with the DMA API. ie a VFIO mdev. iopt_pages_rw_access() directly perform a memcpy on the PFNs, without the overhead of iopt_pages_add_access() iopt_pages_fill_xarray() will load PFNs into the xarray and return a 'struct page *' array. It is used by iopt_pages_access's to extract PFNs for in-kernel use. iopt_pages_fill_from_xarray() is a fast path when it is known the xarray is already filled. As an iopt_pages can be referred to in slices by many areas and accesses it uses interval trees to keep track of which storage tiers currently hold the PFNs. On a page-by-page basis any request for a PFN will be satisfied from one of the storage tiers and the PFN copied to target domain/array. Unfill actions are similar, on a page by page basis domains are unmapped, xarray entries freed or struct pages fully put back. Significant complexity is required to fully optimize all of these data motions. The implementation calculates the largest consecutive range of same-storage indexes and operates in blocks. The accumulation of PFNs always generates the largest contiguous PFN range possible to optimize and this gathering can cross storage tier boundaries. For cases like 'fill domains' care is taken to avoid duplicated work and PFNs are read once and pushed into all domains. The map/unmap interaction with the iommu_domain always works in contiguous PFN blocks. The implementation does not require or benefit from any split/merge optimization in the iommu_domain driver. This design suggests several possible improvements in the IOMMU API that would greatly help performance, particularly a way for the driver to map and read the pfns lists instead of working with one driver call per page to read, and one driver call per contiguous range to store. Link: https://lore.kernel.org/r/9-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/io_pagetable.h | 74 +++ drivers/iommu/iommufd/pages.c | 843 +++++++++++++++++++++++++++ 2 files changed, 917 insertions(+) diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h index b74bf01ffc52c..a2b724175057b 100644 --- a/drivers/iommu/iommufd/io_pagetable.h +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -49,6 +49,15 @@ struct iopt_area { unsigned int num_accesses; }; +int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages); +void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages); + +int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain); +void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages, + struct iommu_domain *domain); +void iopt_area_unmap_domain(struct iopt_area *area, + struct iommu_domain *domain); + static inline unsigned long iopt_area_index(struct iopt_area *area) { return area->pages_node.start; @@ -69,6 +78,39 @@ static inline unsigned long iopt_area_last_iova(struct iopt_area *area) return area->node.last; } +static inline size_t iopt_area_length(struct iopt_area *area) +{ + return (area->node.last - area->node.start) + 1; +} + +#define __make_iopt_iter(name) \ + static inline struct iopt_##name *iopt_##name##_iter_first( \ + struct io_pagetable *iopt, unsigned long start, \ + unsigned long last) \ + { \ + struct interval_tree_node *node; \ + \ + lockdep_assert_held(&iopt->iova_rwsem); \ + node = interval_tree_iter_first(&iopt->name##_itree, start, \ + last); \ + if (!node) \ + return NULL; \ + return container_of(node, struct iopt_##name, node); \ + } \ + static inline struct iopt_##name *iopt_##name##_iter_next( \ + struct iopt_##name *last_node, unsigned long start, \ + unsigned long last) \ + { \ + struct interval_tree_node *node; \ + \ + node = interval_tree_iter_next(&last_node->node, start, last); \ + if (!node) \ + return NULL; \ + return container_of(node, struct iopt_##name, node); \ + } + +__make_iopt_iter(area) + enum { IOPT_PAGES_ACCOUNT_NONE = 0, IOPT_PAGES_ACCOUNT_USER = 1, @@ -106,4 +148,36 @@ struct iopt_pages { struct rb_root_cached domains_itree; }; +struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, + bool writable); +void iopt_release_pages(struct kref *kref); +static inline void iopt_put_pages(struct iopt_pages *pages) +{ + kref_put(&pages->kref, iopt_release_pages); +} + +void iopt_pages_fill_from_xarray(struct iopt_pages *pages, unsigned long start, + unsigned long last, struct page **out_pages); +int iopt_pages_fill_xarray(struct iopt_pages *pages, unsigned long start, + unsigned long last, struct page **out_pages); +void iopt_pages_unfill_xarray(struct iopt_pages *pages, unsigned long start, + unsigned long last); + +int iopt_area_add_access(struct iopt_area *area, unsigned long start, + unsigned long last, struct page **out_pages, + unsigned int flags); +void iopt_area_remove_access(struct iopt_area *area, unsigned long start, + unsigned long last); +int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte, + void *data, unsigned long length, unsigned int flags); + +/* + * Each interval represents an active iopt_access_pages(), it acts as an + * interval lock that keeps the PFNs pinned and stored in the xarray. + */ +struct iopt_pages_access { + struct interval_tree_node node; + unsigned int users; +}; + #endif diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index ebca78e743c6f..bafeee9d73e8a 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -212,6 +212,18 @@ static void iommu_unmap_nofail(struct iommu_domain *domain, unsigned long iova, WARN_ON(ret != size); } +static void iopt_area_unmap_domain_range(struct iopt_area *area, + struct iommu_domain *domain, + unsigned long start_index, + unsigned long last_index) +{ + unsigned long start_iova = iopt_area_index_to_iova(area, start_index); + + iommu_unmap_nofail(domain, start_iova, + iopt_area_index_to_iova_last(area, last_index) - + start_iova + 1); +} + static struct iopt_area *iopt_pages_find_domain_area(struct iopt_pages *pages, unsigned long index) { @@ -1064,3 +1076,834 @@ static int pfn_reader_first(struct pfn_reader *pfns, struct iopt_pages *pages, } return 0; } + +struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, + bool writable) +{ + struct iopt_pages *pages; + + /* + * The iommu API uses size_t as the length, and protect the DIV_ROUND_UP + * below from overflow + */ + if (length > SIZE_MAX - PAGE_SIZE || length == 0) + return ERR_PTR(-EINVAL); + + pages = kzalloc(sizeof(*pages), GFP_KERNEL_ACCOUNT); + if (!pages) + return ERR_PTR(-ENOMEM); + + kref_init(&pages->kref); + xa_init_flags(&pages->pinned_pfns, XA_FLAGS_ACCOUNT); + mutex_init(&pages->mutex); + pages->source_mm = current->mm; + mmgrab(pages->source_mm); + pages->uptr = (void __user *)ALIGN_DOWN((uintptr_t)uptr, PAGE_SIZE); + pages->npages = DIV_ROUND_UP(length + (uptr - pages->uptr), PAGE_SIZE); + pages->access_itree = RB_ROOT_CACHED; + pages->domains_itree = RB_ROOT_CACHED; + pages->writable = writable; + if (capable(CAP_IPC_LOCK)) + pages->account_mode = IOPT_PAGES_ACCOUNT_NONE; + else + pages->account_mode = IOPT_PAGES_ACCOUNT_USER; + pages->source_task = current->group_leader; + get_task_struct(current->group_leader); + pages->source_user = get_uid(current_user()); + return pages; +} + +void iopt_release_pages(struct kref *kref) +{ + struct iopt_pages *pages = container_of(kref, struct iopt_pages, kref); + + WARN_ON(!RB_EMPTY_ROOT(&pages->access_itree.rb_root)); + WARN_ON(!RB_EMPTY_ROOT(&pages->domains_itree.rb_root)); + WARN_ON(pages->npinned); + WARN_ON(!xa_empty(&pages->pinned_pfns)); + mmdrop(pages->source_mm); + mutex_destroy(&pages->mutex); + put_task_struct(pages->source_task); + free_uid(pages->source_user); + kfree(pages); +} + +static void +iopt_area_unpin_domain(struct pfn_batch *batch, struct iopt_area *area, + struct iopt_pages *pages, struct iommu_domain *domain, + unsigned long start_index, unsigned long last_index, + unsigned long *unmapped_end_index, + unsigned long real_last_index) +{ + while (start_index <= last_index) { + unsigned long batch_last_index; + + if (*unmapped_end_index <= last_index) { + unsigned long start = + max(start_index, *unmapped_end_index); + + batch_from_domain(batch, domain, area, start, + last_index); + batch_last_index = start + batch->total_pfns - 1; + } else { + batch_last_index = last_index; + } + + /* + * unmaps must always 'cut' at a place where the pfns are not + * contiguous to pair with the maps that always install + * contiguous pages. Thus, if we have to stop unpinning in the + * middle of the domains we need to keep reading pfns until we + * find a cut point to do the unmap. The pfns we read are + * carried over and either skipped or integrated into the next + * batch. + */ + if (batch_last_index == last_index && + last_index != real_last_index) + batch_from_domain_continue(batch, domain, area, + last_index + 1, + real_last_index); + + if (*unmapped_end_index <= batch_last_index) { + iopt_area_unmap_domain_range( + area, domain, *unmapped_end_index, + start_index + batch->total_pfns - 1); + *unmapped_end_index = start_index + batch->total_pfns; + } + + /* unpin must follow unmap */ + batch_unpin(batch, pages, 0, + batch_last_index - start_index + 1); + start_index = batch_last_index + 1; + + batch_clear_carry(batch, + *unmapped_end_index - batch_last_index - 1); + } +} + +static void __iopt_area_unfill_domain(struct iopt_area *area, + struct iopt_pages *pages, + struct iommu_domain *domain, + unsigned long last_index) +{ + struct interval_tree_double_span_iter span; + unsigned long start_index = iopt_area_index(area); + unsigned long unmapped_end_index = start_index; + u64 backup[BATCH_BACKUP_SIZE]; + struct pfn_batch batch; + + lockdep_assert_held(&pages->mutex); + + /* + * For security we must not unpin something that is still DMA mapped, + * so this must unmap any IOVA before we go ahead and unpin the pages. + * This creates a complexity where we need to skip over unpinning pages + * held in the xarray, but continue to unmap from the domain. + * + * The domain unmap cannot stop in the middle of a contiguous range of + * PFNs. To solve this problem the unpinning step will read ahead to the + * end of any contiguous span, unmap that whole span, and then only + * unpin the leading part that does not have any accesses. The residual + * PFNs that were unmapped but not unpinned are called a "carry" in the + * batch as they are moved to the front of the PFN list and continue on + * to the next iteration(s). + */ + batch_init_backup(&batch, last_index + 1, backup, sizeof(backup)); + interval_tree_for_each_double_span(&span, &pages->domains_itree, + &pages->access_itree, start_index, + last_index) { + if (span.is_used) { + batch_skip_carry(&batch, + span.last_used - span.start_used + 1); + continue; + } + iopt_area_unpin_domain(&batch, area, pages, domain, + span.start_hole, span.last_hole, + &unmapped_end_index, last_index); + } + /* + * If the range ends in a access then we do the residual unmap without + * any unpins. + */ + if (unmapped_end_index != last_index + 1) + iopt_area_unmap_domain_range(area, domain, unmapped_end_index, + last_index); + WARN_ON(batch.total_pfns); + batch_destroy(&batch, backup); + update_unpinned(pages); +} + +static void iopt_area_unfill_partial_domain(struct iopt_area *area, + struct iopt_pages *pages, + struct iommu_domain *domain, + unsigned long end_index) +{ + if (end_index != iopt_area_index(area)) + __iopt_area_unfill_domain(area, pages, domain, end_index - 1); +} + +/** + * iopt_area_unmap_domain() - Unmap without unpinning PFNs in a domain + * @area: The IOVA range to unmap + * @domain: The domain to unmap + * + * The caller must know that unpinning is not required, usually because there + * are other domains in the iopt. + */ +void iopt_area_unmap_domain(struct iopt_area *area, struct iommu_domain *domain) +{ + iommu_unmap_nofail(domain, iopt_area_iova(area), + iopt_area_length(area)); +} + +/** + * iopt_area_unfill_domain() - Unmap and unpin PFNs in a domain + * @area: IOVA area to use + * @pages: page supplier for the area (area->pages is NULL) + * @domain: Domain to unmap from + * + * The domain should be removed from the domains_itree before calling. The + * domain will always be unmapped, but the PFNs may not be unpinned if there are + * still accesses. + */ +void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages, + struct iommu_domain *domain) +{ + __iopt_area_unfill_domain(area, pages, domain, + iopt_area_last_index(area)); +} + +/** + * iopt_area_fill_domain() - Map PFNs from the area into a domain + * @area: IOVA area to use + * @domain: Domain to load PFNs into + * + * Read the pfns from the area's underlying iopt_pages and map them into the + * given domain. Called when attaching a new domain to an io_pagetable. + */ +int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain) +{ + unsigned long done_end_index; + struct pfn_reader pfns; + int rc; + + lockdep_assert_held(&area->pages->mutex); + + rc = pfn_reader_first(&pfns, area->pages, iopt_area_index(area), + iopt_area_last_index(area)); + if (rc) + return rc; + + while (!pfn_reader_done(&pfns)) { + done_end_index = pfns.batch_start_index; + rc = batch_to_domain(&pfns.batch, domain, area, + pfns.batch_start_index); + if (rc) + goto out_unmap; + done_end_index = pfns.batch_end_index; + + rc = pfn_reader_next(&pfns); + if (rc) + goto out_unmap; + } + + rc = pfn_reader_update_pinned(&pfns); + if (rc) + goto out_unmap; + goto out_destroy; + +out_unmap: + pfn_reader_release_pins(&pfns); + iopt_area_unfill_partial_domain(area, area->pages, domain, + done_end_index); +out_destroy: + pfn_reader_destroy(&pfns); + return rc; +} + +/** + * iopt_area_fill_domains() - Install PFNs into the area's domains + * @area: The area to act on + * @pages: The pages associated with the area (area->pages is NULL) + * + * Called during area creation. The area is freshly created and not inserted in + * the domains_itree yet. PFNs are read and loaded into every domain held in the + * area's io_pagetable and the area is installed in the domains_itree. + * + * On failure all domains are left unchanged. + */ +int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages) +{ + unsigned long done_first_end_index; + unsigned long done_all_end_index; + struct iommu_domain *domain; + unsigned long unmap_index; + struct pfn_reader pfns; + unsigned long index; + int rc; + + lockdep_assert_held(&area->iopt->domains_rwsem); + + if (xa_empty(&area->iopt->domains)) + return 0; + + mutex_lock(&pages->mutex); + rc = pfn_reader_first(&pfns, pages, iopt_area_index(area), + iopt_area_last_index(area)); + if (rc) + goto out_unlock; + + while (!pfn_reader_done(&pfns)) { + done_first_end_index = pfns.batch_end_index; + done_all_end_index = pfns.batch_start_index; + xa_for_each(&area->iopt->domains, index, domain) { + rc = batch_to_domain(&pfns.batch, domain, area, + pfns.batch_start_index); + if (rc) + goto out_unmap; + } + done_all_end_index = done_first_end_index; + + rc = pfn_reader_next(&pfns); + if (rc) + goto out_unmap; + } + rc = pfn_reader_update_pinned(&pfns); + if (rc) + goto out_unmap; + + area->storage_domain = xa_load(&area->iopt->domains, 0); + interval_tree_insert(&area->pages_node, &pages->domains_itree); + goto out_destroy; + +out_unmap: + pfn_reader_release_pins(&pfns); + xa_for_each(&area->iopt->domains, unmap_index, domain) { + unsigned long end_index; + + if (unmap_index < index) + end_index = done_first_end_index; + else + end_index = done_all_end_index; + + /* + * The area is not yet part of the domains_itree so we have to + * manage the unpinning specially. The last domain does the + * unpin, every other domain is just unmapped. + */ + if (unmap_index != area->iopt->next_domain_id - 1) { + if (end_index != iopt_area_index(area)) + iopt_area_unmap_domain_range( + area, domain, iopt_area_index(area), + end_index - 1); + } else { + iopt_area_unfill_partial_domain(area, pages, domain, + end_index); + } + } +out_destroy: + pfn_reader_destroy(&pfns); +out_unlock: + mutex_unlock(&pages->mutex); + return rc; +} + +/** + * iopt_area_unfill_domains() - unmap PFNs from the area's domains + * @area: The area to act on + * @pages: The pages associated with the area (area->pages is NULL) + * + * Called during area destruction. This unmaps the iova's covered by all the + * area's domains and releases the PFNs. + */ +void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages) +{ + struct io_pagetable *iopt = area->iopt; + struct iommu_domain *domain; + unsigned long index; + + lockdep_assert_held(&iopt->domains_rwsem); + + mutex_lock(&pages->mutex); + if (!area->storage_domain) + goto out_unlock; + + xa_for_each(&iopt->domains, index, domain) + if (domain != area->storage_domain) + iopt_area_unmap_domain_range( + area, domain, iopt_area_index(area), + iopt_area_last_index(area)); + + interval_tree_remove(&area->pages_node, &pages->domains_itree); + iopt_area_unfill_domain(area, pages, area->storage_domain); + area->storage_domain = NULL; +out_unlock: + mutex_unlock(&pages->mutex); +} + +static void iopt_pages_unpin_xarray(struct pfn_batch *batch, + struct iopt_pages *pages, + unsigned long start_index, + unsigned long end_index) +{ + while (start_index <= end_index) { + batch_from_xarray_clear(batch, &pages->pinned_pfns, start_index, + end_index); + batch_unpin(batch, pages, 0, batch->total_pfns); + start_index += batch->total_pfns; + batch_clear(batch); + } +} + +/** + * iopt_pages_unfill_xarray() - Update the xarry after removing an access + * @pages: The pages to act on + * @start_index: Starting PFN index + * @last_index: Last PFN index + * + * Called when an iopt_pages_access is removed, removes pages from the itree. + * The access should already be removed from the access_itree. + */ +void iopt_pages_unfill_xarray(struct iopt_pages *pages, + unsigned long start_index, + unsigned long last_index) +{ + struct interval_tree_double_span_iter span; + u64 backup[BATCH_BACKUP_SIZE]; + struct pfn_batch batch; + bool batch_inited = false; + + lockdep_assert_held(&pages->mutex); + + interval_tree_for_each_double_span(&span, &pages->access_itree, + &pages->domains_itree, start_index, + last_index) { + if (!span.is_used) { + if (!batch_inited) { + batch_init_backup(&batch, + last_index - start_index + 1, + backup, sizeof(backup)); + batch_inited = true; + } + iopt_pages_unpin_xarray(&batch, pages, span.start_hole, + span.last_hole); + } else if (span.is_used == 2) { + /* Covered by a domain */ + clear_xarray(&pages->pinned_pfns, span.start_used, + span.last_used); + } + /* Otherwise covered by an existing access */ + } + if (batch_inited) + batch_destroy(&batch, backup); + update_unpinned(pages); +} + +/** + * iopt_pages_fill_from_xarray() - Fast path for reading PFNs + * @pages: The pages to act on + * @start_index: The first page index in the range + * @last_index: The last page index in the range + * @out_pages: The output array to return the pages + * + * This can be called if the caller is holding a refcount on an + * iopt_pages_access that is known to have already been filled. It quickly reads + * the pages directly from the xarray. + * + * This is part of the SW iommu interface to read pages for in-kernel use. + */ +void iopt_pages_fill_from_xarray(struct iopt_pages *pages, + unsigned long start_index, + unsigned long last_index, + struct page **out_pages) +{ + XA_STATE(xas, &pages->pinned_pfns, start_index); + void *entry; + + rcu_read_lock(); + while (start_index <= last_index) { + entry = xas_next(&xas); + if (xas_retry(&xas, entry)) + continue; + WARN_ON(!xa_is_value(entry)); + *(out_pages++) = pfn_to_page(xa_to_value(entry)); + start_index++; + } + rcu_read_unlock(); +} + +static int iopt_pages_fill_from_domain(struct iopt_pages *pages, + unsigned long start_index, + unsigned long last_index, + struct page **out_pages) +{ + while (start_index != last_index + 1) { + unsigned long domain_last; + struct iopt_area *area; + + area = iopt_pages_find_domain_area(pages, start_index); + if (WARN_ON(!area)) + return -EINVAL; + + domain_last = min(iopt_area_last_index(area), last_index); + out_pages = raw_pages_from_domain(area->storage_domain, area, + start_index, domain_last, + out_pages); + start_index = domain_last + 1; + } + return 0; +} + +static int iopt_pages_fill_from_mm(struct iopt_pages *pages, + struct pfn_reader_user *user, + unsigned long start_index, + unsigned long last_index, + struct page **out_pages) +{ + unsigned long cur_index = start_index; + int rc; + + while (cur_index != last_index + 1) { + user->upages = out_pages + (cur_index - start_index); + rc = pfn_reader_user_pin(user, pages, cur_index, last_index); + if (rc) + goto out_unpin; + cur_index = user->upages_end; + } + return 0; + +out_unpin: + if (start_index != cur_index) + iopt_pages_err_unpin(pages, start_index, cur_index - 1, + out_pages); + return rc; +} + +/** + * iopt_pages_fill_xarray() - Read PFNs + * @pages: The pages to act on + * @start_index: The first page index in the range + * @last_index: The last page index in the range + * @out_pages: The output array to return the pages, may be NULL + * + * This populates the xarray and returns the pages in out_pages. As the slow + * path this is able to copy pages from other storage tiers into the xarray. + * + * On failure the xarray is left unchanged. + * + * This is part of the SW iommu interface to read pages for in-kernel use. + */ +int iopt_pages_fill_xarray(struct iopt_pages *pages, unsigned long start_index, + unsigned long last_index, struct page **out_pages) +{ + struct interval_tree_double_span_iter span; + unsigned long xa_end = start_index; + struct pfn_reader_user user; + int rc; + + lockdep_assert_held(&pages->mutex); + + pfn_reader_user_init(&user, pages); + user.upages_len = (last_index - start_index + 1) * sizeof(*out_pages); + interval_tree_for_each_double_span(&span, &pages->access_itree, + &pages->domains_itree, start_index, + last_index) { + struct page **cur_pages; + + if (span.is_used == 1) { + cur_pages = out_pages + (span.start_used - start_index); + iopt_pages_fill_from_xarray(pages, span.start_used, + span.last_used, cur_pages); + continue; + } + + if (span.is_used == 2) { + cur_pages = out_pages + (span.start_used - start_index); + iopt_pages_fill_from_domain(pages, span.start_used, + span.last_used, cur_pages); + rc = pages_to_xarray(&pages->pinned_pfns, + span.start_used, span.last_used, + cur_pages); + if (rc) + goto out_clean_xa; + xa_end = span.last_used + 1; + continue; + } + + /* hole */ + cur_pages = out_pages + (span.start_hole - start_index); + rc = iopt_pages_fill_from_mm(pages, &user, span.start_hole, + span.last_hole, cur_pages); + if (rc) + goto out_clean_xa; + rc = pages_to_xarray(&pages->pinned_pfns, span.start_hole, + span.last_hole, cur_pages); + if (rc) { + iopt_pages_err_unpin(pages, span.start_hole, + span.last_hole, cur_pages); + goto out_clean_xa; + } + xa_end = span.last_hole + 1; + } + rc = pfn_reader_user_update_pinned(&user, pages); + if (rc) + goto out_clean_xa; + user.upages = NULL; + pfn_reader_user_destroy(&user, pages); + return 0; + +out_clean_xa: + if (start_index != xa_end) + iopt_pages_unfill_xarray(pages, start_index, xa_end - 1); + user.upages = NULL; + pfn_reader_user_destroy(&user, pages); + return rc; +} + +/* + * This uses the pfn_reader instead of taking a shortcut by using the mm. It can + * do every scenario and is fully consistent with what an iommu_domain would + * see. + */ +static int iopt_pages_rw_slow(struct iopt_pages *pages, + unsigned long start_index, + unsigned long last_index, unsigned long offset, + void *data, unsigned long length, + unsigned int flags) +{ + struct pfn_reader pfns; + int rc; + + mutex_lock(&pages->mutex); + + rc = pfn_reader_first(&pfns, pages, start_index, last_index); + if (rc) + goto out_unlock; + + while (!pfn_reader_done(&pfns)) { + unsigned long done; + + done = batch_rw(&pfns.batch, data, offset, length, flags); + data += done; + length -= done; + offset = 0; + pfn_reader_unpin(&pfns); + + rc = pfn_reader_next(&pfns); + if (rc) + goto out_destroy; + } + if (WARN_ON(length != 0)) + rc = -EINVAL; +out_destroy: + pfn_reader_destroy(&pfns); +out_unlock: + mutex_unlock(&pages->mutex); + return rc; +} + +/* + * A medium speed path that still allows DMA inconsistencies, but doesn't do any + * memory allocations or interval tree searches. + */ +static int iopt_pages_rw_page(struct iopt_pages *pages, unsigned long index, + unsigned long offset, void *data, + unsigned long length, unsigned int flags) +{ + struct page *page = NULL; + int rc; + + if (!mmget_not_zero(pages->source_mm)) + return iopt_pages_rw_slow(pages, index, index, offset, data, + length, flags); + + mmap_read_lock(pages->source_mm); + rc = pin_user_pages_remote( + pages->source_mm, (uintptr_t)(pages->uptr + index * PAGE_SIZE), + 1, (flags & IOMMUFD_ACCESS_RW_WRITE) ? FOLL_WRITE : 0, &page, + NULL, NULL); + mmap_read_unlock(pages->source_mm); + if (rc != 1) { + if (WARN_ON(rc >= 0)) + rc = -EINVAL; + goto out_mmput; + } + copy_data_page(page, data, offset, length, flags); + unpin_user_page(page); + rc = 0; + +out_mmput: + mmput(pages->source_mm); + return rc; +} + +/** + * iopt_pages_rw_access - Copy to/from a linear slice of the pages + * @pages: pages to act on + * @start_byte: First byte of pages to copy to/from + * @data: Kernel buffer to get/put the data + * @length: Number of bytes to copy + * @flags: IOMMUFD_ACCESS_RW_* flags + * + * This will find each page in the range, kmap it and then memcpy to/from + * the given kernel buffer. + */ +int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte, + void *data, unsigned long length, unsigned int flags) +{ + unsigned long start_index = start_byte / PAGE_SIZE; + unsigned long last_index = (start_byte + length - 1) / PAGE_SIZE; + bool change_mm = current->mm != pages->source_mm; + int rc = 0; + + if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable) + return -EPERM; + + if (!(flags & IOMMUFD_ACCESS_RW_KTHREAD) && change_mm) { + if (start_index == last_index) + return iopt_pages_rw_page(pages, start_index, + start_byte % PAGE_SIZE, data, + length, flags); + return iopt_pages_rw_slow(pages, start_index, last_index, + start_byte % PAGE_SIZE, data, length, + flags); + } + + /* + * Try to copy using copy_to_user(). We do this as a fast path and + * ignore any pinning inconsistencies, unlike a real DMA path. + */ + if (change_mm) { + if (!mmget_not_zero(pages->source_mm)) + return iopt_pages_rw_slow(pages, start_index, + last_index, + start_byte % PAGE_SIZE, data, + length, flags); + kthread_use_mm(pages->source_mm); + } + + if (flags & IOMMUFD_ACCESS_RW_WRITE) { + if (copy_to_user(pages->uptr + start_byte, data, length)) + rc = -EFAULT; + } else { + if (copy_from_user(data, pages->uptr + start_byte, length)) + rc = -EFAULT; + } + + if (change_mm) { + kthread_unuse_mm(pages->source_mm); + mmput(pages->source_mm); + } + + return rc; +} + +static struct iopt_pages_access * +iopt_pages_get_exact_access(struct iopt_pages *pages, unsigned long index, + unsigned long last) +{ + struct interval_tree_node *node; + + lockdep_assert_held(&pages->mutex); + + /* There can be overlapping ranges in this interval tree */ + for (node = interval_tree_iter_first(&pages->access_itree, index, last); + node; node = interval_tree_iter_next(node, index, last)) + if (node->start == index && node->last == last) + return container_of(node, struct iopt_pages_access, + node); + return NULL; +} + +/** + * iopt_area_add_access() - Record an in-knerel access for PFNs + * @area: The source of PFNs + * @start_index: First page index + * @last_index: Inclusive last page index + * @out_pages: Output list of struct page's representing the PFNs + * @flags: IOMMUFD_ACCESS_RW_* flags + * + * Record that an in-kernel access will be accessing the pages, ensure they are + * pinned, and return the PFNs as a simple list of 'struct page *'. + * + * This should be undone through a matching call to iopt_area_remove_access() + */ +int iopt_area_add_access(struct iopt_area *area, unsigned long start_index, + unsigned long last_index, struct page **out_pages, + unsigned int flags) +{ + struct iopt_pages *pages = area->pages; + struct iopt_pages_access *access; + int rc; + + if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable) + return -EPERM; + + mutex_lock(&pages->mutex); + access = iopt_pages_get_exact_access(pages, start_index, last_index); + if (access) { + area->num_accesses++; + access->users++; + iopt_pages_fill_from_xarray(pages, start_index, last_index, + out_pages); + mutex_unlock(&pages->mutex); + return 0; + } + + access = kzalloc(sizeof(*access), GFP_KERNEL_ACCOUNT); + if (!access) { + rc = -ENOMEM; + goto err_unlock; + } + + rc = iopt_pages_fill_xarray(pages, start_index, last_index, out_pages); + if (rc) + goto err_free; + + access->node.start = start_index; + access->node.last = last_index; + access->users = 1; + area->num_accesses++; + interval_tree_insert(&access->node, &pages->access_itree); + mutex_unlock(&pages->mutex); + return 0; + +err_free: + kfree(access); +err_unlock: + mutex_unlock(&pages->mutex); + return rc; +} + +/** + * iopt_area_remove_access() - Release an in-kernel access for PFNs + * @area: The source of PFNs + * @start_index: First page index + * @last_index: Inclusive last page index + * + * Undo iopt_area_add_access() and unpin the pages if necessary. The caller + * must stop using the PFNs before calling this. + */ +void iopt_area_remove_access(struct iopt_area *area, unsigned long start_index, + unsigned long last_index) +{ + struct iopt_pages *pages = area->pages; + struct iopt_pages_access *access; + + mutex_lock(&pages->mutex); + access = iopt_pages_get_exact_access(pages, start_index, last_index); + if (WARN_ON(!access)) + goto out_unlock; + + WARN_ON(area->num_accesses == 0 || access->users == 0); + area->num_accesses--; + access->users--; + if (access->users) + goto out_unlock; + + interval_tree_remove(&access->node, &pages->access_itree); + iopt_pages_unfill_xarray(pages, start_index, last_index); + kfree(access); +out_unlock: + mutex_unlock(&pages->mutex); +} -- GitLab From 51fe6141f0f64ae0bbc096a41a07572273e8c0ef Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:33 -0400 Subject: [PATCH 1009/1423] iommufd: Data structure to provide IOVA to PFN mapping This is the remainder of the IOAS data structure. Provide an object called an io_pagetable that is composed of iopt_areas pointing at iopt_pages, along with a list of iommu_domains that mirror the IOVA to PFN map. At the top this is a simple interval tree of iopt_areas indicating the map of IOVA to iopt_pages. An xarray keeps track of a list of domains. Based on the attached domains there is a minimum alignment for areas (which may be smaller than PAGE_SIZE), an interval tree of reserved IOVA that can't be mapped and an IOVA of allowed IOVA that can always be mappable. The concept of an 'access' refers to something like a VFIO mdev that is accessing the IOVA and using a 'struct page *' for CPU based access. Externally an API is provided that matches the requirements of the IOCTL interface for map/unmap and domain attachment. The API provides a 'copy' primitive to establish a new IOVA map in a different IOAS from an existing mapping by re-using the iopt_pages. This is the basic mechanism to provide single pinning. This is designed to support a pre-registration flow where userspace would setup an dummy IOAS with no domains, map in memory and then establish an access to pin all PFNs into the xarray. Copy can then be used to create new IOVA mappings in a different IOAS, with iommu_domains attached. Upon copy the PFNs will be read out of the xarray and mapped into the iommu_domains, avoiding any pin_user_pages() overheads. Link: https://lore.kernel.org/r/10-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- .clang-format | 1 + drivers/iommu/iommufd/Makefile | 1 + drivers/iommu/iommufd/io_pagetable.c | 1186 +++++++++++++++++++++++ drivers/iommu/iommufd/io_pagetable.h | 55 ++ drivers/iommu/iommufd/iommufd_private.h | 52 + 5 files changed, 1295 insertions(+) create mode 100644 drivers/iommu/iommufd/io_pagetable.c diff --git a/.clang-format b/.clang-format index 501241f897766..78aba4a10b1bb 100644 --- a/.clang-format +++ b/.clang-format @@ -444,6 +444,7 @@ ForEachMacros: - 'interval_tree_for_each_span' - 'intlist__for_each_entry' - 'intlist__for_each_entry_safe' + - 'iopt_for_each_contig_area' - 'kcore_copy__for_each_phdr' - 'key_for_each' - 'key_for_each_safe' diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile index 05a0e91e30afa..b66a8c47ff55e 100644 --- a/drivers/iommu/iommufd/Makefile +++ b/drivers/iommu/iommufd/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only iommufd-y := \ + io_pagetable.o \ main.o \ pages.o diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c new file mode 100644 index 0000000000000..756d347948f0e --- /dev/null +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -0,0 +1,1186 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. + * + * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The + * PFNs can be placed into an iommu_domain, or returned to the caller as a page + * list for access by an in-kernel user. + * + * The datastructure uses the iopt_pages to optimize the storage of the PFNs + * between the domains and xarray. + */ +#include +#include +#include +#include +#include +#include +#include + +#include "io_pagetable.h" +#include "double_span.h" + +struct iopt_pages_list { + struct iopt_pages *pages; + struct iopt_area *area; + struct list_head next; + unsigned long start_byte; + unsigned long length; +}; + +struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter, + struct io_pagetable *iopt, + unsigned long iova, + unsigned long last_iova) +{ + lockdep_assert_held(&iopt->iova_rwsem); + + iter->cur_iova = iova; + iter->last_iova = last_iova; + iter->area = iopt_area_iter_first(iopt, iova, iova); + if (!iter->area) + return NULL; + if (!iter->area->pages) { + iter->area = NULL; + return NULL; + } + return iter->area; +} + +struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter) +{ + unsigned long last_iova; + + if (!iter->area) + return NULL; + last_iova = iopt_area_last_iova(iter->area); + if (iter->last_iova <= last_iova) + return NULL; + + iter->cur_iova = last_iova + 1; + iter->area = iopt_area_iter_next(iter->area, iter->cur_iova, + iter->last_iova); + if (!iter->area) + return NULL; + if (iter->cur_iova != iopt_area_iova(iter->area) || + !iter->area->pages) { + iter->area = NULL; + return NULL; + } + return iter->area; +} + +static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span, + unsigned long length, + unsigned long iova_alignment, + unsigned long page_offset) +{ + if (span->is_used || span->last_hole - span->start_hole < length - 1) + return false; + + span->start_hole = ALIGN(span->start_hole, iova_alignment) | + page_offset; + if (span->start_hole > span->last_hole || + span->last_hole - span->start_hole < length - 1) + return false; + return true; +} + +static bool __alloc_iova_check_used(struct interval_tree_span_iter *span, + unsigned long length, + unsigned long iova_alignment, + unsigned long page_offset) +{ + if (span->is_hole || span->last_used - span->start_used < length - 1) + return false; + + span->start_used = ALIGN(span->start_used, iova_alignment) | + page_offset; + if (span->start_used > span->last_used || + span->last_used - span->start_used < length - 1) + return false; + return true; +} + +/* + * Automatically find a block of IOVA that is not being used and not reserved. + * Does not return a 0 IOVA even if it is valid. + */ +static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova, + unsigned long uptr, unsigned long length) +{ + unsigned long page_offset = uptr % PAGE_SIZE; + struct interval_tree_double_span_iter used_span; + struct interval_tree_span_iter allowed_span; + unsigned long iova_alignment; + + lockdep_assert_held(&iopt->iova_rwsem); + + /* Protect roundup_pow-of_two() from overflow */ + if (length == 0 || length >= ULONG_MAX / 2) + return -EOVERFLOW; + + /* + * Keep alignment present in the uptr when building the IOVA, this + * increases the chance we can map a THP. + */ + if (!uptr) + iova_alignment = roundup_pow_of_two(length); + else + iova_alignment = min_t(unsigned long, + roundup_pow_of_two(length), + 1UL << __ffs64(uptr)); + + if (iova_alignment < iopt->iova_alignment) + return -EINVAL; + + interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree, + PAGE_SIZE, ULONG_MAX - PAGE_SIZE) { + if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) { + allowed_span.start_used = PAGE_SIZE; + allowed_span.last_used = ULONG_MAX - PAGE_SIZE; + allowed_span.is_hole = false; + } + + if (!__alloc_iova_check_used(&allowed_span, length, + iova_alignment, page_offset)) + continue; + + interval_tree_for_each_double_span( + &used_span, &iopt->reserved_itree, &iopt->area_itree, + allowed_span.start_used, allowed_span.last_used) { + if (!__alloc_iova_check_hole(&used_span, length, + iova_alignment, + page_offset)) + continue; + + *iova = used_span.start_hole; + return 0; + } + } + return -ENOSPC; +} + +static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova, + unsigned long length) +{ + unsigned long last; + + lockdep_assert_held(&iopt->iova_rwsem); + + if ((iova & (iopt->iova_alignment - 1))) + return -EINVAL; + + if (check_add_overflow(iova, length - 1, &last)) + return -EOVERFLOW; + + /* No reserved IOVA intersects the range */ + if (iopt_reserved_iter_first(iopt, iova, last)) + return -EINVAL; + + /* Check that there is not already a mapping in the range */ + if (iopt_area_iter_first(iopt, iova, last)) + return -EEXIST; + return 0; +} + +/* + * The area takes a slice of the pages from start_bytes to start_byte + length + */ +static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area, + struct iopt_pages *pages, unsigned long iova, + unsigned long start_byte, unsigned long length, + int iommu_prot) +{ + lockdep_assert_held_write(&iopt->iova_rwsem); + + if ((iommu_prot & IOMMU_WRITE) && !pages->writable) + return -EPERM; + + area->iommu_prot = iommu_prot; + area->page_offset = start_byte % PAGE_SIZE; + if (area->page_offset & (iopt->iova_alignment - 1)) + return -EINVAL; + + area->node.start = iova; + if (check_add_overflow(iova, length - 1, &area->node.last)) + return -EOVERFLOW; + + area->pages_node.start = start_byte / PAGE_SIZE; + if (check_add_overflow(start_byte, length - 1, &area->pages_node.last)) + return -EOVERFLOW; + area->pages_node.last = area->pages_node.last / PAGE_SIZE; + if (WARN_ON(area->pages_node.last >= pages->npages)) + return -EOVERFLOW; + + /* + * The area is inserted with a NULL pages indicating it is not fully + * initialized yet. + */ + area->iopt = iopt; + interval_tree_insert(&area->node, &iopt->area_itree); + return 0; +} + +static int iopt_alloc_area_pages(struct io_pagetable *iopt, + struct list_head *pages_list, + unsigned long length, unsigned long *dst_iova, + int iommu_prot, unsigned int flags) +{ + struct iopt_pages_list *elm; + unsigned long iova; + int rc = 0; + + list_for_each_entry(elm, pages_list, next) { + elm->area = kzalloc(sizeof(*elm->area), GFP_KERNEL_ACCOUNT); + if (!elm->area) + return -ENOMEM; + } + + down_write(&iopt->iova_rwsem); + if ((length & (iopt->iova_alignment - 1)) || !length) { + rc = -EINVAL; + goto out_unlock; + } + + if (flags & IOPT_ALLOC_IOVA) { + /* Use the first entry to guess the ideal IOVA alignment */ + elm = list_first_entry(pages_list, struct iopt_pages_list, + next); + rc = iopt_alloc_iova( + iopt, dst_iova, + (uintptr_t)elm->pages->uptr + elm->start_byte, length); + if (rc) + goto out_unlock; + } else { + rc = iopt_check_iova(iopt, *dst_iova, length); + if (rc) + goto out_unlock; + } + + /* + * Areas are created with a NULL pages so that the IOVA space is + * reserved and we can unlock the iova_rwsem. + */ + iova = *dst_iova; + list_for_each_entry(elm, pages_list, next) { + rc = iopt_insert_area(iopt, elm->area, elm->pages, iova, + elm->start_byte, elm->length, iommu_prot); + if (rc) + goto out_unlock; + iova += elm->length; + } + +out_unlock: + up_write(&iopt->iova_rwsem); + return rc; +} + +static void iopt_abort_area(struct iopt_area *area) +{ + if (area->iopt) { + down_write(&area->iopt->iova_rwsem); + interval_tree_remove(&area->node, &area->iopt->area_itree); + up_write(&area->iopt->iova_rwsem); + } + kfree(area); +} + +void iopt_free_pages_list(struct list_head *pages_list) +{ + struct iopt_pages_list *elm; + + while ((elm = list_first_entry_or_null(pages_list, + struct iopt_pages_list, next))) { + if (elm->area) + iopt_abort_area(elm->area); + if (elm->pages) + iopt_put_pages(elm->pages); + list_del(&elm->next); + kfree(elm); + } +} + +static int iopt_fill_domains_pages(struct list_head *pages_list) +{ + struct iopt_pages_list *undo_elm; + struct iopt_pages_list *elm; + int rc; + + list_for_each_entry(elm, pages_list, next) { + rc = iopt_area_fill_domains(elm->area, elm->pages); + if (rc) + goto err_undo; + } + return 0; + +err_undo: + list_for_each_entry(undo_elm, pages_list, next) { + if (undo_elm == elm) + break; + iopt_area_unfill_domains(undo_elm->area, undo_elm->pages); + } + return rc; +} + +int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list, + unsigned long length, unsigned long *dst_iova, + int iommu_prot, unsigned int flags) +{ + struct iopt_pages_list *elm; + int rc; + + rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova, + iommu_prot, flags); + if (rc) + return rc; + + down_read(&iopt->domains_rwsem); + rc = iopt_fill_domains_pages(pages_list); + if (rc) + goto out_unlock_domains; + + down_write(&iopt->iova_rwsem); + list_for_each_entry(elm, pages_list, next) { + /* + * area->pages must be set inside the domains_rwsem to ensure + * any newly added domains will get filled. Moves the reference + * in from the list. + */ + elm->area->pages = elm->pages; + elm->pages = NULL; + elm->area = NULL; + } + up_write(&iopt->iova_rwsem); +out_unlock_domains: + up_read(&iopt->domains_rwsem); + return rc; +} + +/** + * iopt_map_user_pages() - Map a user VA to an iova in the io page table + * @ictx: iommufd_ctx the iopt is part of + * @iopt: io_pagetable to act on + * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains + * the chosen iova on output. Otherwise is the iova to map to on input + * @uptr: User VA to map + * @length: Number of bytes to map + * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping + * @flags: IOPT_ALLOC_IOVA or zero + * + * iova, uptr, and length must be aligned to iova_alignment. For domain backed + * page tables this will pin the pages and load them into the domain at iova. + * For non-domain page tables this will only setup a lazy reference and the + * caller must use iopt_access_pages() to touch them. + * + * iopt_unmap_iova() must be called to undo this before the io_pagetable can be + * destroyed. + */ +int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, + unsigned long *iova, void __user *uptr, + unsigned long length, int iommu_prot, + unsigned int flags) +{ + struct iopt_pages_list elm = {}; + LIST_HEAD(pages_list); + int rc; + + elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE); + if (IS_ERR(elm.pages)) + return PTR_ERR(elm.pages); + if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM && + elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER) + elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM; + elm.start_byte = uptr - elm.pages->uptr; + elm.length = length; + list_add(&elm.next, &pages_list); + + rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags); + if (rc) { + if (elm.area) + iopt_abort_area(elm.area); + if (elm.pages) + iopt_put_pages(elm.pages); + return rc; + } + return 0; +} + +int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova, + unsigned long length, struct list_head *pages_list) +{ + struct iopt_area_contig_iter iter; + unsigned long last_iova; + struct iopt_area *area; + int rc; + + if (!length) + return -EINVAL; + if (check_add_overflow(iova, length - 1, &last_iova)) + return -EOVERFLOW; + + down_read(&iopt->iova_rwsem); + iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) { + struct iopt_pages_list *elm; + unsigned long last = min(last_iova, iopt_area_last_iova(area)); + + elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT); + if (!elm) { + rc = -ENOMEM; + goto err_free; + } + elm->start_byte = iopt_area_start_byte(area, iter.cur_iova); + elm->pages = area->pages; + elm->length = (last - iter.cur_iova) + 1; + kref_get(&elm->pages->kref); + list_add_tail(&elm->next, pages_list); + } + if (!iopt_area_contig_done(&iter)) { + rc = -ENOENT; + goto err_free; + } + up_read(&iopt->iova_rwsem); + return 0; +err_free: + up_read(&iopt->iova_rwsem); + iopt_free_pages_list(pages_list); + return rc; +} + +static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start, + unsigned long last, unsigned long *unmapped) +{ + struct iopt_area *area; + unsigned long unmapped_bytes = 0; + int rc = -ENOENT; + + /* + * The domains_rwsem must be held in read mode any time any area->pages + * is NULL. This prevents domain attach/detatch from running + * concurrently with cleaning up the area. + */ + down_read(&iopt->domains_rwsem); + down_write(&iopt->iova_rwsem); + while ((area = iopt_area_iter_first(iopt, start, last))) { + unsigned long area_last = iopt_area_last_iova(area); + unsigned long area_first = iopt_area_iova(area); + struct iopt_pages *pages; + + /* Userspace should not race map/unmap's of the same area */ + if (!area->pages) { + rc = -EBUSY; + goto out_unlock_iova; + } + + if (area_first < start || area_last > last) { + rc = -ENOENT; + goto out_unlock_iova; + } + + /* + * num_accesses writers must hold the iova_rwsem too, so we can + * safely read it under the write side of the iovam_rwsem + * without the pages->mutex. + */ + if (area->num_accesses) { + start = area_first; + area->prevent_access = true; + up_write(&iopt->iova_rwsem); + up_read(&iopt->domains_rwsem); + /* Later patch calls back to drivers to unmap */ + return -EBUSY; + } + + pages = area->pages; + area->pages = NULL; + up_write(&iopt->iova_rwsem); + + iopt_area_unfill_domains(area, pages); + iopt_abort_area(area); + iopt_put_pages(pages); + + unmapped_bytes += area_last - area_first + 1; + + down_write(&iopt->iova_rwsem); + } + if (unmapped_bytes) + rc = 0; + +out_unlock_iova: + up_write(&iopt->iova_rwsem); + up_read(&iopt->domains_rwsem); + if (unmapped) + *unmapped = unmapped_bytes; + return rc; +} + +/** + * iopt_unmap_iova() - Remove a range of iova + * @iopt: io_pagetable to act on + * @iova: Starting iova to unmap + * @length: Number of bytes to unmap + * @unmapped: Return number of bytes unmapped + * + * The requested range must be a superset of existing ranges. + * Splitting/truncating IOVA mappings is not allowed. + */ +int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova, + unsigned long length, unsigned long *unmapped) +{ + unsigned long iova_last; + + if (!length) + return -EINVAL; + + if (check_add_overflow(iova, length - 1, &iova_last)) + return -EOVERFLOW; + + return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped); +} + +int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped) +{ + int rc; + + rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped); + /* If the IOVAs are empty then unmap all succeeds */ + if (rc == -ENOENT) + return 0; + return rc; +} + +/* The caller must always free all the nodes in the allowed_iova rb_root. */ +int iopt_set_allow_iova(struct io_pagetable *iopt, + struct rb_root_cached *allowed_iova) +{ + struct iopt_allowed *allowed; + + down_write(&iopt->iova_rwsem); + swap(*allowed_iova, iopt->allowed_itree); + + for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed; + allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) { + if (iopt_reserved_iter_first(iopt, allowed->node.start, + allowed->node.last)) { + swap(*allowed_iova, iopt->allowed_itree); + up_write(&iopt->iova_rwsem); + return -EADDRINUSE; + } + } + up_write(&iopt->iova_rwsem); + return 0; +} + +int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start, + unsigned long last, void *owner) +{ + struct iopt_reserved *reserved; + + lockdep_assert_held_write(&iopt->iova_rwsem); + + if (iopt_area_iter_first(iopt, start, last) || + iopt_allowed_iter_first(iopt, start, last)) + return -EADDRINUSE; + + reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT); + if (!reserved) + return -ENOMEM; + reserved->node.start = start; + reserved->node.last = last; + reserved->owner = owner; + interval_tree_insert(&reserved->node, &iopt->reserved_itree); + return 0; +} + +static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner) +{ + struct iopt_reserved *reserved, *next; + + lockdep_assert_held_write(&iopt->iova_rwsem); + + for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved; + reserved = next) { + next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX); + + if (reserved->owner == owner) { + interval_tree_remove(&reserved->node, + &iopt->reserved_itree); + kfree(reserved); + } + } +} + +void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner) +{ + down_write(&iopt->iova_rwsem); + __iopt_remove_reserved_iova(iopt, owner); + up_write(&iopt->iova_rwsem); +} + +void iopt_init_table(struct io_pagetable *iopt) +{ + init_rwsem(&iopt->iova_rwsem); + init_rwsem(&iopt->domains_rwsem); + iopt->area_itree = RB_ROOT_CACHED; + iopt->allowed_itree = RB_ROOT_CACHED; + iopt->reserved_itree = RB_ROOT_CACHED; + xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT); + xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC); + + /* + * iopt's start as SW tables that can use the entire size_t IOVA space + * due to the use of size_t in the APIs. They have no alignment + * restriction. + */ + iopt->iova_alignment = 1; +} + +void iopt_destroy_table(struct io_pagetable *iopt) +{ + struct interval_tree_node *node; + + while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0, + ULONG_MAX))) { + interval_tree_remove(node, &iopt->allowed_itree); + kfree(container_of(node, struct iopt_allowed, node)); + } + + WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root)); + WARN_ON(!xa_empty(&iopt->domains)); + WARN_ON(!xa_empty(&iopt->access_list)); + WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root)); +} + +/** + * iopt_unfill_domain() - Unfill a domain with PFNs + * @iopt: io_pagetable to act on + * @domain: domain to unfill + * + * This is used when removing a domain from the iopt. Every area in the iopt + * will be unmapped from the domain. The domain must already be removed from the + * domains xarray. + */ +static void iopt_unfill_domain(struct io_pagetable *iopt, + struct iommu_domain *domain) +{ + struct iopt_area *area; + + lockdep_assert_held(&iopt->iova_rwsem); + lockdep_assert_held_write(&iopt->domains_rwsem); + + /* + * Some other domain is holding all the pfns still, rapidly unmap this + * domain. + */ + if (iopt->next_domain_id != 0) { + /* Pick an arbitrary remaining domain to act as storage */ + struct iommu_domain *storage_domain = + xa_load(&iopt->domains, 0); + + for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) { + struct iopt_pages *pages = area->pages; + + if (!pages) + continue; + + mutex_lock(&pages->mutex); + if (area->storage_domain == domain) + area->storage_domain = storage_domain; + mutex_unlock(&pages->mutex); + + iopt_area_unmap_domain(area, domain); + } + return; + } + + for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) { + struct iopt_pages *pages = area->pages; + + if (!pages) + continue; + + mutex_lock(&pages->mutex); + interval_tree_remove(&area->pages_node, &pages->domains_itree); + WARN_ON(area->storage_domain != domain); + area->storage_domain = NULL; + iopt_area_unfill_domain(area, pages, domain); + mutex_unlock(&pages->mutex); + } +} + +/** + * iopt_fill_domain() - Fill a domain with PFNs + * @iopt: io_pagetable to act on + * @domain: domain to fill + * + * Fill the domain with PFNs from every area in the iopt. On failure the domain + * is left unchanged. + */ +static int iopt_fill_domain(struct io_pagetable *iopt, + struct iommu_domain *domain) +{ + struct iopt_area *end_area; + struct iopt_area *area; + int rc; + + lockdep_assert_held(&iopt->iova_rwsem); + lockdep_assert_held_write(&iopt->domains_rwsem); + + for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) { + struct iopt_pages *pages = area->pages; + + if (!pages) + continue; + + mutex_lock(&pages->mutex); + rc = iopt_area_fill_domain(area, domain); + if (rc) { + mutex_unlock(&pages->mutex); + goto out_unfill; + } + if (!area->storage_domain) { + WARN_ON(iopt->next_domain_id != 0); + area->storage_domain = domain; + interval_tree_insert(&area->pages_node, + &pages->domains_itree); + } + mutex_unlock(&pages->mutex); + } + return 0; + +out_unfill: + end_area = area; + for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) { + struct iopt_pages *pages = area->pages; + + if (area == end_area) + break; + if (!pages) + continue; + mutex_lock(&pages->mutex); + if (iopt->next_domain_id == 0) { + interval_tree_remove(&area->pages_node, + &pages->domains_itree); + area->storage_domain = NULL; + } + iopt_area_unfill_domain(area, pages, domain); + mutex_unlock(&pages->mutex); + } + return rc; +} + +/* All existing area's conform to an increased page size */ +static int iopt_check_iova_alignment(struct io_pagetable *iopt, + unsigned long new_iova_alignment) +{ + unsigned long align_mask = new_iova_alignment - 1; + struct iopt_area *area; + + lockdep_assert_held(&iopt->iova_rwsem); + lockdep_assert_held(&iopt->domains_rwsem); + + for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) + if ((iopt_area_iova(area) & align_mask) || + (iopt_area_length(area) & align_mask) || + (area->page_offset & align_mask)) + return -EADDRINUSE; + return 0; +} + +int iopt_table_add_domain(struct io_pagetable *iopt, + struct iommu_domain *domain) +{ + const struct iommu_domain_geometry *geometry = &domain->geometry; + struct iommu_domain *iter_domain; + unsigned int new_iova_alignment; + unsigned long index; + int rc; + + down_write(&iopt->domains_rwsem); + down_write(&iopt->iova_rwsem); + + xa_for_each(&iopt->domains, index, iter_domain) { + if (WARN_ON(iter_domain == domain)) { + rc = -EEXIST; + goto out_unlock; + } + } + + /* + * The io page size drives the iova_alignment. Internally the iopt_pages + * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE + * objects into the iommu_domain. + * + * A iommu_domain must always be able to accept PAGE_SIZE to be + * compatible as we can't guarantee higher contiguity. + */ + new_iova_alignment = max_t(unsigned long, + 1UL << __ffs(domain->pgsize_bitmap), + iopt->iova_alignment); + if (new_iova_alignment > PAGE_SIZE) { + rc = -EINVAL; + goto out_unlock; + } + if (new_iova_alignment != iopt->iova_alignment) { + rc = iopt_check_iova_alignment(iopt, new_iova_alignment); + if (rc) + goto out_unlock; + } + + /* No area exists that is outside the allowed domain aperture */ + if (geometry->aperture_start != 0) { + rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1, + domain); + if (rc) + goto out_reserved; + } + if (geometry->aperture_end != ULONG_MAX) { + rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1, + ULONG_MAX, domain); + if (rc) + goto out_reserved; + } + + rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL); + if (rc) + goto out_reserved; + + rc = iopt_fill_domain(iopt, domain); + if (rc) + goto out_release; + + iopt->iova_alignment = new_iova_alignment; + xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL); + iopt->next_domain_id++; + up_write(&iopt->iova_rwsem); + up_write(&iopt->domains_rwsem); + return 0; +out_release: + xa_release(&iopt->domains, iopt->next_domain_id); +out_reserved: + __iopt_remove_reserved_iova(iopt, domain); +out_unlock: + up_write(&iopt->iova_rwsem); + up_write(&iopt->domains_rwsem); + return rc; +} + +static int iopt_calculate_iova_alignment(struct io_pagetable *iopt) +{ + unsigned long new_iova_alignment; + struct iommufd_access *access; + struct iommu_domain *domain; + unsigned long index; + + lockdep_assert_held_write(&iopt->iova_rwsem); + lockdep_assert_held(&iopt->domains_rwsem); + + /* See batch_iommu_map_small() */ + if (iopt->disable_large_pages) + new_iova_alignment = PAGE_SIZE; + else + new_iova_alignment = 1; + + xa_for_each(&iopt->domains, index, domain) + new_iova_alignment = max_t(unsigned long, + 1UL << __ffs(domain->pgsize_bitmap), + new_iova_alignment); + xa_for_each(&iopt->access_list, index, access) + new_iova_alignment = max_t(unsigned long, + access->iova_alignment, + new_iova_alignment); + + if (new_iova_alignment > iopt->iova_alignment) { + int rc; + + rc = iopt_check_iova_alignment(iopt, new_iova_alignment); + if (rc) + return rc; + } + iopt->iova_alignment = new_iova_alignment; + return 0; +} + +void iopt_table_remove_domain(struct io_pagetable *iopt, + struct iommu_domain *domain) +{ + struct iommu_domain *iter_domain = NULL; + unsigned long index; + + down_write(&iopt->domains_rwsem); + down_write(&iopt->iova_rwsem); + + xa_for_each(&iopt->domains, index, iter_domain) + if (iter_domain == domain) + break; + if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id) + goto out_unlock; + + /* + * Compress the xarray to keep it linear by swapping the entry to erase + * with the tail entry and shrinking the tail. + */ + iopt->next_domain_id--; + iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id); + if (index != iopt->next_domain_id) + xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL); + + iopt_unfill_domain(iopt, domain); + __iopt_remove_reserved_iova(iopt, domain); + + WARN_ON(iopt_calculate_iova_alignment(iopt)); +out_unlock: + up_write(&iopt->iova_rwsem); + up_write(&iopt->domains_rwsem); +} + +/** + * iopt_area_split - Split an area into two parts at iova + * @area: The area to split + * @iova: Becomes the last of a new area + * + * This splits an area into two. It is part of the VFIO compatibility to allow + * poking a hole in the mapping. The two areas continue to point at the same + * iopt_pages, just with different starting bytes. + */ +static int iopt_area_split(struct iopt_area *area, unsigned long iova) +{ + unsigned long alignment = area->iopt->iova_alignment; + unsigned long last_iova = iopt_area_last_iova(area); + unsigned long start_iova = iopt_area_iova(area); + unsigned long new_start = iova + 1; + struct io_pagetable *iopt = area->iopt; + struct iopt_pages *pages = area->pages; + struct iopt_area *lhs; + struct iopt_area *rhs; + int rc; + + lockdep_assert_held_write(&iopt->iova_rwsem); + + if (iova == start_iova || iova == last_iova) + return 0; + + if (!pages || area->prevent_access) + return -EBUSY; + + if (new_start & (alignment - 1) || + iopt_area_start_byte(area, new_start) & (alignment - 1)) + return -EINVAL; + + lhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT); + if (!lhs) + return -ENOMEM; + + rhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT); + if (!rhs) { + rc = -ENOMEM; + goto err_free_lhs; + } + + mutex_lock(&pages->mutex); + /* + * Splitting is not permitted if an access exists, we don't track enough + * information to split existing accesses. + */ + if (area->num_accesses) { + rc = -EINVAL; + goto err_unlock; + } + + /* + * Splitting is not permitted if a domain could have been mapped with + * huge pages. + */ + if (area->storage_domain && !iopt->disable_large_pages) { + rc = -EINVAL; + goto err_unlock; + } + + interval_tree_remove(&area->node, &iopt->area_itree); + rc = iopt_insert_area(iopt, lhs, area->pages, start_iova, + iopt_area_start_byte(area, start_iova), + (new_start - 1) - start_iova + 1, + area->iommu_prot); + if (WARN_ON(rc)) + goto err_insert; + + rc = iopt_insert_area(iopt, rhs, area->pages, new_start, + iopt_area_start_byte(area, new_start), + last_iova - new_start + 1, area->iommu_prot); + if (WARN_ON(rc)) + goto err_remove_lhs; + + lhs->storage_domain = area->storage_domain; + lhs->pages = area->pages; + rhs->storage_domain = area->storage_domain; + rhs->pages = area->pages; + kref_get(&rhs->pages->kref); + kfree(area); + mutex_unlock(&pages->mutex); + + /* + * No change to domains or accesses because the pages hasn't been + * changed + */ + return 0; + +err_remove_lhs: + interval_tree_remove(&lhs->node, &iopt->area_itree); +err_insert: + interval_tree_insert(&area->node, &iopt->area_itree); +err_unlock: + mutex_unlock(&pages->mutex); + kfree(rhs); +err_free_lhs: + kfree(lhs); + return rc; +} + +int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas, + size_t num_iovas) +{ + int rc = 0; + int i; + + down_write(&iopt->iova_rwsem); + for (i = 0; i < num_iovas; i++) { + struct iopt_area *area; + + area = iopt_area_iter_first(iopt, iovas[i], iovas[i]); + if (!area) + continue; + rc = iopt_area_split(area, iovas[i]); + if (rc) + break; + } + up_write(&iopt->iova_rwsem); + return rc; +} + +void iopt_enable_large_pages(struct io_pagetable *iopt) +{ + int rc; + + down_write(&iopt->domains_rwsem); + down_write(&iopt->iova_rwsem); + WRITE_ONCE(iopt->disable_large_pages, false); + rc = iopt_calculate_iova_alignment(iopt); + WARN_ON(rc); + up_write(&iopt->iova_rwsem); + up_write(&iopt->domains_rwsem); +} + +int iopt_disable_large_pages(struct io_pagetable *iopt) +{ + int rc = 0; + + down_write(&iopt->domains_rwsem); + down_write(&iopt->iova_rwsem); + if (iopt->disable_large_pages) + goto out_unlock; + + /* Won't do it if domains already have pages mapped in them */ + if (!xa_empty(&iopt->domains) && + !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) { + rc = -EINVAL; + goto out_unlock; + } + + WRITE_ONCE(iopt->disable_large_pages, true); + rc = iopt_calculate_iova_alignment(iopt); + if (rc) + WRITE_ONCE(iopt->disable_large_pages, false); +out_unlock: + up_write(&iopt->iova_rwsem); + up_write(&iopt->domains_rwsem); + return rc; +} + +int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access) +{ + int rc; + + down_write(&iopt->domains_rwsem); + down_write(&iopt->iova_rwsem); + rc = xa_alloc(&iopt->access_list, &access->iopt_access_list_id, access, + xa_limit_16b, GFP_KERNEL_ACCOUNT); + if (rc) + goto out_unlock; + + rc = iopt_calculate_iova_alignment(iopt); + if (rc) { + xa_erase(&iopt->access_list, access->iopt_access_list_id); + goto out_unlock; + } + +out_unlock: + up_write(&iopt->iova_rwsem); + up_write(&iopt->domains_rwsem); + return rc; +} + +void iopt_remove_access(struct io_pagetable *iopt, + struct iommufd_access *access) +{ + down_write(&iopt->domains_rwsem); + down_write(&iopt->iova_rwsem); + WARN_ON(xa_erase(&iopt->access_list, access->iopt_access_list_id) != + access); + WARN_ON(iopt_calculate_iova_alignment(iopt)); + up_write(&iopt->iova_rwsem); + up_write(&iopt->domains_rwsem); +} + +/* Narrow the valid_iova_itree to include reserved ranges from a group. */ +int iopt_table_enforce_group_resv_regions(struct io_pagetable *iopt, + struct device *device, + struct iommu_group *group, + phys_addr_t *sw_msi_start) +{ + struct iommu_resv_region *resv; + struct iommu_resv_region *tmp; + LIST_HEAD(group_resv_regions); + int rc; + + down_write(&iopt->iova_rwsem); + rc = iommu_get_group_resv_regions(group, &group_resv_regions); + if (rc) + goto out_unlock; + + list_for_each_entry(resv, &group_resv_regions, list) { + if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE) + continue; + + /* + * The presence of any 'real' MSI regions should take precedence + * over the software-managed one if the IOMMU driver happens to + * advertise both types. + */ + if (sw_msi_start && resv->type == IOMMU_RESV_MSI) { + *sw_msi_start = 0; + sw_msi_start = NULL; + } + if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) + *sw_msi_start = resv->start; + + rc = iopt_reserve_iova(iopt, resv->start, + resv->length - 1 + resv->start, device); + if (rc) + goto out_reserved; + } + rc = 0; + goto out_free_resv; + +out_reserved: + __iopt_remove_reserved_iova(iopt, device); +out_free_resv: + list_for_each_entry_safe(resv, tmp, &group_resv_regions, list) + kfree(resv); +out_unlock: + up_write(&iopt->iova_rwsem); + return rc; +} diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h index a2b724175057b..2ee6942c3ef4a 100644 --- a/drivers/iommu/iommufd/io_pagetable.h +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -46,9 +46,19 @@ struct iopt_area { unsigned int page_offset; /* IOMMU_READ, IOMMU_WRITE, etc */ int iommu_prot; + bool prevent_access : 1; unsigned int num_accesses; }; +struct iopt_allowed { + struct interval_tree_node node; +}; + +struct iopt_reserved { + struct interval_tree_node node; + void *owner; +}; + int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages); void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages); @@ -83,6 +93,24 @@ static inline size_t iopt_area_length(struct iopt_area *area) return (area->node.last - area->node.start) + 1; } +/* + * Number of bytes from the start of the iopt_pages that the iova begins. + * iopt_area_start_byte() / PAGE_SIZE encodes the starting page index + * iopt_area_start_byte() % PAGE_SIZE encodes the offset within that page + */ +static inline unsigned long iopt_area_start_byte(struct iopt_area *area, + unsigned long iova) +{ + return (iova - iopt_area_iova(area)) + area->page_offset + + iopt_area_index(area) * PAGE_SIZE; +} + +static inline unsigned long iopt_area_iova_to_index(struct iopt_area *area, + unsigned long iova) +{ + return iopt_area_start_byte(area, iova) / PAGE_SIZE; +} + #define __make_iopt_iter(name) \ static inline struct iopt_##name *iopt_##name##_iter_first( \ struct io_pagetable *iopt, unsigned long start, \ @@ -110,6 +138,33 @@ static inline size_t iopt_area_length(struct iopt_area *area) } __make_iopt_iter(area) +__make_iopt_iter(allowed) +__make_iopt_iter(reserved) + +struct iopt_area_contig_iter { + unsigned long cur_iova; + unsigned long last_iova; + struct iopt_area *area; +}; +struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter, + struct io_pagetable *iopt, + unsigned long iova, + unsigned long last_iova); +struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter); + +static inline bool iopt_area_contig_done(struct iopt_area_contig_iter *iter) +{ + return iter->area && iter->last_iova <= iopt_area_last_iova(iter->area); +} + +/* + * Iterate over a contiguous list of areas that span the iova,last_iova range. + * The caller must check iopt_area_contig_done() after the loop to see if + * contiguous areas existed. + */ +#define iopt_for_each_contig_area(iter, area, iopt, iova, last_iova) \ + for (area = iopt_area_contig_init(iter, iopt, iova, last_iova); area; \ + area = iopt_area_contig_next(iter)) enum { IOPT_PAGES_ACCOUNT_NONE = 0, diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 169a30ff3bf0d..f7ab6c6edafd1 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -9,9 +9,14 @@ #include #include +struct iommu_domain; +struct iommu_group; + struct iommufd_ctx { struct file *file; struct xarray objects; + + u8 account_mode; }; /* @@ -27,6 +32,7 @@ struct iommufd_ctx { struct io_pagetable { struct rw_semaphore domains_rwsem; struct xarray domains; + struct xarray access_list; unsigned int next_domain_id; struct rw_semaphore iova_rwsem; @@ -36,7 +42,45 @@ struct io_pagetable { /* IOVA that cannot be allocated, struct iopt_reserved */ struct rb_root_cached reserved_itree; u8 disable_large_pages; + unsigned long iova_alignment; +}; + +void iopt_init_table(struct io_pagetable *iopt); +void iopt_destroy_table(struct io_pagetable *iopt); +int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova, + unsigned long length, struct list_head *pages_list); +void iopt_free_pages_list(struct list_head *pages_list); +enum { + IOPT_ALLOC_IOVA = 1 << 0, }; +int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, + unsigned long *iova, void __user *uptr, + unsigned long length, int iommu_prot, + unsigned int flags); +int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list, + unsigned long length, unsigned long *dst_iova, + int iommu_prot, unsigned int flags); +int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova, + unsigned long length, unsigned long *unmapped); +int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped); + +int iopt_table_add_domain(struct io_pagetable *iopt, + struct iommu_domain *domain); +void iopt_table_remove_domain(struct io_pagetable *iopt, + struct iommu_domain *domain); +int iopt_table_enforce_group_resv_regions(struct io_pagetable *iopt, + struct device *device, + struct iommu_group *group, + phys_addr_t *sw_msi_start); +int iopt_set_allow_iova(struct io_pagetable *iopt, + struct rb_root_cached *allowed_iova); +int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start, + unsigned long last, void *owner); +void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner); +int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas, + size_t num_iovas); +void iopt_enable_large_pages(struct io_pagetable *iopt); +int iopt_disable_large_pages(struct io_pagetable *iopt); struct iommufd_ucmd { struct iommufd_ctx *ictx; @@ -130,4 +174,12 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, type), \ typeof(*(ptr)), obj) +struct iommufd_access { + unsigned long iova_alignment; + u32 iopt_access_list_id; +}; + +int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access); +void iopt_remove_access(struct io_pagetable *iopt, + struct iommufd_access *access); #endif -- GitLab From aad37e71d5c4dc1d3c25734f0bcd51c324f94b5e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:34 -0400 Subject: [PATCH 1010/1423] iommufd: IOCTLs for the io_pagetable Connect the IOAS to its IOCTL interface. This exposes most of the functionality in the io_pagetable to userspace. This is intended to be the core of the generic interface that IOMMUFD will provide. Every IOMMU driver should be able to implement an iommu_domain that is compatible with this generic mechanism. It is also designed to be easy to use for simple non virtual machine monitor users, like DPDK: - Universal simple support for all IOMMUs (no PPC special path) - An IOVA allocator that considers the aperture and the allowed/reserved ranges - io_pagetable allows any number of iommu_domains to be connected to the IOAS - Automatic allocation and re-use of iommu_domains Along with room in the design to add non-generic features to cater to specific HW functionality. Link: https://lore.kernel.org/r/11-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Reviewed-by: Kevin Tian Reviewed-by: Eric Auger Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/Makefile | 1 + drivers/iommu/iommufd/ioas.c | 392 ++++++++++++++++++++++++ drivers/iommu/iommufd/iommufd_private.h | 33 ++ drivers/iommu/iommufd/main.c | 48 +++ include/uapi/linux/iommufd.h | 258 +++++++++++++++- 5 files changed, 731 insertions(+), 1 deletion(-) create mode 100644 drivers/iommu/iommufd/ioas.c diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile index b66a8c47ff55e..2b4f36f1b72f9 100644 --- a/drivers/iommu/iommufd/Makefile +++ b/drivers/iommu/iommufd/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only iommufd-y := \ io_pagetable.o \ + ioas.o \ main.o \ pages.o diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c new file mode 100644 index 0000000000000..6ff97dafc8913 --- /dev/null +++ b/drivers/iommu/iommufd/ioas.c @@ -0,0 +1,392 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + */ +#include +#include +#include +#include + +#include "io_pagetable.h" + +void iommufd_ioas_destroy(struct iommufd_object *obj) +{ + struct iommufd_ioas *ioas = container_of(obj, struct iommufd_ioas, obj); + int rc; + + rc = iopt_unmap_all(&ioas->iopt, NULL); + WARN_ON(rc && rc != -ENOENT); + iopt_destroy_table(&ioas->iopt); +} + +struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx) +{ + struct iommufd_ioas *ioas; + + ioas = iommufd_object_alloc(ictx, ioas, IOMMUFD_OBJ_IOAS); + if (IS_ERR(ioas)) + return ioas; + + iopt_init_table(&ioas->iopt); + return ioas; +} + +int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_alloc *cmd = ucmd->cmd; + struct iommufd_ioas *ioas; + int rc; + + if (cmd->flags) + return -EOPNOTSUPP; + + ioas = iommufd_ioas_alloc(ucmd->ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + cmd->out_ioas_id = ioas->obj.id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_table; + iommufd_object_finalize(ucmd->ictx, &ioas->obj); + return 0; + +out_table: + iommufd_object_abort_and_destroy(ucmd->ictx, &ioas->obj); + return rc; +} + +int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd) +{ + struct iommu_iova_range __user *ranges; + struct iommu_ioas_iova_ranges *cmd = ucmd->cmd; + struct iommufd_ioas *ioas; + struct interval_tree_span_iter span; + u32 max_iovas; + int rc; + + if (cmd->__reserved) + return -EOPNOTSUPP; + + ioas = iommufd_get_ioas(ucmd, cmd->ioas_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + down_read(&ioas->iopt.iova_rwsem); + max_iovas = cmd->num_iovas; + ranges = u64_to_user_ptr(cmd->allowed_iovas); + cmd->num_iovas = 0; + cmd->out_iova_alignment = ioas->iopt.iova_alignment; + interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0, + ULONG_MAX) { + if (!span.is_hole) + continue; + if (cmd->num_iovas < max_iovas) { + struct iommu_iova_range elm = { + .start = span.start_hole, + .last = span.last_hole, + }; + + if (copy_to_user(&ranges[cmd->num_iovas], &elm, + sizeof(elm))) { + rc = -EFAULT; + goto out_put; + } + } + cmd->num_iovas++; + } + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_put; + if (cmd->num_iovas > max_iovas) + rc = -EMSGSIZE; +out_put: + up_read(&ioas->iopt.iova_rwsem); + iommufd_put_object(&ioas->obj); + return rc; +} + +static int iommufd_ioas_load_iovas(struct rb_root_cached *itree, + struct iommu_iova_range __user *ranges, + u32 num) +{ + u32 i; + + for (i = 0; i != num; i++) { + struct iommu_iova_range range; + struct iopt_allowed *allowed; + + if (copy_from_user(&range, ranges + i, sizeof(range))) + return -EFAULT; + + if (range.start >= range.last) + return -EINVAL; + + if (interval_tree_iter_first(itree, range.start, range.last)) + return -EINVAL; + + allowed = kzalloc(sizeof(*allowed), GFP_KERNEL_ACCOUNT); + if (!allowed) + return -ENOMEM; + allowed->node.start = range.start; + allowed->node.last = range.last; + + interval_tree_insert(&allowed->node, itree); + } + return 0; +} + +int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_allow_iovas *cmd = ucmd->cmd; + struct rb_root_cached allowed_iova = RB_ROOT_CACHED; + struct interval_tree_node *node; + struct iommufd_ioas *ioas; + struct io_pagetable *iopt; + int rc = 0; + + if (cmd->__reserved) + return -EOPNOTSUPP; + + ioas = iommufd_get_ioas(ucmd, cmd->ioas_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + iopt = &ioas->iopt; + + rc = iommufd_ioas_load_iovas(&allowed_iova, + u64_to_user_ptr(cmd->allowed_iovas), + cmd->num_iovas); + if (rc) + goto out_free; + + /* + * We want the allowed tree update to be atomic, so we have to keep the + * original nodes around, and keep track of the new nodes as we allocate + * memory for them. The simplest solution is to have a new/old tree and + * then swap new for old. On success we free the old tree, on failure we + * free the new tree. + */ + rc = iopt_set_allow_iova(iopt, &allowed_iova); +out_free: + while ((node = interval_tree_iter_first(&allowed_iova, 0, ULONG_MAX))) { + interval_tree_remove(node, &allowed_iova); + kfree(container_of(node, struct iopt_allowed, node)); + } + iommufd_put_object(&ioas->obj); + return rc; +} + +static int conv_iommu_prot(u32 map_flags) +{ + /* + * We provide no manual cache coherency ioctls to userspace and most + * architectures make the CPU ops for cache flushing privileged. + * Therefore we require the underlying IOMMU to support CPU coherent + * operation. Support for IOMMU_CACHE is enforced by the + * IOMMU_CAP_CACHE_COHERENCY test during bind. + */ + int iommu_prot = IOMMU_CACHE; + + if (map_flags & IOMMU_IOAS_MAP_WRITEABLE) + iommu_prot |= IOMMU_WRITE; + if (map_flags & IOMMU_IOAS_MAP_READABLE) + iommu_prot |= IOMMU_READ; + return iommu_prot; +} + +int iommufd_ioas_map(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_map *cmd = ucmd->cmd; + unsigned long iova = cmd->iova; + struct iommufd_ioas *ioas; + unsigned int flags = 0; + int rc; + + if ((cmd->flags & + ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_READABLE)) || + cmd->__reserved) + return -EOPNOTSUPP; + if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) + return -EOVERFLOW; + + ioas = iommufd_get_ioas(ucmd, cmd->ioas_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA)) + flags = IOPT_ALLOC_IOVA; + rc = iopt_map_user_pages(ucmd->ictx, &ioas->iopt, &iova, + u64_to_user_ptr(cmd->user_va), cmd->length, + conv_iommu_prot(cmd->flags), flags); + if (rc) + goto out_put; + + cmd->iova = iova; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); +out_put: + iommufd_put_object(&ioas->obj); + return rc; +} + +int iommufd_ioas_copy(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_copy *cmd = ucmd->cmd; + struct iommufd_ioas *src_ioas; + struct iommufd_ioas *dst_ioas; + unsigned int flags = 0; + LIST_HEAD(pages_list); + unsigned long iova; + int rc; + + if ((cmd->flags & + ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_READABLE))) + return -EOPNOTSUPP; + if (cmd->length >= ULONG_MAX || cmd->src_iova >= ULONG_MAX || + cmd->dst_iova >= ULONG_MAX) + return -EOVERFLOW; + + src_ioas = iommufd_get_ioas(ucmd, cmd->src_ioas_id); + if (IS_ERR(src_ioas)) + return PTR_ERR(src_ioas); + rc = iopt_get_pages(&src_ioas->iopt, cmd->src_iova, cmd->length, + &pages_list); + iommufd_put_object(&src_ioas->obj); + if (rc) + return rc; + + dst_ioas = iommufd_get_ioas(ucmd, cmd->dst_ioas_id); + if (IS_ERR(dst_ioas)) { + rc = PTR_ERR(dst_ioas); + goto out_pages; + } + + if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA)) + flags = IOPT_ALLOC_IOVA; + iova = cmd->dst_iova; + rc = iopt_map_pages(&dst_ioas->iopt, &pages_list, cmd->length, &iova, + conv_iommu_prot(cmd->flags), flags); + if (rc) + goto out_put_dst; + + cmd->dst_iova = iova; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); +out_put_dst: + iommufd_put_object(&dst_ioas->obj); +out_pages: + iopt_free_pages_list(&pages_list); + return rc; +} + +int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_unmap *cmd = ucmd->cmd; + struct iommufd_ioas *ioas; + unsigned long unmapped = 0; + int rc; + + ioas = iommufd_get_ioas(ucmd, cmd->ioas_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + if (cmd->iova == 0 && cmd->length == U64_MAX) { + rc = iopt_unmap_all(&ioas->iopt, &unmapped); + if (rc) + goto out_put; + } else { + if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) { + rc = -EOVERFLOW; + goto out_put; + } + rc = iopt_unmap_iova(&ioas->iopt, cmd->iova, cmd->length, + &unmapped); + if (rc) + goto out_put; + } + + cmd->length = unmapped; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + +out_put: + iommufd_put_object(&ioas->obj); + return rc; +} + +int iommufd_option_rlimit_mode(struct iommu_option *cmd, + struct iommufd_ctx *ictx) +{ + if (cmd->object_id) + return -EOPNOTSUPP; + + if (cmd->op == IOMMU_OPTION_OP_GET) { + cmd->val64 = ictx->account_mode == IOPT_PAGES_ACCOUNT_MM; + return 0; + } + if (cmd->op == IOMMU_OPTION_OP_SET) { + int rc = 0; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + xa_lock(&ictx->objects); + if (!xa_empty(&ictx->objects)) { + rc = -EBUSY; + } else { + if (cmd->val64 == 0) + ictx->account_mode = IOPT_PAGES_ACCOUNT_USER; + else if (cmd->val64 == 1) + ictx->account_mode = IOPT_PAGES_ACCOUNT_MM; + else + rc = -EINVAL; + } + xa_unlock(&ictx->objects); + + return rc; + } + return -EOPNOTSUPP; +} + +static int iommufd_ioas_option_huge_pages(struct iommu_option *cmd, + struct iommufd_ioas *ioas) +{ + if (cmd->op == IOMMU_OPTION_OP_GET) { + cmd->val64 = !ioas->iopt.disable_large_pages; + return 0; + } + if (cmd->op == IOMMU_OPTION_OP_SET) { + if (cmd->val64 == 0) + return iopt_disable_large_pages(&ioas->iopt); + if (cmd->val64 == 1) { + iopt_enable_large_pages(&ioas->iopt); + return 0; + } + return -EINVAL; + } + return -EOPNOTSUPP; +} + +int iommufd_ioas_option(struct iommufd_ucmd *ucmd) +{ + struct iommu_option *cmd = ucmd->cmd; + struct iommufd_ioas *ioas; + int rc = 0; + + if (cmd->__reserved) + return -EOPNOTSUPP; + + ioas = iommufd_get_ioas(ucmd, cmd->object_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + switch (cmd->option_id) { + case IOMMU_OPTION_HUGE_PAGES: + rc = iommufd_ioas_option_huge_pages(cmd, ioas); + break; + default: + rc = -EOPNOTSUPP; + } + + iommufd_put_object(&ioas->obj); + return rc; +} diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index f7ab6c6edafd1..1a13c54a8def8 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -11,6 +11,7 @@ struct iommu_domain; struct iommu_group; +struct iommu_option; struct iommufd_ctx { struct file *file; @@ -102,6 +103,7 @@ static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd, enum iommufd_object_type { IOMMUFD_OBJ_NONE, IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE, + IOMMUFD_OBJ_IOAS, }; /* Base struct for all objects with a userspace ID handle. */ @@ -174,6 +176,37 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, type), \ typeof(*(ptr)), obj) +/* + * The IO Address Space (IOAS) pagetable is a virtual page table backed by the + * io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The + * mapping is copied into all of the associated domains and made available to + * in-kernel users. + */ +struct iommufd_ioas { + struct iommufd_object obj; + struct io_pagetable iopt; +}; + +static inline struct iommufd_ioas *iommufd_get_ioas(struct iommufd_ucmd *ucmd, + u32 id) +{ + return container_of(iommufd_get_object(ucmd->ictx, id, + IOMMUFD_OBJ_IOAS), + struct iommufd_ioas, obj); +} + +struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx); +int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd); +void iommufd_ioas_destroy(struct iommufd_object *obj); +int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd); +int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd); +int iommufd_ioas_map(struct iommufd_ucmd *ucmd); +int iommufd_ioas_copy(struct iommufd_ucmd *ucmd); +int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd); +int iommufd_ioas_option(struct iommufd_ucmd *ucmd); +int iommufd_option_rlimit_mode(struct iommu_option *cmd, + struct iommufd_ctx *ictx); + struct iommufd_access { unsigned long iova_alignment; u32 iopt_access_list_id; diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index dfbc68b97506d..1c0a1f499378d 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -204,8 +204,39 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp) return 0; } +static int iommufd_option(struct iommufd_ucmd *ucmd) +{ + struct iommu_option *cmd = ucmd->cmd; + int rc; + + if (cmd->__reserved) + return -EOPNOTSUPP; + + switch (cmd->option_id) { + case IOMMU_OPTION_RLIMIT_MODE: + rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx); + break; + case IOMMU_OPTION_HUGE_PAGES: + rc = iommufd_ioas_option(ucmd); + break; + default: + return -EOPNOTSUPP; + } + if (rc) + return rc; + if (copy_to_user(&((struct iommu_option __user *)ucmd->ubuffer)->val64, + &cmd->val64, sizeof(cmd->val64))) + return -EFAULT; + return 0; +} + union ucmd_buffer { struct iommu_destroy destroy; + struct iommu_ioas_alloc alloc; + struct iommu_ioas_allow_iovas allow_iovas; + struct iommu_ioas_iova_ranges iova_ranges; + struct iommu_ioas_map map; + struct iommu_ioas_unmap unmap; }; struct iommufd_ioctl_op { @@ -226,6 +257,20 @@ struct iommufd_ioctl_op { } static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id), + IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl, + struct iommu_ioas_alloc, out_ioas_id), + IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas, + struct iommu_ioas_allow_iovas, allowed_iovas), + IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy, + src_iova), + IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges, + struct iommu_ioas_iova_ranges, out_iova_alignment), + IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map, + iova), + IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct iommu_ioas_unmap, + length), + IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, + val64), }; static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd, @@ -312,6 +357,9 @@ void iommufd_ctx_put(struct iommufd_ctx *ictx) EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, IOMMUFD); static const struct iommufd_object_ops iommufd_object_ops[] = { + [IOMMUFD_OBJ_IOAS] = { + .destroy = iommufd_ioas_destroy, + }, }; static struct miscdevice iommu_misc_dev = { diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 37de92f0534b8..30cc5c5e2b347 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -37,12 +37,19 @@ enum { IOMMUFD_CMD_BASE = 0x80, IOMMUFD_CMD_DESTROY = IOMMUFD_CMD_BASE, + IOMMUFD_CMD_IOAS_ALLOC, + IOMMUFD_CMD_IOAS_ALLOW_IOVAS, + IOMMUFD_CMD_IOAS_COPY, + IOMMUFD_CMD_IOAS_IOVA_RANGES, + IOMMUFD_CMD_IOAS_MAP, + IOMMUFD_CMD_IOAS_UNMAP, + IOMMUFD_CMD_OPTION, }; /** * struct iommu_destroy - ioctl(IOMMU_DESTROY) * @size: sizeof(struct iommu_destroy) - * @id: iommufd object ID to destroy. Can by any destroyable object type. + * @id: iommufd object ID to destroy. Can be any destroyable object type. * * Destroy any object held within iommufd. */ @@ -52,4 +59,253 @@ struct iommu_destroy { }; #define IOMMU_DESTROY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_DESTROY) +/** + * struct iommu_ioas_alloc - ioctl(IOMMU_IOAS_ALLOC) + * @size: sizeof(struct iommu_ioas_alloc) + * @flags: Must be 0 + * @out_ioas_id: Output IOAS ID for the allocated object + * + * Allocate an IO Address Space (IOAS) which holds an IO Virtual Address (IOVA) + * to memory mapping. + */ +struct iommu_ioas_alloc { + __u32 size; + __u32 flags; + __u32 out_ioas_id; +}; +#define IOMMU_IOAS_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_ALLOC) + +/** + * struct iommu_iova_range - ioctl(IOMMU_IOVA_RANGE) + * @start: First IOVA + * @last: Inclusive last IOVA + * + * An interval in IOVA space. + */ +struct iommu_iova_range { + __aligned_u64 start; + __aligned_u64 last; +}; + +/** + * struct iommu_ioas_iova_ranges - ioctl(IOMMU_IOAS_IOVA_RANGES) + * @size: sizeof(struct iommu_ioas_iova_ranges) + * @ioas_id: IOAS ID to read ranges from + * @num_iovas: Input/Output total number of ranges in the IOAS + * @__reserved: Must be 0 + * @allowed_iovas: Pointer to the output array of struct iommu_iova_range + * @out_iova_alignment: Minimum alignment required for mapping IOVA + * + * Query an IOAS for ranges of allowed IOVAs. Mapping IOVA outside these ranges + * is not allowed. num_iovas will be set to the total number of iovas and + * the allowed_iovas[] will be filled in as space permits. + * + * The allowed ranges are dependent on the HW path the DMA operation takes, and + * can change during the lifetime of the IOAS. A fresh empty IOAS will have a + * full range, and each attached device will narrow the ranges based on that + * device's HW restrictions. Detaching a device can widen the ranges. Userspace + * should query ranges after every attach/detach to know what IOVAs are valid + * for mapping. + * + * On input num_iovas is the length of the allowed_iovas array. On output it is + * the total number of iovas filled in. The ioctl will return -EMSGSIZE and set + * num_iovas to the required value if num_iovas is too small. In this case the + * caller should allocate a larger output array and re-issue the ioctl. + * + * out_iova_alignment returns the minimum IOVA alignment that can be given + * to IOMMU_IOAS_MAP/COPY. IOVA's must satisfy:: + * + * starting_iova % out_iova_alignment == 0 + * (starting_iova + length) % out_iova_alignment == 0 + * + * out_iova_alignment can be 1 indicating any IOVA is allowed. It cannot + * be higher than the system PAGE_SIZE. + */ +struct iommu_ioas_iova_ranges { + __u32 size; + __u32 ioas_id; + __u32 num_iovas; + __u32 __reserved; + __aligned_u64 allowed_iovas; + __aligned_u64 out_iova_alignment; +}; +#define IOMMU_IOAS_IOVA_RANGES _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_IOVA_RANGES) + +/** + * struct iommu_ioas_allow_iovas - ioctl(IOMMU_IOAS_ALLOW_IOVAS) + * @size: sizeof(struct iommu_ioas_allow_iovas) + * @ioas_id: IOAS ID to allow IOVAs from + * @num_iovas: Input/Output total number of ranges in the IOAS + * @__reserved: Must be 0 + * @allowed_iovas: Pointer to array of struct iommu_iova_range + * + * Ensure a range of IOVAs are always available for allocation. If this call + * succeeds then IOMMU_IOAS_IOVA_RANGES will never return a list of IOVA ranges + * that are narrower than the ranges provided here. This call will fail if + * IOMMU_IOAS_IOVA_RANGES is currently narrower than the given ranges. + * + * When an IOAS is first created the IOVA_RANGES will be maximally sized, and as + * devices are attached the IOVA will narrow based on the device restrictions. + * When an allowed range is specified any narrowing will be refused, ie device + * attachment can fail if the device requires limiting within the allowed range. + * + * Automatic IOVA allocation is also impacted by this call. MAP will only + * allocate within the allowed IOVAs if they are present. + * + * This call replaces the entire allowed list with the given list. + */ +struct iommu_ioas_allow_iovas { + __u32 size; + __u32 ioas_id; + __u32 num_iovas; + __u32 __reserved; + __aligned_u64 allowed_iovas; +}; +#define IOMMU_IOAS_ALLOW_IOVAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_ALLOW_IOVAS) + +/** + * enum iommufd_ioas_map_flags - Flags for map and copy + * @IOMMU_IOAS_MAP_FIXED_IOVA: If clear the kernel will compute an appropriate + * IOVA to place the mapping at + * @IOMMU_IOAS_MAP_WRITEABLE: DMA is allowed to write to this mapping + * @IOMMU_IOAS_MAP_READABLE: DMA is allowed to read from this mapping + */ +enum iommufd_ioas_map_flags { + IOMMU_IOAS_MAP_FIXED_IOVA = 1 << 0, + IOMMU_IOAS_MAP_WRITEABLE = 1 << 1, + IOMMU_IOAS_MAP_READABLE = 1 << 2, +}; + +/** + * struct iommu_ioas_map - ioctl(IOMMU_IOAS_MAP) + * @size: sizeof(struct iommu_ioas_map) + * @flags: Combination of enum iommufd_ioas_map_flags + * @ioas_id: IOAS ID to change the mapping of + * @__reserved: Must be 0 + * @user_va: Userspace pointer to start mapping from + * @length: Number of bytes to map + * @iova: IOVA the mapping was placed at. If IOMMU_IOAS_MAP_FIXED_IOVA is set + * then this must be provided as input. + * + * Set an IOVA mapping from a user pointer. If FIXED_IOVA is specified then the + * mapping will be established at iova, otherwise a suitable location based on + * the reserved and allowed lists will be automatically selected and returned in + * iova. + * + * If IOMMU_IOAS_MAP_FIXED_IOVA is specified then the iova range must currently + * be unused, existing IOVA cannot be replaced. + */ +struct iommu_ioas_map { + __u32 size; + __u32 flags; + __u32 ioas_id; + __u32 __reserved; + __aligned_u64 user_va; + __aligned_u64 length; + __aligned_u64 iova; +}; +#define IOMMU_IOAS_MAP _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_MAP) + +/** + * struct iommu_ioas_copy - ioctl(IOMMU_IOAS_COPY) + * @size: sizeof(struct iommu_ioas_copy) + * @flags: Combination of enum iommufd_ioas_map_flags + * @dst_ioas_id: IOAS ID to change the mapping of + * @src_ioas_id: IOAS ID to copy from + * @length: Number of bytes to copy and map + * @dst_iova: IOVA the mapping was placed at. If IOMMU_IOAS_MAP_FIXED_IOVA is + * set then this must be provided as input. + * @src_iova: IOVA to start the copy + * + * Copy an already existing mapping from src_ioas_id and establish it in + * dst_ioas_id. The src iova/length must exactly match a range used with + * IOMMU_IOAS_MAP. + * + * This may be used to efficiently clone a subset of an IOAS to another, or as a + * kind of 'cache' to speed up mapping. Copy has an efficiency advantage over + * establishing equivalent new mappings, as internal resources are shared, and + * the kernel will pin the user memory only once. + */ +struct iommu_ioas_copy { + __u32 size; + __u32 flags; + __u32 dst_ioas_id; + __u32 src_ioas_id; + __aligned_u64 length; + __aligned_u64 dst_iova; + __aligned_u64 src_iova; +}; +#define IOMMU_IOAS_COPY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_COPY) + +/** + * struct iommu_ioas_unmap - ioctl(IOMMU_IOAS_UNMAP) + * @size: sizeof(struct iommu_ioas_unmap) + * @ioas_id: IOAS ID to change the mapping of + * @iova: IOVA to start the unmapping at + * @length: Number of bytes to unmap, and return back the bytes unmapped + * + * Unmap an IOVA range. The iova/length must be a superset of a previously + * mapped range used with IOMMU_IOAS_MAP or IOMMU_IOAS_COPY. Splitting or + * truncating ranges is not allowed. The values 0 to U64_MAX will unmap + * everything. + */ +struct iommu_ioas_unmap { + __u32 size; + __u32 ioas_id; + __aligned_u64 iova; + __aligned_u64 length; +}; +#define IOMMU_IOAS_UNMAP _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_UNMAP) + +/** + * enum iommufd_option - ioctl(IOMMU_OPTION_RLIMIT_MODE) and + * ioctl(IOMMU_OPTION_HUGE_PAGES) + * @IOMMU_OPTION_RLIMIT_MODE: + * Change how RLIMIT_MEMLOCK accounting works. The caller must have privilege + * to invoke this. Value 0 (default) is user based accouting, 1 uses process + * based accounting. Global option, object_id must be 0 + * @IOMMU_OPTION_HUGE_PAGES: + * Value 1 (default) allows contiguous pages to be combined when generating + * iommu mappings. Value 0 disables combining, everything is mapped to + * PAGE_SIZE. This can be useful for benchmarking. This is a per-IOAS + * option, the object_id must be the IOAS ID. + */ +enum iommufd_option { + IOMMU_OPTION_RLIMIT_MODE = 0, + IOMMU_OPTION_HUGE_PAGES = 1, +}; + +/** + * enum iommufd_option_ops - ioctl(IOMMU_OPTION_OP_SET) and + * ioctl(IOMMU_OPTION_OP_GET) + * @IOMMU_OPTION_OP_SET: Set the option's value + * @IOMMU_OPTION_OP_GET: Get the option's value + */ +enum iommufd_option_ops { + IOMMU_OPTION_OP_SET = 0, + IOMMU_OPTION_OP_GET = 1, +}; + +/** + * struct iommu_option - iommu option multiplexer + * @size: sizeof(struct iommu_option) + * @option_id: One of enum iommufd_option + * @op: One of enum iommufd_option_ops + * @__reserved: Must be 0 + * @object_id: ID of the object if required + * @val64: Option value to set or value returned on get + * + * Change a simple option value. This multiplexor allows controlling options + * on objects. IOMMU_OPTION_OP_SET will load an option and IOMMU_OPTION_OP_GET + * will return the current value. + */ +struct iommu_option { + __u32 size; + __u32 option_id; + __u16 op; + __u16 __reserved; + __u32 object_id; + __aligned_u64 val64; +}; +#define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION) #endif -- GitLab From ea4acfac57b9dee57a7d5840359a41cc3251de92 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:35 -0400 Subject: [PATCH 1011/1423] iommufd: Add a HW pagetable object The hw_pagetable object exposes the internal struct iommu_domain's to userspace. An iommu_domain is required when any DMA device attaches to an IOAS to control the io page table through the iommu driver. For compatibility with VFIO the hw_pagetable is automatically created when a DMA device is attached to the IOAS. If a compatible iommu_domain already exists then the hw_pagetable associated with it is used for the attachment. In the initial series there is no iommufd uAPI for the hw_pagetable object. The next patch provides driver facing APIs for IO page table attachment that allows drivers to accept either an IOAS or a hw_pagetable ID and for the driver to return the hw_pagetable ID that was auto-selected from an IOAS. The expectation is the driver will provide uAPI through its own FD for attaching its device to iommufd. This allows userspace to learn the mapping of devices to iommu_domains and to override the automatic attachment. The future HW specific interface will allow userspace to create hw_pagetable objects using iommu_domains with IOMMU driver specific parameters. This infrastructure will allow linking those domains to IOAS's and devices. Link: https://lore.kernel.org/r/12-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Eric Auger Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/Makefile | 1 + drivers/iommu/iommufd/hw_pagetable.c | 57 +++++++++++++++++++++++++ drivers/iommu/iommufd/ioas.c | 3 ++ drivers/iommu/iommufd/iommufd_private.h | 33 ++++++++++++++ drivers/iommu/iommufd/main.c | 3 ++ 5 files changed, 97 insertions(+) create mode 100644 drivers/iommu/iommufd/hw_pagetable.c diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile index 2b4f36f1b72f9..e13e971aa28c6 100644 --- a/drivers/iommu/iommufd/Makefile +++ b/drivers/iommu/iommufd/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only iommufd-y := \ + hw_pagetable.o \ io_pagetable.o \ ioas.o \ main.o \ diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c new file mode 100644 index 0000000000000..43d473989a066 --- /dev/null +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + */ +#include + +#include "iommufd_private.h" + +void iommufd_hw_pagetable_destroy(struct iommufd_object *obj) +{ + struct iommufd_hw_pagetable *hwpt = + container_of(obj, struct iommufd_hw_pagetable, obj); + + WARN_ON(!list_empty(&hwpt->devices)); + + iommu_domain_free(hwpt->domain); + refcount_dec(&hwpt->ioas->obj.users); + mutex_destroy(&hwpt->devices_lock); +} + +/** + * iommufd_hw_pagetable_alloc() - Get an iommu_domain for a device + * @ictx: iommufd context + * @ioas: IOAS to associate the domain with + * @dev: Device to get an iommu_domain for + * + * Allocate a new iommu_domain and return it as a hw_pagetable. + */ +struct iommufd_hw_pagetable * +iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, + struct device *dev) +{ + struct iommufd_hw_pagetable *hwpt; + int rc; + + hwpt = iommufd_object_alloc(ictx, hwpt, IOMMUFD_OBJ_HW_PAGETABLE); + if (IS_ERR(hwpt)) + return hwpt; + + hwpt->domain = iommu_domain_alloc(dev->bus); + if (!hwpt->domain) { + rc = -ENOMEM; + goto out_abort; + } + + INIT_LIST_HEAD(&hwpt->devices); + INIT_LIST_HEAD(&hwpt->hwpt_item); + mutex_init(&hwpt->devices_lock); + /* Pairs with iommufd_hw_pagetable_destroy() */ + refcount_inc(&ioas->obj.users); + hwpt->ioas = ioas; + return hwpt; + +out_abort: + iommufd_object_abort(ictx, &hwpt->obj); + return ERR_PTR(rc); +} diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c index 6ff97dafc8913..302779b33bd4a 100644 --- a/drivers/iommu/iommufd/ioas.c +++ b/drivers/iommu/iommufd/ioas.c @@ -17,6 +17,7 @@ void iommufd_ioas_destroy(struct iommufd_object *obj) rc = iopt_unmap_all(&ioas->iopt, NULL); WARN_ON(rc && rc != -ENOENT); iopt_destroy_table(&ioas->iopt); + mutex_destroy(&ioas->mutex); } struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx) @@ -28,6 +29,8 @@ struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx) return ioas; iopt_init_table(&ioas->iopt); + INIT_LIST_HEAD(&ioas->hwpt_list); + mutex_init(&ioas->mutex); return ioas; } diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 1a13c54a8def8..6b0448702a956 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -103,6 +103,7 @@ static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd, enum iommufd_object_type { IOMMUFD_OBJ_NONE, IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE, + IOMMUFD_OBJ_HW_PAGETABLE, IOMMUFD_OBJ_IOAS, }; @@ -181,10 +182,20 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, * io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The * mapping is copied into all of the associated domains and made available to * in-kernel users. + * + * Every iommu_domain that is created is wrapped in a iommufd_hw_pagetable + * object. When we go to attach a device to an IOAS we need to get an + * iommu_domain and wrapping iommufd_hw_pagetable for it. + * + * An iommu_domain & iommfd_hw_pagetable will be automatically selected + * for a device based on the hwpt_list. If no suitable iommu_domain + * is found a new iommu_domain will be created. */ struct iommufd_ioas { struct iommufd_object obj; struct io_pagetable iopt; + struct mutex mutex; + struct list_head hwpt_list; }; static inline struct iommufd_ioas *iommufd_get_ioas(struct iommufd_ucmd *ucmd, @@ -207,6 +218,28 @@ int iommufd_ioas_option(struct iommufd_ucmd *ucmd); int iommufd_option_rlimit_mode(struct iommu_option *cmd, struct iommufd_ctx *ictx); +/* + * A HW pagetable is called an iommu_domain inside the kernel. This user object + * allows directly creating and inspecting the domains. Domains that have kernel + * owned page tables will be associated with an iommufd_ioas that provides the + * IOVA to PFN map. + */ +struct iommufd_hw_pagetable { + struct iommufd_object obj; + struct iommufd_ioas *ioas; + struct iommu_domain *domain; + bool auto_domain : 1; + /* Head at iommufd_ioas::hwpt_list */ + struct list_head hwpt_item; + struct mutex devices_lock; + struct list_head devices; +}; + +struct iommufd_hw_pagetable * +iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, + struct device *dev); +void iommufd_hw_pagetable_destroy(struct iommufd_object *obj); + struct iommufd_access { unsigned long iova_alignment; u32 iopt_access_list_id; diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 1c0a1f499378d..ac6580a7b7066 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -360,6 +360,9 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { [IOMMUFD_OBJ_IOAS] = { .destroy = iommufd_ioas_destroy, }, + [IOMMUFD_OBJ_HW_PAGETABLE] = { + .destroy = iommufd_hw_pagetable_destroy, + }, }; static struct miscdevice iommu_misc_dev = { -- GitLab From e8d57210035b6377d424ba964961892d01127cf6 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:36 -0400 Subject: [PATCH 1012/1423] iommufd: Add kAPI toward external drivers for physical devices Add the four functions external drivers need to connect physical DMA to the IOMMUFD: iommufd_device_bind() / iommufd_device_unbind() Register the device with iommufd and establish security isolation. iommufd_device_attach() / iommufd_device_detach() Connect a bound device to a page table Binding a device creates a device object ID in the uAPI, however the generic API does not yet provide any IOCTLs to manipulate them. Link: https://lore.kernel.org/r/13-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/Makefile | 1 + drivers/iommu/iommufd/device.c | 419 ++++++++++++++++++++++++ drivers/iommu/iommufd/iommufd_private.h | 5 + drivers/iommu/iommufd/main.c | 3 + include/linux/iommufd.h | 9 + 5 files changed, 437 insertions(+) create mode 100644 drivers/iommu/iommufd/device.c diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile index e13e971aa28c6..ca28a135b9675 100644 --- a/drivers/iommu/iommufd/Makefile +++ b/drivers/iommu/iommufd/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only iommufd-y := \ + device.o \ hw_pagetable.o \ io_pagetable.o \ ioas.o \ diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c new file mode 100644 index 0000000000000..67cd00b4d9265 --- /dev/null +++ b/drivers/iommu/iommufd/device.c @@ -0,0 +1,419 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + */ +#include +#include +#include +#include + +#include "iommufd_private.h" + +static bool allow_unsafe_interrupts; +module_param(allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC( + allow_unsafe_interrupts, + "Allow IOMMUFD to bind to devices even if the platform cannot isolate " + "the MSI interrupt window. Enabling this is a security weakness."); + +/* + * A iommufd_device object represents the binding relationship between a + * consuming driver and the iommufd. These objects are created/destroyed by + * external drivers, not by userspace. + */ +struct iommufd_device { + struct iommufd_object obj; + struct iommufd_ctx *ictx; + struct iommufd_hw_pagetable *hwpt; + /* Head at iommufd_hw_pagetable::devices */ + struct list_head devices_item; + /* always the physical device */ + struct device *dev; + struct iommu_group *group; + bool enforce_cache_coherency; +}; + +void iommufd_device_destroy(struct iommufd_object *obj) +{ + struct iommufd_device *idev = + container_of(obj, struct iommufd_device, obj); + + iommu_device_release_dma_owner(idev->dev); + iommu_group_put(idev->group); + iommufd_ctx_put(idev->ictx); +} + +/** + * iommufd_device_bind - Bind a physical device to an iommu fd + * @ictx: iommufd file descriptor + * @dev: Pointer to a physical device struct + * @id: Output ID number to return to userspace for this device + * + * A successful bind establishes an ownership over the device and returns + * struct iommufd_device pointer, otherwise returns error pointer. + * + * A driver using this API must set driver_managed_dma and must not touch + * the device until this routine succeeds and establishes ownership. + * + * Binding a PCI device places the entire RID under iommufd control. + * + * The caller must undo this with iommufd_device_unbind() + */ +struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, + struct device *dev, u32 *id) +{ + struct iommufd_device *idev; + struct iommu_group *group; + int rc; + + /* + * iommufd always sets IOMMU_CACHE because we offer no way for userspace + * to restore cache coherency. + */ + if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) + return ERR_PTR(-EINVAL); + + group = iommu_group_get(dev); + if (!group) + return ERR_PTR(-ENODEV); + + rc = iommu_device_claim_dma_owner(dev, ictx); + if (rc) + goto out_group_put; + + idev = iommufd_object_alloc(ictx, idev, IOMMUFD_OBJ_DEVICE); + if (IS_ERR(idev)) { + rc = PTR_ERR(idev); + goto out_release_owner; + } + idev->ictx = ictx; + iommufd_ctx_get(ictx); + idev->dev = dev; + idev->enforce_cache_coherency = + device_iommu_capable(dev, IOMMU_CAP_ENFORCE_CACHE_COHERENCY); + /* The calling driver is a user until iommufd_device_unbind() */ + refcount_inc(&idev->obj.users); + /* group refcount moves into iommufd_device */ + idev->group = group; + + /* + * If the caller fails after this success it must call + * iommufd_unbind_device() which is safe since we hold this refcount. + * This also means the device is a leaf in the graph and no other object + * can take a reference on it. + */ + iommufd_object_finalize(ictx, &idev->obj); + *id = idev->obj.id; + return idev; + +out_release_owner: + iommu_device_release_dma_owner(dev); +out_group_put: + iommu_group_put(group); + return ERR_PTR(rc); +} +EXPORT_SYMBOL_NS_GPL(iommufd_device_bind, IOMMUFD); + +/** + * iommufd_device_unbind - Undo iommufd_device_bind() + * @idev: Device returned by iommufd_device_bind() + * + * Release the device from iommufd control. The DMA ownership will return back + * to unowned with DMA controlled by the DMA API. This invalidates the + * iommufd_device pointer, other APIs that consume it must not be called + * concurrently. + */ +void iommufd_device_unbind(struct iommufd_device *idev) +{ + bool was_destroyed; + + was_destroyed = iommufd_object_destroy_user(idev->ictx, &idev->obj); + WARN_ON(!was_destroyed); +} +EXPORT_SYMBOL_NS_GPL(iommufd_device_unbind, IOMMUFD); + +static int iommufd_device_setup_msi(struct iommufd_device *idev, + struct iommufd_hw_pagetable *hwpt, + phys_addr_t sw_msi_start) +{ + int rc; + + /* + * IOMMU_CAP_INTR_REMAP means that the platform is isolating MSI, and it + * creates the MSI window by default in the iommu domain. Nothing + * further to do. + */ + if (device_iommu_capable(idev->dev, IOMMU_CAP_INTR_REMAP)) + return 0; + + /* + * On ARM systems that set the global IRQ_DOMAIN_FLAG_MSI_REMAP every + * allocated iommu_domain will block interrupts by default and this + * special flow is needed to turn them back on. iommu_dma_prepare_msi() + * will install pages into our domain after request_irq() to make this + * work. + * + * FIXME: This is conceptually broken for iommufd since we want to allow + * userspace to change the domains, eg switch from an identity IOAS to a + * DMA IOAS. There is currently no way to create a MSI window that + * matches what the IRQ layer actually expects in a newly created + * domain. + */ + if (irq_domain_check_msi_remap()) { + if (WARN_ON(!sw_msi_start)) + return -EPERM; + /* + * iommu_get_msi_cookie() can only be called once per domain, + * it returns -EBUSY on later calls. + */ + if (hwpt->msi_cookie) + return 0; + rc = iommu_get_msi_cookie(hwpt->domain, sw_msi_start); + if (rc) + return rc; + hwpt->msi_cookie = true; + return 0; + } + + /* + * Otherwise the platform has a MSI window that is not isolated. For + * historical compat with VFIO allow a module parameter to ignore the + * insecurity. + */ + if (!allow_unsafe_interrupts) + return -EPERM; + + dev_warn( + idev->dev, + "MSI interrupt window cannot be isolated by the IOMMU, this platform is insecure. Use the \"allow_unsafe_interrupts\" module parameter to override\n"); + return 0; +} + +static bool iommufd_hw_pagetable_has_group(struct iommufd_hw_pagetable *hwpt, + struct iommu_group *group) +{ + struct iommufd_device *cur_dev; + + list_for_each_entry(cur_dev, &hwpt->devices, devices_item) + if (cur_dev->group == group) + return true; + return false; +} + +static int iommufd_device_do_attach(struct iommufd_device *idev, + struct iommufd_hw_pagetable *hwpt) +{ + phys_addr_t sw_msi_start = 0; + int rc; + + mutex_lock(&hwpt->devices_lock); + + /* + * Try to upgrade the domain we have, it is an iommu driver bug to + * report IOMMU_CAP_ENFORCE_CACHE_COHERENCY but fail + * enforce_cache_coherency when there are no devices attached to the + * domain. + */ + if (idev->enforce_cache_coherency && !hwpt->enforce_cache_coherency) { + if (hwpt->domain->ops->enforce_cache_coherency) + hwpt->enforce_cache_coherency = + hwpt->domain->ops->enforce_cache_coherency( + hwpt->domain); + if (!hwpt->enforce_cache_coherency) { + WARN_ON(list_empty(&hwpt->devices)); + rc = -EINVAL; + goto out_unlock; + } + } + + rc = iopt_table_enforce_group_resv_regions(&hwpt->ioas->iopt, idev->dev, + idev->group, &sw_msi_start); + if (rc) + goto out_unlock; + + rc = iommufd_device_setup_msi(idev, hwpt, sw_msi_start); + if (rc) + goto out_iova; + + /* + * FIXME: Hack around missing a device-centric iommu api, only attach to + * the group once for the first device that is in the group. + */ + if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) { + rc = iommu_attach_group(hwpt->domain, idev->group); + if (rc) + goto out_iova; + + if (list_empty(&hwpt->devices)) { + rc = iopt_table_add_domain(&hwpt->ioas->iopt, + hwpt->domain); + if (rc) + goto out_detach; + } + } + + idev->hwpt = hwpt; + refcount_inc(&hwpt->obj.users); + list_add(&idev->devices_item, &hwpt->devices); + mutex_unlock(&hwpt->devices_lock); + return 0; + +out_detach: + iommu_detach_group(hwpt->domain, idev->group); +out_iova: + iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev); +out_unlock: + mutex_unlock(&hwpt->devices_lock); + return rc; +} + +/* + * When automatically managing the domains we search for a compatible domain in + * the iopt and if one is found use it, otherwise create a new domain. + * Automatic domain selection will never pick a manually created domain. + */ +static int iommufd_device_auto_get_domain(struct iommufd_device *idev, + struct iommufd_ioas *ioas) +{ + struct iommufd_hw_pagetable *hwpt; + int rc; + + /* + * There is no differentiation when domains are allocated, so any domain + * that is willing to attach to the device is interchangeable with any + * other. + */ + mutex_lock(&ioas->mutex); + list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) { + if (!hwpt->auto_domain) + continue; + + rc = iommufd_device_do_attach(idev, hwpt); + + /* + * -EINVAL means the domain is incompatible with the device. + * Other error codes should propagate to userspace as failure. + * Success means the domain is attached. + */ + if (rc == -EINVAL) + continue; + goto out_unlock; + } + + hwpt = iommufd_hw_pagetable_alloc(idev->ictx, ioas, idev->dev); + if (IS_ERR(hwpt)) { + rc = PTR_ERR(hwpt); + goto out_unlock; + } + hwpt->auto_domain = true; + + rc = iommufd_device_do_attach(idev, hwpt); + if (rc) + goto out_abort; + list_add_tail(&hwpt->hwpt_item, &ioas->hwpt_list); + + mutex_unlock(&ioas->mutex); + iommufd_object_finalize(idev->ictx, &hwpt->obj); + return 0; + +out_abort: + iommufd_object_abort_and_destroy(idev->ictx, &hwpt->obj); +out_unlock: + mutex_unlock(&ioas->mutex); + return rc; +} + +/** + * iommufd_device_attach - Connect a device from an iommu_domain + * @idev: device to attach + * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE + * Output the IOMMUFD_OBJ_HW_PAGETABLE ID + * + * This connects the device to an iommu_domain, either automatically or manually + * selected. Once this completes the device could do DMA. + * + * The caller should return the resulting pt_id back to userspace. + * This function is undone by calling iommufd_device_detach(). + */ +int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id) +{ + struct iommufd_object *pt_obj; + int rc; + + pt_obj = iommufd_get_object(idev->ictx, *pt_id, IOMMUFD_OBJ_ANY); + if (IS_ERR(pt_obj)) + return PTR_ERR(pt_obj); + + switch (pt_obj->type) { + case IOMMUFD_OBJ_HW_PAGETABLE: { + struct iommufd_hw_pagetable *hwpt = + container_of(pt_obj, struct iommufd_hw_pagetable, obj); + + rc = iommufd_device_do_attach(idev, hwpt); + if (rc) + goto out_put_pt_obj; + + mutex_lock(&hwpt->ioas->mutex); + list_add_tail(&hwpt->hwpt_item, &hwpt->ioas->hwpt_list); + mutex_unlock(&hwpt->ioas->mutex); + break; + } + case IOMMUFD_OBJ_IOAS: { + struct iommufd_ioas *ioas = + container_of(pt_obj, struct iommufd_ioas, obj); + + rc = iommufd_device_auto_get_domain(idev, ioas); + if (rc) + goto out_put_pt_obj; + break; + } + default: + rc = -EINVAL; + goto out_put_pt_obj; + } + + refcount_inc(&idev->obj.users); + *pt_id = idev->hwpt->obj.id; + rc = 0; + +out_put_pt_obj: + iommufd_put_object(pt_obj); + return rc; +} +EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD); + +/** + * iommufd_device_detach - Disconnect a device to an iommu_domain + * @idev: device to detach + * + * Undo iommufd_device_attach(). This disconnects the idev from the previously + * attached pt_id. The device returns back to a blocked DMA translation. + */ +void iommufd_device_detach(struct iommufd_device *idev) +{ + struct iommufd_hw_pagetable *hwpt = idev->hwpt; + + mutex_lock(&hwpt->ioas->mutex); + mutex_lock(&hwpt->devices_lock); + list_del(&idev->devices_item); + if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) { + if (list_empty(&hwpt->devices)) { + iopt_table_remove_domain(&hwpt->ioas->iopt, + hwpt->domain); + list_del(&hwpt->hwpt_item); + } + iommu_detach_group(hwpt->domain, idev->group); + } + iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev); + mutex_unlock(&hwpt->devices_lock); + mutex_unlock(&hwpt->ioas->mutex); + + if (hwpt->auto_domain) + iommufd_object_destroy_user(idev->ictx, &hwpt->obj); + else + refcount_dec(&hwpt->obj.users); + + idev->hwpt = NULL; + + refcount_dec(&idev->obj.users); +} +EXPORT_SYMBOL_NS_GPL(iommufd_device_detach, IOMMUFD); diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 6b0448702a956..72a0c805be233 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -103,6 +103,7 @@ static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd, enum iommufd_object_type { IOMMUFD_OBJ_NONE, IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE, + IOMMUFD_OBJ_DEVICE, IOMMUFD_OBJ_HW_PAGETABLE, IOMMUFD_OBJ_IOAS, }; @@ -229,6 +230,8 @@ struct iommufd_hw_pagetable { struct iommufd_ioas *ioas; struct iommu_domain *domain; bool auto_domain : 1; + bool enforce_cache_coherency : 1; + bool msi_cookie : 1; /* Head at iommufd_ioas::hwpt_list */ struct list_head hwpt_item; struct mutex devices_lock; @@ -240,6 +243,8 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, struct device *dev); void iommufd_hw_pagetable_destroy(struct iommufd_object *obj); +void iommufd_device_destroy(struct iommufd_object *obj); + struct iommufd_access { unsigned long iova_alignment; u32 iopt_access_list_id; diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index ac6580a7b7066..fe98912bab0e0 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -357,6 +357,9 @@ void iommufd_ctx_put(struct iommufd_ctx *ictx) EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, IOMMUFD); static const struct iommufd_object_ops iommufd_object_ops[] = { + [IOMMUFD_OBJ_DEVICE] = { + .destroy = iommufd_device_destroy, + }, [IOMMUFD_OBJ_IOAS] = { .destroy = iommufd_ioas_destroy, }, diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 26e09d539737b..185dff3eb32f1 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -9,10 +9,19 @@ #include #include #include +#include +struct iommufd_device; struct iommufd_ctx; struct file; +struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, + struct device *dev, u32 *id); +void iommufd_device_unbind(struct iommufd_device *idev); + +int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id); +void iommufd_device_detach(struct iommufd_device *idev); + enum { IOMMUFD_ACCESS_RW_READ = 0, IOMMUFD_ACCESS_RW_WRITE = 1 << 0, -- GitLab From 8d40205f6093f18e07fe3dc5920fc85e9f82b8b3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:37 -0400 Subject: [PATCH 1013/1423] iommufd: Add kAPI toward external drivers for kernel access Kernel access is the mode that VFIO "mdevs" use. In this case there is no struct device and no IOMMU connection. iommufd acts as a record keeper for accesses and returns the actual struct pages back to the caller to use however they need. eg with kmap or the DMA API. Each caller must create a struct iommufd_access with iommufd_access_create(), similar to how iommufd_device_bind() works. Using this struct the caller can access blocks of IOVA using iommufd_access_pin_pages() or iommufd_access_rw(). Callers must provide a callback that immediately unpins any IOVA being used within a range. This happens if userspace unmaps the IOVA under the pin. The implementation forwards the access requests directly to the iopt infrastructure that manages the iopt_pages_access. Link: https://lore.kernel.org/r/14-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 316 ++++++++++++++++++++++++ drivers/iommu/iommufd/io_pagetable.c | 8 +- drivers/iommu/iommufd/iommufd_private.h | 10 + drivers/iommu/iommufd/main.c | 3 + include/linux/iommufd.h | 43 +++- 5 files changed, 377 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 67cd00b4d9265..06b6894b77062 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -6,6 +6,7 @@ #include #include +#include "io_pagetable.h" #include "iommufd_private.h" static bool allow_unsafe_interrupts; @@ -417,3 +418,318 @@ void iommufd_device_detach(struct iommufd_device *idev) refcount_dec(&idev->obj.users); } EXPORT_SYMBOL_NS_GPL(iommufd_device_detach, IOMMUFD); + +void iommufd_access_destroy_object(struct iommufd_object *obj) +{ + struct iommufd_access *access = + container_of(obj, struct iommufd_access, obj); + + iopt_remove_access(&access->ioas->iopt, access); + iommufd_ctx_put(access->ictx); + refcount_dec(&access->ioas->obj.users); +} + +/** + * iommufd_access_create - Create an iommufd_access + * @ictx: iommufd file descriptor + * @ioas_id: ID for a IOMMUFD_OBJ_IOAS + * @ops: Driver's ops to associate with the access + * @data: Opaque data to pass into ops functions + * + * An iommufd_access allows a driver to read/write to the IOAS without using + * DMA. The underlying CPU memory can be accessed using the + * iommufd_access_pin_pages() or iommufd_access_rw() functions. + * + * The provided ops are required to use iommufd_access_pin_pages(). + */ +struct iommufd_access * +iommufd_access_create(struct iommufd_ctx *ictx, u32 ioas_id, + const struct iommufd_access_ops *ops, void *data) +{ + struct iommufd_access *access; + struct iommufd_object *obj; + int rc; + + /* + * There is no uAPI for the access object, but to keep things symmetric + * use the object infrastructure anyhow. + */ + access = iommufd_object_alloc(ictx, access, IOMMUFD_OBJ_ACCESS); + if (IS_ERR(access)) + return access; + + access->data = data; + access->ops = ops; + + obj = iommufd_get_object(ictx, ioas_id, IOMMUFD_OBJ_IOAS); + if (IS_ERR(obj)) { + rc = PTR_ERR(obj); + goto out_abort; + } + access->ioas = container_of(obj, struct iommufd_ioas, obj); + iommufd_ref_to_users(obj); + + if (ops->needs_pin_pages) + access->iova_alignment = PAGE_SIZE; + else + access->iova_alignment = 1; + rc = iopt_add_access(&access->ioas->iopt, access); + if (rc) + goto out_put_ioas; + + /* The calling driver is a user until iommufd_access_destroy() */ + refcount_inc(&access->obj.users); + access->ictx = ictx; + iommufd_ctx_get(ictx); + iommufd_object_finalize(ictx, &access->obj); + return access; +out_put_ioas: + refcount_dec(&access->ioas->obj.users); +out_abort: + iommufd_object_abort(ictx, &access->obj); + return ERR_PTR(rc); +} +EXPORT_SYMBOL_NS_GPL(iommufd_access_create, IOMMUFD); + +/** + * iommufd_access_destroy - Destroy an iommufd_access + * @access: The access to destroy + * + * The caller must stop using the access before destroying it. + */ +void iommufd_access_destroy(struct iommufd_access *access) +{ + bool was_destroyed; + + was_destroyed = iommufd_object_destroy_user(access->ictx, &access->obj); + WARN_ON(!was_destroyed); +} +EXPORT_SYMBOL_NS_GPL(iommufd_access_destroy, IOMMUFD); + +/** + * iommufd_access_notify_unmap - Notify users of an iopt to stop using it + * @iopt: iopt to work on + * @iova: Starting iova in the iopt + * @length: Number of bytes + * + * After this function returns there should be no users attached to the pages + * linked to this iopt that intersect with iova,length. Anyone that has attached + * a user through iopt_access_pages() needs to detach it through + * iommufd_access_unpin_pages() before this function returns. + * + * iommufd_access_destroy() will wait for any outstanding unmap callback to + * complete. Once iommufd_access_destroy() no unmap ops are running or will + * run in the future. Due to this a driver must not create locking that prevents + * unmap to complete while iommufd_access_destroy() is running. + */ +void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova, + unsigned long length) +{ + struct iommufd_ioas *ioas = + container_of(iopt, struct iommufd_ioas, iopt); + struct iommufd_access *access; + unsigned long index; + + xa_lock(&ioas->iopt.access_list); + xa_for_each(&ioas->iopt.access_list, index, access) { + if (!iommufd_lock_obj(&access->obj)) + continue; + xa_unlock(&ioas->iopt.access_list); + + access->ops->unmap(access->data, iova, length); + + iommufd_put_object(&access->obj); + xa_lock(&ioas->iopt.access_list); + } + xa_unlock(&ioas->iopt.access_list); +} + +/** + * iommufd_access_unpin_pages() - Undo iommufd_access_pin_pages + * @access: IOAS access to act on + * @iova: Starting IOVA + * @length: Number of bytes to access + * + * Return the struct page's. The caller must stop accessing them before calling + * this. The iova/length must exactly match the one provided to access_pages. + */ +void iommufd_access_unpin_pages(struct iommufd_access *access, + unsigned long iova, unsigned long length) +{ + struct io_pagetable *iopt = &access->ioas->iopt; + struct iopt_area_contig_iter iter; + unsigned long last_iova; + struct iopt_area *area; + + if (WARN_ON(!length) || + WARN_ON(check_add_overflow(iova, length - 1, &last_iova))) + return; + + down_read(&iopt->iova_rwsem); + iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) + iopt_area_remove_access( + area, iopt_area_iova_to_index(area, iter.cur_iova), + iopt_area_iova_to_index( + area, + min(last_iova, iopt_area_last_iova(area)))); + up_read(&iopt->iova_rwsem); + WARN_ON(!iopt_area_contig_done(&iter)); +} +EXPORT_SYMBOL_NS_GPL(iommufd_access_unpin_pages, IOMMUFD); + +static bool iopt_area_contig_is_aligned(struct iopt_area_contig_iter *iter) +{ + if (iopt_area_start_byte(iter->area, iter->cur_iova) % PAGE_SIZE) + return false; + + if (!iopt_area_contig_done(iter) && + (iopt_area_start_byte(iter->area, iopt_area_last_iova(iter->area)) % + PAGE_SIZE) != (PAGE_SIZE - 1)) + return false; + return true; +} + +static bool check_area_prot(struct iopt_area *area, unsigned int flags) +{ + if (flags & IOMMUFD_ACCESS_RW_WRITE) + return area->iommu_prot & IOMMU_WRITE; + return area->iommu_prot & IOMMU_READ; +} + +/** + * iommufd_access_pin_pages() - Return a list of pages under the iova + * @access: IOAS access to act on + * @iova: Starting IOVA + * @length: Number of bytes to access + * @out_pages: Output page list + * @flags: IOPMMUFD_ACCESS_RW_* flags + * + * Reads @length bytes starting at iova and returns the struct page * pointers. + * These can be kmap'd by the caller for CPU access. + * + * The caller must perform iommufd_access_unpin_pages() when done to balance + * this. + * + * This API always requires a page aligned iova. This happens naturally if the + * ioas alignment is >= PAGE_SIZE and the iova is PAGE_SIZE aligned. However + * smaller alignments have corner cases where this API can fail on otherwise + * aligned iova. + */ +int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova, + unsigned long length, struct page **out_pages, + unsigned int flags) +{ + struct io_pagetable *iopt = &access->ioas->iopt; + struct iopt_area_contig_iter iter; + unsigned long last_iova; + struct iopt_area *area; + int rc; + + if (!length) + return -EINVAL; + if (check_add_overflow(iova, length - 1, &last_iova)) + return -EOVERFLOW; + + down_read(&iopt->iova_rwsem); + iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) { + unsigned long last = min(last_iova, iopt_area_last_iova(area)); + unsigned long last_index = iopt_area_iova_to_index(area, last); + unsigned long index = + iopt_area_iova_to_index(area, iter.cur_iova); + + if (area->prevent_access || + !iopt_area_contig_is_aligned(&iter)) { + rc = -EINVAL; + goto err_remove; + } + + if (!check_area_prot(area, flags)) { + rc = -EPERM; + goto err_remove; + } + + rc = iopt_area_add_access(area, index, last_index, out_pages, + flags); + if (rc) + goto err_remove; + out_pages += last_index - index + 1; + } + if (!iopt_area_contig_done(&iter)) { + rc = -ENOENT; + goto err_remove; + } + + up_read(&iopt->iova_rwsem); + return 0; + +err_remove: + if (iova < iter.cur_iova) { + last_iova = iter.cur_iova - 1; + iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) + iopt_area_remove_access( + area, + iopt_area_iova_to_index(area, iter.cur_iova), + iopt_area_iova_to_index( + area, min(last_iova, + iopt_area_last_iova(area)))); + } + up_read(&iopt->iova_rwsem); + return rc; +} +EXPORT_SYMBOL_NS_GPL(iommufd_access_pin_pages, IOMMUFD); + +/** + * iommufd_access_rw - Read or write data under the iova + * @access: IOAS access to act on + * @iova: Starting IOVA + * @data: Kernel buffer to copy to/from + * @length: Number of bytes to access + * @flags: IOMMUFD_ACCESS_RW_* flags + * + * Copy kernel to/from data into the range given by IOVA/length. If flags + * indicates IOMMUFD_ACCESS_RW_KTHREAD then a large copy can be optimized + * by changing it into copy_to/from_user(). + */ +int iommufd_access_rw(struct iommufd_access *access, unsigned long iova, + void *data, size_t length, unsigned int flags) +{ + struct io_pagetable *iopt = &access->ioas->iopt; + struct iopt_area_contig_iter iter; + struct iopt_area *area; + unsigned long last_iova; + int rc; + + if (!length) + return -EINVAL; + if (check_add_overflow(iova, length - 1, &last_iova)) + return -EOVERFLOW; + + down_read(&iopt->iova_rwsem); + iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) { + unsigned long last = min(last_iova, iopt_area_last_iova(area)); + unsigned long bytes = (last - iter.cur_iova) + 1; + + if (area->prevent_access) { + rc = -EINVAL; + goto err_out; + } + + if (!check_area_prot(area, flags)) { + rc = -EPERM; + goto err_out; + } + + rc = iopt_pages_rw_access( + area->pages, iopt_area_start_byte(area, iter.cur_iova), + data, bytes, flags); + if (rc) + goto err_out; + data += bytes; + } + if (!iopt_area_contig_done(&iter)) + rc = -ENOENT; +err_out: + up_read(&iopt->iova_rwsem); + return rc; +} +EXPORT_SYMBOL_NS_GPL(iommufd_access_rw, IOMMUFD); diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 756d347948f0e..4f4a9d9aac570 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -458,6 +458,7 @@ static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start, * is NULL. This prevents domain attach/detatch from running * concurrently with cleaning up the area. */ +again: down_read(&iopt->domains_rwsem); down_write(&iopt->iova_rwsem); while ((area = iopt_area_iter_first(iopt, start, last))) { @@ -486,8 +487,11 @@ static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start, area->prevent_access = true; up_write(&iopt->iova_rwsem); up_read(&iopt->domains_rwsem); - /* Later patch calls back to drivers to unmap */ - return -EBUSY; + iommufd_access_notify_unmap(iopt, area_first, + iopt_area_length(area)); + if (WARN_ON(READ_ONCE(area->num_accesses))) + return -EDEADLOCK; + goto again; } pages = area->pages; diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 72a0c805be233..40302cc0da360 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -65,6 +65,8 @@ int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova, unsigned long length, unsigned long *unmapped); int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped); +void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova, + unsigned long length); int iopt_table_add_domain(struct io_pagetable *iopt, struct iommu_domain *domain); void iopt_table_remove_domain(struct io_pagetable *iopt, @@ -106,6 +108,7 @@ enum iommufd_object_type { IOMMUFD_OBJ_DEVICE, IOMMUFD_OBJ_HW_PAGETABLE, IOMMUFD_OBJ_IOAS, + IOMMUFD_OBJ_ACCESS, }; /* Base struct for all objects with a userspace ID handle. */ @@ -246,6 +249,11 @@ void iommufd_hw_pagetable_destroy(struct iommufd_object *obj); void iommufd_device_destroy(struct iommufd_object *obj); struct iommufd_access { + struct iommufd_object obj; + struct iommufd_ctx *ictx; + struct iommufd_ioas *ioas; + const struct iommufd_access_ops *ops; + void *data; unsigned long iova_alignment; u32 iopt_access_list_id; }; @@ -253,4 +261,6 @@ struct iommufd_access { int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access); void iopt_remove_access(struct io_pagetable *iopt, struct iommufd_access *access); +void iommufd_access_destroy_object(struct iommufd_object *obj); + #endif diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index fe98912bab0e0..4153f6a202555 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -357,6 +357,9 @@ void iommufd_ctx_put(struct iommufd_ctx *ictx) EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, IOMMUFD); static const struct iommufd_object_ops iommufd_object_ops[] = { + [IOMMUFD_OBJ_ACCESS] = { + .destroy = iommufd_access_destroy_object, + }, [IOMMUFD_OBJ_DEVICE] = { .destroy = iommufd_device_destroy, }, diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 185dff3eb32f1..46c481a26d791 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -9,10 +9,12 @@ #include #include #include -#include +struct device; struct iommufd_device; +struct page; struct iommufd_ctx; +struct iommufd_access; struct file; struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, @@ -22,6 +24,11 @@ void iommufd_device_unbind(struct iommufd_device *idev); int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id); void iommufd_device_detach(struct iommufd_device *idev); +struct iommufd_access_ops { + u8 needs_pin_pages : 1; + void (*unmap)(void *data, unsigned long iova, unsigned long length); +}; + enum { IOMMUFD_ACCESS_RW_READ = 0, IOMMUFD_ACCESS_RW_WRITE = 1 << 0, @@ -29,11 +36,24 @@ enum { IOMMUFD_ACCESS_RW_KTHREAD = 1 << 1, }; +struct iommufd_access * +iommufd_access_create(struct iommufd_ctx *ictx, u32 ioas_id, + const struct iommufd_access_ops *ops, void *data); +void iommufd_access_destroy(struct iommufd_access *access); + void iommufd_ctx_get(struct iommufd_ctx *ictx); #if IS_ENABLED(CONFIG_IOMMUFD) struct iommufd_ctx *iommufd_ctx_from_file(struct file *file); void iommufd_ctx_put(struct iommufd_ctx *ictx); + +int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova, + unsigned long length, struct page **out_pages, + unsigned int flags); +void iommufd_access_unpin_pages(struct iommufd_access *access, + unsigned long iova, unsigned long length); +int iommufd_access_rw(struct iommufd_access *access, unsigned long iova, + void *data, size_t len, unsigned int flags); #else /* !CONFIG_IOMMUFD */ static inline struct iommufd_ctx *iommufd_ctx_from_file(struct file *file) { @@ -43,5 +63,26 @@ static inline struct iommufd_ctx *iommufd_ctx_from_file(struct file *file) static inline void iommufd_ctx_put(struct iommufd_ctx *ictx) { } + +static inline int iommufd_access_pin_pages(struct iommufd_access *access, + unsigned long iova, + unsigned long length, + struct page **out_pages, + unsigned int flags) +{ + return -EOPNOTSUPP; +} + +static inline void iommufd_access_unpin_pages(struct iommufd_access *access, + unsigned long iova, + unsigned long length) +{ +} + +static inline int iommufd_access_rw(struct iommufd_access *access, unsigned long iova, + void *data, size_t len, unsigned int flags) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_IOMMUFD */ #endif -- GitLab From d624d6652a65ad4f47a58b8651a1ec1163bb81d3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:38 -0400 Subject: [PATCH 1014/1423] iommufd: vfio container FD ioctl compatibility iommufd can directly implement the /dev/vfio/vfio container IOCTLs by mapping them into io_pagetable operations. A userspace application can test against iommufd and confirm compatibility then simply make a small change to open /dev/iommu instead of /dev/vfio/vfio. For testing purposes /dev/vfio/vfio can be symlinked to /dev/iommu and then all applications will use the compatibility path with no code changes. A later series allows /dev/vfio/vfio to be directly provided by iommufd, which allows the rlimit mode to work the same as well. This series just provides the iommufd side of compatibility. Actually linking this to VFIO_SET_CONTAINER is a followup series, with a link in the cover letter. Internally the compatibility API uses a normal IOAS object that, like vfio, is automatically allocated when the first device is attached. Userspace can also query or set this IOAS object directly using the IOMMU_VFIO_IOAS ioctl. This allows mixing and matching new iommufd only features while still using the VFIO style map/unmap ioctls. While this is enough to operate qemu, it has a few differences: - Resource limits rely on memory cgroups to bound what userspace can do instead of the module parameter dma_entry_limit. - VFIO P2P is not implemented. The DMABUF patches for vfio are a start at a solution where iommufd would import a special DMABUF. This is to avoid further propogating the follow_pfn() security problem. - A full audit for pedantic compatibility details (eg errnos, etc) has not yet been done - powerpc SPAPR is left out, as it is not connected to the iommu_domain framework. It seems interest in SPAPR is minimal as it is currently non-working in v6.1-rc1. They will have to convert to the iommu subsystem framework to enjoy iommfd. The following are not going to be implemented and we expect to remove them from VFIO type1: - SW access 'dirty tracking'. As discussed in the cover letter this will be done in VFIO. - VFIO_TYPE1_NESTING_IOMMU https://lore.kernel.org/all/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/ - VFIO_DMA_MAP_FLAG_VADDR https://lore.kernel.org/all/Yz777bJZjTyLrHEQ@nvidia.com/ Link: https://lore.kernel.org/r/15-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Reviewed-by: Kevin Tian Reviewed-by: Eric Auger Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/Makefile | 3 +- drivers/iommu/iommufd/iommufd_private.h | 6 + drivers/iommu/iommufd/main.c | 16 +- drivers/iommu/iommufd/vfio_compat.c | 472 ++++++++++++++++++++++++ include/linux/iommufd.h | 7 + include/uapi/linux/iommufd.h | 36 ++ 6 files changed, 534 insertions(+), 6 deletions(-) create mode 100644 drivers/iommu/iommufd/vfio_compat.c diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile index ca28a135b9675..2fdff04000b32 100644 --- a/drivers/iommu/iommufd/Makefile +++ b/drivers/iommu/iommufd/Makefile @@ -5,6 +5,7 @@ iommufd-y := \ io_pagetable.o \ ioas.o \ main.o \ - pages.o + pages.o \ + vfio_compat.o obj-$(CONFIG_IOMMUFD) += iommufd.o diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 40302cc0da360..8fe5f162ccbcf 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -18,6 +18,7 @@ struct iommufd_ctx { struct xarray objects; u8 account_mode; + struct iommufd_ioas *vfio_ioas; }; /* @@ -92,6 +93,9 @@ struct iommufd_ucmd { void *cmd; }; +int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd, + unsigned long arg); + /* Copy the response in ucmd->cmd back to userspace. */ static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd, size_t cmd_len) @@ -222,6 +226,8 @@ int iommufd_ioas_option(struct iommufd_ucmd *ucmd); int iommufd_option_rlimit_mode(struct iommu_option *cmd, struct iommufd_ctx *ictx); +int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd); + /* * A HW pagetable is called an iommu_domain inside the kernel. This user object * allows directly creating and inspecting the domains. Domains that have kernel diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 4153f6a202555..5cf69c4d591de 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -133,6 +133,8 @@ bool iommufd_object_destroy_user(struct iommufd_ctx *ictx, return false; } __xa_erase(&ictx->objects, obj->id); + if (ictx->vfio_ioas && &ictx->vfio_ioas->obj == obj) + ictx->vfio_ioas = NULL; xa_unlock(&ictx->objects); up_write(&obj->destroy_rwsem); @@ -271,27 +273,31 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { length), IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, val64), + IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas, + __reserved), }; static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { + struct iommufd_ctx *ictx = filp->private_data; const struct iommufd_ioctl_op *op; struct iommufd_ucmd ucmd = {}; union ucmd_buffer buf; unsigned int nr; int ret; - ucmd.ictx = filp->private_data; + nr = _IOC_NR(cmd); + if (nr < IOMMUFD_CMD_BASE || + (nr - IOMMUFD_CMD_BASE) >= ARRAY_SIZE(iommufd_ioctl_ops)) + return iommufd_vfio_ioctl(ictx, cmd, arg); + + ucmd.ictx = ictx; ucmd.ubuffer = (void __user *)arg; ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer); if (ret) return ret; - nr = _IOC_NR(cmd); - if (nr < IOMMUFD_CMD_BASE || - (nr - IOMMUFD_CMD_BASE) >= ARRAY_SIZE(iommufd_ioctl_ops)) - return -ENOIOCTLCMD; op = &iommufd_ioctl_ops[nr - IOMMUFD_CMD_BASE]; if (op->ioctl_num != cmd) return -ENOIOCTLCMD; diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c new file mode 100644 index 0000000000000..3ceca0e8311c3 --- /dev/null +++ b/drivers/iommu/iommufd/vfio_compat.c @@ -0,0 +1,472 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "iommufd_private.h" + +static struct iommufd_ioas *get_compat_ioas(struct iommufd_ctx *ictx) +{ + struct iommufd_ioas *ioas = ERR_PTR(-ENODEV); + + xa_lock(&ictx->objects); + if (!ictx->vfio_ioas || !iommufd_lock_obj(&ictx->vfio_ioas->obj)) + goto out_unlock; + ioas = ictx->vfio_ioas; +out_unlock: + xa_unlock(&ictx->objects); + return ioas; +} + +/** + * iommufd_vfio_compat_ioas_id - Return the IOAS ID that vfio should use + * @ictx: Context to operate on + * @out_ioas_id: The ioas_id the caller should use + * + * The compatibility IOAS is the IOAS that the vfio compatibility ioctls operate + * on since they do not have an IOAS ID input in their ABI. Only attaching a + * group should cause a default creation of the internal ioas, this returns the + * existing ioas if it has already been assigned somehow. + */ +int iommufd_vfio_compat_ioas_id(struct iommufd_ctx *ictx, u32 *out_ioas_id) +{ + struct iommufd_ioas *ioas = NULL; + struct iommufd_ioas *out_ioas; + + ioas = iommufd_ioas_alloc(ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + xa_lock(&ictx->objects); + if (ictx->vfio_ioas && iommufd_lock_obj(&ictx->vfio_ioas->obj)) + out_ioas = ictx->vfio_ioas; + else { + out_ioas = ioas; + ictx->vfio_ioas = ioas; + } + xa_unlock(&ictx->objects); + + *out_ioas_id = out_ioas->obj.id; + if (out_ioas != ioas) { + iommufd_put_object(&out_ioas->obj); + iommufd_object_abort(ictx, &ioas->obj); + return 0; + } + /* + * An automatically created compat IOAS is treated as a userspace + * created object. Userspace can learn the ID via IOMMU_VFIO_IOAS_GET, + * and if not manually destroyed it will be destroyed automatically + * at iommufd release. + */ + iommufd_object_finalize(ictx, &ioas->obj); + return 0; +} +EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_id, IOMMUFD_VFIO); + +int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd) +{ + struct iommu_vfio_ioas *cmd = ucmd->cmd; + struct iommufd_ioas *ioas; + + if (cmd->__reserved) + return -EOPNOTSUPP; + switch (cmd->op) { + case IOMMU_VFIO_IOAS_GET: + ioas = get_compat_ioas(ucmd->ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + cmd->ioas_id = ioas->obj.id; + iommufd_put_object(&ioas->obj); + return iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + + case IOMMU_VFIO_IOAS_SET: + ioas = iommufd_get_ioas(ucmd, cmd->ioas_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + xa_lock(&ucmd->ictx->objects); + ucmd->ictx->vfio_ioas = ioas; + xa_unlock(&ucmd->ictx->objects); + iommufd_put_object(&ioas->obj); + return 0; + + case IOMMU_VFIO_IOAS_CLEAR: + xa_lock(&ucmd->ictx->objects); + ucmd->ictx->vfio_ioas = NULL; + xa_unlock(&ucmd->ictx->objects); + return 0; + default: + return -EOPNOTSUPP; + } +} + +static int iommufd_vfio_map_dma(struct iommufd_ctx *ictx, unsigned int cmd, + void __user *arg) +{ + u32 supported_flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + size_t minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); + struct vfio_iommu_type1_dma_map map; + int iommu_prot = IOMMU_CACHE; + struct iommufd_ioas *ioas; + unsigned long iova; + int rc; + + if (copy_from_user(&map, arg, minsz)) + return -EFAULT; + + if (map.argsz < minsz || map.flags & ~supported_flags) + return -EINVAL; + + if (map.flags & VFIO_DMA_MAP_FLAG_READ) + iommu_prot |= IOMMU_READ; + if (map.flags & VFIO_DMA_MAP_FLAG_WRITE) + iommu_prot |= IOMMU_WRITE; + + ioas = get_compat_ioas(ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + /* + * Maps created through the legacy interface always use VFIO compatible + * rlimit accounting. If the user wishes to use the faster user based + * rlimit accounting then they must use the new interface. + */ + iova = map.iova; + rc = iopt_map_user_pages(ictx, &ioas->iopt, &iova, u64_to_user_ptr(map.vaddr), + map.size, iommu_prot, 0); + iommufd_put_object(&ioas->obj); + return rc; +} + +static int iommufd_vfio_unmap_dma(struct iommufd_ctx *ictx, unsigned int cmd, + void __user *arg) +{ + size_t minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); + /* + * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP is obsoleted by the new + * dirty tracking direction: + * https://lore.kernel.org/kvm/20220731125503.142683-1-yishaih@nvidia.com/ + * https://lore.kernel.org/kvm/20220428210933.3583-1-joao.m.martins@oracle.com/ + */ + u32 supported_flags = VFIO_DMA_UNMAP_FLAG_ALL; + struct vfio_iommu_type1_dma_unmap unmap; + unsigned long unmapped = 0; + struct iommufd_ioas *ioas; + int rc; + + if (copy_from_user(&unmap, arg, minsz)) + return -EFAULT; + + if (unmap.argsz < minsz || unmap.flags & ~supported_flags) + return -EINVAL; + + ioas = get_compat_ioas(ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + if (unmap.flags & VFIO_DMA_UNMAP_FLAG_ALL) { + if (unmap.iova != 0 || unmap.size != 0) { + rc = -EINVAL; + goto err_put; + } + rc = iopt_unmap_all(&ioas->iopt, &unmapped); + } else { + if (READ_ONCE(ioas->iopt.disable_large_pages)) { + /* + * Create cuts at the start and last of the requested + * range. If the start IOVA is 0 then it doesn't need to + * be cut. + */ + unsigned long iovas[] = { unmap.iova + unmap.size - 1, + unmap.iova - 1 }; + + rc = iopt_cut_iova(&ioas->iopt, iovas, + unmap.iova ? 2 : 1); + if (rc) + goto err_put; + } + rc = iopt_unmap_iova(&ioas->iopt, unmap.iova, unmap.size, + &unmapped); + } + unmap.size = unmapped; + if (copy_to_user(arg, &unmap, minsz)) + rc = -EFAULT; + +err_put: + iommufd_put_object(&ioas->obj); + return rc; +} + +static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx) +{ + struct iommufd_hw_pagetable *hwpt; + struct iommufd_ioas *ioas; + int rc = 1; + + ioas = get_compat_ioas(ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + mutex_lock(&ioas->mutex); + list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) { + if (!hwpt->enforce_cache_coherency) { + rc = 0; + break; + } + } + mutex_unlock(&ioas->mutex); + + iommufd_put_object(&ioas->obj); + return rc; +} + +static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx, + unsigned long type) +{ + switch (type) { + case VFIO_TYPE1_IOMMU: + case VFIO_TYPE1v2_IOMMU: + case VFIO_UNMAP_ALL: + return 1; + + case VFIO_DMA_CC_IOMMU: + return iommufd_vfio_cc_iommu(ictx); + + /* + * This is obsolete, and to be removed from VFIO. It was an incomplete + * idea that got merged. + * https://lore.kernel.org/kvm/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/ + */ + case VFIO_TYPE1_NESTING_IOMMU: + return 0; + + /* + * VFIO_DMA_MAP_FLAG_VADDR + * https://lore.kernel.org/kvm/1611939252-7240-1-git-send-email-steven.sistare@oracle.com/ + * https://lore.kernel.org/all/Yz777bJZjTyLrHEQ@nvidia.com/ + * + * It is hard to see how this could be implemented safely. + */ + case VFIO_UPDATE_VADDR: + default: + return 0; + } +} + +static int iommufd_vfio_set_iommu(struct iommufd_ctx *ictx, unsigned long type) +{ + struct iommufd_ioas *ioas = NULL; + int rc = 0; + + if (type != VFIO_TYPE1_IOMMU && type != VFIO_TYPE1v2_IOMMU) + return -EINVAL; + + /* VFIO fails the set_iommu if there is no group */ + ioas = get_compat_ioas(ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + /* + * The difference between TYPE1 and TYPE1v2 is the ability to unmap in + * the middle of mapped ranges. This is complicated by huge page support + * which creates single large IOPTEs that cannot be split by the iommu + * driver. TYPE1 is very old at this point and likely nothing uses it, + * however it is simple enough to emulate by simply disabling the + * problematic large IOPTEs. Then we can safely unmap within any range. + */ + if (type == VFIO_TYPE1_IOMMU) + rc = iopt_disable_large_pages(&ioas->iopt); + iommufd_put_object(&ioas->obj); + return rc; +} + +static unsigned long iommufd_get_pagesizes(struct iommufd_ioas *ioas) +{ + struct io_pagetable *iopt = &ioas->iopt; + unsigned long pgsize_bitmap = ULONG_MAX; + struct iommu_domain *domain; + unsigned long index; + + down_read(&iopt->domains_rwsem); + xa_for_each(&iopt->domains, index, domain) + pgsize_bitmap &= domain->pgsize_bitmap; + + /* See vfio_update_pgsize_bitmap() */ + if (pgsize_bitmap & ~PAGE_MASK) { + pgsize_bitmap &= PAGE_MASK; + pgsize_bitmap |= PAGE_SIZE; + } + pgsize_bitmap = max(pgsize_bitmap, ioas->iopt.iova_alignment); + up_read(&iopt->domains_rwsem); + return pgsize_bitmap; +} + +static int iommufd_fill_cap_iova(struct iommufd_ioas *ioas, + struct vfio_info_cap_header __user *cur, + size_t avail) +{ + struct vfio_iommu_type1_info_cap_iova_range __user *ucap_iovas = + container_of(cur, + struct vfio_iommu_type1_info_cap_iova_range __user, + header); + struct vfio_iommu_type1_info_cap_iova_range cap_iovas = { + .header = { + .id = VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, + .version = 1, + }, + }; + struct interval_tree_span_iter span; + + interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0, + ULONG_MAX) { + struct vfio_iova_range range; + + if (!span.is_hole) + continue; + range.start = span.start_hole; + range.end = span.last_hole; + if (avail >= struct_size(&cap_iovas, iova_ranges, + cap_iovas.nr_iovas + 1) && + copy_to_user(&ucap_iovas->iova_ranges[cap_iovas.nr_iovas], + &range, sizeof(range))) + return -EFAULT; + cap_iovas.nr_iovas++; + } + if (avail >= struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas) && + copy_to_user(ucap_iovas, &cap_iovas, sizeof(cap_iovas))) + return -EFAULT; + return struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas); +} + +static int iommufd_fill_cap_dma_avail(struct iommufd_ioas *ioas, + struct vfio_info_cap_header __user *cur, + size_t avail) +{ + struct vfio_iommu_type1_info_dma_avail cap_dma = { + .header = { + .id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL, + .version = 1, + }, + /* + * iommufd's limit is based on the cgroup's memory limit. + * Normally vfio would return U16_MAX here, and provide a module + * parameter to adjust it. Since S390 qemu userspace actually + * pays attention and needs a value bigger than U16_MAX return + * U32_MAX. + */ + .avail = U32_MAX, + }; + + if (avail >= sizeof(cap_dma) && + copy_to_user(cur, &cap_dma, sizeof(cap_dma))) + return -EFAULT; + return sizeof(cap_dma); +} + +static int iommufd_vfio_iommu_get_info(struct iommufd_ctx *ictx, + void __user *arg) +{ + typedef int (*fill_cap_fn)(struct iommufd_ioas *ioas, + struct vfio_info_cap_header __user *cur, + size_t avail); + static const fill_cap_fn fill_fns[] = { + iommufd_fill_cap_dma_avail, + iommufd_fill_cap_iova, + }; + size_t minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes); + struct vfio_info_cap_header __user *last_cap = NULL; + struct vfio_iommu_type1_info info; + struct iommufd_ioas *ioas; + size_t total_cap_size; + int rc; + int i; + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + minsz = min_t(size_t, info.argsz, sizeof(info)); + + ioas = get_compat_ioas(ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + info.flags = VFIO_IOMMU_INFO_PGSIZES; + info.iova_pgsizes = iommufd_get_pagesizes(ioas); + info.cap_offset = 0; + + down_read(&ioas->iopt.iova_rwsem); + total_cap_size = sizeof(info); + for (i = 0; i != ARRAY_SIZE(fill_fns); i++) { + int cap_size; + + if (info.argsz > total_cap_size) + cap_size = fill_fns[i](ioas, arg + total_cap_size, + info.argsz - total_cap_size); + else + cap_size = fill_fns[i](ioas, NULL, 0); + if (cap_size < 0) { + rc = cap_size; + goto out_put; + } + if (last_cap && info.argsz >= total_cap_size && + put_user(total_cap_size, &last_cap->next)) { + rc = -EFAULT; + goto out_put; + } + last_cap = arg + total_cap_size; + total_cap_size += cap_size; + } + + /* + * If the user did not provide enough space then only some caps are + * returned and the argsz will be updated to the correct amount to get + * all caps. + */ + if (info.argsz >= total_cap_size) + info.cap_offset = sizeof(info); + info.argsz = total_cap_size; + info.flags |= VFIO_IOMMU_INFO_CAPS; + if (copy_to_user(arg, &info, minsz)) { + rc = -EFAULT; + goto out_put; + } + rc = 0; + +out_put: + up_read(&ioas->iopt.iova_rwsem); + iommufd_put_object(&ioas->obj); + return rc; +} + +int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd, + unsigned long arg) +{ + void __user *uarg = (void __user *)arg; + + switch (cmd) { + case VFIO_GET_API_VERSION: + return VFIO_API_VERSION; + case VFIO_SET_IOMMU: + return iommufd_vfio_set_iommu(ictx, arg); + case VFIO_CHECK_EXTENSION: + return iommufd_vfio_check_extension(ictx, arg); + case VFIO_IOMMU_GET_INFO: + return iommufd_vfio_iommu_get_info(ictx, uarg); + case VFIO_IOMMU_MAP_DMA: + return iommufd_vfio_map_dma(ictx, cmd, uarg); + case VFIO_IOMMU_UNMAP_DMA: + return iommufd_vfio_unmap_dma(ictx, cmd, uarg); + case VFIO_IOMMU_DIRTY_PAGES: + default: + return -ENOIOCTLCMD; + } + return -ENOIOCTLCMD; +} diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 46c481a26d791..84af9a239769a 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -54,6 +54,7 @@ void iommufd_access_unpin_pages(struct iommufd_access *access, unsigned long iova, unsigned long length); int iommufd_access_rw(struct iommufd_access *access, unsigned long iova, void *data, size_t len, unsigned int flags); +int iommufd_vfio_compat_ioas_id(struct iommufd_ctx *ictx, u32 *out_ioas_id); #else /* !CONFIG_IOMMUFD */ static inline struct iommufd_ctx *iommufd_ctx_from_file(struct file *file) { @@ -84,5 +85,11 @@ static inline int iommufd_access_rw(struct iommufd_access *access, unsigned long { return -EOPNOTSUPP; } + +static inline int iommufd_vfio_compat_ioas_id(struct iommufd_ctx *ictx, + u32 *out_ioas_id) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_IOMMUFD */ #endif diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 30cc5c5e2b347..98ebba80cfa1f 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -44,6 +44,7 @@ enum { IOMMUFD_CMD_IOAS_MAP, IOMMUFD_CMD_IOAS_UNMAP, IOMMUFD_CMD_OPTION, + IOMMUFD_CMD_VFIO_IOAS, }; /** @@ -308,4 +309,39 @@ struct iommu_option { __aligned_u64 val64; }; #define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION) + +/** + * enum iommufd_vfio_ioas_op - IOMMU_VFIO_IOAS_* ioctls + * @IOMMU_VFIO_IOAS_GET: Get the current compatibility IOAS + * @IOMMU_VFIO_IOAS_SET: Change the current compatibility IOAS + * @IOMMU_VFIO_IOAS_CLEAR: Disable VFIO compatibility + */ +enum iommufd_vfio_ioas_op { + IOMMU_VFIO_IOAS_GET = 0, + IOMMU_VFIO_IOAS_SET = 1, + IOMMU_VFIO_IOAS_CLEAR = 2, +}; + +/** + * struct iommu_vfio_ioas - ioctl(IOMMU_VFIO_IOAS) + * @size: sizeof(struct iommu_vfio_ioas) + * @ioas_id: For IOMMU_VFIO_IOAS_SET the input IOAS ID to set + * For IOMMU_VFIO_IOAS_GET will output the IOAS ID + * @op: One of enum iommufd_vfio_ioas_op + * @__reserved: Must be 0 + * + * The VFIO compatibility support uses a single ioas because VFIO APIs do not + * support the ID field. Set or Get the IOAS that VFIO compatibility will use. + * When VFIO_GROUP_SET_CONTAINER is used on an iommufd it will get the + * compatibility ioas, either by taking what is already set, or auto creating + * one. From then on VFIO will continue to use that ioas and is not effected by + * this ioctl. SET or CLEAR does not destroy any auto-created IOAS. + */ +struct iommu_vfio_ioas { + __u32 size; + __u32 ioas_id; + __u16 op; + __u16 __reserved; +}; +#define IOMMU_VFIO_IOAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VFIO_IOAS) #endif -- GitLab From f4b20bb34c83dceade5470288f48f94ce3598ada Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:39 -0400 Subject: [PATCH 1015/1423] iommufd: Add kernel support for testing iommufd Provide a mock kernel module for the iommu_domain that allows it to run without any HW and the mocking provides a way to directly validate that the PFNs loaded into the iommu_domain are correct. This exposes the access kAPI toward userspace to allow userspace to explore the functionality of pages.c and io_pagetable.c The mock also simulates the rare case of PAGE_SIZE > iommu page size as the mock will operate at a 2K iommu page size. This allows exercising all of the calculations to support this mismatch. This is also intended to support syzkaller exploring the same space. However, it is an unusually invasive config option to enable all of this. The config option should not be enabled in a production kernel. Link: https://lore.kernel.org/r/16-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Tested-by: Matthew Rosato # s390 Tested-by: Eric Auger # aarch64 Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/Kconfig | 12 + drivers/iommu/iommufd/Makefile | 2 + drivers/iommu/iommufd/device.c | 38 ++ drivers/iommu/iommufd/ioas.c | 3 + drivers/iommu/iommufd/iommufd_private.h | 35 + drivers/iommu/iommufd/iommufd_test.h | 93 +++ drivers/iommu/iommufd/main.c | 14 + drivers/iommu/iommufd/pages.c | 8 + drivers/iommu/iommufd/selftest.c | 853 ++++++++++++++++++++++++ include/linux/iommufd.h | 3 + 10 files changed, 1061 insertions(+) create mode 100644 drivers/iommu/iommufd/iommufd_test.h create mode 100644 drivers/iommu/iommufd/selftest.c diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig index 164812084a675..871244f2443fb 100644 --- a/drivers/iommu/iommufd/Kconfig +++ b/drivers/iommu/iommufd/Kconfig @@ -10,3 +10,15 @@ config IOMMUFD it relates to managing IO page tables that point at user space memory. If you don't know what to do here, say N. + +if IOMMUFD +config IOMMUFD_TEST + bool "IOMMU Userspace API Test support" + depends on DEBUG_KERNEL + depends on FAULT_INJECTION + depends on RUNTIME_TESTING_MENU + default n + help + This is dangerous, do not enable unless running + tools/testing/selftests/iommu +endif diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile index 2fdff04000b32..8aeba81800c51 100644 --- a/drivers/iommu/iommufd/Makefile +++ b/drivers/iommu/iommufd/Makefile @@ -8,4 +8,6 @@ iommufd-y := \ pages.o \ vfio_compat.o +iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o + obj-$(CONFIG_IOMMUFD) += iommufd.o diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 06b6894b77062..67ce36152e8ab 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -733,3 +733,41 @@ err_out: return rc; } EXPORT_SYMBOL_NS_GPL(iommufd_access_rw, IOMMUFD); + +#ifdef CONFIG_IOMMUFD_TEST +/* + * Creating a real iommufd_device is too hard, bypass creating a iommufd_device + * and go directly to attaching a domain. + */ +struct iommufd_hw_pagetable * +iommufd_device_selftest_attach(struct iommufd_ctx *ictx, + struct iommufd_ioas *ioas, + struct device *mock_dev) +{ + struct iommufd_hw_pagetable *hwpt; + int rc; + + hwpt = iommufd_hw_pagetable_alloc(ictx, ioas, mock_dev); + if (IS_ERR(hwpt)) + return hwpt; + + rc = iopt_table_add_domain(&hwpt->ioas->iopt, hwpt->domain); + if (rc) + goto out_hwpt; + + refcount_inc(&hwpt->obj.users); + iommufd_object_finalize(ictx, &hwpt->obj); + return hwpt; + +out_hwpt: + iommufd_object_abort_and_destroy(ictx, &hwpt->obj); + return ERR_PTR(rc); +} + +void iommufd_device_selftest_detach(struct iommufd_ctx *ictx, + struct iommufd_hw_pagetable *hwpt) +{ + iopt_table_remove_domain(&hwpt->ioas->iopt, hwpt->domain); + refcount_dec(&hwpt->obj.users); +} +#endif diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c index 302779b33bd4a..31577e9d434f8 100644 --- a/drivers/iommu/iommufd/ioas.c +++ b/drivers/iommu/iommufd/ioas.c @@ -242,6 +242,9 @@ int iommufd_ioas_copy(struct iommufd_ucmd *ucmd) unsigned long iova; int rc; + iommufd_test_syz_conv_iova_id(ucmd, cmd->src_ioas_id, &cmd->src_iova, + &cmd->flags); + if ((cmd->flags & ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE))) diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 8fe5f162ccbcf..222e86591f8ac 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -113,6 +113,9 @@ enum iommufd_object_type { IOMMUFD_OBJ_HW_PAGETABLE, IOMMUFD_OBJ_IOAS, IOMMUFD_OBJ_ACCESS, +#ifdef CONFIG_IOMMUFD_TEST + IOMMUFD_OBJ_SELFTEST, +#endif }; /* Base struct for all objects with a userspace ID handle. */ @@ -269,4 +272,36 @@ void iopt_remove_access(struct io_pagetable *iopt, struct iommufd_access *access); void iommufd_access_destroy_object(struct iommufd_object *obj); +#ifdef CONFIG_IOMMUFD_TEST +struct iommufd_hw_pagetable * +iommufd_device_selftest_attach(struct iommufd_ctx *ictx, + struct iommufd_ioas *ioas, + struct device *mock_dev); +void iommufd_device_selftest_detach(struct iommufd_ctx *ictx, + struct iommufd_hw_pagetable *hwpt); +int iommufd_test(struct iommufd_ucmd *ucmd); +void iommufd_selftest_destroy(struct iommufd_object *obj); +extern size_t iommufd_test_memory_limit; +void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, + unsigned int ioas_id, u64 *iova, u32 *flags); +bool iommufd_should_fail(void); +void __init iommufd_test_init(void); +void iommufd_test_exit(void); +#else +static inline void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, + unsigned int ioas_id, + u64 *iova, u32 *flags) +{ +} +static inline bool iommufd_should_fail(void) +{ + return false; +} +static inline void __init iommufd_test_init(void) +{ +} +static inline void iommufd_test_exit(void) +{ +} +#endif #endif diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h new file mode 100644 index 0000000000000..1d96a8f466fd2 --- /dev/null +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. + */ +#ifndef _UAPI_IOMMUFD_TEST_H +#define _UAPI_IOMMUFD_TEST_H + +#include +#include + +enum { + IOMMU_TEST_OP_ADD_RESERVED = 1, + IOMMU_TEST_OP_MOCK_DOMAIN, + IOMMU_TEST_OP_MD_CHECK_MAP, + IOMMU_TEST_OP_MD_CHECK_REFS, + IOMMU_TEST_OP_CREATE_ACCESS, + IOMMU_TEST_OP_DESTROY_ACCESS_PAGES, + IOMMU_TEST_OP_ACCESS_PAGES, + IOMMU_TEST_OP_ACCESS_RW, + IOMMU_TEST_OP_SET_TEMP_MEMORY_LIMIT, +}; + +enum { + MOCK_APERTURE_START = 1UL << 24, + MOCK_APERTURE_LAST = (1UL << 31) - 1, +}; + +enum { + MOCK_FLAGS_ACCESS_WRITE = 1 << 0, + MOCK_FLAGS_ACCESS_SYZ = 1 << 16, +}; + +enum { + MOCK_ACCESS_RW_WRITE = 1 << 0, + MOCK_ACCESS_RW_SLOW_PATH = 1 << 2, +}; + +enum { + MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES = 1 << 0, +}; + +struct iommu_test_cmd { + __u32 size; + __u32 op; + __u32 id; + __u32 __reserved; + union { + struct { + __aligned_u64 start; + __aligned_u64 length; + } add_reserved; + struct { + __u32 out_device_id; + __u32 out_hwpt_id; + } mock_domain; + struct { + __aligned_u64 iova; + __aligned_u64 length; + __aligned_u64 uptr; + } check_map; + struct { + __aligned_u64 length; + __aligned_u64 uptr; + __u32 refs; + } check_refs; + struct { + __u32 out_access_fd; + __u32 flags; + } create_access; + struct { + __u32 access_pages_id; + } destroy_access_pages; + struct { + __u32 flags; + __u32 out_access_pages_id; + __aligned_u64 iova; + __aligned_u64 length; + __aligned_u64 uptr; + } access_pages; + struct { + __aligned_u64 iova; + __aligned_u64 length; + __aligned_u64 uptr; + __u32 flags; + } access_rw; + struct { + __u32 limit; + } memory_limit; + }; + __u32 last; +}; +#define IOMMU_TEST_CMD _IO(IOMMUFD_TYPE, IOMMUFD_CMD_BASE + 32) + +#endif diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 5cf69c4d591de..7c8f40bc8d98d 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -19,6 +19,7 @@ #include #include "iommufd_private.h" +#include "iommufd_test.h" struct iommufd_object_ops { void (*destroy)(struct iommufd_object *obj); @@ -239,6 +240,9 @@ union ucmd_buffer { struct iommu_ioas_iova_ranges iova_ranges; struct iommu_ioas_map map; struct iommu_ioas_unmap unmap; +#ifdef CONFIG_IOMMUFD_TEST + struct iommu_test_cmd test; +#endif }; struct iommufd_ioctl_op { @@ -275,6 +279,9 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { val64), IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas, __reserved), +#ifdef CONFIG_IOMMUFD_TEST + IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last), +#endif }; static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd, @@ -375,6 +382,11 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { [IOMMUFD_OBJ_HW_PAGETABLE] = { .destroy = iommufd_hw_pagetable_destroy, }, +#ifdef CONFIG_IOMMUFD_TEST + [IOMMUFD_OBJ_SELFTEST] = { + .destroy = iommufd_selftest_destroy, + }, +#endif }; static struct miscdevice iommu_misc_dev = { @@ -392,11 +404,13 @@ static int __init iommufd_init(void) ret = misc_register(&iommu_misc_dev); if (ret) return ret; + iommufd_test_init(); return 0; } static void __exit iommufd_exit(void) { + iommufd_test_exit(); misc_deregister(&iommu_misc_dev); } diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index bafeee9d73e8a..640331b8a0791 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -56,7 +56,11 @@ #include "io_pagetable.h" #include "double_span.h" +#ifndef CONFIG_IOMMUFD_TEST #define TEMP_MEMORY_LIMIT 65536 +#else +#define TEMP_MEMORY_LIMIT iommufd_test_memory_limit +#endif #define BATCH_BACKUP_SIZE 32 /* @@ -1756,6 +1760,10 @@ int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte, bool change_mm = current->mm != pages->source_mm; int rc = 0; + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + (flags & __IOMMUFD_ACCESS_RW_SLOW_PATH)) + change_mm = true; + if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable) return -EPERM; diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c new file mode 100644 index 0000000000000..cfb5fe9a5e0ee --- /dev/null +++ b/drivers/iommu/iommufd/selftest.c @@ -0,0 +1,853 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. + * + * Kernel side components to support tools/testing/selftests/iommu + */ +#include +#include +#include +#include +#include +#include +#include + +#include "io_pagetable.h" +#include "iommufd_private.h" +#include "iommufd_test.h" + +static DECLARE_FAULT_ATTR(fail_iommufd); +static struct dentry *dbgfs_root; + +size_t iommufd_test_memory_limit = 65536; + +enum { + MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2, + + /* + * Like a real page table alignment requires the low bits of the address + * to be zero. xarray also requires the high bit to be zero, so we store + * the pfns shifted. The upper bits are used for metadata. + */ + MOCK_PFN_MASK = ULONG_MAX / MOCK_IO_PAGE_SIZE, + + _MOCK_PFN_START = MOCK_PFN_MASK + 1, + MOCK_PFN_START_IOVA = _MOCK_PFN_START, + MOCK_PFN_LAST_IOVA = _MOCK_PFN_START, +}; + +/* + * Syzkaller has trouble randomizing the correct iova to use since it is linked + * to the map ioctl's output, and it has no ide about that. So, simplify things. + * In syzkaller mode the 64 bit IOVA is converted into an nth area and offset + * value. This has a much smaller randomization space and syzkaller can hit it. + */ +static unsigned long iommufd_test_syz_conv_iova(struct io_pagetable *iopt, + u64 *iova) +{ + struct syz_layout { + __u32 nth_area; + __u32 offset; + }; + struct syz_layout *syz = (void *)iova; + unsigned int nth = syz->nth_area; + struct iopt_area *area; + + down_read(&iopt->iova_rwsem); + for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) { + if (nth == 0) { + up_read(&iopt->iova_rwsem); + return iopt_area_iova(area) + syz->offset; + } + nth--; + } + up_read(&iopt->iova_rwsem); + + return 0; +} + +void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, + unsigned int ioas_id, u64 *iova, u32 *flags) +{ + struct iommufd_ioas *ioas; + + if (!(*flags & MOCK_FLAGS_ACCESS_SYZ)) + return; + *flags &= ~(u32)MOCK_FLAGS_ACCESS_SYZ; + + ioas = iommufd_get_ioas(ucmd, ioas_id); + if (IS_ERR(ioas)) + return; + *iova = iommufd_test_syz_conv_iova(&ioas->iopt, iova); + iommufd_put_object(&ioas->obj); +} + +struct mock_iommu_domain { + struct iommu_domain domain; + struct xarray pfns; +}; + +enum selftest_obj_type { + TYPE_IDEV, +}; + +struct selftest_obj { + struct iommufd_object obj; + enum selftest_obj_type type; + + union { + struct { + struct iommufd_hw_pagetable *hwpt; + struct iommufd_ctx *ictx; + struct device mock_dev; + } idev; + }; +}; + +static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type) +{ + struct mock_iommu_domain *mock; + + if (WARN_ON(iommu_domain_type != IOMMU_DOMAIN_UNMANAGED)) + return NULL; + + mock = kzalloc(sizeof(*mock), GFP_KERNEL); + if (!mock) + return NULL; + mock->domain.geometry.aperture_start = MOCK_APERTURE_START; + mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST; + mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE; + xa_init(&mock->pfns); + return &mock->domain; +} + +static void mock_domain_free(struct iommu_domain *domain) +{ + struct mock_iommu_domain *mock = + container_of(domain, struct mock_iommu_domain, domain); + + WARN_ON(!xa_empty(&mock->pfns)); + kfree(mock); +} + +static int mock_domain_map_pages(struct iommu_domain *domain, + unsigned long iova, phys_addr_t paddr, + size_t pgsize, size_t pgcount, int prot, + gfp_t gfp, size_t *mapped) +{ + struct mock_iommu_domain *mock = + container_of(domain, struct mock_iommu_domain, domain); + unsigned long flags = MOCK_PFN_START_IOVA; + unsigned long start_iova = iova; + + /* + * xarray does not reliably work with fault injection because it does a + * retry allocation, so put our own failure point. + */ + if (iommufd_should_fail()) + return -ENOENT; + + WARN_ON(iova % MOCK_IO_PAGE_SIZE); + WARN_ON(pgsize % MOCK_IO_PAGE_SIZE); + for (; pgcount; pgcount--) { + size_t cur; + + for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) { + void *old; + + if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize) + flags = MOCK_PFN_LAST_IOVA; + old = xa_store(&mock->pfns, iova / MOCK_IO_PAGE_SIZE, + xa_mk_value((paddr / MOCK_IO_PAGE_SIZE) | + flags), + gfp); + if (xa_is_err(old)) { + for (; start_iova != iova; + start_iova += MOCK_IO_PAGE_SIZE) + xa_erase(&mock->pfns, + start_iova / + MOCK_IO_PAGE_SIZE); + return xa_err(old); + } + WARN_ON(old); + iova += MOCK_IO_PAGE_SIZE; + paddr += MOCK_IO_PAGE_SIZE; + *mapped += MOCK_IO_PAGE_SIZE; + flags = 0; + } + } + return 0; +} + +static size_t mock_domain_unmap_pages(struct iommu_domain *domain, + unsigned long iova, size_t pgsize, + size_t pgcount, + struct iommu_iotlb_gather *iotlb_gather) +{ + struct mock_iommu_domain *mock = + container_of(domain, struct mock_iommu_domain, domain); + bool first = true; + size_t ret = 0; + void *ent; + + WARN_ON(iova % MOCK_IO_PAGE_SIZE); + WARN_ON(pgsize % MOCK_IO_PAGE_SIZE); + + for (; pgcount; pgcount--) { + size_t cur; + + for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) { + ent = xa_erase(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); + WARN_ON(!ent); + /* + * iommufd generates unmaps that must be a strict + * superset of the map's performend So every starting + * IOVA should have been an iova passed to map, and the + * + * First IOVA must be present and have been a first IOVA + * passed to map_pages + */ + if (first) { + WARN_ON(!(xa_to_value(ent) & + MOCK_PFN_START_IOVA)); + first = false; + } + if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize) + WARN_ON(!(xa_to_value(ent) & + MOCK_PFN_LAST_IOVA)); + + iova += MOCK_IO_PAGE_SIZE; + ret += MOCK_IO_PAGE_SIZE; + } + } + return ret; +} + +static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain, + dma_addr_t iova) +{ + struct mock_iommu_domain *mock = + container_of(domain, struct mock_iommu_domain, domain); + void *ent; + + WARN_ON(iova % MOCK_IO_PAGE_SIZE); + ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); + WARN_ON(!ent); + return (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE; +} + +static const struct iommu_ops mock_ops = { + .owner = THIS_MODULE, + .pgsize_bitmap = MOCK_IO_PAGE_SIZE, + .domain_alloc = mock_domain_alloc, + .default_domain_ops = + &(struct iommu_domain_ops){ + .free = mock_domain_free, + .map_pages = mock_domain_map_pages, + .unmap_pages = mock_domain_unmap_pages, + .iova_to_phys = mock_domain_iova_to_phys, + }, +}; + +static inline struct iommufd_hw_pagetable * +get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, + struct mock_iommu_domain **mock) +{ + struct iommufd_hw_pagetable *hwpt; + struct iommufd_object *obj; + + obj = iommufd_get_object(ucmd->ictx, mockpt_id, + IOMMUFD_OBJ_HW_PAGETABLE); + if (IS_ERR(obj)) + return ERR_CAST(obj); + hwpt = container_of(obj, struct iommufd_hw_pagetable, obj); + if (hwpt->domain->ops != mock_ops.default_domain_ops) { + iommufd_put_object(&hwpt->obj); + return ERR_PTR(-EINVAL); + } + *mock = container_of(hwpt->domain, struct mock_iommu_domain, domain); + return hwpt; +} + +/* Create an hw_pagetable with the mock domain so we can test the domain ops */ +static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd, + struct iommu_test_cmd *cmd) +{ + static struct bus_type mock_bus = { .iommu_ops = &mock_ops }; + struct iommufd_hw_pagetable *hwpt; + struct selftest_obj *sobj; + struct iommufd_ioas *ioas; + int rc; + + ioas = iommufd_get_ioas(ucmd, cmd->id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + sobj = iommufd_object_alloc(ucmd->ictx, sobj, IOMMUFD_OBJ_SELFTEST); + if (IS_ERR(sobj)) { + rc = PTR_ERR(sobj); + goto out_ioas; + } + sobj->idev.ictx = ucmd->ictx; + sobj->type = TYPE_IDEV; + sobj->idev.mock_dev.bus = &mock_bus; + + hwpt = iommufd_device_selftest_attach(ucmd->ictx, ioas, + &sobj->idev.mock_dev); + if (IS_ERR(hwpt)) { + rc = PTR_ERR(hwpt); + goto out_sobj; + } + sobj->idev.hwpt = hwpt; + + /* Userspace must destroy both of these IDs to destroy the object */ + cmd->mock_domain.out_hwpt_id = hwpt->obj.id; + cmd->mock_domain.out_device_id = sobj->obj.id; + iommufd_object_finalize(ucmd->ictx, &sobj->obj); + iommufd_put_object(&ioas->obj); + return iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + +out_sobj: + iommufd_object_abort(ucmd->ictx, &sobj->obj); +out_ioas: + iommufd_put_object(&ioas->obj); + return rc; +} + +/* Add an additional reserved IOVA to the IOAS */ +static int iommufd_test_add_reserved(struct iommufd_ucmd *ucmd, + unsigned int mockpt_id, + unsigned long start, size_t length) +{ + struct iommufd_ioas *ioas; + int rc; + + ioas = iommufd_get_ioas(ucmd, mockpt_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + down_write(&ioas->iopt.iova_rwsem); + rc = iopt_reserve_iova(&ioas->iopt, start, start + length - 1, NULL); + up_write(&ioas->iopt.iova_rwsem); + iommufd_put_object(&ioas->obj); + return rc; +} + +/* Check that every pfn under each iova matches the pfn under a user VA */ +static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd, + unsigned int mockpt_id, unsigned long iova, + size_t length, void __user *uptr) +{ + struct iommufd_hw_pagetable *hwpt; + struct mock_iommu_domain *mock; + int rc; + + if (iova % MOCK_IO_PAGE_SIZE || length % MOCK_IO_PAGE_SIZE || + (uintptr_t)uptr % MOCK_IO_PAGE_SIZE) + return -EINVAL; + + hwpt = get_md_pagetable(ucmd, mockpt_id, &mock); + if (IS_ERR(hwpt)) + return PTR_ERR(hwpt); + + for (; length; length -= MOCK_IO_PAGE_SIZE) { + struct page *pages[1]; + unsigned long pfn; + long npages; + void *ent; + + npages = get_user_pages_fast((uintptr_t)uptr & PAGE_MASK, 1, 0, + pages); + if (npages < 0) { + rc = npages; + goto out_put; + } + if (WARN_ON(npages != 1)) { + rc = -EFAULT; + goto out_put; + } + pfn = page_to_pfn(pages[0]); + put_page(pages[0]); + + ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); + if (!ent || + (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE != + pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) { + rc = -EINVAL; + goto out_put; + } + iova += MOCK_IO_PAGE_SIZE; + uptr += MOCK_IO_PAGE_SIZE; + } + rc = 0; + +out_put: + iommufd_put_object(&hwpt->obj); + return rc; +} + +/* Check that the page ref count matches, to look for missing pin/unpins */ +static int iommufd_test_md_check_refs(struct iommufd_ucmd *ucmd, + void __user *uptr, size_t length, + unsigned int refs) +{ + if (length % PAGE_SIZE || (uintptr_t)uptr % PAGE_SIZE) + return -EINVAL; + + for (; length; length -= PAGE_SIZE) { + struct page *pages[1]; + long npages; + + npages = get_user_pages_fast((uintptr_t)uptr, 1, 0, pages); + if (npages < 0) + return npages; + if (WARN_ON(npages != 1)) + return -EFAULT; + if (!PageCompound(pages[0])) { + unsigned int count; + + count = page_ref_count(pages[0]); + if (count / GUP_PIN_COUNTING_BIAS != refs) { + put_page(pages[0]); + return -EIO; + } + } + put_page(pages[0]); + uptr += PAGE_SIZE; + } + return 0; +} + +struct selftest_access { + struct iommufd_access *access; + struct file *file; + struct mutex lock; + struct list_head items; + unsigned int next_id; + bool destroying; +}; + +struct selftest_access_item { + struct list_head items_elm; + unsigned long iova; + size_t length; + unsigned int id; +}; + +static const struct file_operations iommfd_test_staccess_fops; + +static struct selftest_access *iommufd_access_get(int fd) +{ + struct file *file; + + file = fget(fd); + if (!file) + return ERR_PTR(-EBADFD); + + if (file->f_op != &iommfd_test_staccess_fops) { + fput(file); + return ERR_PTR(-EBADFD); + } + return file->private_data; +} + +static void iommufd_test_access_unmap(void *data, unsigned long iova, + unsigned long length) +{ + unsigned long iova_last = iova + length - 1; + struct selftest_access *staccess = data; + struct selftest_access_item *item; + struct selftest_access_item *tmp; + + mutex_lock(&staccess->lock); + list_for_each_entry_safe(item, tmp, &staccess->items, items_elm) { + if (iova > item->iova + item->length - 1 || + iova_last < item->iova) + continue; + list_del(&item->items_elm); + iommufd_access_unpin_pages(staccess->access, item->iova, + item->length); + kfree(item); + } + mutex_unlock(&staccess->lock); +} + +static int iommufd_test_access_item_destroy(struct iommufd_ucmd *ucmd, + unsigned int access_id, + unsigned int item_id) +{ + struct selftest_access_item *item; + struct selftest_access *staccess; + + staccess = iommufd_access_get(access_id); + if (IS_ERR(staccess)) + return PTR_ERR(staccess); + + mutex_lock(&staccess->lock); + list_for_each_entry(item, &staccess->items, items_elm) { + if (item->id == item_id) { + list_del(&item->items_elm); + iommufd_access_unpin_pages(staccess->access, item->iova, + item->length); + mutex_unlock(&staccess->lock); + kfree(item); + fput(staccess->file); + return 0; + } + } + mutex_unlock(&staccess->lock); + fput(staccess->file); + return -ENOENT; +} + +static int iommufd_test_staccess_release(struct inode *inode, + struct file *filep) +{ + struct selftest_access *staccess = filep->private_data; + + if (staccess->access) { + iommufd_test_access_unmap(staccess, 0, ULONG_MAX); + iommufd_access_destroy(staccess->access); + } + mutex_destroy(&staccess->lock); + kfree(staccess); + return 0; +} + +static const struct iommufd_access_ops selftest_access_ops_pin = { + .needs_pin_pages = 1, + .unmap = iommufd_test_access_unmap, +}; + +static const struct iommufd_access_ops selftest_access_ops = { + .unmap = iommufd_test_access_unmap, +}; + +static const struct file_operations iommfd_test_staccess_fops = { + .release = iommufd_test_staccess_release, +}; + +static struct selftest_access *iommufd_test_alloc_access(void) +{ + struct selftest_access *staccess; + struct file *filep; + + staccess = kzalloc(sizeof(*staccess), GFP_KERNEL_ACCOUNT); + if (!staccess) + return ERR_PTR(-ENOMEM); + INIT_LIST_HEAD(&staccess->items); + mutex_init(&staccess->lock); + + filep = anon_inode_getfile("[iommufd_test_staccess]", + &iommfd_test_staccess_fops, staccess, + O_RDWR); + if (IS_ERR(filep)) { + kfree(staccess); + return ERR_CAST(filep); + } + staccess->file = filep; + return staccess; +} + +static int iommufd_test_create_access(struct iommufd_ucmd *ucmd, + unsigned int ioas_id, unsigned int flags) +{ + struct iommu_test_cmd *cmd = ucmd->cmd; + struct selftest_access *staccess; + struct iommufd_access *access; + int fdno; + int rc; + + if (flags & ~MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES) + return -EOPNOTSUPP; + + staccess = iommufd_test_alloc_access(); + if (IS_ERR(staccess)) + return PTR_ERR(staccess); + + fdno = get_unused_fd_flags(O_CLOEXEC); + if (fdno < 0) { + rc = -ENOMEM; + goto out_free_staccess; + } + + access = iommufd_access_create( + ucmd->ictx, ioas_id, + (flags & MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES) ? + &selftest_access_ops_pin : + &selftest_access_ops, + staccess); + if (IS_ERR(access)) { + rc = PTR_ERR(access); + goto out_put_fdno; + } + cmd->create_access.out_access_fd = fdno; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_destroy; + + staccess->access = access; + fd_install(fdno, staccess->file); + return 0; + +out_destroy: + iommufd_access_destroy(access); +out_put_fdno: + put_unused_fd(fdno); +out_free_staccess: + fput(staccess->file); + return rc; +} + +/* Check that the pages in a page array match the pages in the user VA */ +static int iommufd_test_check_pages(void __user *uptr, struct page **pages, + size_t npages) +{ + for (; npages; npages--) { + struct page *tmp_pages[1]; + long rc; + + rc = get_user_pages_fast((uintptr_t)uptr, 1, 0, tmp_pages); + if (rc < 0) + return rc; + if (WARN_ON(rc != 1)) + return -EFAULT; + put_page(tmp_pages[0]); + if (tmp_pages[0] != *pages) + return -EBADE; + pages++; + uptr += PAGE_SIZE; + } + return 0; +} + +static int iommufd_test_access_pages(struct iommufd_ucmd *ucmd, + unsigned int access_id, unsigned long iova, + size_t length, void __user *uptr, + u32 flags) +{ + struct iommu_test_cmd *cmd = ucmd->cmd; + struct selftest_access_item *item; + struct selftest_access *staccess; + struct page **pages; + size_t npages; + int rc; + + /* Prevent syzkaller from triggering a WARN_ON in kvzalloc() */ + if (length > 16*1024*1024) + return -ENOMEM; + + if (flags & ~(MOCK_FLAGS_ACCESS_WRITE | MOCK_FLAGS_ACCESS_SYZ)) + return -EOPNOTSUPP; + + staccess = iommufd_access_get(access_id); + if (IS_ERR(staccess)) + return PTR_ERR(staccess); + + if (staccess->access->ops != &selftest_access_ops_pin) { + rc = -EOPNOTSUPP; + goto out_put; + } + + if (flags & MOCK_FLAGS_ACCESS_SYZ) + iova = iommufd_test_syz_conv_iova(&staccess->access->ioas->iopt, + &cmd->access_pages.iova); + + npages = (ALIGN(iova + length, PAGE_SIZE) - + ALIGN_DOWN(iova, PAGE_SIZE)) / + PAGE_SIZE; + pages = kvcalloc(npages, sizeof(*pages), GFP_KERNEL_ACCOUNT); + if (!pages) { + rc = -ENOMEM; + goto out_put; + } + + /* + * Drivers will need to think very carefully about this locking. The + * core code can do multiple unmaps instantaneously after + * iommufd_access_pin_pages() and *all* the unmaps must not return until + * the range is unpinned. This simple implementation puts a global lock + * around the pin, which may not suit drivers that want this to be a + * performance path. drivers that get this wrong will trigger WARN_ON + * races and cause EDEADLOCK failures to userspace. + */ + mutex_lock(&staccess->lock); + rc = iommufd_access_pin_pages(staccess->access, iova, length, pages, + flags & MOCK_FLAGS_ACCESS_WRITE); + if (rc) + goto out_unlock; + + /* For syzkaller allow uptr to be NULL to skip this check */ + if (uptr) { + rc = iommufd_test_check_pages( + uptr - (iova - ALIGN_DOWN(iova, PAGE_SIZE)), pages, + npages); + if (rc) + goto out_unaccess; + } + + item = kzalloc(sizeof(*item), GFP_KERNEL_ACCOUNT); + if (!item) { + rc = -ENOMEM; + goto out_unaccess; + } + + item->iova = iova; + item->length = length; + item->id = staccess->next_id++; + list_add_tail(&item->items_elm, &staccess->items); + + cmd->access_pages.out_access_pages_id = item->id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_free_item; + goto out_unlock; + +out_free_item: + list_del(&item->items_elm); + kfree(item); +out_unaccess: + iommufd_access_unpin_pages(staccess->access, iova, length); +out_unlock: + mutex_unlock(&staccess->lock); + kvfree(pages); +out_put: + fput(staccess->file); + return rc; +} + +static int iommufd_test_access_rw(struct iommufd_ucmd *ucmd, + unsigned int access_id, unsigned long iova, + size_t length, void __user *ubuf, + unsigned int flags) +{ + struct iommu_test_cmd *cmd = ucmd->cmd; + struct selftest_access *staccess; + void *tmp; + int rc; + + /* Prevent syzkaller from triggering a WARN_ON in kvzalloc() */ + if (length > 16*1024*1024) + return -ENOMEM; + + if (flags & ~(MOCK_ACCESS_RW_WRITE | MOCK_ACCESS_RW_SLOW_PATH | + MOCK_FLAGS_ACCESS_SYZ)) + return -EOPNOTSUPP; + + staccess = iommufd_access_get(access_id); + if (IS_ERR(staccess)) + return PTR_ERR(staccess); + + tmp = kvzalloc(length, GFP_KERNEL_ACCOUNT); + if (!tmp) { + rc = -ENOMEM; + goto out_put; + } + + if (flags & MOCK_ACCESS_RW_WRITE) { + if (copy_from_user(tmp, ubuf, length)) { + rc = -EFAULT; + goto out_free; + } + } + + if (flags & MOCK_FLAGS_ACCESS_SYZ) + iova = iommufd_test_syz_conv_iova(&staccess->access->ioas->iopt, + &cmd->access_rw.iova); + + rc = iommufd_access_rw(staccess->access, iova, tmp, length, flags); + if (rc) + goto out_free; + if (!(flags & MOCK_ACCESS_RW_WRITE)) { + if (copy_to_user(ubuf, tmp, length)) { + rc = -EFAULT; + goto out_free; + } + } + +out_free: + kvfree(tmp); +out_put: + fput(staccess->file); + return rc; +} +static_assert((unsigned int)MOCK_ACCESS_RW_WRITE == IOMMUFD_ACCESS_RW_WRITE); +static_assert((unsigned int)MOCK_ACCESS_RW_SLOW_PATH == + __IOMMUFD_ACCESS_RW_SLOW_PATH); + +void iommufd_selftest_destroy(struct iommufd_object *obj) +{ + struct selftest_obj *sobj = container_of(obj, struct selftest_obj, obj); + + switch (sobj->type) { + case TYPE_IDEV: + iommufd_device_selftest_detach(sobj->idev.ictx, + sobj->idev.hwpt); + break; + } +} + +int iommufd_test(struct iommufd_ucmd *ucmd) +{ + struct iommu_test_cmd *cmd = ucmd->cmd; + + switch (cmd->op) { + case IOMMU_TEST_OP_ADD_RESERVED: + return iommufd_test_add_reserved(ucmd, cmd->id, + cmd->add_reserved.start, + cmd->add_reserved.length); + case IOMMU_TEST_OP_MOCK_DOMAIN: + return iommufd_test_mock_domain(ucmd, cmd); + case IOMMU_TEST_OP_MD_CHECK_MAP: + return iommufd_test_md_check_pa( + ucmd, cmd->id, cmd->check_map.iova, + cmd->check_map.length, + u64_to_user_ptr(cmd->check_map.uptr)); + case IOMMU_TEST_OP_MD_CHECK_REFS: + return iommufd_test_md_check_refs( + ucmd, u64_to_user_ptr(cmd->check_refs.uptr), + cmd->check_refs.length, cmd->check_refs.refs); + case IOMMU_TEST_OP_CREATE_ACCESS: + return iommufd_test_create_access(ucmd, cmd->id, + cmd->create_access.flags); + case IOMMU_TEST_OP_ACCESS_PAGES: + return iommufd_test_access_pages( + ucmd, cmd->id, cmd->access_pages.iova, + cmd->access_pages.length, + u64_to_user_ptr(cmd->access_pages.uptr), + cmd->access_pages.flags); + case IOMMU_TEST_OP_ACCESS_RW: + return iommufd_test_access_rw( + ucmd, cmd->id, cmd->access_rw.iova, + cmd->access_rw.length, + u64_to_user_ptr(cmd->access_rw.uptr), + cmd->access_rw.flags); + case IOMMU_TEST_OP_DESTROY_ACCESS_PAGES: + return iommufd_test_access_item_destroy( + ucmd, cmd->id, cmd->destroy_access_pages.access_pages_id); + case IOMMU_TEST_OP_SET_TEMP_MEMORY_LIMIT: + /* Protect _batch_init(), can not be less than elmsz */ + if (cmd->memory_limit.limit < + sizeof(unsigned long) + sizeof(u32)) + return -EINVAL; + iommufd_test_memory_limit = cmd->memory_limit.limit; + return 0; + default: + return -EOPNOTSUPP; + } +} + +bool iommufd_should_fail(void) +{ + return should_fail(&fail_iommufd, 1); +} + +void __init iommufd_test_init(void) +{ + dbgfs_root = + fault_create_debugfs_attr("fail_iommufd", NULL, &fail_iommufd); +} + +void iommufd_test_exit(void) +{ + debugfs_remove_recursive(dbgfs_root); +} diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 84af9a239769a..650d45629647a 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -34,6 +34,9 @@ enum { IOMMUFD_ACCESS_RW_WRITE = 1 << 0, /* Set if the caller is in a kthread then rw will use kthread_use_mm() */ IOMMUFD_ACCESS_RW_KTHREAD = 1 << 1, + + /* Only for use by selftest */ + __IOMMUFD_ACCESS_RW_SLOW_PATH = 1 << 2, }; struct iommufd_access * -- GitLab From e26eed4f623da70913b535631a29764d108efe98 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:40 -0400 Subject: [PATCH 1016/1423] iommufd: Add some fault injection points This increases the coverage the fail_nth test gets, as well as via syzkaller. Link: https://lore.kernel.org/r/17-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Tested-by: Matthew Rosato # s390 Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/main.c | 3 +++ drivers/iommu/iommufd/pages.c | 26 ++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 7c8f40bc8d98d..bcb463e581009 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -102,6 +102,9 @@ struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, { struct iommufd_object *obj; + if (iommufd_should_fail()) + return ERR_PTR(-ENOENT); + xa_lock(&ictx->objects); obj = xa_load(&ictx->objects, id); if (!obj || (type != IOMMUFD_OBJ_ANY && obj->type != type) || diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index 640331b8a0791..c5d2d9a8c5620 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -80,6 +80,10 @@ static void *temp_kmalloc(size_t *size, void *backup, size_t backup_len) if (*size < backup_len) return backup; + + if (!backup && iommufd_should_fail()) + return NULL; + *size = min_t(size_t, *size, TEMP_MEMORY_LIMIT); res = kmalloc(*size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); if (res) @@ -544,6 +548,7 @@ static int pages_to_xarray(struct xarray *xa, unsigned long start_index, unsigned long last_index, struct page **pages) { struct page **end_pages = pages + (last_index - start_index) + 1; + struct page **half_pages = pages + (end_pages - pages) / 2; XA_STATE(xas, xa, start_index); do { @@ -551,6 +556,15 @@ static int pages_to_xarray(struct xarray *xa, unsigned long start_index, xas_lock(&xas); while (pages != end_pages) { + /* xarray does not participate in fault injection */ + if (pages == half_pages && iommufd_should_fail()) { + xas_set_err(&xas, -EINVAL); + xas_unlock(&xas); + /* aka xas_destroy() */ + xas_nomem(&xas, GFP_KERNEL); + goto err_clear; + } + old = xas_store(&xas, xa_mk_value(page_to_pfn(*pages))); if (xas_error(&xas)) break; @@ -561,6 +575,7 @@ static int pages_to_xarray(struct xarray *xa, unsigned long start_index, xas_unlock(&xas); } while (xas_nomem(&xas, GFP_KERNEL)); +err_clear: if (xas_error(&xas)) { if (xas.xa_index != start_index) clear_xarray(xa, start_index, xas.xa_index - 1); @@ -728,6 +743,10 @@ static int pfn_reader_user_pin(struct pfn_reader_user *user, npages = min_t(unsigned long, last_index - start_index + 1, user->upages_len / sizeof(*user->upages)); + + if (iommufd_should_fail()) + return -EFAULT; + uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE); if (!remote_mm) rc = pin_user_pages_fast(uptr, npages, user->gup_flags, @@ -872,6 +891,8 @@ static int pfn_reader_user_update_pinned(struct pfn_reader_user *user, npages = pages->last_npinned - pages->npinned; inc = false; } else { + if (iommufd_should_fail()) + return -ENOMEM; npages = pages->npinned - pages->last_npinned; inc = true; } @@ -1721,6 +1742,11 @@ static int iopt_pages_rw_page(struct iopt_pages *pages, unsigned long index, return iopt_pages_rw_slow(pages, index, index, offset, data, length, flags); + if (iommufd_should_fail()) { + rc = -EINVAL; + goto out_mmput; + } + mmap_read_lock(pages->source_mm); rc = pin_user_pages_remote( pages->source_mm, (uintptr_t)(pages->uptr + index * PAGE_SIZE), -- GitLab From 52f528583bb395495f7dd35e6e4d548bccbf8a73 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:41 -0400 Subject: [PATCH 1017/1423] iommufd: Add additional invariant assertions These are on performance paths so we protect them using the CONFIG_IOMMUFD_TEST to not take a hit during normal operation. These are useful when running the test suite and syzkaller to find data structure inconsistencies early. Link: https://lore.kernel.org/r/18-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Tested-by: Yi Liu Tested-by: Matthew Rosato # s390 Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 5 ++++ drivers/iommu/iommufd/io_pagetable.c | 22 +++++++++++++++ drivers/iommu/iommufd/io_pagetable.h | 3 ++ drivers/iommu/iommufd/pages.c | 42 ++++++++++++++++++++++++++-- 4 files changed, 70 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 67ce36152e8ab..dd2a415b603e3 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -625,6 +625,11 @@ int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova, struct iopt_area *area; int rc; + /* Driver's ops don't support pin_pages */ + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(access->iova_alignment != PAGE_SIZE || !access->ops->unmap)) + return -EINVAL; + if (!length) return -EINVAL; if (check_add_overflow(iova, length - 1, &last_iova)) diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 4f4a9d9aac570..3467cea795684 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -251,6 +251,11 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt, (uintptr_t)elm->pages->uptr + elm->start_byte, length); if (rc) goto out_unlock; + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) { + rc = -EINVAL; + goto out_unlock; + } } else { rc = iopt_check_iova(iopt, *dst_iova, length); if (rc) @@ -277,6 +282,8 @@ out_unlock: static void iopt_abort_area(struct iopt_area *area) { + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(area->pages); if (area->iopt) { down_write(&area->iopt->iova_rwsem); interval_tree_remove(&area->node, &area->iopt->area_itree); @@ -642,6 +649,9 @@ void iopt_destroy_table(struct io_pagetable *iopt) { struct interval_tree_node *node; + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + iopt_remove_reserved_iova(iopt, NULL); + while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0, ULONG_MAX))) { interval_tree_remove(node, &iopt->allowed_itree); @@ -688,6 +698,8 @@ static void iopt_unfill_domain(struct io_pagetable *iopt, continue; mutex_lock(&pages->mutex); + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(!area->storage_domain); if (area->storage_domain == domain) area->storage_domain = storage_domain; mutex_unlock(&pages->mutex); @@ -792,6 +804,16 @@ static int iopt_check_iova_alignment(struct io_pagetable *iopt, (iopt_area_length(area) & align_mask) || (area->page_offset & align_mask)) return -EADDRINUSE; + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) { + struct iommufd_access *access; + unsigned long index; + + xa_for_each(&iopt->access_list, index, access) + if (WARN_ON(access->iova_alignment > + new_iova_alignment)) + return -EADDRINUSE; + } return 0; } diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h index 2ee6942c3ef4a..83e7c175f2a27 100644 --- a/drivers/iommu/iommufd/io_pagetable.h +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -101,6 +101,9 @@ static inline size_t iopt_area_length(struct iopt_area *area) static inline unsigned long iopt_area_start_byte(struct iopt_area *area, unsigned long iova) { + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(iova < iopt_area_iova(area) || + iova > iopt_area_last_iova(area)); return (iova - iopt_area_iova(area)) + area->page_offset + iopt_area_index(area) * PAGE_SIZE; } diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index c5d2d9a8c5620..429fa3b0a239c 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -162,12 +162,20 @@ void interval_tree_double_span_iter_next( static void iopt_pages_add_npinned(struct iopt_pages *pages, size_t npages) { - pages->npinned += npages; + int rc; + + rc = check_add_overflow(pages->npinned, npages, &pages->npinned); + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(rc || pages->npinned > pages->npages); } static void iopt_pages_sub_npinned(struct iopt_pages *pages, size_t npages) { - pages->npinned -= npages; + int rc; + + rc = check_sub_overflow(pages->npinned, npages, &pages->npinned); + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(rc || pages->npinned > pages->npages); } static void iopt_pages_err_unpin(struct iopt_pages *pages, @@ -189,6 +197,9 @@ static void iopt_pages_err_unpin(struct iopt_pages *pages, static unsigned long iopt_area_index_to_iova(struct iopt_area *area, unsigned long index) { + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(index < iopt_area_index(area) || + index > iopt_area_last_index(area)); index -= iopt_area_index(area); if (index == 0) return iopt_area_iova(area); @@ -198,6 +209,9 @@ static unsigned long iopt_area_index_to_iova(struct iopt_area *area, static unsigned long iopt_area_index_to_iova_last(struct iopt_area *area, unsigned long index) { + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(index < iopt_area_index(area) || + index > iopt_area_last_index(area)); if (index == iopt_area_last_index(area)) return iopt_area_last_iova(area); return iopt_area_iova(area) - area->page_offset + @@ -286,6 +300,8 @@ static void batch_skip_carry(struct pfn_batch *batch, unsigned int skip_pfns) { if (!batch->total_pfns) return; + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(batch->total_pfns != batch->npfns[0]); skip_pfns = min(batch->total_pfns, skip_pfns); batch->pfns[0] += skip_pfns; batch->npfns[0] -= skip_pfns; @@ -301,6 +317,8 @@ static int __batch_init(struct pfn_batch *batch, size_t max_pages, void *backup, batch->pfns = temp_kmalloc(&size, backup, backup_len); if (!batch->pfns) return -ENOMEM; + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && WARN_ON(size < elmsz)) + return -EINVAL; batch->array_size = size / elmsz; batch->npfns = (u32 *)(batch->pfns + batch->array_size); batch_clear(batch); @@ -429,6 +447,10 @@ static int batch_iommu_map_small(struct iommu_domain *domain, unsigned long start_iova = iova; int rc; + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(paddr % PAGE_SIZE || iova % PAGE_SIZE || + size % PAGE_SIZE); + while (size) { rc = iommu_map(domain, iova, paddr, PAGE_SIZE, prot); if (rc) @@ -718,6 +740,10 @@ static int pfn_reader_user_pin(struct pfn_reader_user *user, uintptr_t uptr; long rc; + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(last_index < start_index)) + return -EINVAL; + if (!user->upages) { /* All undone in pfn_reader_destroy() */ user->upages_len = @@ -956,6 +982,10 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns) struct iopt_area *area; int rc; + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(span->last_used < start_index)) + return -EINVAL; + if (span->is_used == 1) { batch_from_xarray(&pfns->batch, &pfns->pages->pinned_pfns, start_index, span->last_used); @@ -1008,6 +1038,10 @@ static int pfn_reader_next(struct pfn_reader *pfns) while (pfns->batch_end_index != pfns->last_index + 1) { unsigned int npfns = pfns->batch.total_pfns; + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(interval_tree_double_span_iter_done(&pfns->span))) + return -EINVAL; + rc = pfn_reader_fill_span(pfns); if (rc) return rc; @@ -1091,6 +1125,10 @@ static int pfn_reader_first(struct pfn_reader *pfns, struct iopt_pages *pages, { int rc; + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(last_index < start_index)) + return -EINVAL; + rc = pfn_reader_init(pfns, pages, start_index, last_index); if (rc) return rc; -- GitLab From 57f0988706fec1b8dbc3fe00965828a47e2235a1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:42 -0400 Subject: [PATCH 1018/1423] iommufd: Add a selftest Cover the essential functionality of the iommufd with a directed test from userspace. This aims to achieve reasonable functional coverage using the in-kernel self test framework. A second test does a failure injection sweep of the success paths to study error unwind behaviors. This allows achieving high coverage of the corner cases in pages.c. The selftest requires CONFIG_IOMMUFD_TEST to be enabled, and several huge pages which may require: echo 4 > /proc/sys/vm/nr_hugepages Link: https://lore.kernel.org/r/19-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Tested-by: Nicolin Chen Tested-by: Matthew Rosato # s390 Tested-by: Yi Liu Tested-by: Eric Auger # aarch64 Signed-off-by: Nicolin Chen Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- tools/testing/selftests/Makefile | 1 + tools/testing/selftests/iommu/.gitignore | 3 + tools/testing/selftests/iommu/Makefile | 12 + tools/testing/selftests/iommu/config | 2 + tools/testing/selftests/iommu/iommufd.c | 1654 +++++++++++++++++ .../selftests/iommu/iommufd_fail_nth.c | 580 ++++++ tools/testing/selftests/iommu/iommufd_utils.h | 278 +++ 7 files changed, 2530 insertions(+) create mode 100644 tools/testing/selftests/iommu/.gitignore create mode 100644 tools/testing/selftests/iommu/Makefile create mode 100644 tools/testing/selftests/iommu/config create mode 100644 tools/testing/selftests/iommu/iommufd.c create mode 100644 tools/testing/selftests/iommu/iommufd_fail_nth.c create mode 100644 tools/testing/selftests/iommu/iommufd_utils.h diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index f07aef7c592c2..d6680af7b2956 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -27,6 +27,7 @@ TARGETS += ftrace TARGETS += futex TARGETS += gpio TARGETS += intel_pstate +TARGETS += iommu TARGETS += ipc TARGETS += ir TARGETS += kcmp diff --git a/tools/testing/selftests/iommu/.gitignore b/tools/testing/selftests/iommu/.gitignore new file mode 100644 index 0000000000000..7d0703049ebaf --- /dev/null +++ b/tools/testing/selftests/iommu/.gitignore @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +/iommufd +/iommufd_fail_nth diff --git a/tools/testing/selftests/iommu/Makefile b/tools/testing/selftests/iommu/Makefile new file mode 100644 index 0000000000000..7cb74d26f1417 --- /dev/null +++ b/tools/testing/selftests/iommu/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0-only +CFLAGS += -Wall -O2 -Wno-unused-function +CFLAGS += -I../../../../include/uapi/ +CFLAGS += -I../../../../include/ + +CFLAGS += -D_GNU_SOURCE + +TEST_GEN_PROGS := +TEST_GEN_PROGS += iommufd +TEST_GEN_PROGS += iommufd_fail_nth + +include ../lib.mk diff --git a/tools/testing/selftests/iommu/config b/tools/testing/selftests/iommu/config new file mode 100644 index 0000000000000..6c4f901d6fed3 --- /dev/null +++ b/tools/testing/selftests/iommu/config @@ -0,0 +1,2 @@ +CONFIG_IOMMUFD +CONFIG_IOMMUFD_TEST diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c new file mode 100644 index 0000000000000..8aa8a346cf221 --- /dev/null +++ b/tools/testing/selftests/iommu/iommufd.c @@ -0,0 +1,1654 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES */ +#include +#include +#include + +#define __EXPORTED_HEADERS__ +#include + +#include "iommufd_utils.h" + +static void *buffer; + +static unsigned long PAGE_SIZE; +static unsigned long HUGEPAGE_SIZE; + +#define MOCK_PAGE_SIZE (PAGE_SIZE / 2) + +static unsigned long get_huge_page_size(void) +{ + char buf[80]; + int ret; + int fd; + + fd = open("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", + O_RDONLY); + if (fd < 0) + return 2 * 1024 * 1024; + + ret = read(fd, buf, sizeof(buf)); + close(fd); + if (ret <= 0 || ret == sizeof(buf)) + return 2 * 1024 * 1024; + buf[ret] = 0; + return strtoul(buf, NULL, 10); +} + +static __attribute__((constructor)) void setup_sizes(void) +{ + void *vrc; + int rc; + + PAGE_SIZE = sysconf(_SC_PAGE_SIZE); + HUGEPAGE_SIZE = get_huge_page_size(); + + BUFFER_SIZE = PAGE_SIZE * 16; + rc = posix_memalign(&buffer, HUGEPAGE_SIZE, BUFFER_SIZE); + assert(!rc); + assert(buffer); + assert((uintptr_t)buffer % HUGEPAGE_SIZE == 0); + vrc = mmap(buffer, BUFFER_SIZE, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + assert(vrc == buffer); +} + +FIXTURE(iommufd) +{ + int fd; +}; + +FIXTURE_SETUP(iommufd) +{ + self->fd = open("/dev/iommu", O_RDWR); + ASSERT_NE(-1, self->fd); +} + +FIXTURE_TEARDOWN(iommufd) +{ + teardown_iommufd(self->fd, _metadata); +} + +TEST_F(iommufd, simple_close) +{ +} + +TEST_F(iommufd, cmd_fail) +{ + struct iommu_destroy cmd = { .size = sizeof(cmd), .id = 0 }; + + /* object id is invalid */ + EXPECT_ERRNO(ENOENT, _test_ioctl_destroy(self->fd, 0)); + /* Bad pointer */ + EXPECT_ERRNO(EFAULT, ioctl(self->fd, IOMMU_DESTROY, NULL)); + /* Unknown ioctl */ + EXPECT_ERRNO(ENOTTY, + ioctl(self->fd, _IO(IOMMUFD_TYPE, IOMMUFD_CMD_BASE - 1), + &cmd)); +} + +TEST_F(iommufd, cmd_length) +{ +#define TEST_LENGTH(_struct, _ioctl) \ + { \ + struct { \ + struct _struct cmd; \ + uint8_t extra; \ + } cmd = { .cmd = { .size = sizeof(struct _struct) - 1 }, \ + .extra = UINT8_MAX }; \ + int old_errno; \ + int rc; \ + \ + EXPECT_ERRNO(EINVAL, ioctl(self->fd, _ioctl, &cmd)); \ + cmd.cmd.size = sizeof(struct _struct) + 1; \ + EXPECT_ERRNO(E2BIG, ioctl(self->fd, _ioctl, &cmd)); \ + cmd.cmd.size = sizeof(struct _struct); \ + rc = ioctl(self->fd, _ioctl, &cmd); \ + old_errno = errno; \ + cmd.cmd.size = sizeof(struct _struct) + 1; \ + cmd.extra = 0; \ + if (rc) { \ + EXPECT_ERRNO(old_errno, \ + ioctl(self->fd, _ioctl, &cmd)); \ + } else { \ + ASSERT_EQ(0, ioctl(self->fd, _ioctl, &cmd)); \ + } \ + } + + TEST_LENGTH(iommu_destroy, IOMMU_DESTROY); + TEST_LENGTH(iommu_ioas_alloc, IOMMU_IOAS_ALLOC); + TEST_LENGTH(iommu_ioas_iova_ranges, IOMMU_IOAS_IOVA_RANGES); + TEST_LENGTH(iommu_ioas_allow_iovas, IOMMU_IOAS_ALLOW_IOVAS); + TEST_LENGTH(iommu_ioas_map, IOMMU_IOAS_MAP); + TEST_LENGTH(iommu_ioas_copy, IOMMU_IOAS_COPY); + TEST_LENGTH(iommu_ioas_unmap, IOMMU_IOAS_UNMAP); + TEST_LENGTH(iommu_option, IOMMU_OPTION); + TEST_LENGTH(iommu_vfio_ioas, IOMMU_VFIO_IOAS); +#undef TEST_LENGTH +} + +TEST_F(iommufd, cmd_ex_fail) +{ + struct { + struct iommu_destroy cmd; + __u64 future; + } cmd = { .cmd = { .size = sizeof(cmd), .id = 0 } }; + + /* object id is invalid and command is longer */ + EXPECT_ERRNO(ENOENT, ioctl(self->fd, IOMMU_DESTROY, &cmd)); + /* future area is non-zero */ + cmd.future = 1; + EXPECT_ERRNO(E2BIG, ioctl(self->fd, IOMMU_DESTROY, &cmd)); + /* Original command "works" */ + cmd.cmd.size = sizeof(cmd.cmd); + EXPECT_ERRNO(ENOENT, ioctl(self->fd, IOMMU_DESTROY, &cmd)); + /* Short command fails */ + cmd.cmd.size = sizeof(cmd.cmd) - 1; + EXPECT_ERRNO(EINVAL, ioctl(self->fd, IOMMU_DESTROY, &cmd)); +} + +TEST_F(iommufd, global_options) +{ + struct iommu_option cmd = { + .size = sizeof(cmd), + .option_id = IOMMU_OPTION_RLIMIT_MODE, + .op = IOMMU_OPTION_OP_GET, + .val64 = 1, + }; + + cmd.option_id = IOMMU_OPTION_RLIMIT_MODE; + ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd)); + ASSERT_EQ(0, cmd.val64); + + /* This requires root */ + cmd.op = IOMMU_OPTION_OP_SET; + cmd.val64 = 1; + ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd)); + cmd.val64 = 2; + EXPECT_ERRNO(EINVAL, ioctl(self->fd, IOMMU_OPTION, &cmd)); + + cmd.op = IOMMU_OPTION_OP_GET; + ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd)); + ASSERT_EQ(1, cmd.val64); + + cmd.op = IOMMU_OPTION_OP_SET; + cmd.val64 = 0; + ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd)); + + cmd.op = IOMMU_OPTION_OP_GET; + cmd.option_id = IOMMU_OPTION_HUGE_PAGES; + EXPECT_ERRNO(ENOENT, ioctl(self->fd, IOMMU_OPTION, &cmd)); + cmd.op = IOMMU_OPTION_OP_SET; + EXPECT_ERRNO(ENOENT, ioctl(self->fd, IOMMU_OPTION, &cmd)); +} + +FIXTURE(iommufd_ioas) +{ + int fd; + uint32_t ioas_id; + uint32_t domain_id; + uint64_t base_iova; +}; + +FIXTURE_VARIANT(iommufd_ioas) +{ + unsigned int mock_domains; + unsigned int memory_limit; +}; + +FIXTURE_SETUP(iommufd_ioas) +{ + unsigned int i; + + + self->fd = open("/dev/iommu", O_RDWR); + ASSERT_NE(-1, self->fd); + test_ioctl_ioas_alloc(&self->ioas_id); + + if (!variant->memory_limit) { + test_ioctl_set_default_memory_limit(); + } else { + test_ioctl_set_temp_memory_limit(variant->memory_limit); + } + + for (i = 0; i != variant->mock_domains; i++) { + test_cmd_mock_domain(self->ioas_id, NULL, &self->domain_id); + self->base_iova = MOCK_APERTURE_START; + } +} + +FIXTURE_TEARDOWN(iommufd_ioas) +{ + test_ioctl_set_default_memory_limit(); + teardown_iommufd(self->fd, _metadata); +} + +FIXTURE_VARIANT_ADD(iommufd_ioas, no_domain) +{ +}; + +FIXTURE_VARIANT_ADD(iommufd_ioas, mock_domain) +{ + .mock_domains = 1, +}; + +FIXTURE_VARIANT_ADD(iommufd_ioas, two_mock_domain) +{ + .mock_domains = 2, +}; + +FIXTURE_VARIANT_ADD(iommufd_ioas, mock_domain_limit) +{ + .mock_domains = 1, + .memory_limit = 16, +}; + +TEST_F(iommufd_ioas, ioas_auto_destroy) +{ +} + +TEST_F(iommufd_ioas, ioas_destroy) +{ + if (self->domain_id) { + /* IOAS cannot be freed while a domain is on it */ + EXPECT_ERRNO(EBUSY, + _test_ioctl_destroy(self->fd, self->ioas_id)); + } else { + /* Can allocate and manually free an IOAS table */ + test_ioctl_destroy(self->ioas_id); + } +} + +TEST_F(iommufd_ioas, ioas_area_destroy) +{ + /* Adding an area does not change ability to destroy */ + test_ioctl_ioas_map_fixed(buffer, PAGE_SIZE, self->base_iova); + if (self->domain_id) + EXPECT_ERRNO(EBUSY, + _test_ioctl_destroy(self->fd, self->ioas_id)); + else + test_ioctl_destroy(self->ioas_id); +} + +TEST_F(iommufd_ioas, ioas_area_auto_destroy) +{ + int i; + + /* Can allocate and automatically free an IOAS table with many areas */ + for (i = 0; i != 10; i++) { + test_ioctl_ioas_map_fixed(buffer, PAGE_SIZE, + self->base_iova + i * PAGE_SIZE); + } +} + +TEST_F(iommufd_ioas, area) +{ + int i; + + /* Unmap fails if nothing is mapped */ + for (i = 0; i != 10; i++) + test_err_ioctl_ioas_unmap(ENOENT, i * PAGE_SIZE, PAGE_SIZE); + + /* Unmap works */ + for (i = 0; i != 10; i++) + test_ioctl_ioas_map_fixed(buffer, PAGE_SIZE, + self->base_iova + i * PAGE_SIZE); + for (i = 0; i != 10; i++) + test_ioctl_ioas_unmap(self->base_iova + i * PAGE_SIZE, + PAGE_SIZE); + + /* Split fails */ + test_ioctl_ioas_map_fixed(buffer, PAGE_SIZE * 2, + self->base_iova + 16 * PAGE_SIZE); + test_err_ioctl_ioas_unmap(ENOENT, self->base_iova + 16 * PAGE_SIZE, + PAGE_SIZE); + test_err_ioctl_ioas_unmap(ENOENT, self->base_iova + 17 * PAGE_SIZE, + PAGE_SIZE); + + /* Over map fails */ + test_err_ioctl_ioas_map_fixed(EEXIST, buffer, PAGE_SIZE * 2, + self->base_iova + 16 * PAGE_SIZE); + test_err_ioctl_ioas_map_fixed(EEXIST, buffer, PAGE_SIZE, + self->base_iova + 16 * PAGE_SIZE); + test_err_ioctl_ioas_map_fixed(EEXIST, buffer, PAGE_SIZE, + self->base_iova + 17 * PAGE_SIZE); + test_err_ioctl_ioas_map_fixed(EEXIST, buffer, PAGE_SIZE * 2, + self->base_iova + 15 * PAGE_SIZE); + test_err_ioctl_ioas_map_fixed(EEXIST, buffer, PAGE_SIZE * 3, + self->base_iova + 15 * PAGE_SIZE); + + /* unmap all works */ + test_ioctl_ioas_unmap(0, UINT64_MAX); + + /* Unmap all succeeds on an empty IOAS */ + test_ioctl_ioas_unmap(0, UINT64_MAX); +} + +TEST_F(iommufd_ioas, unmap_fully_contained_areas) +{ + uint64_t unmap_len; + int i; + + /* Give no_domain some space to rewind base_iova */ + self->base_iova += 4 * PAGE_SIZE; + + for (i = 0; i != 4; i++) + test_ioctl_ioas_map_fixed(buffer, 8 * PAGE_SIZE, + self->base_iova + i * 16 * PAGE_SIZE); + + /* Unmap not fully contained area doesn't work */ + test_err_ioctl_ioas_unmap(ENOENT, self->base_iova - 4 * PAGE_SIZE, + 8 * PAGE_SIZE); + test_err_ioctl_ioas_unmap(ENOENT, + self->base_iova + 3 * 16 * PAGE_SIZE + + 8 * PAGE_SIZE - 4 * PAGE_SIZE, + 8 * PAGE_SIZE); + + /* Unmap fully contained areas works */ + ASSERT_EQ(0, _test_ioctl_ioas_unmap(self->fd, self->ioas_id, + self->base_iova - 4 * PAGE_SIZE, + 3 * 16 * PAGE_SIZE + 8 * PAGE_SIZE + + 4 * PAGE_SIZE, + &unmap_len)); + ASSERT_EQ(32 * PAGE_SIZE, unmap_len); +} + +TEST_F(iommufd_ioas, area_auto_iova) +{ + struct iommu_test_cmd test_cmd = { + .size = sizeof(test_cmd), + .op = IOMMU_TEST_OP_ADD_RESERVED, + .id = self->ioas_id, + .add_reserved = { .start = PAGE_SIZE * 4, + .length = PAGE_SIZE * 100 }, + }; + struct iommu_iova_range ranges[1] = {}; + struct iommu_ioas_allow_iovas allow_cmd = { + .size = sizeof(allow_cmd), + .ioas_id = self->ioas_id, + .num_iovas = 1, + .allowed_iovas = (uintptr_t)ranges, + }; + __u64 iovas[10]; + int i; + + /* Simple 4k pages */ + for (i = 0; i != 10; i++) + test_ioctl_ioas_map(buffer, PAGE_SIZE, &iovas[i]); + for (i = 0; i != 10; i++) + test_ioctl_ioas_unmap(iovas[i], PAGE_SIZE); + + /* Kernel automatically aligns IOVAs properly */ + for (i = 0; i != 10; i++) { + size_t length = PAGE_SIZE * (i + 1); + + if (self->domain_id) { + test_ioctl_ioas_map(buffer, length, &iovas[i]); + } else { + test_ioctl_ioas_map((void *)(1UL << 31), length, + &iovas[i]); + } + EXPECT_EQ(0, iovas[i] % (1UL << (ffs(length) - 1))); + } + for (i = 0; i != 10; i++) + test_ioctl_ioas_unmap(iovas[i], PAGE_SIZE * (i + 1)); + + /* Avoids a reserved region */ + ASSERT_EQ(0, + ioctl(self->fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_ADD_RESERVED), + &test_cmd)); + for (i = 0; i != 10; i++) { + size_t length = PAGE_SIZE * (i + 1); + + test_ioctl_ioas_map(buffer, length, &iovas[i]); + EXPECT_EQ(0, iovas[i] % (1UL << (ffs(length) - 1))); + EXPECT_EQ(false, + iovas[i] > test_cmd.add_reserved.start && + iovas[i] < + test_cmd.add_reserved.start + + test_cmd.add_reserved.length); + } + for (i = 0; i != 10; i++) + test_ioctl_ioas_unmap(iovas[i], PAGE_SIZE * (i + 1)); + + /* Allowed region intersects with a reserved region */ + ranges[0].start = PAGE_SIZE; + ranges[0].last = PAGE_SIZE * 600; + EXPECT_ERRNO(EADDRINUSE, + ioctl(self->fd, IOMMU_IOAS_ALLOW_IOVAS, &allow_cmd)); + + /* Allocate from an allowed region */ + if (self->domain_id) { + ranges[0].start = MOCK_APERTURE_START + PAGE_SIZE; + ranges[0].last = MOCK_APERTURE_START + PAGE_SIZE * 600 - 1; + } else { + ranges[0].start = PAGE_SIZE * 200; + ranges[0].last = PAGE_SIZE * 600 - 1; + } + ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_ALLOW_IOVAS, &allow_cmd)); + for (i = 0; i != 10; i++) { + size_t length = PAGE_SIZE * (i + 1); + + test_ioctl_ioas_map(buffer, length, &iovas[i]); + EXPECT_EQ(0, iovas[i] % (1UL << (ffs(length) - 1))); + EXPECT_EQ(true, iovas[i] >= ranges[0].start); + EXPECT_EQ(true, iovas[i] <= ranges[0].last); + EXPECT_EQ(true, iovas[i] + length > ranges[0].start); + EXPECT_EQ(true, iovas[i] + length <= ranges[0].last + 1); + } + for (i = 0; i != 10; i++) + test_ioctl_ioas_unmap(iovas[i], PAGE_SIZE * (i + 1)); +} + +TEST_F(iommufd_ioas, area_allowed) +{ + struct iommu_test_cmd test_cmd = { + .size = sizeof(test_cmd), + .op = IOMMU_TEST_OP_ADD_RESERVED, + .id = self->ioas_id, + .add_reserved = { .start = PAGE_SIZE * 4, + .length = PAGE_SIZE * 100 }, + }; + struct iommu_iova_range ranges[1] = {}; + struct iommu_ioas_allow_iovas allow_cmd = { + .size = sizeof(allow_cmd), + .ioas_id = self->ioas_id, + .num_iovas = 1, + .allowed_iovas = (uintptr_t)ranges, + }; + + /* Reserved intersects an allowed */ + allow_cmd.num_iovas = 1; + ranges[0].start = self->base_iova; + ranges[0].last = ranges[0].start + PAGE_SIZE * 600; + ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_ALLOW_IOVAS, &allow_cmd)); + test_cmd.add_reserved.start = ranges[0].start + PAGE_SIZE; + test_cmd.add_reserved.length = PAGE_SIZE; + EXPECT_ERRNO(EADDRINUSE, + ioctl(self->fd, + _IOMMU_TEST_CMD(IOMMU_TEST_OP_ADD_RESERVED), + &test_cmd)); + allow_cmd.num_iovas = 0; + ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_ALLOW_IOVAS, &allow_cmd)); + + /* Allowed intersects a reserved */ + ASSERT_EQ(0, + ioctl(self->fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_ADD_RESERVED), + &test_cmd)); + allow_cmd.num_iovas = 1; + ranges[0].start = self->base_iova; + ranges[0].last = ranges[0].start + PAGE_SIZE * 600; + EXPECT_ERRNO(EADDRINUSE, + ioctl(self->fd, IOMMU_IOAS_ALLOW_IOVAS, &allow_cmd)); +} + +TEST_F(iommufd_ioas, copy_area) +{ + struct iommu_ioas_copy copy_cmd = { + .size = sizeof(copy_cmd), + .flags = IOMMU_IOAS_MAP_FIXED_IOVA, + .dst_ioas_id = self->ioas_id, + .src_ioas_id = self->ioas_id, + .length = PAGE_SIZE, + }; + + test_ioctl_ioas_map_fixed(buffer, PAGE_SIZE, self->base_iova); + + /* Copy inside a single IOAS */ + copy_cmd.src_iova = self->base_iova; + copy_cmd.dst_iova = self->base_iova + PAGE_SIZE; + ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_COPY, ©_cmd)); + + /* Copy between IOAS's */ + copy_cmd.src_iova = self->base_iova; + copy_cmd.dst_iova = 0; + test_ioctl_ioas_alloc(©_cmd.dst_ioas_id); + ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_COPY, ©_cmd)); +} + +TEST_F(iommufd_ioas, iova_ranges) +{ + struct iommu_test_cmd test_cmd = { + .size = sizeof(test_cmd), + .op = IOMMU_TEST_OP_ADD_RESERVED, + .id = self->ioas_id, + .add_reserved = { .start = PAGE_SIZE, .length = PAGE_SIZE }, + }; + struct iommu_iova_range *ranges = buffer; + struct iommu_ioas_iova_ranges ranges_cmd = { + .size = sizeof(ranges_cmd), + .ioas_id = self->ioas_id, + .num_iovas = BUFFER_SIZE / sizeof(*ranges), + .allowed_iovas = (uintptr_t)ranges, + }; + + /* Range can be read */ + ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_IOVA_RANGES, &ranges_cmd)); + EXPECT_EQ(1, ranges_cmd.num_iovas); + if (!self->domain_id) { + EXPECT_EQ(0, ranges[0].start); + EXPECT_EQ(SIZE_MAX, ranges[0].last); + EXPECT_EQ(1, ranges_cmd.out_iova_alignment); + } else { + EXPECT_EQ(MOCK_APERTURE_START, ranges[0].start); + EXPECT_EQ(MOCK_APERTURE_LAST, ranges[0].last); + EXPECT_EQ(MOCK_PAGE_SIZE, ranges_cmd.out_iova_alignment); + } + + /* Buffer too small */ + memset(ranges, 0, BUFFER_SIZE); + ranges_cmd.num_iovas = 0; + EXPECT_ERRNO(EMSGSIZE, + ioctl(self->fd, IOMMU_IOAS_IOVA_RANGES, &ranges_cmd)); + EXPECT_EQ(1, ranges_cmd.num_iovas); + EXPECT_EQ(0, ranges[0].start); + EXPECT_EQ(0, ranges[0].last); + + /* 2 ranges */ + ASSERT_EQ(0, + ioctl(self->fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_ADD_RESERVED), + &test_cmd)); + ranges_cmd.num_iovas = BUFFER_SIZE / sizeof(*ranges); + ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_IOVA_RANGES, &ranges_cmd)); + if (!self->domain_id) { + EXPECT_EQ(2, ranges_cmd.num_iovas); + EXPECT_EQ(0, ranges[0].start); + EXPECT_EQ(PAGE_SIZE - 1, ranges[0].last); + EXPECT_EQ(PAGE_SIZE * 2, ranges[1].start); + EXPECT_EQ(SIZE_MAX, ranges[1].last); + } else { + EXPECT_EQ(1, ranges_cmd.num_iovas); + EXPECT_EQ(MOCK_APERTURE_START, ranges[0].start); + EXPECT_EQ(MOCK_APERTURE_LAST, ranges[0].last); + } + + /* Buffer too small */ + memset(ranges, 0, BUFFER_SIZE); + ranges_cmd.num_iovas = 1; + if (!self->domain_id) { + EXPECT_ERRNO(EMSGSIZE, ioctl(self->fd, IOMMU_IOAS_IOVA_RANGES, + &ranges_cmd)); + EXPECT_EQ(2, ranges_cmd.num_iovas); + EXPECT_EQ(0, ranges[0].start); + EXPECT_EQ(PAGE_SIZE - 1, ranges[0].last); + } else { + ASSERT_EQ(0, + ioctl(self->fd, IOMMU_IOAS_IOVA_RANGES, &ranges_cmd)); + EXPECT_EQ(1, ranges_cmd.num_iovas); + EXPECT_EQ(MOCK_APERTURE_START, ranges[0].start); + EXPECT_EQ(MOCK_APERTURE_LAST, ranges[0].last); + } + EXPECT_EQ(0, ranges[1].start); + EXPECT_EQ(0, ranges[1].last); +} + +TEST_F(iommufd_ioas, access_pin) +{ + struct iommu_test_cmd access_cmd = { + .size = sizeof(access_cmd), + .op = IOMMU_TEST_OP_ACCESS_PAGES, + .access_pages = { .iova = MOCK_APERTURE_START, + .length = BUFFER_SIZE, + .uptr = (uintptr_t)buffer }, + }; + struct iommu_test_cmd check_map_cmd = { + .size = sizeof(check_map_cmd), + .op = IOMMU_TEST_OP_MD_CHECK_MAP, + .check_map = { .iova = MOCK_APERTURE_START, + .length = BUFFER_SIZE, + .uptr = (uintptr_t)buffer }, + }; + uint32_t access_pages_id; + unsigned int npages; + + test_cmd_create_access(self->ioas_id, &access_cmd.id, + MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES); + + for (npages = 1; npages < BUFFER_SIZE / PAGE_SIZE; npages++) { + uint32_t mock_device_id; + uint32_t mock_hwpt_id; + + access_cmd.access_pages.length = npages * PAGE_SIZE; + + /* Single map/unmap */ + test_ioctl_ioas_map_fixed(buffer, BUFFER_SIZE, + MOCK_APERTURE_START); + ASSERT_EQ(0, ioctl(self->fd, + _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_PAGES), + &access_cmd)); + test_cmd_destroy_access_pages( + access_cmd.id, + access_cmd.access_pages.out_access_pages_id); + + /* Double user */ + ASSERT_EQ(0, ioctl(self->fd, + _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_PAGES), + &access_cmd)); + access_pages_id = access_cmd.access_pages.out_access_pages_id; + ASSERT_EQ(0, ioctl(self->fd, + _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_PAGES), + &access_cmd)); + test_cmd_destroy_access_pages( + access_cmd.id, + access_cmd.access_pages.out_access_pages_id); + test_cmd_destroy_access_pages(access_cmd.id, access_pages_id); + + /* Add/remove a domain with a user */ + ASSERT_EQ(0, ioctl(self->fd, + _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_PAGES), + &access_cmd)); + test_cmd_mock_domain(self->ioas_id, &mock_device_id, + &mock_hwpt_id); + check_map_cmd.id = mock_hwpt_id; + ASSERT_EQ(0, ioctl(self->fd, + _IOMMU_TEST_CMD(IOMMU_TEST_OP_MD_CHECK_MAP), + &check_map_cmd)); + + test_ioctl_destroy(mock_device_id); + test_ioctl_destroy(mock_hwpt_id); + test_cmd_destroy_access_pages( + access_cmd.id, + access_cmd.access_pages.out_access_pages_id); + + test_ioctl_ioas_unmap(MOCK_APERTURE_START, BUFFER_SIZE); + } + test_cmd_destroy_access(access_cmd.id); +} + +TEST_F(iommufd_ioas, access_pin_unmap) +{ + struct iommu_test_cmd access_pages_cmd = { + .size = sizeof(access_pages_cmd), + .op = IOMMU_TEST_OP_ACCESS_PAGES, + .access_pages = { .iova = MOCK_APERTURE_START, + .length = BUFFER_SIZE, + .uptr = (uintptr_t)buffer }, + }; + + test_cmd_create_access(self->ioas_id, &access_pages_cmd.id, + MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES); + test_ioctl_ioas_map_fixed(buffer, BUFFER_SIZE, MOCK_APERTURE_START); + ASSERT_EQ(0, + ioctl(self->fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_PAGES), + &access_pages_cmd)); + + /* Trigger the unmap op */ + test_ioctl_ioas_unmap(MOCK_APERTURE_START, BUFFER_SIZE); + + /* kernel removed the item for us */ + test_err_destroy_access_pages( + ENOENT, access_pages_cmd.id, + access_pages_cmd.access_pages.out_access_pages_id); +} + +static void check_access_rw(struct __test_metadata *_metadata, int fd, + unsigned int access_id, uint64_t iova, + unsigned int def_flags) +{ + uint16_t tmp[32]; + struct iommu_test_cmd access_cmd = { + .size = sizeof(access_cmd), + .op = IOMMU_TEST_OP_ACCESS_RW, + .id = access_id, + .access_rw = { .uptr = (uintptr_t)tmp }, + }; + uint16_t *buffer16 = buffer; + unsigned int i; + void *tmp2; + + for (i = 0; i != BUFFER_SIZE / sizeof(*buffer16); i++) + buffer16[i] = rand(); + + for (access_cmd.access_rw.iova = iova + PAGE_SIZE - 50; + access_cmd.access_rw.iova < iova + PAGE_SIZE + 50; + access_cmd.access_rw.iova++) { + for (access_cmd.access_rw.length = 1; + access_cmd.access_rw.length < sizeof(tmp); + access_cmd.access_rw.length++) { + access_cmd.access_rw.flags = def_flags; + ASSERT_EQ(0, ioctl(fd, + _IOMMU_TEST_CMD( + IOMMU_TEST_OP_ACCESS_RW), + &access_cmd)); + ASSERT_EQ(0, + memcmp(buffer + (access_cmd.access_rw.iova - + iova), + tmp, access_cmd.access_rw.length)); + + for (i = 0; i != ARRAY_SIZE(tmp); i++) + tmp[i] = rand(); + access_cmd.access_rw.flags = def_flags | + MOCK_ACCESS_RW_WRITE; + ASSERT_EQ(0, ioctl(fd, + _IOMMU_TEST_CMD( + IOMMU_TEST_OP_ACCESS_RW), + &access_cmd)); + ASSERT_EQ(0, + memcmp(buffer + (access_cmd.access_rw.iova - + iova), + tmp, access_cmd.access_rw.length)); + } + } + + /* Multi-page test */ + tmp2 = malloc(BUFFER_SIZE); + ASSERT_NE(NULL, tmp2); + access_cmd.access_rw.iova = iova; + access_cmd.access_rw.length = BUFFER_SIZE; + access_cmd.access_rw.flags = def_flags; + access_cmd.access_rw.uptr = (uintptr_t)tmp2; + ASSERT_EQ(0, ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_RW), + &access_cmd)); + ASSERT_EQ(0, memcmp(buffer, tmp2, access_cmd.access_rw.length)); + free(tmp2); +} + +TEST_F(iommufd_ioas, access_rw) +{ + __u32 access_id; + __u64 iova; + + test_cmd_create_access(self->ioas_id, &access_id, 0); + test_ioctl_ioas_map(buffer, BUFFER_SIZE, &iova); + check_access_rw(_metadata, self->fd, access_id, iova, 0); + check_access_rw(_metadata, self->fd, access_id, iova, + MOCK_ACCESS_RW_SLOW_PATH); + test_ioctl_ioas_unmap(iova, BUFFER_SIZE); + test_cmd_destroy_access(access_id); +} + +TEST_F(iommufd_ioas, access_rw_unaligned) +{ + __u32 access_id; + __u64 iova; + + test_cmd_create_access(self->ioas_id, &access_id, 0); + + /* Unaligned pages */ + iova = self->base_iova + MOCK_PAGE_SIZE; + test_ioctl_ioas_map_fixed(buffer, BUFFER_SIZE, iova); + check_access_rw(_metadata, self->fd, access_id, iova, 0); + test_ioctl_ioas_unmap(iova, BUFFER_SIZE); + test_cmd_destroy_access(access_id); +} + +TEST_F(iommufd_ioas, fork_gone) +{ + __u32 access_id; + pid_t child; + + test_cmd_create_access(self->ioas_id, &access_id, 0); + + /* Create a mapping with a different mm */ + child = fork(); + if (!child) { + test_ioctl_ioas_map_fixed(buffer, BUFFER_SIZE, + MOCK_APERTURE_START); + exit(0); + } + ASSERT_NE(-1, child); + ASSERT_EQ(child, waitpid(child, NULL, 0)); + + if (self->domain_id) { + /* + * If a domain already existed then everything was pinned within + * the fork, so this copies from one domain to another. + */ + test_cmd_mock_domain(self->ioas_id, NULL, NULL); + check_access_rw(_metadata, self->fd, access_id, + MOCK_APERTURE_START, 0); + + } else { + /* + * Otherwise we need to actually pin pages which can't happen + * since the fork is gone. + */ + test_err_mock_domain(EFAULT, self->ioas_id, NULL, NULL); + } + + test_cmd_destroy_access(access_id); +} + +TEST_F(iommufd_ioas, fork_present) +{ + __u32 access_id; + int pipefds[2]; + uint64_t tmp; + pid_t child; + int efd; + + test_cmd_create_access(self->ioas_id, &access_id, 0); + + ASSERT_EQ(0, pipe2(pipefds, O_CLOEXEC)); + efd = eventfd(0, EFD_CLOEXEC); + ASSERT_NE(-1, efd); + + /* Create a mapping with a different mm */ + child = fork(); + if (!child) { + __u64 iova; + uint64_t one = 1; + + close(pipefds[1]); + test_ioctl_ioas_map_fixed(buffer, BUFFER_SIZE, + MOCK_APERTURE_START); + if (write(efd, &one, sizeof(one)) != sizeof(one)) + exit(100); + if (read(pipefds[0], &iova, 1) != 1) + exit(100); + exit(0); + } + close(pipefds[0]); + ASSERT_NE(-1, child); + ASSERT_EQ(8, read(efd, &tmp, sizeof(tmp))); + + /* Read pages from the remote process */ + test_cmd_mock_domain(self->ioas_id, NULL, NULL); + check_access_rw(_metadata, self->fd, access_id, MOCK_APERTURE_START, 0); + + ASSERT_EQ(0, close(pipefds[1])); + ASSERT_EQ(child, waitpid(child, NULL, 0)); + + test_cmd_destroy_access(access_id); +} + +TEST_F(iommufd_ioas, ioas_option_huge_pages) +{ + struct iommu_option cmd = { + .size = sizeof(cmd), + .option_id = IOMMU_OPTION_HUGE_PAGES, + .op = IOMMU_OPTION_OP_GET, + .val64 = 3, + .object_id = self->ioas_id, + }; + + ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd)); + ASSERT_EQ(1, cmd.val64); + + cmd.op = IOMMU_OPTION_OP_SET; + cmd.val64 = 0; + ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd)); + + cmd.op = IOMMU_OPTION_OP_GET; + cmd.val64 = 3; + ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd)); + ASSERT_EQ(0, cmd.val64); + + cmd.op = IOMMU_OPTION_OP_SET; + cmd.val64 = 2; + EXPECT_ERRNO(EINVAL, ioctl(self->fd, IOMMU_OPTION, &cmd)); + + cmd.op = IOMMU_OPTION_OP_SET; + cmd.val64 = 1; + ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd)); +} + +TEST_F(iommufd_ioas, ioas_iova_alloc) +{ + unsigned int length; + __u64 iova; + + for (length = 1; length != PAGE_SIZE * 2; length++) { + if (variant->mock_domains && (length % MOCK_PAGE_SIZE)) { + test_err_ioctl_ioas_map(EINVAL, buffer, length, &iova); + } else { + test_ioctl_ioas_map(buffer, length, &iova); + test_ioctl_ioas_unmap(iova, length); + } + } +} + +TEST_F(iommufd_ioas, ioas_align_change) +{ + struct iommu_option cmd = { + .size = sizeof(cmd), + .option_id = IOMMU_OPTION_HUGE_PAGES, + .op = IOMMU_OPTION_OP_SET, + .object_id = self->ioas_id, + /* 0 means everything must be aligned to PAGE_SIZE */ + .val64 = 0, + }; + + /* + * We cannot upgrade the alignment using OPTION_HUGE_PAGES when a domain + * and map are present. + */ + if (variant->mock_domains) + return; + + /* + * We can upgrade to PAGE_SIZE alignment when things are aligned right + */ + test_ioctl_ioas_map_fixed(buffer, PAGE_SIZE, MOCK_APERTURE_START); + ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd)); + + /* Misalignment is rejected at map time */ + test_err_ioctl_ioas_map_fixed(EINVAL, buffer + MOCK_PAGE_SIZE, + PAGE_SIZE, + MOCK_APERTURE_START + PAGE_SIZE); + ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd)); + + /* Reduce alignment */ + cmd.val64 = 1; + ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd)); + + /* Confirm misalignment is rejected during alignment upgrade */ + test_ioctl_ioas_map_fixed(buffer + MOCK_PAGE_SIZE, PAGE_SIZE, + MOCK_APERTURE_START + PAGE_SIZE); + cmd.val64 = 0; + EXPECT_ERRNO(EADDRINUSE, ioctl(self->fd, IOMMU_OPTION, &cmd)); + + test_ioctl_ioas_unmap(MOCK_APERTURE_START + PAGE_SIZE, PAGE_SIZE); + test_ioctl_ioas_unmap(MOCK_APERTURE_START, PAGE_SIZE); +} + +TEST_F(iommufd_ioas, copy_sweep) +{ + struct iommu_ioas_copy copy_cmd = { + .size = sizeof(copy_cmd), + .flags = IOMMU_IOAS_MAP_FIXED_IOVA, + .src_ioas_id = self->ioas_id, + .dst_iova = MOCK_APERTURE_START, + .length = MOCK_PAGE_SIZE, + }; + unsigned int dst_ioas_id; + uint64_t last_iova; + uint64_t iova; + + test_ioctl_ioas_alloc(&dst_ioas_id); + copy_cmd.dst_ioas_id = dst_ioas_id; + + if (variant->mock_domains) + last_iova = MOCK_APERTURE_START + BUFFER_SIZE - 1; + else + last_iova = MOCK_APERTURE_START + BUFFER_SIZE - 2; + + test_ioctl_ioas_map_fixed(buffer, last_iova - MOCK_APERTURE_START + 1, + MOCK_APERTURE_START); + + for (iova = MOCK_APERTURE_START - PAGE_SIZE; iova <= last_iova; + iova += 511) { + copy_cmd.src_iova = iova; + if (iova < MOCK_APERTURE_START || + iova + copy_cmd.length - 1 > last_iova) { + EXPECT_ERRNO(ENOENT, ioctl(self->fd, IOMMU_IOAS_COPY, + ©_cmd)); + } else { + ASSERT_EQ(0, + ioctl(self->fd, IOMMU_IOAS_COPY, ©_cmd)); + test_ioctl_ioas_unmap_id(dst_ioas_id, copy_cmd.dst_iova, + copy_cmd.length); + } + } + + test_ioctl_destroy(dst_ioas_id); +} + +FIXTURE(iommufd_mock_domain) +{ + int fd; + uint32_t ioas_id; + uint32_t domain_id; + uint32_t domain_ids[2]; + int mmap_flags; + size_t mmap_buf_size; +}; + +FIXTURE_VARIANT(iommufd_mock_domain) +{ + unsigned int mock_domains; + bool hugepages; +}; + +FIXTURE_SETUP(iommufd_mock_domain) +{ + unsigned int i; + + self->fd = open("/dev/iommu", O_RDWR); + ASSERT_NE(-1, self->fd); + test_ioctl_ioas_alloc(&self->ioas_id); + + ASSERT_GE(ARRAY_SIZE(self->domain_ids), variant->mock_domains); + + for (i = 0; i != variant->mock_domains; i++) + test_cmd_mock_domain(self->ioas_id, NULL, &self->domain_ids[i]); + self->domain_id = self->domain_ids[0]; + + self->mmap_flags = MAP_SHARED | MAP_ANONYMOUS; + self->mmap_buf_size = PAGE_SIZE * 8; + if (variant->hugepages) { + /* + * MAP_POPULATE will cause the kernel to fail mmap if THPs are + * not available. + */ + self->mmap_flags |= MAP_HUGETLB | MAP_POPULATE; + self->mmap_buf_size = HUGEPAGE_SIZE * 2; + } +} + +FIXTURE_TEARDOWN(iommufd_mock_domain) +{ + teardown_iommufd(self->fd, _metadata); +} + +FIXTURE_VARIANT_ADD(iommufd_mock_domain, one_domain) +{ + .mock_domains = 1, + .hugepages = false, +}; + +FIXTURE_VARIANT_ADD(iommufd_mock_domain, two_domains) +{ + .mock_domains = 2, + .hugepages = false, +}; + +FIXTURE_VARIANT_ADD(iommufd_mock_domain, one_domain_hugepage) +{ + .mock_domains = 1, + .hugepages = true, +}; + +FIXTURE_VARIANT_ADD(iommufd_mock_domain, two_domains_hugepage) +{ + .mock_domains = 2, + .hugepages = true, +}; + +/* Have the kernel check that the user pages made it to the iommu_domain */ +#define check_mock_iova(_ptr, _iova, _length) \ + ({ \ + struct iommu_test_cmd check_map_cmd = { \ + .size = sizeof(check_map_cmd), \ + .op = IOMMU_TEST_OP_MD_CHECK_MAP, \ + .id = self->domain_id, \ + .check_map = { .iova = _iova, \ + .length = _length, \ + .uptr = (uintptr_t)(_ptr) }, \ + }; \ + ASSERT_EQ(0, \ + ioctl(self->fd, \ + _IOMMU_TEST_CMD(IOMMU_TEST_OP_MD_CHECK_MAP), \ + &check_map_cmd)); \ + if (self->domain_ids[1]) { \ + check_map_cmd.id = self->domain_ids[1]; \ + ASSERT_EQ(0, \ + ioctl(self->fd, \ + _IOMMU_TEST_CMD( \ + IOMMU_TEST_OP_MD_CHECK_MAP), \ + &check_map_cmd)); \ + } \ + }) + +TEST_F(iommufd_mock_domain, basic) +{ + size_t buf_size = self->mmap_buf_size; + uint8_t *buf; + __u64 iova; + + /* Simple one page map */ + test_ioctl_ioas_map(buffer, PAGE_SIZE, &iova); + check_mock_iova(buffer, iova, PAGE_SIZE); + + buf = mmap(0, buf_size, PROT_READ | PROT_WRITE, self->mmap_flags, -1, + 0); + ASSERT_NE(MAP_FAILED, buf); + + /* EFAULT half way through mapping */ + ASSERT_EQ(0, munmap(buf + buf_size / 2, buf_size / 2)); + test_err_ioctl_ioas_map(EFAULT, buf, buf_size, &iova); + + /* EFAULT on first page */ + ASSERT_EQ(0, munmap(buf, buf_size / 2)); + test_err_ioctl_ioas_map(EFAULT, buf, buf_size, &iova); +} + +TEST_F(iommufd_mock_domain, ro_unshare) +{ + uint8_t *buf; + __u64 iova; + int fd; + + fd = open("/proc/self/exe", O_RDONLY); + ASSERT_NE(-1, fd); + + buf = mmap(0, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + ASSERT_NE(MAP_FAILED, buf); + close(fd); + + /* + * There have been lots of changes to the "unshare" mechanism in + * get_user_pages(), make sure it works right. The write to the page + * after we map it for reading should not change the assigned PFN. + */ + ASSERT_EQ(0, + _test_ioctl_ioas_map(self->fd, self->ioas_id, buf, PAGE_SIZE, + &iova, IOMMU_IOAS_MAP_READABLE)); + check_mock_iova(buf, iova, PAGE_SIZE); + memset(buf, 1, PAGE_SIZE); + check_mock_iova(buf, iova, PAGE_SIZE); + ASSERT_EQ(0, munmap(buf, PAGE_SIZE)); +} + +TEST_F(iommufd_mock_domain, all_aligns) +{ + size_t test_step = variant->hugepages ? (self->mmap_buf_size / 16) : + MOCK_PAGE_SIZE; + size_t buf_size = self->mmap_buf_size; + unsigned int start; + unsigned int end; + uint8_t *buf; + + buf = mmap(0, buf_size, PROT_READ | PROT_WRITE, self->mmap_flags, -1, + 0); + ASSERT_NE(MAP_FAILED, buf); + check_refs(buf, buf_size, 0); + + /* + * Map every combination of page size and alignment within a big region, + * less for hugepage case as it takes so long to finish. + */ + for (start = 0; start < buf_size; start += test_step) { + if (variant->hugepages) + end = buf_size; + else + end = start + MOCK_PAGE_SIZE; + for (; end < buf_size; end += MOCK_PAGE_SIZE) { + size_t length = end - start; + __u64 iova; + + test_ioctl_ioas_map(buf + start, length, &iova); + check_mock_iova(buf + start, iova, length); + check_refs(buf + start / PAGE_SIZE * PAGE_SIZE, + end / PAGE_SIZE * PAGE_SIZE - + start / PAGE_SIZE * PAGE_SIZE, + 1); + + test_ioctl_ioas_unmap(iova, length); + } + } + check_refs(buf, buf_size, 0); + ASSERT_EQ(0, munmap(buf, buf_size)); +} + +TEST_F(iommufd_mock_domain, all_aligns_copy) +{ + size_t test_step = variant->hugepages ? self->mmap_buf_size / 16 : + MOCK_PAGE_SIZE; + size_t buf_size = self->mmap_buf_size; + unsigned int start; + unsigned int end; + uint8_t *buf; + + buf = mmap(0, buf_size, PROT_READ | PROT_WRITE, self->mmap_flags, -1, + 0); + ASSERT_NE(MAP_FAILED, buf); + check_refs(buf, buf_size, 0); + + /* + * Map every combination of page size and alignment within a big region, + * less for hugepage case as it takes so long to finish. + */ + for (start = 0; start < buf_size; start += test_step) { + if (variant->hugepages) + end = buf_size; + else + end = start + MOCK_PAGE_SIZE; + for (; end < buf_size; end += MOCK_PAGE_SIZE) { + size_t length = end - start; + unsigned int old_id; + uint32_t mock_device_id; + __u64 iova; + + test_ioctl_ioas_map(buf + start, length, &iova); + + /* Add and destroy a domain while the area exists */ + old_id = self->domain_ids[1]; + test_cmd_mock_domain(self->ioas_id, &mock_device_id, + &self->domain_ids[1]); + + check_mock_iova(buf + start, iova, length); + check_refs(buf + start / PAGE_SIZE * PAGE_SIZE, + end / PAGE_SIZE * PAGE_SIZE - + start / PAGE_SIZE * PAGE_SIZE, + 1); + + test_ioctl_destroy(mock_device_id); + test_ioctl_destroy(self->domain_ids[1]); + self->domain_ids[1] = old_id; + + test_ioctl_ioas_unmap(iova, length); + } + } + check_refs(buf, buf_size, 0); + ASSERT_EQ(0, munmap(buf, buf_size)); +} + +TEST_F(iommufd_mock_domain, user_copy) +{ + struct iommu_test_cmd access_cmd = { + .size = sizeof(access_cmd), + .op = IOMMU_TEST_OP_ACCESS_PAGES, + .access_pages = { .length = BUFFER_SIZE, + .uptr = (uintptr_t)buffer }, + }; + struct iommu_ioas_copy copy_cmd = { + .size = sizeof(copy_cmd), + .flags = IOMMU_IOAS_MAP_FIXED_IOVA, + .dst_ioas_id = self->ioas_id, + .dst_iova = MOCK_APERTURE_START, + .length = BUFFER_SIZE, + }; + unsigned int ioas_id; + + /* Pin the pages in an IOAS with no domains then copy to an IOAS with domains */ + test_ioctl_ioas_alloc(&ioas_id); + test_ioctl_ioas_map_id(ioas_id, buffer, BUFFER_SIZE, + ©_cmd.src_iova); + + test_cmd_create_access(ioas_id, &access_cmd.id, + MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES); + + access_cmd.access_pages.iova = copy_cmd.src_iova; + ASSERT_EQ(0, + ioctl(self->fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_PAGES), + &access_cmd)); + copy_cmd.src_ioas_id = ioas_id; + ASSERT_EQ(0, ioctl(self->fd, IOMMU_IOAS_COPY, ©_cmd)); + check_mock_iova(buffer, MOCK_APERTURE_START, BUFFER_SIZE); + + test_cmd_destroy_access_pages( + access_cmd.id, access_cmd.access_pages.out_access_pages_id); + test_cmd_destroy_access(access_cmd.id) test_ioctl_destroy(ioas_id); + + test_ioctl_destroy(ioas_id); +} + +/* VFIO compatibility IOCTLs */ + +TEST_F(iommufd, simple_ioctls) +{ + ASSERT_EQ(VFIO_API_VERSION, ioctl(self->fd, VFIO_GET_API_VERSION)); + ASSERT_EQ(1, ioctl(self->fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU)); +} + +TEST_F(iommufd, unmap_cmd) +{ + struct vfio_iommu_type1_dma_unmap unmap_cmd = { + .iova = MOCK_APERTURE_START, + .size = PAGE_SIZE, + }; + + unmap_cmd.argsz = 1; + EXPECT_ERRNO(EINVAL, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd)); + + unmap_cmd.argsz = sizeof(unmap_cmd); + unmap_cmd.flags = 1 << 31; + EXPECT_ERRNO(EINVAL, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd)); + + unmap_cmd.flags = 0; + EXPECT_ERRNO(ENODEV, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd)); +} + +TEST_F(iommufd, map_cmd) +{ + struct vfio_iommu_type1_dma_map map_cmd = { + .iova = MOCK_APERTURE_START, + .size = PAGE_SIZE, + .vaddr = (__u64)buffer, + }; + + map_cmd.argsz = 1; + EXPECT_ERRNO(EINVAL, ioctl(self->fd, VFIO_IOMMU_MAP_DMA, &map_cmd)); + + map_cmd.argsz = sizeof(map_cmd); + map_cmd.flags = 1 << 31; + EXPECT_ERRNO(EINVAL, ioctl(self->fd, VFIO_IOMMU_MAP_DMA, &map_cmd)); + + /* Requires a domain to be attached */ + map_cmd.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + EXPECT_ERRNO(ENODEV, ioctl(self->fd, VFIO_IOMMU_MAP_DMA, &map_cmd)); +} + +TEST_F(iommufd, info_cmd) +{ + struct vfio_iommu_type1_info info_cmd = {}; + + /* Invalid argsz */ + info_cmd.argsz = 1; + EXPECT_ERRNO(EINVAL, ioctl(self->fd, VFIO_IOMMU_GET_INFO, &info_cmd)); + + info_cmd.argsz = sizeof(info_cmd); + EXPECT_ERRNO(ENODEV, ioctl(self->fd, VFIO_IOMMU_GET_INFO, &info_cmd)); +} + +TEST_F(iommufd, set_iommu_cmd) +{ + /* Requires a domain to be attached */ + EXPECT_ERRNO(ENODEV, + ioctl(self->fd, VFIO_SET_IOMMU, VFIO_TYPE1v2_IOMMU)); + EXPECT_ERRNO(ENODEV, ioctl(self->fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)); +} + +TEST_F(iommufd, vfio_ioas) +{ + struct iommu_vfio_ioas vfio_ioas_cmd = { + .size = sizeof(vfio_ioas_cmd), + .op = IOMMU_VFIO_IOAS_GET, + }; + __u32 ioas_id; + + /* ENODEV if there is no compat ioas */ + EXPECT_ERRNO(ENODEV, ioctl(self->fd, IOMMU_VFIO_IOAS, &vfio_ioas_cmd)); + + /* Invalid id for set */ + vfio_ioas_cmd.op = IOMMU_VFIO_IOAS_SET; + EXPECT_ERRNO(ENOENT, ioctl(self->fd, IOMMU_VFIO_IOAS, &vfio_ioas_cmd)); + + /* Valid id for set*/ + test_ioctl_ioas_alloc(&ioas_id); + vfio_ioas_cmd.ioas_id = ioas_id; + ASSERT_EQ(0, ioctl(self->fd, IOMMU_VFIO_IOAS, &vfio_ioas_cmd)); + + /* Same id comes back from get */ + vfio_ioas_cmd.op = IOMMU_VFIO_IOAS_GET; + ASSERT_EQ(0, ioctl(self->fd, IOMMU_VFIO_IOAS, &vfio_ioas_cmd)); + ASSERT_EQ(ioas_id, vfio_ioas_cmd.ioas_id); + + /* Clear works */ + vfio_ioas_cmd.op = IOMMU_VFIO_IOAS_CLEAR; + ASSERT_EQ(0, ioctl(self->fd, IOMMU_VFIO_IOAS, &vfio_ioas_cmd)); + vfio_ioas_cmd.op = IOMMU_VFIO_IOAS_GET; + EXPECT_ERRNO(ENODEV, ioctl(self->fd, IOMMU_VFIO_IOAS, &vfio_ioas_cmd)); +} + +FIXTURE(vfio_compat_mock_domain) +{ + int fd; + uint32_t ioas_id; +}; + +FIXTURE_VARIANT(vfio_compat_mock_domain) +{ + unsigned int version; +}; + +FIXTURE_SETUP(vfio_compat_mock_domain) +{ + struct iommu_vfio_ioas vfio_ioas_cmd = { + .size = sizeof(vfio_ioas_cmd), + .op = IOMMU_VFIO_IOAS_SET, + }; + + self->fd = open("/dev/iommu", O_RDWR); + ASSERT_NE(-1, self->fd); + + /* Create what VFIO would consider a group */ + test_ioctl_ioas_alloc(&self->ioas_id); + test_cmd_mock_domain(self->ioas_id, NULL, NULL); + + /* Attach it to the vfio compat */ + vfio_ioas_cmd.ioas_id = self->ioas_id; + ASSERT_EQ(0, ioctl(self->fd, IOMMU_VFIO_IOAS, &vfio_ioas_cmd)); + ASSERT_EQ(0, ioctl(self->fd, VFIO_SET_IOMMU, variant->version)); +} + +FIXTURE_TEARDOWN(vfio_compat_mock_domain) +{ + teardown_iommufd(self->fd, _metadata); +} + +FIXTURE_VARIANT_ADD(vfio_compat_mock_domain, Ver1v2) +{ + .version = VFIO_TYPE1v2_IOMMU, +}; + +FIXTURE_VARIANT_ADD(vfio_compat_mock_domain, Ver1v0) +{ + .version = VFIO_TYPE1_IOMMU, +}; + +TEST_F(vfio_compat_mock_domain, simple_close) +{ +} + +TEST_F(vfio_compat_mock_domain, option_huge_pages) +{ + struct iommu_option cmd = { + .size = sizeof(cmd), + .option_id = IOMMU_OPTION_HUGE_PAGES, + .op = IOMMU_OPTION_OP_GET, + .val64 = 3, + .object_id = self->ioas_id, + }; + + ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd)); + if (variant->version == VFIO_TYPE1_IOMMU) { + ASSERT_EQ(0, cmd.val64); + } else { + ASSERT_EQ(1, cmd.val64); + } +} + +/* + * Execute an ioctl command stored in buffer and check that the result does not + * overflow memory. + */ +static bool is_filled(const void *buf, uint8_t c, size_t len) +{ + const uint8_t *cbuf = buf; + + for (; len; cbuf++, len--) + if (*cbuf != c) + return false; + return true; +} + +#define ioctl_check_buf(fd, cmd) \ + ({ \ + size_t _cmd_len = *(__u32 *)buffer; \ + \ + memset(buffer + _cmd_len, 0xAA, BUFFER_SIZE - _cmd_len); \ + ASSERT_EQ(0, ioctl(fd, cmd, buffer)); \ + ASSERT_EQ(true, is_filled(buffer + _cmd_len, 0xAA, \ + BUFFER_SIZE - _cmd_len)); \ + }) + +static void check_vfio_info_cap_chain(struct __test_metadata *_metadata, + struct vfio_iommu_type1_info *info_cmd) +{ + const struct vfio_info_cap_header *cap; + + ASSERT_GE(info_cmd->argsz, info_cmd->cap_offset + sizeof(*cap)); + cap = buffer + info_cmd->cap_offset; + while (true) { + size_t cap_size; + + if (cap->next) + cap_size = (buffer + cap->next) - (void *)cap; + else + cap_size = (buffer + info_cmd->argsz) - (void *)cap; + + switch (cap->id) { + case VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE: { + struct vfio_iommu_type1_info_cap_iova_range *data = + (void *)cap; + + ASSERT_EQ(1, data->header.version); + ASSERT_EQ(1, data->nr_iovas); + EXPECT_EQ(MOCK_APERTURE_START, + data->iova_ranges[0].start); + EXPECT_EQ(MOCK_APERTURE_LAST, data->iova_ranges[0].end); + break; + } + case VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL: { + struct vfio_iommu_type1_info_dma_avail *data = + (void *)cap; + + ASSERT_EQ(1, data->header.version); + ASSERT_EQ(sizeof(*data), cap_size); + break; + } + default: + ASSERT_EQ(false, true); + break; + } + if (!cap->next) + break; + + ASSERT_GE(info_cmd->argsz, cap->next + sizeof(*cap)); + ASSERT_GE(buffer + cap->next, (void *)cap); + cap = buffer + cap->next; + } +} + +TEST_F(vfio_compat_mock_domain, get_info) +{ + struct vfio_iommu_type1_info *info_cmd = buffer; + unsigned int i; + size_t caplen; + + /* Pre-cap ABI */ + *info_cmd = (struct vfio_iommu_type1_info){ + .argsz = offsetof(struct vfio_iommu_type1_info, cap_offset), + }; + ioctl_check_buf(self->fd, VFIO_IOMMU_GET_INFO); + ASSERT_NE(0, info_cmd->iova_pgsizes); + ASSERT_EQ(VFIO_IOMMU_INFO_PGSIZES | VFIO_IOMMU_INFO_CAPS, + info_cmd->flags); + + /* Read the cap chain size */ + *info_cmd = (struct vfio_iommu_type1_info){ + .argsz = sizeof(*info_cmd), + }; + ioctl_check_buf(self->fd, VFIO_IOMMU_GET_INFO); + ASSERT_NE(0, info_cmd->iova_pgsizes); + ASSERT_EQ(VFIO_IOMMU_INFO_PGSIZES | VFIO_IOMMU_INFO_CAPS, + info_cmd->flags); + ASSERT_EQ(0, info_cmd->cap_offset); + ASSERT_LT(sizeof(*info_cmd), info_cmd->argsz); + + /* Read the caps, kernel should never create a corrupted caps */ + caplen = info_cmd->argsz; + for (i = sizeof(*info_cmd); i < caplen; i++) { + *info_cmd = (struct vfio_iommu_type1_info){ + .argsz = i, + }; + ioctl_check_buf(self->fd, VFIO_IOMMU_GET_INFO); + ASSERT_EQ(VFIO_IOMMU_INFO_PGSIZES | VFIO_IOMMU_INFO_CAPS, + info_cmd->flags); + if (!info_cmd->cap_offset) + continue; + check_vfio_info_cap_chain(_metadata, info_cmd); + } +} + +static void shuffle_array(unsigned long *array, size_t nelms) +{ + unsigned int i; + + /* Shuffle */ + for (i = 0; i != nelms; i++) { + unsigned long tmp = array[i]; + unsigned int other = rand() % (nelms - i); + + array[i] = array[other]; + array[other] = tmp; + } +} + +TEST_F(vfio_compat_mock_domain, map) +{ + struct vfio_iommu_type1_dma_map map_cmd = { + .argsz = sizeof(map_cmd), + .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, + .vaddr = (uintptr_t)buffer, + .size = BUFFER_SIZE, + .iova = MOCK_APERTURE_START, + }; + struct vfio_iommu_type1_dma_unmap unmap_cmd = { + .argsz = sizeof(unmap_cmd), + .size = BUFFER_SIZE, + .iova = MOCK_APERTURE_START, + }; + unsigned long pages_iova[BUFFER_SIZE / PAGE_SIZE]; + unsigned int i; + + /* Simple map/unmap */ + ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_MAP_DMA, &map_cmd)); + ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd)); + ASSERT_EQ(BUFFER_SIZE, unmap_cmd.size); + + /* UNMAP_FLAG_ALL requres 0 iova/size */ + ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_MAP_DMA, &map_cmd)); + unmap_cmd.flags = VFIO_DMA_UNMAP_FLAG_ALL; + EXPECT_ERRNO(EINVAL, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd)); + + unmap_cmd.iova = 0; + unmap_cmd.size = 0; + ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd)); + ASSERT_EQ(BUFFER_SIZE, unmap_cmd.size); + + /* Small pages */ + for (i = 0; i != ARRAY_SIZE(pages_iova); i++) { + map_cmd.iova = pages_iova[i] = + MOCK_APERTURE_START + i * PAGE_SIZE; + map_cmd.vaddr = (uintptr_t)buffer + i * PAGE_SIZE; + map_cmd.size = PAGE_SIZE; + ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_MAP_DMA, &map_cmd)); + } + shuffle_array(pages_iova, ARRAY_SIZE(pages_iova)); + + unmap_cmd.flags = 0; + unmap_cmd.size = PAGE_SIZE; + for (i = 0; i != ARRAY_SIZE(pages_iova); i++) { + unmap_cmd.iova = pages_iova[i]; + ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd)); + } +} + +TEST_F(vfio_compat_mock_domain, huge_map) +{ + size_t buf_size = HUGEPAGE_SIZE * 2; + struct vfio_iommu_type1_dma_map map_cmd = { + .argsz = sizeof(map_cmd), + .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, + .size = buf_size, + .iova = MOCK_APERTURE_START, + }; + struct vfio_iommu_type1_dma_unmap unmap_cmd = { + .argsz = sizeof(unmap_cmd), + }; + unsigned long pages_iova[16]; + unsigned int i; + void *buf; + + /* Test huge pages and splitting */ + buf = mmap(0, buf_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, -1, + 0); + ASSERT_NE(MAP_FAILED, buf); + map_cmd.vaddr = (uintptr_t)buf; + ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_MAP_DMA, &map_cmd)); + + unmap_cmd.size = buf_size / ARRAY_SIZE(pages_iova); + for (i = 0; i != ARRAY_SIZE(pages_iova); i++) + pages_iova[i] = MOCK_APERTURE_START + (i * unmap_cmd.size); + shuffle_array(pages_iova, ARRAY_SIZE(pages_iova)); + + /* type1 mode can cut up larger mappings, type1v2 always fails */ + for (i = 0; i != ARRAY_SIZE(pages_iova); i++) { + unmap_cmd.iova = pages_iova[i]; + unmap_cmd.size = buf_size / ARRAY_SIZE(pages_iova); + if (variant->version == VFIO_TYPE1_IOMMU) { + ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, + &unmap_cmd)); + } else { + EXPECT_ERRNO(ENOENT, + ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, + &unmap_cmd)); + } + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c new file mode 100644 index 0000000000000..9713111b820dd --- /dev/null +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -0,0 +1,580 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + * + * These tests are "kernel integrity" tests. They are looking for kernel + * WARN/OOPS/kasn/etc splats triggered by kernel sanitizers & debugging + * features. It does not attempt to verify that the system calls are doing what + * they are supposed to do. + * + * The basic philosophy is to run a sequence of calls that will succeed and then + * sweep every failure injection point on that call chain to look for + * interesting things in error handling. + * + * This test is best run with: + * echo 1 > /proc/sys/kernel/panic_on_warn + * If something is actually going wrong. + */ +#include +#include + +#define __EXPORTED_HEADERS__ +#include + +#include "iommufd_utils.h" + +static bool have_fault_injection; + +static int writeat(int dfd, const char *fn, const char *val) +{ + size_t val_len = strlen(val); + ssize_t res; + int fd; + + fd = openat(dfd, fn, O_WRONLY); + if (fd == -1) + return -1; + res = write(fd, val, val_len); + assert(res == val_len); + close(fd); + return 0; +} + +static __attribute__((constructor)) void setup_buffer(void) +{ + BUFFER_SIZE = 2*1024*1024; + + buffer = mmap(0, BUFFER_SIZE, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); +} + +/* + * This sets up fail_injection in a way that is useful for this test. + * It does not attempt to restore things back to how they were. + */ +static __attribute__((constructor)) void setup_fault_injection(void) +{ + DIR *debugfs = opendir("/sys/kernel/debug/"); + struct dirent *dent; + + if (!debugfs) + return; + + /* Allow any allocation call to be fault injected */ + if (writeat(dirfd(debugfs), "failslab/ignore-gfp-wait", "N")) + return; + writeat(dirfd(debugfs), "fail_page_alloc/ignore-gfp-wait", "N"); + writeat(dirfd(debugfs), "fail_page_alloc/ignore-gfp-highmem", "N"); + + while ((dent = readdir(debugfs))) { + char fn[300]; + + if (strncmp(dent->d_name, "fail", 4) != 0) + continue; + + /* We are looking for kernel splats, quiet down the log */ + snprintf(fn, sizeof(fn), "%s/verbose", dent->d_name); + writeat(dirfd(debugfs), fn, "0"); + } + closedir(debugfs); + have_fault_injection = true; +} + +struct fail_nth_state { + int proc_fd; + unsigned int iteration; +}; + +static void fail_nth_first(struct __test_metadata *_metadata, + struct fail_nth_state *nth_state) +{ + char buf[300]; + + snprintf(buf, sizeof(buf), "/proc/self/task/%u/fail-nth", getpid()); + nth_state->proc_fd = open(buf, O_RDWR); + ASSERT_NE(-1, nth_state->proc_fd); +} + +static bool fail_nth_next(struct __test_metadata *_metadata, + struct fail_nth_state *nth_state, + int test_result) +{ + static const char disable_nth[] = "0"; + char buf[300]; + + /* + * This is just an arbitrary limit based on the current kernel + * situation. Changes in the kernel can dramtically change the number of + * required fault injection sites, so if this hits it doesn't + * necessarily mean a test failure, just that the limit has to be made + * bigger. + */ + ASSERT_GT(400, nth_state->iteration); + if (nth_state->iteration != 0) { + ssize_t res; + ssize_t res2; + + buf[0] = 0; + /* + * Annoyingly disabling the nth can also fail. This means + * the test passed without triggering failure + */ + res = pread(nth_state->proc_fd, buf, sizeof(buf), 0); + if (res == -1 && errno == EFAULT) { + buf[0] = '1'; + buf[1] = '\n'; + res = 2; + } + + res2 = pwrite(nth_state->proc_fd, disable_nth, + ARRAY_SIZE(disable_nth) - 1, 0); + if (res2 == -1 && errno == EFAULT) { + res2 = pwrite(nth_state->proc_fd, disable_nth, + ARRAY_SIZE(disable_nth) - 1, 0); + buf[0] = '1'; + buf[1] = '\n'; + } + ASSERT_EQ(ARRAY_SIZE(disable_nth) - 1, res2); + + /* printf(" nth %u result=%d nth=%u\n", nth_state->iteration, + test_result, atoi(buf)); */ + fflush(stdout); + ASSERT_LT(1, res); + if (res != 2 || buf[0] != '0' || buf[1] != '\n') + return false; + } else { + /* printf(" nth %u result=%d\n", nth_state->iteration, + test_result); */ + } + nth_state->iteration++; + return true; +} + +/* + * This is called during the test to start failure injection. It allows the test + * to do some setup that has already been swept and thus reduce the required + * iterations. + */ +void __fail_nth_enable(struct __test_metadata *_metadata, + struct fail_nth_state *nth_state) +{ + char buf[300]; + size_t len; + + if (!nth_state->iteration) + return; + + len = snprintf(buf, sizeof(buf), "%u", nth_state->iteration); + ASSERT_EQ(len, pwrite(nth_state->proc_fd, buf, len, 0)); +} +#define fail_nth_enable() __fail_nth_enable(_metadata, _nth_state) + +#define TEST_FAIL_NTH(fixture_name, name) \ + static int test_nth_##name(struct __test_metadata *_metadata, \ + FIXTURE_DATA(fixture_name) *self, \ + const FIXTURE_VARIANT(fixture_name) \ + *variant, \ + struct fail_nth_state *_nth_state); \ + TEST_F(fixture_name, name) \ + { \ + struct fail_nth_state nth_state = {}; \ + int test_result = 0; \ + \ + if (!have_fault_injection) \ + SKIP(return, \ + "fault injection is not enabled in the kernel"); \ + fail_nth_first(_metadata, &nth_state); \ + ASSERT_EQ(0, test_nth_##name(_metadata, self, variant, \ + &nth_state)); \ + while (fail_nth_next(_metadata, &nth_state, test_result)) { \ + fixture_name##_teardown(_metadata, self, variant); \ + fixture_name##_setup(_metadata, self, variant); \ + test_result = test_nth_##name(_metadata, self, \ + variant, &nth_state); \ + }; \ + ASSERT_EQ(0, test_result); \ + } \ + static int test_nth_##name( \ + struct __test_metadata __attribute__((unused)) *_metadata, \ + FIXTURE_DATA(fixture_name) __attribute__((unused)) *self, \ + const FIXTURE_VARIANT(fixture_name) __attribute__((unused)) \ + *variant, \ + struct fail_nth_state *_nth_state) + +FIXTURE(basic_fail_nth) +{ + int fd; + uint32_t access_id; +}; + +FIXTURE_SETUP(basic_fail_nth) +{ + self->fd = -1; + self->access_id = 0; +} + +FIXTURE_TEARDOWN(basic_fail_nth) +{ + int rc; + + if (self->access_id) { + /* The access FD holds the iommufd open until it closes */ + rc = _test_cmd_destroy_access(self->access_id); + assert(rc == 0); + } + teardown_iommufd(self->fd, _metadata); +} + +/* Cover ioas.c */ +TEST_FAIL_NTH(basic_fail_nth, basic) +{ + struct iommu_iova_range ranges[10]; + uint32_t ioas_id; + __u64 iova; + + fail_nth_enable(); + + self->fd = open("/dev/iommu", O_RDWR); + if (self->fd == -1) + return -1; + + if (_test_ioctl_ioas_alloc(self->fd, &ioas_id)) + return -1; + + { + struct iommu_ioas_iova_ranges ranges_cmd = { + .size = sizeof(ranges_cmd), + .num_iovas = ARRAY_SIZE(ranges), + .ioas_id = ioas_id, + .allowed_iovas = (uintptr_t)ranges, + }; + if (ioctl(self->fd, IOMMU_IOAS_IOVA_RANGES, &ranges_cmd)) + return -1; + } + + { + struct iommu_ioas_allow_iovas allow_cmd = { + .size = sizeof(allow_cmd), + .ioas_id = ioas_id, + .num_iovas = 1, + .allowed_iovas = (uintptr_t)ranges, + }; + + ranges[0].start = 16*1024; + ranges[0].last = BUFFER_SIZE + 16 * 1024 * 600 - 1; + if (ioctl(self->fd, IOMMU_IOAS_ALLOW_IOVAS, &allow_cmd)) + return -1; + } + + if (_test_ioctl_ioas_map(self->fd, ioas_id, buffer, BUFFER_SIZE, &iova, + IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_READABLE)) + return -1; + + { + struct iommu_ioas_copy copy_cmd = { + .size = sizeof(copy_cmd), + .flags = IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_READABLE, + .dst_ioas_id = ioas_id, + .src_ioas_id = ioas_id, + .src_iova = iova, + .length = sizeof(ranges), + }; + + if (ioctl(self->fd, IOMMU_IOAS_COPY, ©_cmd)) + return -1; + } + + if (_test_ioctl_ioas_unmap(self->fd, ioas_id, iova, BUFFER_SIZE, + NULL)) + return -1; + /* Failure path of no IOVA to unmap */ + _test_ioctl_ioas_unmap(self->fd, ioas_id, iova, BUFFER_SIZE, NULL); + return 0; +} + +/* iopt_area_fill_domains() and iopt_area_fill_domain() */ +TEST_FAIL_NTH(basic_fail_nth, map_domain) +{ + uint32_t ioas_id; + __u32 device_id; + __u32 hwpt_id; + __u64 iova; + + self->fd = open("/dev/iommu", O_RDWR); + if (self->fd == -1) + return -1; + + if (_test_ioctl_ioas_alloc(self->fd, &ioas_id)) + return -1; + + if (_test_ioctl_set_temp_memory_limit(self->fd, 32)) + return -1; + + fail_nth_enable(); + + if (_test_cmd_mock_domain(self->fd, ioas_id, &device_id, &hwpt_id)) + return -1; + + if (_test_ioctl_ioas_map(self->fd, ioas_id, buffer, 262144, &iova, + IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_READABLE)) + return -1; + + if (_test_ioctl_destroy(self->fd, device_id)) + return -1; + if (_test_ioctl_destroy(self->fd, hwpt_id)) + return -1; + + if (_test_cmd_mock_domain(self->fd, ioas_id, &device_id, &hwpt_id)) + return -1; + return 0; +} + +TEST_FAIL_NTH(basic_fail_nth, map_two_domains) +{ + uint32_t ioas_id; + __u32 device_id2; + __u32 device_id; + __u32 hwpt_id2; + __u32 hwpt_id; + __u64 iova; + + self->fd = open("/dev/iommu", O_RDWR); + if (self->fd == -1) + return -1; + + if (_test_ioctl_ioas_alloc(self->fd, &ioas_id)) + return -1; + + if (_test_ioctl_set_temp_memory_limit(self->fd, 32)) + return -1; + + if (_test_cmd_mock_domain(self->fd, ioas_id, &device_id, &hwpt_id)) + return -1; + + fail_nth_enable(); + + if (_test_cmd_mock_domain(self->fd, ioas_id, &device_id2, &hwpt_id2)) + return -1; + + if (_test_ioctl_ioas_map(self->fd, ioas_id, buffer, 262144, &iova, + IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_READABLE)) + return -1; + + if (_test_ioctl_destroy(self->fd, device_id)) + return -1; + if (_test_ioctl_destroy(self->fd, hwpt_id)) + return -1; + + if (_test_ioctl_destroy(self->fd, device_id2)) + return -1; + if (_test_ioctl_destroy(self->fd, hwpt_id2)) + return -1; + + if (_test_cmd_mock_domain(self->fd, ioas_id, &device_id, &hwpt_id)) + return -1; + if (_test_cmd_mock_domain(self->fd, ioas_id, &device_id2, &hwpt_id2)) + return -1; + return 0; +} + +TEST_FAIL_NTH(basic_fail_nth, access_rw) +{ + uint64_t tmp_big[4096]; + uint32_t ioas_id; + uint16_t tmp[32]; + __u64 iova; + + self->fd = open("/dev/iommu", O_RDWR); + if (self->fd == -1) + return -1; + + if (_test_ioctl_ioas_alloc(self->fd, &ioas_id)) + return -1; + + if (_test_ioctl_set_temp_memory_limit(self->fd, 32)) + return -1; + + if (_test_ioctl_ioas_map(self->fd, ioas_id, buffer, 262144, &iova, + IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_READABLE)) + return -1; + + fail_nth_enable(); + + if (_test_cmd_create_access(self->fd, ioas_id, &self->access_id, 0)) + return -1; + + { + struct iommu_test_cmd access_cmd = { + .size = sizeof(access_cmd), + .op = IOMMU_TEST_OP_ACCESS_RW, + .id = self->access_id, + .access_rw = { .iova = iova, + .length = sizeof(tmp), + .uptr = (uintptr_t)tmp }, + }; + + // READ + if (ioctl(self->fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_RW), + &access_cmd)) + return -1; + + access_cmd.access_rw.flags = MOCK_ACCESS_RW_WRITE; + if (ioctl(self->fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_RW), + &access_cmd)) + return -1; + + access_cmd.access_rw.flags = MOCK_ACCESS_RW_SLOW_PATH; + if (ioctl(self->fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_RW), + &access_cmd)) + return -1; + access_cmd.access_rw.flags = MOCK_ACCESS_RW_SLOW_PATH | + MOCK_ACCESS_RW_WRITE; + if (ioctl(self->fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_RW), + &access_cmd)) + return -1; + } + + { + struct iommu_test_cmd access_cmd = { + .size = sizeof(access_cmd), + .op = IOMMU_TEST_OP_ACCESS_RW, + .id = self->access_id, + .access_rw = { .iova = iova, + .flags = MOCK_ACCESS_RW_SLOW_PATH, + .length = sizeof(tmp_big), + .uptr = (uintptr_t)tmp_big }, + }; + + if (ioctl(self->fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_RW), + &access_cmd)) + return -1; + } + if (_test_cmd_destroy_access(self->access_id)) + return -1; + self->access_id = 0; + return 0; +} + +/* pages.c access functions */ +TEST_FAIL_NTH(basic_fail_nth, access_pin) +{ + uint32_t access_pages_id; + uint32_t ioas_id; + __u64 iova; + + self->fd = open("/dev/iommu", O_RDWR); + if (self->fd == -1) + return -1; + + if (_test_ioctl_ioas_alloc(self->fd, &ioas_id)) + return -1; + + if (_test_ioctl_set_temp_memory_limit(self->fd, 32)) + return -1; + + if (_test_ioctl_ioas_map(self->fd, ioas_id, buffer, BUFFER_SIZE, &iova, + IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_READABLE)) + return -1; + + if (_test_cmd_create_access(self->fd, ioas_id, &self->access_id, + MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES)) + return -1; + + fail_nth_enable(); + + { + struct iommu_test_cmd access_cmd = { + .size = sizeof(access_cmd), + .op = IOMMU_TEST_OP_ACCESS_PAGES, + .id = self->access_id, + .access_pages = { .iova = iova, + .length = BUFFER_SIZE, + .uptr = (uintptr_t)buffer }, + }; + + if (ioctl(self->fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_RW), + &access_cmd)) + return -1; + access_pages_id = access_cmd.access_pages.out_access_pages_id; + } + + if (_test_cmd_destroy_access_pages(self->fd, self->access_id, + access_pages_id)) + return -1; + + if (_test_cmd_destroy_access(self->access_id)) + return -1; + self->access_id = 0; + return 0; +} + +/* iopt_pages_fill_xarray() */ +TEST_FAIL_NTH(basic_fail_nth, access_pin_domain) +{ + uint32_t access_pages_id; + uint32_t ioas_id; + __u32 device_id; + __u32 hwpt_id; + __u64 iova; + + self->fd = open("/dev/iommu", O_RDWR); + if (self->fd == -1) + return -1; + + if (_test_ioctl_ioas_alloc(self->fd, &ioas_id)) + return -1; + + if (_test_ioctl_set_temp_memory_limit(self->fd, 32)) + return -1; + + if (_test_cmd_mock_domain(self->fd, ioas_id, &device_id, &hwpt_id)) + return -1; + + if (_test_ioctl_ioas_map(self->fd, ioas_id, buffer, BUFFER_SIZE, &iova, + IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_READABLE)) + return -1; + + if (_test_cmd_create_access(self->fd, ioas_id, &self->access_id, + MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES)) + return -1; + + fail_nth_enable(); + + { + struct iommu_test_cmd access_cmd = { + .size = sizeof(access_cmd), + .op = IOMMU_TEST_OP_ACCESS_PAGES, + .id = self->access_id, + .access_pages = { .iova = iova, + .length = BUFFER_SIZE, + .uptr = (uintptr_t)buffer }, + }; + + if (ioctl(self->fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_RW), + &access_cmd)) + return -1; + access_pages_id = access_cmd.access_pages.out_access_pages_id; + } + + if (_test_cmd_destroy_access_pages(self->fd, self->access_id, + access_pages_id)) + return -1; + + if (_test_cmd_destroy_access(self->access_id)) + return -1; + self->access_id = 0; + + if (_test_ioctl_destroy(self->fd, device_id)) + return -1; + if (_test_ioctl_destroy(self->fd, hwpt_id)) + return -1; + return 0; +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h new file mode 100644 index 0000000000000..0d1f46369c2a3 --- /dev/null +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -0,0 +1,278 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES */ +#ifndef __SELFTEST_IOMMUFD_UTILS +#define __SELFTEST_IOMMUFD_UTILS + +#include +#include +#include +#include +#include +#include + +#include "../kselftest_harness.h" +#include "../../../../drivers/iommu/iommufd/iommufd_test.h" + +/* Hack to make assertions more readable */ +#define _IOMMU_TEST_CMD(x) IOMMU_TEST_CMD + +static void *buffer; +static unsigned long BUFFER_SIZE; + +/* + * Have the kernel check the refcount on pages. I don't know why a freshly + * mmap'd anon non-compound page starts out with a ref of 3 + */ +#define check_refs(_ptr, _length, _refs) \ + ({ \ + struct iommu_test_cmd test_cmd = { \ + .size = sizeof(test_cmd), \ + .op = IOMMU_TEST_OP_MD_CHECK_REFS, \ + .check_refs = { .length = _length, \ + .uptr = (uintptr_t)(_ptr), \ + .refs = _refs }, \ + }; \ + ASSERT_EQ(0, \ + ioctl(self->fd, \ + _IOMMU_TEST_CMD(IOMMU_TEST_OP_MD_CHECK_REFS), \ + &test_cmd)); \ + }) + +static int _test_cmd_mock_domain(int fd, unsigned int ioas_id, __u32 *device_id, + __u32 *hwpt_id) +{ + struct iommu_test_cmd cmd = { + .size = sizeof(cmd), + .op = IOMMU_TEST_OP_MOCK_DOMAIN, + .id = ioas_id, + .mock_domain = {}, + }; + int ret; + + ret = ioctl(fd, IOMMU_TEST_CMD, &cmd); + if (ret) + return ret; + if (device_id) + *device_id = cmd.mock_domain.out_device_id; + assert(cmd.id != 0); + if (hwpt_id) + *hwpt_id = cmd.mock_domain.out_hwpt_id; + return 0; +} +#define test_cmd_mock_domain(ioas_id, device_id, hwpt_id) \ + ASSERT_EQ(0, _test_cmd_mock_domain(self->fd, ioas_id, device_id, \ + hwpt_id)) +#define test_err_mock_domain(_errno, ioas_id, device_id, hwpt_id) \ + EXPECT_ERRNO(_errno, _test_cmd_mock_domain(self->fd, ioas_id, \ + device_id, hwpt_id)) + +static int _test_cmd_create_access(int fd, unsigned int ioas_id, + __u32 *access_id, unsigned int flags) +{ + struct iommu_test_cmd cmd = { + .size = sizeof(cmd), + .op = IOMMU_TEST_OP_CREATE_ACCESS, + .id = ioas_id, + .create_access = { .flags = flags }, + }; + int ret; + + ret = ioctl(fd, IOMMU_TEST_CMD, &cmd); + if (ret) + return ret; + *access_id = cmd.create_access.out_access_fd; + return 0; +} +#define test_cmd_create_access(ioas_id, access_id, flags) \ + ASSERT_EQ(0, _test_cmd_create_access(self->fd, ioas_id, access_id, \ + flags)) + +static int _test_cmd_destroy_access(unsigned int access_id) +{ + return close(access_id); +} +#define test_cmd_destroy_access(access_id) \ + ASSERT_EQ(0, _test_cmd_destroy_access(access_id)) + +static int _test_cmd_destroy_access_pages(int fd, unsigned int access_id, + unsigned int access_pages_id) +{ + struct iommu_test_cmd cmd = { + .size = sizeof(cmd), + .op = IOMMU_TEST_OP_DESTROY_ACCESS_PAGES, + .id = access_id, + .destroy_access_pages = { .access_pages_id = access_pages_id }, + }; + return ioctl(fd, IOMMU_TEST_CMD, &cmd); +} +#define test_cmd_destroy_access_pages(access_id, access_pages_id) \ + ASSERT_EQ(0, _test_cmd_destroy_access_pages(self->fd, access_id, \ + access_pages_id)) +#define test_err_destroy_access_pages(_errno, access_id, access_pages_id) \ + EXPECT_ERRNO(_errno, _test_cmd_destroy_access_pages( \ + self->fd, access_id, access_pages_id)) + +static int _test_ioctl_destroy(int fd, unsigned int id) +{ + struct iommu_destroy cmd = { + .size = sizeof(cmd), + .id = id, + }; + return ioctl(fd, IOMMU_DESTROY, &cmd); +} +#define test_ioctl_destroy(id) ASSERT_EQ(0, _test_ioctl_destroy(self->fd, id)) + +static int _test_ioctl_ioas_alloc(int fd, __u32 *id) +{ + struct iommu_ioas_alloc cmd = { + .size = sizeof(cmd), + }; + int ret; + + ret = ioctl(fd, IOMMU_IOAS_ALLOC, &cmd); + if (ret) + return ret; + *id = cmd.out_ioas_id; + return 0; +} +#define test_ioctl_ioas_alloc(id) \ + ({ \ + ASSERT_EQ(0, _test_ioctl_ioas_alloc(self->fd, id)); \ + ASSERT_NE(0, *(id)); \ + }) + +static int _test_ioctl_ioas_map(int fd, unsigned int ioas_id, void *buffer, + size_t length, __u64 *iova, unsigned int flags) +{ + struct iommu_ioas_map cmd = { + .size = sizeof(cmd), + .flags = flags, + .ioas_id = ioas_id, + .user_va = (uintptr_t)buffer, + .length = length, + }; + int ret; + + if (flags & IOMMU_IOAS_MAP_FIXED_IOVA) + cmd.iova = *iova; + + ret = ioctl(fd, IOMMU_IOAS_MAP, &cmd); + *iova = cmd.iova; + return ret; +} +#define test_ioctl_ioas_map(buffer, length, iova_p) \ + ASSERT_EQ(0, _test_ioctl_ioas_map(self->fd, self->ioas_id, buffer, \ + length, iova_p, \ + IOMMU_IOAS_MAP_WRITEABLE | \ + IOMMU_IOAS_MAP_READABLE)) + +#define test_err_ioctl_ioas_map(_errno, buffer, length, iova_p) \ + EXPECT_ERRNO(_errno, \ + _test_ioctl_ioas_map(self->fd, self->ioas_id, buffer, \ + length, iova_p, \ + IOMMU_IOAS_MAP_WRITEABLE | \ + IOMMU_IOAS_MAP_READABLE)) + +#define test_ioctl_ioas_map_id(ioas_id, buffer, length, iova_p) \ + ASSERT_EQ(0, _test_ioctl_ioas_map(self->fd, ioas_id, buffer, length, \ + iova_p, \ + IOMMU_IOAS_MAP_WRITEABLE | \ + IOMMU_IOAS_MAP_READABLE)) + +#define test_ioctl_ioas_map_fixed(buffer, length, iova) \ + ({ \ + __u64 __iova = iova; \ + ASSERT_EQ(0, _test_ioctl_ioas_map( \ + self->fd, self->ioas_id, buffer, length, \ + &__iova, \ + IOMMU_IOAS_MAP_FIXED_IOVA | \ + IOMMU_IOAS_MAP_WRITEABLE | \ + IOMMU_IOAS_MAP_READABLE)); \ + }) + +#define test_err_ioctl_ioas_map_fixed(_errno, buffer, length, iova) \ + ({ \ + __u64 __iova = iova; \ + EXPECT_ERRNO(_errno, \ + _test_ioctl_ioas_map( \ + self->fd, self->ioas_id, buffer, length, \ + &__iova, \ + IOMMU_IOAS_MAP_FIXED_IOVA | \ + IOMMU_IOAS_MAP_WRITEABLE | \ + IOMMU_IOAS_MAP_READABLE)); \ + }) + +static int _test_ioctl_ioas_unmap(int fd, unsigned int ioas_id, uint64_t iova, + size_t length, uint64_t *out_len) +{ + struct iommu_ioas_unmap cmd = { + .size = sizeof(cmd), + .ioas_id = ioas_id, + .iova = iova, + .length = length, + }; + int ret; + + ret = ioctl(fd, IOMMU_IOAS_UNMAP, &cmd); + if (out_len) + *out_len = cmd.length; + return ret; +} +#define test_ioctl_ioas_unmap(iova, length) \ + ASSERT_EQ(0, _test_ioctl_ioas_unmap(self->fd, self->ioas_id, iova, \ + length, NULL)) + +#define test_ioctl_ioas_unmap_id(ioas_id, iova, length) \ + ASSERT_EQ(0, _test_ioctl_ioas_unmap(self->fd, ioas_id, iova, length, \ + NULL)) + +#define test_err_ioctl_ioas_unmap(_errno, iova, length) \ + EXPECT_ERRNO(_errno, _test_ioctl_ioas_unmap(self->fd, self->ioas_id, \ + iova, length, NULL)) + +static int _test_ioctl_set_temp_memory_limit(int fd, unsigned int limit) +{ + struct iommu_test_cmd memlimit_cmd = { + .size = sizeof(memlimit_cmd), + .op = IOMMU_TEST_OP_SET_TEMP_MEMORY_LIMIT, + .memory_limit = { .limit = limit }, + }; + + return ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_SET_TEMP_MEMORY_LIMIT), + &memlimit_cmd); +} + +#define test_ioctl_set_temp_memory_limit(limit) \ + ASSERT_EQ(0, _test_ioctl_set_temp_memory_limit(self->fd, limit)) + +#define test_ioctl_set_default_memory_limit() \ + test_ioctl_set_temp_memory_limit(65536) + +static void teardown_iommufd(int fd, struct __test_metadata *_metadata) +{ + struct iommu_test_cmd test_cmd = { + .size = sizeof(test_cmd), + .op = IOMMU_TEST_OP_MD_CHECK_REFS, + .check_refs = { .length = BUFFER_SIZE, + .uptr = (uintptr_t)buffer }, + }; + + if (fd == -1) + return; + + EXPECT_EQ(0, close(fd)); + + fd = open("/dev/iommu", O_RDWR); + EXPECT_NE(-1, fd); + EXPECT_EQ(0, ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_MD_CHECK_REFS), + &test_cmd)); + EXPECT_EQ(0, close(fd)); +} + +#define EXPECT_ERRNO(expected_errno, cmd) \ + ({ \ + ASSERT_EQ(-1, cmd); \ + EXPECT_EQ(expected_errno, errno); \ + }) + +#endif -- GitLab From 5c8c0b3273822cf982c250a9a19e003e4b315edb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 31 Aug 2022 00:17:04 +0000 Subject: [PATCH 1019/1423] KVM: x86: Delete documentation for READ|WRITE in KVM_X86_SET_MSR_FILTER Delete the paragraph that describes the behavior when both KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE are set for a range. There is nothing special about KVM's handling of this combination, whereas explicitly documenting the combination suggests that there is some magic behavior the user needs to be aware of. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220831001706.4075399-2-seanjc@google.com --- Documentation/virt/kvm/api.rst | 7 ------- 1 file changed, 7 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 5617bc4f899f4..373cf425e85c1 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -4115,13 +4115,6 @@ flags values for ``struct kvm_msr_filter_range``: a write for a particular MSR should be handled regardless of the default filter action. -``KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE`` - - Filter both read and write accesses to MSRs using the given bitmap. A 0 - in the bitmap indicates that both reads and writes should immediately fail, - while a 1 indicates that reads and writes for a particular MSR are not - filtered by this range. - flags values for ``struct kvm_msr_filter``: ``KVM_MSR_FILTER_DEFAULT_ALLOW`` -- GitLab From b93d2ec34ef368bb854289db99d8d6ca7f523e25 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 31 Aug 2022 00:17:05 +0000 Subject: [PATCH 1020/1423] KVM: x86: Reword MSR filtering docs to more precisely define behavior Reword the MSR filtering documentatiion to more precisely define the behavior of filtering using common virtualization terminology. - Explicitly document KVM's behavior when an MSR is denied - s/handled/allowed as there is no guarantee KVM will "handle" the MSR access - Drop the "fall back" terminology, which incorrectly suggests that there is existing KVM behavior to fall back to - Fix an off-by-one error in the range (the end is exclusive) - Call out the interaction between MSR filtering and KVM_CAP_X86_USER_SPACE_MSR's KVM_MSR_EXIT_REASON_FILTER - Delete the redundant paragraph on what '0' and '1' in the bitmap means, it's covered by the sections on KVM_MSR_FILTER_{READ,WRITE} - Delete the clause on x2APIC MSR behavior depending on APIC base, this is covered by stating that KVM follows architectural behavior when emulating/virtualizing MSR accesses Reported-by: Aaron Lewis Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220831001706.4075399-3-seanjc@google.com --- Documentation/virt/kvm/api.rst | 70 +++++++++++++++++----------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 373cf425e85c1..a4d07b866dea8 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -4104,15 +4104,15 @@ flags values for ``struct kvm_msr_filter_range``: ``KVM_MSR_FILTER_READ`` Filter read accesses to MSRs using the given bitmap. A 0 in the bitmap - indicates that a read should immediately fail, while a 1 indicates that - a read for a particular MSR should be handled regardless of the default + indicates that read accesses should be denied, while a 1 indicates that + a read for a particular MSR should be allowed regardless of the default filter action. ``KVM_MSR_FILTER_WRITE`` Filter write accesses to MSRs using the given bitmap. A 0 in the bitmap - indicates that a write should immediately fail, while a 1 indicates that - a write for a particular MSR should be handled regardless of the default + indicates that write accesses should be denied, while a 1 indicates that + a write for a particular MSR should be allowed regardless of the default filter action. flags values for ``struct kvm_msr_filter``: @@ -4120,57 +4120,55 @@ flags values for ``struct kvm_msr_filter``: ``KVM_MSR_FILTER_DEFAULT_ALLOW`` If no filter range matches an MSR index that is getting accessed, KVM will - fall back to allowing access to the MSR. + allow accesses to all MSRs by default. ``KVM_MSR_FILTER_DEFAULT_DENY`` If no filter range matches an MSR index that is getting accessed, KVM will - fall back to rejecting access to the MSR. In this mode, all MSRs that should - be processed by KVM need to explicitly be marked as allowed in the bitmaps. + deny accesses to all MSRs by default. -This ioctl allows user space to define up to 16 bitmaps of MSR ranges to -specify whether a certain MSR access should be explicitly filtered for or not. +This ioctl allows userspace to define up to 16 bitmaps of MSR ranges to deny +guest MSR accesses that would normally be allowed by KVM. If an MSR is not +covered by a specific range, the "default" filtering behavior applies. Each +bitmap range covers MSRs from [base .. base+nmsrs). -If this ioctl has never been invoked, MSR accesses are not guarded and the -default KVM in-kernel emulation behavior is fully preserved. +If an MSR access is denied by userspace, the resulting KVM behavior depends on +whether or not KVM_CAP_X86_USER_SPACE_MSR's KVM_MSR_EXIT_REASON_FILTER is +enabled. If KVM_MSR_EXIT_REASON_FILTER is enabled, KVM will exit to userspace +on denied accesses, i.e. userspace effectively intercepts the MSR access. If +KVM_MSR_EXIT_REASON_FILTER is not enabled, KVM will inject a #GP into the guest +on denied accesses. + +If an MSR access is allowed by userspace, KVM will emulate and/or virtualize +the access in accordance with the vCPU model. Note, KVM may still ultimately +inject a #GP if an access is allowed by userspace, e.g. if KVM doesn't support +the MSR, or to follow architectural behavior for the MSR. + +By default, KVM operates in KVM_MSR_FILTER_DEFAULT_ALLOW mode with no MSR range +filters. Calling this ioctl with an empty set of ranges (all nmsrs == 0) disables MSR filtering. In that mode, ``KVM_MSR_FILTER_DEFAULT_DENY`` is invalid and causes an error. -As soon as the filtering is in place, every MSR access is processed through -the filtering except for accesses to the x2APIC MSRs (from 0x800 to 0x8ff); -x2APIC MSRs are always allowed, independent of the ``default_allow`` setting, -and their behavior depends on the ``X2APIC_ENABLE`` bit of the APIC base -register. - .. warning:: - MSR accesses coming from nested vmentry/vmexit are not filtered. + MSR accesses as part of nested VM-Enter/VM-Exit are not filtered. This includes both writes to individual VMCS fields and reads/writes through the MSR lists pointed to by the VMCS. -If a bit is within one of the defined ranges, read and write accesses are -guarded by the bitmap's value for the MSR index if the kind of access -is included in the ``struct kvm_msr_filter_range`` flags. If no range -cover this particular access, the behavior is determined by the flags -field in the kvm_msr_filter struct: ``KVM_MSR_FILTER_DEFAULT_ALLOW`` -and ``KVM_MSR_FILTER_DEFAULT_DENY``. - -Each bitmap range specifies a range of MSRs to potentially allow access on. -The range goes from MSR index [base .. base+nmsrs]. The flags field -indicates whether reads, writes or both reads and writes are filtered -by setting a 1 bit in the bitmap for the corresponding MSR index. - -If an MSR access is not permitted through the filtering, it generates a -#GP inside the guest. When combined with KVM_CAP_X86_USER_SPACE_MSR, that -allows user space to deflect and potentially handle various MSR accesses -into user space. + x2APIC MSR accesses cannot be filtered (KVM silently ignores filters that + cover any x2APIC MSRs). Note, invoking this ioctl while a vCPU is running is inherently racy. However, KVM does guarantee that vCPUs will see either the previous filter or the new filter, e.g. MSRs with identical settings in both the old and new filter will have deterministic behavior. +Similarly, if userspace wishes to intercept on denied accesses, +KVM_MSR_EXIT_REASON_FILTER must be enabled before activating any filters, and +left enabled until after all filters are deactivated. Failure to do so may +result in KVM injecting a #GP instead of exiting to userspace. + 4.98 KVM_CREATE_SPAPR_TCE_64 ---------------------------- @@ -6500,6 +6498,8 @@ wants to write. Once finished processing the event, user space must continue vCPU execution. If the MSR write was unsuccessful, user space also sets the "error" field to "1". +See KVM_X86_SET_MSR_FILTER for details on the interaction with MSR filtering. + :: @@ -7937,7 +7937,7 @@ KVM_EXIT_X86_WRMSR exit notifications. This capability indicates that KVM supports that accesses to user defined MSRs may be rejected. With this capability exposed, KVM exports new VM ioctl KVM_X86_SET_MSR_FILTER which user space can call to specify bitmaps of MSR -ranges that KVM should reject access to. +ranges that KVM should deny access to. In combination with KVM_CAP_X86_USER_SPACE_MSR, this allows user space to trap and emulate MSRs that are outside of the scope of KVM as well as -- GitLab From 1f158147181b83c5ae02273d0b3b9eddaebcc854 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 31 Aug 2022 00:17:06 +0000 Subject: [PATCH 1021/1423] KVM: x86: Clean up KVM_CAP_X86_USER_SPACE_MSR documentation Clean up the KVM_CAP_X86_USER_SPACE_MSR documentation to eliminate misleading and/or inconsistent verbiage, and to actually document what accesses are intercepted by which flags. - s/will/may since not all #GPs are guaranteed to be intercepted - s/deflect/intercept to align with common KVM terminology - s/user space/userspace to align with the majority of KVM docs - Avoid using "trap" terminology, as KVM exits to userspace _before_ stepping, i.e. doesn't exhibit trap-like behavior - Actually document the flags Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220831001706.4075399-4-seanjc@google.com --- Documentation/virt/kvm/api.rst | 40 ++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index a4d07b866dea8..c6857f6b25ab0 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6473,29 +6473,29 @@ if it decides to decode and emulate the instruction. Used on x86 systems. When the VM capability KVM_CAP_X86_USER_SPACE_MSR is enabled, MSR accesses to registers that would invoke a #GP by KVM kernel code -will instead trigger a KVM_EXIT_X86_RDMSR exit for reads and KVM_EXIT_X86_WRMSR +may instead trigger a KVM_EXIT_X86_RDMSR exit for reads and KVM_EXIT_X86_WRMSR exit for writes. -The "reason" field specifies why the MSR trap occurred. User space will only -receive MSR exit traps when a particular reason was requested during through +The "reason" field specifies why the MSR interception occurred. Userspace will +only receive MSR exits when a particular reason was requested during through ENABLE_CAP. Currently valid exit reasons are: KVM_MSR_EXIT_REASON_UNKNOWN - access to MSR that is unknown to KVM KVM_MSR_EXIT_REASON_INVAL - access to invalid MSRs or reserved bits KVM_MSR_EXIT_REASON_FILTER - access blocked by KVM_X86_SET_MSR_FILTER -For KVM_EXIT_X86_RDMSR, the "index" field tells user space which MSR the guest -wants to read. To respond to this request with a successful read, user space +For KVM_EXIT_X86_RDMSR, the "index" field tells userspace which MSR the guest +wants to read. To respond to this request with a successful read, userspace writes the respective data into the "data" field and must continue guest execution to ensure the read data is transferred into guest register state. -If the RDMSR request was unsuccessful, user space indicates that with a "1" in +If the RDMSR request was unsuccessful, userspace indicates that with a "1" in the "error" field. This will inject a #GP into the guest when the VCPU is executed again. -For KVM_EXIT_X86_WRMSR, the "index" field tells user space which MSR the guest -wants to write. Once finished processing the event, user space must continue -vCPU execution. If the MSR write was unsuccessful, user space also sets the +For KVM_EXIT_X86_WRMSR, the "index" field tells userspace which MSR the guest +wants to write. Once finished processing the event, userspace must continue +vCPU execution. If the MSR write was unsuccessful, userspace also sets the "error" field to "1". See KVM_X86_SET_MSR_FILTER for details on the interaction with MSR filtering. @@ -7265,19 +7265,27 @@ the module parameter for the target VM. :Parameters: args[0] contains the mask of KVM_MSR_EXIT_REASON_* events to report :Returns: 0 on success; -1 on error -This capability enables trapping of #GP invoking RDMSR and WRMSR instructions -into user space. +This capability allows userspace to intercept RDMSR and WRMSR instructions if +access to an MSR is denied. By default, KVM injects #GP on denied accesses. When a guest requests to read or write an MSR, KVM may not implement all MSRs that are relevant to a respective system. It also does not differentiate by CPU type. -To allow more fine grained control over MSR handling, user space may enable +To allow more fine grained control over MSR handling, userspace may enable this capability. With it enabled, MSR accesses that match the mask specified in -args[0] and trigger a #GP event inside the guest by KVM will instead trigger -KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR exit notifications which user space -can then handle to implement model specific MSR handling and/or user notifications -to inform a user that an MSR was not handled. +args[0] and would trigger a #GP inside the guest will instead trigger +KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR exit notifications. Userspace +can then implement model specific MSR handling and/or user notifications +to inform a user that an MSR was not emulated/virtualized by KVM. + +The valid mask flags are: + + KVM_MSR_EXIT_REASON_UNKNOWN - intercept accesses to unknown (to KVM) MSRs + KVM_MSR_EXIT_REASON_INVAL - intercept accesses that are architecturally + invalid according to the vCPU model and/or mode + KVM_MSR_EXIT_REASON_FILTER - intercept accesses that are denied by userspace + via KVM_X86_SET_MSR_FILTER 7.22 KVM_CAP_X86_BUS_LOCK_EXIT ------------------------------- -- GitLab From 4a8fd4a720f8a8dbc370076d26388176c311218a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 31 Aug 2022 00:07:21 +0000 Subject: [PATCH 1022/1423] KVM: nVMX: Reword comments about generating nested CR0/4 read shadows Reword the comments that (attempt to) document nVMX's overrides of the CR0/4 read shadows for L2 after calling vmx_set_cr0/4(). The important behavior that needs to be documented is that KVM needs to override the shadows to account for L1's masks even though the shadows are set by the common helpers (and that setting the shadows first would result in the correct shadows being clobbered). Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Link: https://lore.kernel.org/r/20220831000721.4066617-1-seanjc@google.com --- arch/x86/kvm/vmx/nested.c | 9 +++------ arch/x86/kvm/vmx/nested.h | 7 ++++--- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 61c83424285c8..b6f4411b613e9 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2588,12 +2588,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, nested_ept_init_mmu_context(vcpu); /* - * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those - * bits which we consider mandatory enabled. - * The CR0_READ_SHADOW is what L2 should have expected to read given - * the specifications by L1; It's not enough to take - * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we - * have more bits than L1 expected. + * Override the CR0/CR4 read shadows after setting the effective guest + * CR0/CR4. The common helpers also set the shadows, but they don't + * account for vmcs12's cr0/4_guest_host_mask. */ vmx_set_cr0(vcpu, vmcs12->guest_cr0); vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 6312c9541c3cd..96952263b029b 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -79,9 +79,10 @@ static inline bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu) } /* - * Return the cr0 value that a nested guest would read. This is a combination - * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by - * its hypervisor (cr0_read_shadow). + * Return the cr0/4 value that a nested guest would read. This is a combination + * of L1's "real" cr0 used to run the guest (guest_cr0), and the bits shadowed + * by the L1 hypervisor (cr0_read_shadow). KVM must emulate CPU behavior as + * the value+mask loaded into vmcs02 may not match the vmcs12 fields. */ static inline unsigned long nested_read_cr0(struct vmcs12 *fields) { -- GitLab From 0b5e7a16a0a79a3742f0df9e45bca46f01b40e6a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 28 Sep 2022 23:20:15 +0000 Subject: [PATCH 1023/1423] KVM: VMX: Make vmread_error_trampoline() uncallable from C code Declare vmread_error_trampoline() as an opaque symbol so that it cannot be called from C code, at least not without some serious fudging. The trampoline always passes parameters on the stack so that the inline VMREAD sequence doesn't need to clobber registers. regparm(0) was originally added to document the stack behavior, but it ended up being confusing because regparm(0) is a nop for 64-bit targets. Opportunustically wrap the trampoline and its declaration in #ifdeffery to make it even harder to invoke incorrectly, to document why it exists, and so that it's not left behind if/when CONFIG_CC_HAS_ASM_GOTO_OUTPUT is true for all supported toolchains. No functional change intended. Cc: Uros Bizjak Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220928232015.745948-1-seanjc@google.com --- arch/x86/kvm/vmx/vmenter.S | 2 ++ arch/x86/kvm/vmx/vmx_ops.h | 18 ++++++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 0b5db4de4d09e..766c6b3ef5ed9 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -269,6 +269,7 @@ SYM_FUNC_END(__vmx_vcpu_run) .section .text, "ax" +#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT /** * vmread_error_trampoline - Trampoline from inline asm to vmread_error() * @field: VMCS field encoding that failed @@ -317,6 +318,7 @@ SYM_FUNC_START(vmread_error_trampoline) RET SYM_FUNC_END(vmread_error_trampoline) +#endif SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff) /* diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index f6f23c7397dc5..842dc898c9728 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -11,14 +11,28 @@ #include "../x86.h" void vmread_error(unsigned long field, bool fault); -__attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field, - bool fault); void vmwrite_error(unsigned long field, unsigned long value); void vmclear_error(struct vmcs *vmcs, u64 phys_addr); void vmptrld_error(struct vmcs *vmcs, u64 phys_addr); void invvpid_error(unsigned long ext, u16 vpid, gva_t gva); void invept_error(unsigned long ext, u64 eptp, gpa_t gpa); +#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT +/* + * The VMREAD error trampoline _always_ uses the stack to pass parameters, even + * for 64-bit targets. Preserving all registers allows the VMREAD inline asm + * blob to avoid clobbering GPRs, which in turn allows the compiler to better + * optimize sequences of VMREADs. + * + * Declare the trampoline as an opaque label as it's not safe to call from C + * code; there is no way to tell the compiler to pass params on the stack for + * 64-bit targets. + * + * void vmread_error_trampoline(unsigned long field, bool fault); + */ +extern unsigned long vmread_error_trampoline; +#endif + static __always_inline void vmcs_check16(unsigned long field) { BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, -- GitLab From d2a00af2061db863890e32a4a99a6f82c330df1f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 Jun 2022 23:23:51 +0000 Subject: [PATCH 1024/1423] KVM: VMX: Allow userspace to set all supported FEATURE_CONTROL bits Allow userspace to set all supported bits in MSR IA32_FEATURE_CONTROL irrespective of the guest CPUID model, e.g. via KVM_SET_MSRS. KVM's ABI is that userspace is allowed to set MSRs before CPUID, i.e. can set MSRs to values that would fault according to the guest CPUID model. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220607232353.3375324-2-seanjc@google.com --- arch/x86/kvm/vmx/vmx.c | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3f31c46c306e8..7be1fb50a753f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1836,12 +1836,38 @@ bool nested_vmx_allowed(struct kvm_vcpu *vcpu) return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX); } -static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, - uint64_t val) +/* + * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of + * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain + * backwards compatibility even though KVM doesn't support emulating SMX. And + * because userspace set "VMX in SMX", the guest must also be allowed to set it, + * e.g. if the MSR is left unlocked and the guest does a RMW operation. + */ +#define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \ + FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \ + FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \ + FEAT_CTL_SGX_LC_ENABLED | \ + FEAT_CTL_SGX_ENABLED | \ + FEAT_CTL_LMCE_ENABLED) + +static inline bool vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, + struct msr_data *msr) { - uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits; + uint64_t valid_bits; + + /* + * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are + * exposed to the guest. + */ + WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits & + ~KVM_SUPPORTED_FEATURE_CONTROL); + + if (msr->host_initiated) + valid_bits = KVM_SUPPORTED_FEATURE_CONTROL; + else + valid_bits = vmx->msr_ia32_feature_control_valid_bits; - return !(val & ~valid_bits); + return !(msr->data & ~valid_bits); } static int vmx_get_msr_feature(struct kvm_msr_entry *msr) @@ -2240,7 +2266,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.mcg_ext_ctl = data; break; case MSR_IA32_FEAT_CTL: - if (!vmx_feature_control_msr_valid(vcpu, data) || + if (!vmx_feature_control_msr_valid(vmx, msr_info) || (to_vmx(vcpu)->msr_ia32_feature_control & FEAT_CTL_LOCKED && !msr_info->host_initiated)) return 1; -- GitLab From 2d6cd68636d60822219074b7c1d0bfe41321f106 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 Jun 2022 23:23:52 +0000 Subject: [PATCH 1025/1423] KVM: VMX: Move MSR_IA32_FEAT_CTL.LOCKED check into "is valid" helper Move the check on IA32_FEATURE_CONTROL being locked, i.e. read-only from the guest, into the helper to check the overall validity of the incoming value. Opportunistically rename the helper to make it clear that it returns a bool. No functional change intended. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220607232353.3375324-3-seanjc@google.com --- arch/x86/kvm/vmx/vmx.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 7be1fb50a753f..fe5615fd8295c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1850,8 +1850,8 @@ bool nested_vmx_allowed(struct kvm_vcpu *vcpu) FEAT_CTL_SGX_ENABLED | \ FEAT_CTL_LMCE_ENABLED) -static inline bool vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, - struct msr_data *msr) +static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, + struct msr_data *msr) { uint64_t valid_bits; @@ -1862,6 +1862,10 @@ static inline bool vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits & ~KVM_SUPPORTED_FEATURE_CONTROL); + if (!msr->host_initiated && + (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED)) + return false; + if (msr->host_initiated) valid_bits = KVM_SUPPORTED_FEATURE_CONTROL; else @@ -2266,10 +2270,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.mcg_ext_ctl = data; break; case MSR_IA32_FEAT_CTL: - if (!vmx_feature_control_msr_valid(vmx, msr_info) || - (to_vmx(vcpu)->msr_ia32_feature_control & - FEAT_CTL_LOCKED && !msr_info->host_initiated)) + if (!is_vmx_feature_control_msr_valid(vmx, msr_info)) return 1; + vmx->msr_ia32_feature_control = data; if (msr_info->host_initiated && data == 0) vmx_leave_nested(vcpu); -- GitLab From b80732fdc9b235046687a2999ed198fa55fde901 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 Jun 2022 23:23:53 +0000 Subject: [PATCH 1026/1423] KVM: selftests: Verify userspace can stuff IA32_FEATURE_CONTROL at will Verify the KVM allows userspace to set all supported bits in the IA32_FEATURE_CONTROL MSR irrespective of the current guest CPUID, and that all unsupported bits are rejected. Throw the testcase into vmx_msrs_test even though it's not technically a VMX MSR; it's close enough, and the most frequently feature controlled by the MSR is VMX. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220607232353.3375324-4-seanjc@google.com --- .../selftests/kvm/include/x86_64/processor.h | 2 + .../selftests/kvm/x86_64/vmx_msrs_test.c | 47 +++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 5d310abe6c3f4..ac4590abb44d7 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -102,6 +102,7 @@ struct kvm_x86_cpu_feature { #define X86_FEATURE_XMM2 KVM_X86_CPU_FEATURE(0x1, 0, EDX, 26) #define X86_FEATURE_FSGSBASE KVM_X86_CPU_FEATURE(0x7, 0, EBX, 0) #define X86_FEATURE_TSC_ADJUST KVM_X86_CPU_FEATURE(0x7, 0, EBX, 1) +#define X86_FEATURE_SGX KVM_X86_CPU_FEATURE(0x7, 0, EBX, 2) #define X86_FEATURE_HLE KVM_X86_CPU_FEATURE(0x7, 0, EBX, 4) #define X86_FEATURE_SMEP KVM_X86_CPU_FEATURE(0x7, 0, EBX, 7) #define X86_FEATURE_INVPCID KVM_X86_CPU_FEATURE(0x7, 0, EBX, 10) @@ -115,6 +116,7 @@ struct kvm_x86_cpu_feature { #define X86_FEATURE_PKU KVM_X86_CPU_FEATURE(0x7, 0, ECX, 3) #define X86_FEATURE_LA57 KVM_X86_CPU_FEATURE(0x7, 0, ECX, 16) #define X86_FEATURE_RDPID KVM_X86_CPU_FEATURE(0x7, 0, ECX, 22) +#define X86_FEATURE_SGX_LC KVM_X86_CPU_FEATURE(0x7, 0, ECX, 30) #define X86_FEATURE_SHSTK KVM_X86_CPU_FEATURE(0x7, 0, ECX, 7) #define X86_FEATURE_IBT KVM_X86_CPU_FEATURE(0x7, 0, EDX, 20) #define X86_FEATURE_AMX_TILE KVM_X86_CPU_FEATURE(0x7, 0, EDX, 24) diff --git a/tools/testing/selftests/kvm/x86_64/vmx_msrs_test.c b/tools/testing/selftests/kvm/x86_64/vmx_msrs_test.c index 322d561b4260e..90720b6205f4e 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_msrs_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_msrs_test.c @@ -67,6 +67,52 @@ static void vmx_save_restore_msrs_test(struct kvm_vcpu *vcpu) vmx_fixed1_msr_test(vcpu, MSR_IA32_VMX_VMFUNC, -1ull); } +static void __ia32_feature_control_msr_test(struct kvm_vcpu *vcpu, + uint64_t msr_bit, + struct kvm_x86_cpu_feature feature) +{ + uint64_t val; + + vcpu_clear_cpuid_feature(vcpu, feature); + + val = vcpu_get_msr(vcpu, MSR_IA32_FEAT_CTL); + vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, val | msr_bit | FEAT_CTL_LOCKED); + vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, (val & ~msr_bit) | FEAT_CTL_LOCKED); + vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, val | msr_bit | FEAT_CTL_LOCKED); + vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, (val & ~msr_bit) | FEAT_CTL_LOCKED); + vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, val); + + if (!kvm_cpu_has(feature)) + return; + + vcpu_set_cpuid_feature(vcpu, feature); +} + +static void ia32_feature_control_msr_test(struct kvm_vcpu *vcpu) +{ + uint64_t supported_bits = FEAT_CTL_LOCKED | + FEAT_CTL_VMX_ENABLED_INSIDE_SMX | + FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | + FEAT_CTL_SGX_LC_ENABLED | + FEAT_CTL_SGX_ENABLED | + FEAT_CTL_LMCE_ENABLED; + int bit, r; + + __ia32_feature_control_msr_test(vcpu, FEAT_CTL_VMX_ENABLED_INSIDE_SMX, X86_FEATURE_SMX); + __ia32_feature_control_msr_test(vcpu, FEAT_CTL_VMX_ENABLED_INSIDE_SMX, X86_FEATURE_VMX); + __ia32_feature_control_msr_test(vcpu, FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX, X86_FEATURE_VMX); + __ia32_feature_control_msr_test(vcpu, FEAT_CTL_SGX_LC_ENABLED, X86_FEATURE_SGX_LC); + __ia32_feature_control_msr_test(vcpu, FEAT_CTL_SGX_LC_ENABLED, X86_FEATURE_SGX); + __ia32_feature_control_msr_test(vcpu, FEAT_CTL_SGX_ENABLED, X86_FEATURE_SGX); + __ia32_feature_control_msr_test(vcpu, FEAT_CTL_LMCE_ENABLED, X86_FEATURE_MCE); + + for_each_clear_bit(bit, &supported_bits, 64) { + r = _vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, BIT(bit)); + TEST_ASSERT(r == 0, + "Setting reserved bit %d in IA32_FEATURE_CONTROL should fail", bit); + } +} + int main(void) { struct kvm_vcpu *vcpu; @@ -79,6 +125,7 @@ int main(void) vm = vm_create_with_one_vcpu(&vcpu, NULL); vmx_save_restore_msrs_test(vcpu); + ia32_feature_control_msr_test(vcpu); kvm_vm_free(vm); } -- GitLab From 3ebcbd2244f5a69e06e5f655bfbd8127c08201c7 Mon Sep 17 00:00:00 2001 From: Anton Romanov Date: Wed, 8 Jun 2022 18:35:26 +0000 Subject: [PATCH 1027/1423] KVM: x86: Use current rather than snapshotted TSC frequency if it is constant Don't snapshot tsc_khz into per-cpu cpu_tsc_khz if the host TSC is constant, in which case the actual TSC frequency will never change and thus capturing TSC during initialization is unnecessary, KVM can simply use tsc_khz. This value is snapshotted from kvm_timer_init->kvmclock_cpu_online->tsc_khz_changed(NULL) On CPUs with constant TSC, but not a hardware-specified TSC frequency, snapshotting cpu_tsc_khz and using that to set a VM's target TSC frequency can lead to VM to think its TSC frequency is not what it actually is if refining the TSC completes after KVM snapshots tsc_khz. The actual frequency never changes, only the kernel's calculation of what that frequency is changes. Ideally, KVM would not be able to race with TSC refinement, or would have a hook into tsc_refine_calibration_work() to get an alert when refinement is complete. Avoiding the race altogether isn't practical as refinement takes a relative eternity; it's deliberately put on a work queue outside of the normal boot sequence to avoid unnecessarily delaying boot. Adding a hook is doable, but somewhat gross due to KVM's ability to be built as a module. And if the TSC is constant, which is likely the case for every VMX/SVM-capable CPU produced in the last decade, the race can be hit if and only if userspace is able to create a VM before TSC refinement completes; refinement is slow, but not that slow. For now, punt on a proper fix, as not taking a snapshot can help some uses cases and not taking a snapshot is arguably correct irrespective of the race with refinement. Signed-off-by: Anton Romanov Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20220608183525.1143682-1-romanton@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ef12747ecb63d..152ea4993b76f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2974,6 +2974,22 @@ static void kvm_update_masterclock(struct kvm *kvm) kvm_end_pvclock_update(kvm); } +/* + * Use the kernel's tsc_khz directly if the TSC is constant, otherwise use KVM's + * per-CPU value (which may be zero if a CPU is going offline). Note, tsc_khz + * can change during boot even if the TSC is constant, as it's possible for KVM + * to be loaded before TSC calibration completes. Ideally, KVM would get a + * notification when calibration completes, but practically speaking calibration + * will complete before userspace is alive enough to create VMs. + */ +static unsigned long get_cpu_tsc_khz(void) +{ + if (static_cpu_has(X86_FEATURE_CONSTANT_TSC)) + return tsc_khz; + else + return __this_cpu_read(cpu_tsc_khz); +} + /* Called within read_seqcount_begin/retry for kvm->pvclock_sc. */ static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) { @@ -2984,7 +3000,8 @@ static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) get_cpu(); data->flags = 0; - if (ka->use_master_clock && __this_cpu_read(cpu_tsc_khz)) { + if (ka->use_master_clock && + (static_cpu_has(X86_FEATURE_CONSTANT_TSC) || __this_cpu_read(cpu_tsc_khz))) { #ifdef CONFIG_X86_64 struct timespec64 ts; @@ -2998,7 +3015,7 @@ static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) data->flags |= KVM_CLOCK_TSC_STABLE; hv_clock.tsc_timestamp = ka->master_cycle_now; hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; - kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, + kvm_get_time_scale(NSEC_PER_SEC, get_cpu_tsc_khz() * 1000LL, &hv_clock.tsc_shift, &hv_clock.tsc_to_system_mul); data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc); @@ -3108,7 +3125,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); - tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz); + tgt_tsc_khz = get_cpu_tsc_khz(); if (unlikely(tgt_tsc_khz == 0)) { local_irq_restore(flags); kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); @@ -9038,9 +9055,11 @@ static void tsc_khz_changed(void *data) struct cpufreq_freqs *freq = data; unsigned long khz = 0; + WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_CONSTANT_TSC)); + if (data) khz = freq->new; - else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + else khz = cpufreq_quick_get(raw_smp_processor_id()); if (!khz) khz = tsc_khz; @@ -9061,8 +9080,10 @@ static void kvm_hyperv_tsc_notifier(void) hyperv_stop_tsc_emulation(); /* TSC frequency always matches when on Hyper-V */ - for_each_present_cpu(cpu) - per_cpu(cpu_tsc_khz, cpu) = tsc_khz; + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + for_each_present_cpu(cpu) + per_cpu(cpu_tsc_khz, cpu) = tsc_khz; + } kvm_caps.max_guest_tsc_khz = tsc_khz; list_for_each_entry(kvm, &vm_list, vm_list) { @@ -9199,10 +9220,10 @@ static void kvm_timer_init(void) } cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); - } - cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online", - kvmclock_cpu_online, kvmclock_cpu_down_prep); + cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online", + kvmclock_cpu_online, kvmclock_cpu_down_prep); + } } #ifdef CONFIG_X86_64 @@ -9362,10 +9383,11 @@ void kvm_arch_exit(void) #endif kvm_lapic_exit(); - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); - cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE); + cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE); + } #ifdef CONFIG_X86_64 pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); irq_work_sync(&pvclock_irq_work); -- GitLab From 10aa7cd398a9ead7464a7f8b49d4e4c843806813 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Wed, 30 Nov 2022 17:44:37 +0800 Subject: [PATCH 1028/1423] IB/hfi1: Switch to netif_napi_add() There is no need to use netif_napi_add_weight() when the weight argument is 64. See "net: drop the weight argument from netif_napi_add". Signed-off-by: Yang Yang Link: https://lore.kernel.org/r/202211301744378304494@zte.com.cn Reviewed-by: xu xin Reviewed-by: Zhang Yunkai Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hfi1/netdev_rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/netdev_rx.c b/drivers/infiniband/hw/hfi1/netdev_rx.c index 3dfa5aff25125..720d4c85c9c93 100644 --- a/drivers/infiniband/hw/hfi1/netdev_rx.c +++ b/drivers/infiniband/hw/hfi1/netdev_rx.c @@ -216,7 +216,7 @@ static int hfi1_netdev_rxq_init(struct hfi1_netdev_rx *rx) * right now. */ set_bit(NAPI_STATE_NO_BUSY_POLL, &rxq->napi.state); - netif_napi_add_weight(dev, &rxq->napi, hfi1_netdev_rx_napi, 64); + netif_napi_add(dev, &rxq->napi, hfi1_netdev_rx_napi); rc = msix_netdev_request_rcd_irq(rxq->rcd); if (rc) goto bail_context_irq_failure; -- GitLab From 38931d8989b5760b0bd17c9ec99e81986258e4cb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 22 Sep 2022 13:08:16 -0700 Subject: [PATCH 1029/1423] mm: Make ksize() a reporting-only function With all "silently resizing" callers of ksize() refactored, remove the logic in ksize() that would allow it to be used to effectively change the size of an allocation (bypassing __alloc_size hints, etc). Users wanting this feature need to either use kmalloc_size_roundup() before an allocation, or use krealloc() directly. For kfree_sensitive(), move the unpoisoning logic inline. Replace the some of the partially open-coded ksize() in __do_krealloc with ksize() now that it doesn't perform unpoisoning. Adjust the KUnit tests to match the new ksize() behavior. Execution tested with: $ ./tools/testing/kunit/kunit.py run \ --kconfig_add CONFIG_KASAN=y \ --kconfig_add CONFIG_KASAN_GENERIC=y \ --arch x86_64 kasan Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Andrew Morton Cc: Roman Gushchin Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Vincenzo Frascino Cc: linux-mm@kvack.org Cc: kasan-dev@googlegroups.com Acked-by: Vlastimil Babka Acked-by: David Rientjes Enhanced-by: Andrey Konovalov Signed-off-by: Kees Cook --- mm/kasan/kasan_test.c | 19 +++++++++++++------ mm/slab_common.c | 26 ++++++++++---------------- 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index 0d59098f08761..73684642c42d8 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -783,23 +783,30 @@ static void kasan_global_oob_left(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p); } -/* Check that ksize() makes the whole object accessible. */ +/* Check that ksize() does NOT unpoison whole object. */ static void ksize_unpoisons_memory(struct kunit *test) { char *ptr; - size_t size = 123, real_size; + size_t size = 128 - KASAN_GRANULE_SIZE - 5; + size_t real_size; ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + real_size = ksize(ptr); + KUNIT_EXPECT_GT(test, real_size, size); OPTIMIZER_HIDE_VAR(ptr); - /* This access shouldn't trigger a KASAN report. */ - ptr[size] = 'x'; + /* These accesses shouldn't trigger a KASAN report. */ + ptr[0] = 'x'; + ptr[size - 1] = 'x'; - /* This one must. */ - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[real_size]); + /* These must trigger a KASAN report. */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size]); + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size + 5]); + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[real_size - 1]); kfree(ptr); } diff --git a/mm/slab_common.c b/mm/slab_common.c index 33b1886b06ebf..7e96abf1bd7d1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1333,11 +1333,11 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) void *ret; size_t ks; - /* Don't use instrumented ksize to allow precise KASAN poisoning. */ + /* Check for double-free before calling ksize. */ if (likely(!ZERO_OR_NULL_PTR(p))) { if (!kasan_check_byte(p)) return NULL; - ks = kfence_ksize(p) ?: __ksize(p); + ks = ksize(p); } else ks = 0; @@ -1405,8 +1405,10 @@ void kfree_sensitive(const void *p) void *mem = (void *)p; ks = ksize(mem); - if (ks) + if (ks) { + kasan_unpoison_range(mem, ks); memzero_explicit(mem, ks); + } kfree(mem); } EXPORT_SYMBOL(kfree_sensitive); @@ -1427,13 +1429,11 @@ EXPORT_SYMBOL(kfree_sensitive); */ size_t ksize(const void *objp) { - size_t size; - /* - * We need to first check that the pointer to the object is valid, and - * only then unpoison the memory. The report printed from ksize() is - * more useful, then when it's printed later when the behaviour could - * be undefined due to a potential use-after-free or double-free. + * We need to first check that the pointer to the object is valid. + * The KASAN report printed from ksize() is more useful, then when + * it's printed later when the behaviour could be undefined due to + * a potential use-after-free or double-free. * * We use kasan_check_byte(), which is supported for the hardware * tag-based KASAN mode, unlike kasan_check_read/write(). @@ -1447,13 +1447,7 @@ size_t ksize(const void *objp) if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp)) return 0; - size = kfence_ksize(objp) ?: __ksize(objp); - /* - * We assume that ksize callers could use whole allocated area, - * so we need to unpoison this area. - */ - kasan_unpoison_range(objp, size); - return size; + return kfence_ksize(objp) ?: __ksize(objp); } EXPORT_SYMBOL(ksize); -- GitLab From 25226df4b9be7f6d5d722af5b75e86e76e5c3a80 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 21 Sep 2022 13:46:03 -0500 Subject: [PATCH 1030/1423] mm/pgtable: Fix multiple -Wstringop-overflow warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The actual size of the following arrays at run-time depends on CONFIG_X86_PAE. 427 pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS]; 428 pmd_t *pmds[MAX_PREALLOCATED_PMDS]; If CONFIG_X86_PAE is not enabled, their final size will be zero (which is technically not a legal storage size in C, but remains "valid" via the GNU extension). In that case, the compiler complains about trying to access objects of size zero when calling functions where these objects are passed as arguments. Fix this by sanity-checking the size of those arrays just before the function calls. Also, the following warnings are fixed by these changes when building with GCC 11+ and -Wstringop-overflow enabled: arch/x86/mm/pgtable.c:437:13: warning: ‘preallocate_pmds.constprop’ accessing 8 bytes in a region of size 0 [-Wstringop-overflow=] arch/x86/mm/pgtable.c:440:13: warning: ‘preallocate_pmds.constprop’ accessing 8 bytes in a region of size 0 [-Wstringop-overflow=] arch/x86/mm/pgtable.c:462:9: warning: ‘free_pmds.constprop’ accessing 8 bytes in a region of size 0 [-Wstringop-overflow=] arch/x86/mm/pgtable.c:455:9: warning: ‘pgd_prepopulate_user_pmd’ accessing 8 bytes in a region of size 0 [-Wstringop-overflow=] arch/x86/mm/pgtable.c:464:9: warning: ‘free_pmds.constprop’ accessing 8 bytes in a region of size 0 [-Wstringop-overflow=] This is one of the last cases in the ongoing effort to globally enable -Wstringop-overflow. The alternative to this is to make the originally suggested change: make the pmds argument from an array pointer to a pointer pointer. That situation is considered "legal" for C in the sense that it does not have a way to reason about the storage. i.e.: -static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) +static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t **pmds) With the above change, there's no difference in binary output, and the compiler warning is silenced. However, with this patch, the compiler can actually figure out that it isn't using the code at all, and it gets dropped: text data bss dec hex filename 8218 718 32 8968 2308 arch/x86/mm/pgtable.o.before 7765 694 32 8491 212b arch/x86/mm/pgtable.o.after So this case (fixing a warning and reducing image size) is a clear win. Additionally drops an old work-around for GCC in the same code. Link: https://github.com/KSPP/linux/issues/203 Link: https://github.com/KSPP/linux/issues/181 Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/Yytb67xvrnctxnEe@work --- arch/x86/mm/pgtable.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 8525f2876fb40..e4f499eb0f295 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -299,9 +299,6 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) pud_t *pud; int i; - if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ - return; - p4d = p4d_offset(pgd, 0); pud = pud_offset(p4d, 0); @@ -434,10 +431,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm) mm->pgd = pgd; - if (preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0) + if (sizeof(pmds) != 0 && + preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0) goto out_free_pgd; - if (preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0) + if (sizeof(u_pmds) != 0 && + preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0) goto out_free_pmds; if (paravirt_pgd_alloc(mm) != 0) @@ -451,17 +450,22 @@ pgd_t *pgd_alloc(struct mm_struct *mm) spin_lock(&pgd_lock); pgd_ctor(mm, pgd); - pgd_prepopulate_pmd(mm, pgd, pmds); - pgd_prepopulate_user_pmd(mm, pgd, u_pmds); + if (sizeof(pmds) != 0) + pgd_prepopulate_pmd(mm, pgd, pmds); + + if (sizeof(u_pmds) != 0) + pgd_prepopulate_user_pmd(mm, pgd, u_pmds); spin_unlock(&pgd_lock); return pgd; out_free_user_pmds: - free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS); + if (sizeof(u_pmds) != 0) + free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS); out_free_pmds: - free_pmds(mm, pmds, PREALLOCATED_PMDS); + if (sizeof(pmds) != 0) + free_pmds(mm, pmds, PREALLOCATED_PMDS); out_free_pgd: _pgd_free(pgd); out: -- GitLab From 9360d035a579d95d1e76c471061b9065b18a0eb1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Nov 2022 15:43:21 -0800 Subject: [PATCH 1031/1423] panic: Separate sysctl logic from CONFIG_SMP In preparation for adding more sysctls directly in kernel/panic.c, split CONFIG_SMP from the logic that adds sysctls. Cc: Petr Mladek Cc: Andrew Morton Cc: tangmeng Cc: "Guilherme G. Piccoli" Cc: Tiezhu Yang Cc: Sebastian Andrzej Siewior Reviewed-by: Luis Chamberlain Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221117234328.594699-1-keescook@chromium.org --- kernel/panic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/panic.c b/kernel/panic.c index da323209f5833..d843d036651e0 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -75,8 +75,9 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); -#if defined(CONFIG_SMP) && defined(CONFIG_SYSCTL) +#ifdef CONFIG_SYSCTL static struct ctl_table kern_panic_table[] = { +#ifdef CONFIG_SMP { .procname = "oops_all_cpu_backtrace", .data = &sysctl_oops_all_cpu_backtrace, @@ -86,6 +87,7 @@ static struct ctl_table kern_panic_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, +#endif { } }; -- GitLab From d4ccd54d28d3c8598e2354acc13e28c060961dbb Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 17 Nov 2022 15:43:22 -0800 Subject: [PATCH 1032/1423] exit: Put an upper limit on how often we can oops Many Linux systems are configured to not panic on oops; but allowing an attacker to oops the system **really** often can make even bugs that look completely unexploitable exploitable (like NULL dereferences and such) if each crash elevates a refcount by one or a lock is taken in read mode, and this causes a counter to eventually overflow. The most interesting counters for this are 32 bits wide (like open-coded refcounts that don't use refcount_t). (The ldsem reader count on 32-bit platforms is just 16 bits, but probably nobody cares about 32-bit platforms that much nowadays.) So let's panic the system if the kernel is constantly oopsing. The speed of oopsing 2^32 times probably depends on several factors, like how long the stack trace is and which unwinder you're using; an empirically important one is whether your console is showing a graphical environment or a text console that oopses will be printed to. In a quick single-threaded benchmark, it looks like oopsing in a vfork() child with a very short stack trace only takes ~510 microseconds per run when a graphical console is active; but switching to a text console that oopses are printed to slows it down around 87x, to ~45 milliseconds per run. (Adding more threads makes this faster, but the actual oops printing happens under &die_lock on x86, so you can maybe speed this up by a factor of around 2 and then any further improvement gets eaten up by lock contention.) It looks like it would take around 8-12 days to overflow a 32-bit counter with repeated oopsing on a multi-core X86 system running a graphical environment; both me (in an X86 VM) and Seth (with a distro kernel on normal hardware in a standard configuration) got numbers in that ballpark. 12 days aren't *that* short on a desktop system, and you'd likely need much longer on a typical server system (assuming that people don't run graphical desktop environments on their servers), and this is a *very* noisy and violent approach to exploiting the kernel; and it also seems to take orders of magnitude longer on some machines, probably because stuff like EFI pstore will slow it down a ton if that's active. Signed-off-by: Jann Horn Link: https://lore.kernel.org/r/20221107201317.324457-1-jannh@google.com Reviewed-by: Luis Chamberlain Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221117234328.594699-2-keescook@chromium.org --- Documentation/admin-guide/sysctl/kernel.rst | 8 ++++ kernel/exit.c | 42 +++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 98d1b198b2b4c..09f3fb2f85858 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -667,6 +667,14 @@ This is the default behavior. an oops event is detected. +oops_limit +========== + +Number of kernel oopses after which the kernel should panic when +``panic_on_oops`` is not set. Setting this to 0 or 1 has the same effect +as setting ``panic_on_oops=1``. + + osrelease, ostype & version =========================== diff --git a/kernel/exit.c b/kernel/exit.c index 35e0a31a0315c..2ab3ead621189 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -72,6 +72,33 @@ #include #include +/* + * The default value should be high enough to not crash a system that randomly + * crashes its kernel from time to time, but low enough to at least not permit + * overflowing 32-bit refcounts or the ldsem writer count. + */ +static unsigned int oops_limit = 10000; + +#ifdef CONFIG_SYSCTL +static struct ctl_table kern_exit_table[] = { + { + .procname = "oops_limit", + .data = &oops_limit, + .maxlen = sizeof(oops_limit), + .mode = 0644, + .proc_handler = proc_douintvec, + }, + { } +}; + +static __init int kernel_exit_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_exit_table); + return 0; +} +late_initcall(kernel_exit_sysctls_init); +#endif + static void __unhash_process(struct task_struct *p, bool group_dead) { nr_threads--; @@ -874,6 +901,8 @@ void __noreturn do_exit(long code) void __noreturn make_task_dead(int signr) { + static atomic_t oops_count = ATOMIC_INIT(0); + /* * Take the task off the cpu after something catastrophic has * happened. @@ -897,6 +926,19 @@ void __noreturn make_task_dead(int signr) preempt_count_set(PREEMPT_ENABLED); } + /* + * Every time the system oopses, if the oops happens while a reference + * to an object was held, the reference leaks. + * If the oops doesn't also leak memory, repeated oopsing can cause + * reference counters to wrap around (if they're not using refcount_t). + * This means that repeated oopsing can make unexploitable-looking bugs + * exploitable through repeated oopsing. + * To make sure this can't happen, place an upper bound on how often the + * kernel may oops without panic(). + */ + if (atomic_inc_return(&oops_count) >= READ_ONCE(oops_limit)) + panic("Oopsed too often (kernel.oops_limit is %d)", oops_limit); + /* * We're taking recursive faults here in make_task_dead. Safest is to just * leave this task alone and wait for reboot. -- GitLab From 9db89b41117024f80b38b15954017fb293133364 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Nov 2022 15:43:23 -0800 Subject: [PATCH 1033/1423] exit: Expose "oops_count" to sysfs Since Oops count is now tracked and is a fairly interesting signal, add the entry /sys/kernel/oops_count to expose it to userspace. Cc: "Eric W. Biederman" Cc: Jann Horn Cc: Arnd Bergmann Reviewed-by: Luis Chamberlain Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221117234328.594699-3-keescook@chromium.org --- .../ABI/testing/sysfs-kernel-oops_count | 6 +++++ MAINTAINERS | 1 + kernel/exit.c | 22 +++++++++++++++++-- 3 files changed, 27 insertions(+), 2 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-kernel-oops_count diff --git a/Documentation/ABI/testing/sysfs-kernel-oops_count b/Documentation/ABI/testing/sysfs-kernel-oops_count new file mode 100644 index 0000000000000..156cca9dbc960 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-oops_count @@ -0,0 +1,6 @@ +What: /sys/kernel/oops_count +Date: November 2022 +KernelVersion: 6.2.0 +Contact: Linux Kernel Hardening List +Description: + Shows how many times the system has Oopsed since last boot. diff --git a/MAINTAINERS b/MAINTAINERS index 1cd80c1137212..0a1e95a58e54c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11106,6 +11106,7 @@ M: Kees Cook L: linux-hardening@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/hardening +F: Documentation/ABI/testing/sysfs-kernel-oops_count F: include/linux/overflow.h F: include/linux/randomize_kstack.h F: mm/usercopy.c diff --git a/kernel/exit.c b/kernel/exit.c index 2ab3ead621189..dc1a32149f944 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -67,6 +67,7 @@ #include #include #include +#include #include #include @@ -99,6 +100,25 @@ static __init int kernel_exit_sysctls_init(void) late_initcall(kernel_exit_sysctls_init); #endif +static atomic_t oops_count = ATOMIC_INIT(0); + +#ifdef CONFIG_SYSFS +static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sysfs_emit(page, "%d\n", atomic_read(&oops_count)); +} + +static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count); + +static __init int kernel_exit_sysfs_init(void) +{ + sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL); + return 0; +} +late_initcall(kernel_exit_sysfs_init); +#endif + static void __unhash_process(struct task_struct *p, bool group_dead) { nr_threads--; @@ -901,8 +921,6 @@ void __noreturn do_exit(long code) void __noreturn make_task_dead(int signr) { - static atomic_t oops_count = ATOMIC_INIT(0); - /* * Take the task off the cpu after something catastrophic has * happened. -- GitLab From 9d720a5a658f5135861773f26e927449bef93d61 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 30 Nov 2022 09:25:51 -0800 Subject: [PATCH 1034/1423] xfs: hoist refcount record merge predicates Hoist these multiline conditionals into separate static inline helpers to improve readability and set the stage for corruption fixes that will be introduced in the next patch. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Xiao Yang --- fs/xfs/libxfs/xfs_refcount.c | 129 ++++++++++++++++++++++++++++++----- 1 file changed, 113 insertions(+), 16 deletions(-) diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 3f34bafe18dd1..4408893333a6c 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -815,11 +815,119 @@ out_error: /* Is this extent valid? */ static inline bool xfs_refc_valid( - struct xfs_refcount_irec *rc) + const struct xfs_refcount_irec *rc) { return rc->rc_startblock != NULLAGBLOCK; } +static inline bool +xfs_refc_want_merge_center( + const struct xfs_refcount_irec *left, + const struct xfs_refcount_irec *cleft, + const struct xfs_refcount_irec *cright, + const struct xfs_refcount_irec *right, + bool cleft_is_cright, + enum xfs_refc_adjust_op adjust, + unsigned long long *ulenp) +{ + unsigned long long ulen = left->rc_blockcount; + + /* + * To merge with a center record, both shoulder records must be + * adjacent to the record we want to adjust. This is only true if + * find_left and find_right made all four records valid. + */ + if (!xfs_refc_valid(left) || !xfs_refc_valid(right) || + !xfs_refc_valid(cleft) || !xfs_refc_valid(cright)) + return false; + + /* There must only be one record for the entire range. */ + if (!cleft_is_cright) + return false; + + /* The shoulder record refcounts must match the new refcount. */ + if (left->rc_refcount != cleft->rc_refcount + adjust) + return false; + if (right->rc_refcount != cleft->rc_refcount + adjust) + return false; + + /* + * The new record cannot exceed the max length. ulen is a ULL as the + * individual record block counts can be up to (u32 - 1) in length + * hence we need to catch u32 addition overflows here. + */ + ulen += cleft->rc_blockcount + right->rc_blockcount; + if (ulen >= MAXREFCEXTLEN) + return false; + + *ulenp = ulen; + return true; +} + +static inline bool +xfs_refc_want_merge_left( + const struct xfs_refcount_irec *left, + const struct xfs_refcount_irec *cleft, + enum xfs_refc_adjust_op adjust) +{ + unsigned long long ulen = left->rc_blockcount; + + /* + * For a left merge, the left shoulder record must be adjacent to the + * start of the range. If this is true, find_left made left and cleft + * contain valid contents. + */ + if (!xfs_refc_valid(left) || !xfs_refc_valid(cleft)) + return false; + + /* Left shoulder record refcount must match the new refcount. */ + if (left->rc_refcount != cleft->rc_refcount + adjust) + return false; + + /* + * The new record cannot exceed the max length. ulen is a ULL as the + * individual record block counts can be up to (u32 - 1) in length + * hence we need to catch u32 addition overflows here. + */ + ulen += cleft->rc_blockcount; + if (ulen >= MAXREFCEXTLEN) + return false; + + return true; +} + +static inline bool +xfs_refc_want_merge_right( + const struct xfs_refcount_irec *cright, + const struct xfs_refcount_irec *right, + enum xfs_refc_adjust_op adjust) +{ + unsigned long long ulen = right->rc_blockcount; + + /* + * For a right merge, the right shoulder record must be adjacent to the + * end of the range. If this is true, find_right made cright and right + * contain valid contents. + */ + if (!xfs_refc_valid(right) || !xfs_refc_valid(cright)) + return false; + + /* Right shoulder record refcount must match the new refcount. */ + if (right->rc_refcount != cright->rc_refcount + adjust) + return false; + + /* + * The new record cannot exceed the max length. ulen is a ULL as the + * individual record block counts can be up to (u32 - 1) in length + * hence we need to catch u32 addition overflows here. + */ + ulen += cright->rc_blockcount; + if (ulen >= MAXREFCEXTLEN) + return false; + + return true; +} + /* * Try to merge with any extents on the boundaries of the adjustment range. */ @@ -861,23 +969,15 @@ xfs_refcount_merge_extents( (cleft.rc_blockcount == cright.rc_blockcount); /* Try to merge left, cleft, and right. cleft must == cright. */ - ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount + - right.rc_blockcount; - if (xfs_refc_valid(&left) && xfs_refc_valid(&right) && - xfs_refc_valid(&cleft) && xfs_refc_valid(&cright) && cequal && - left.rc_refcount == cleft.rc_refcount + adjust && - right.rc_refcount == cleft.rc_refcount + adjust && - ulen < MAXREFCEXTLEN) { + if (xfs_refc_want_merge_center(&left, &cleft, &cright, &right, cequal, + adjust, &ulen)) { *shape_changed = true; return xfs_refcount_merge_center_extents(cur, &left, &cleft, &right, ulen, aglen); } /* Try to merge left and cleft. */ - ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount; - if (xfs_refc_valid(&left) && xfs_refc_valid(&cleft) && - left.rc_refcount == cleft.rc_refcount + adjust && - ulen < MAXREFCEXTLEN) { + if (xfs_refc_want_merge_left(&left, &cleft, adjust)) { *shape_changed = true; error = xfs_refcount_merge_left_extent(cur, &left, &cleft, agbno, aglen); @@ -893,10 +993,7 @@ xfs_refcount_merge_extents( } /* Try to merge cright and right. */ - ulen = (unsigned long long)right.rc_blockcount + cright.rc_blockcount; - if (xfs_refc_valid(&right) && xfs_refc_valid(&cright) && - right.rc_refcount == cright.rc_refcount + adjust && - ulen < MAXREFCEXTLEN) { + if (xfs_refc_want_merge_right(&cright, &right, adjust)) { *shape_changed = true; return xfs_refcount_merge_right_extent(cur, &right, &cright, aglen); -- GitLab From b25d1984aa884fc91a73a5a407b9ac976d441e9b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 30 Nov 2022 09:25:51 -0800 Subject: [PATCH 1035/1423] xfs: estimate post-merge refcounts correctly Upon enabling fsdax + reflink for XFS, xfs/179 began to report refcount metadata corruptions after being run. Specifically, xfs_repair noticed single-block refcount records that could be combined but had not been. The root cause of this is improper MAXREFCOUNT edge case handling in xfs_refcount_merge_extents. When we're trying to find candidates for a refcount btree record merge, we compute the refcount attribute of the merged record, but we fail to account for the fact that once a record hits rc_refcount == MAXREFCOUNT, it is pinned that way forever. Hence the computed refcount is wrong, and we fail to merge the extents. Fix this by adjusting the merge predicates to compute the adjusted refcount correctly. Fixes: 3172725814f9 ("xfs: adjust refcount of an extent of blocks in refcount btree") Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Xiao Yang --- fs/xfs/libxfs/xfs_refcount.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 4408893333a6c..6f7ed9288fe40 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -820,6 +820,17 @@ xfs_refc_valid( return rc->rc_startblock != NULLAGBLOCK; } +static inline xfs_nlink_t +xfs_refc_merge_refcount( + const struct xfs_refcount_irec *irec, + enum xfs_refc_adjust_op adjust) +{ + /* Once a record hits MAXREFCOUNT, it is pinned there forever */ + if (irec->rc_refcount == MAXREFCOUNT) + return MAXREFCOUNT; + return irec->rc_refcount + adjust; +} + static inline bool xfs_refc_want_merge_center( const struct xfs_refcount_irec *left, @@ -831,6 +842,7 @@ xfs_refc_want_merge_center( unsigned long long *ulenp) { unsigned long long ulen = left->rc_blockcount; + xfs_nlink_t new_refcount; /* * To merge with a center record, both shoulder records must be @@ -846,9 +858,10 @@ xfs_refc_want_merge_center( return false; /* The shoulder record refcounts must match the new refcount. */ - if (left->rc_refcount != cleft->rc_refcount + adjust) + new_refcount = xfs_refc_merge_refcount(cleft, adjust); + if (left->rc_refcount != new_refcount) return false; - if (right->rc_refcount != cleft->rc_refcount + adjust) + if (right->rc_refcount != new_refcount) return false; /* @@ -871,6 +884,7 @@ xfs_refc_want_merge_left( enum xfs_refc_adjust_op adjust) { unsigned long long ulen = left->rc_blockcount; + xfs_nlink_t new_refcount; /* * For a left merge, the left shoulder record must be adjacent to the @@ -881,7 +895,8 @@ xfs_refc_want_merge_left( return false; /* Left shoulder record refcount must match the new refcount. */ - if (left->rc_refcount != cleft->rc_refcount + adjust) + new_refcount = xfs_refc_merge_refcount(cleft, adjust); + if (left->rc_refcount != new_refcount) return false; /* @@ -903,6 +918,7 @@ xfs_refc_want_merge_right( enum xfs_refc_adjust_op adjust) { unsigned long long ulen = right->rc_blockcount; + xfs_nlink_t new_refcount; /* * For a right merge, the right shoulder record must be adjacent to the @@ -913,7 +929,8 @@ xfs_refc_want_merge_right( return false; /* Right shoulder record refcount must match the new refcount. */ - if (right->rc_refcount != cright->rc_refcount + adjust) + new_refcount = xfs_refc_merge_refcount(cright, adjust); + if (right->rc_refcount != new_refcount) return false; /* -- GitLab From 8c25febf23963431686f04874b96321288504127 Mon Sep 17 00:00:00 2001 From: Guo Xuenan Date: Thu, 1 Dec 2022 09:36:16 -0800 Subject: [PATCH 1036/1423] xfs: get rid of assert from xfs_btree_islastblock xfs_btree_check_block contains debugging knobs. With XFS_DEBUG setting up, turn on the debugging knob can trigger the assert of xfs_btree_islastblock, test script as follows: while true do mount $disk $mountpoint fsstress -d $testdir -l 0 -n 10000 -p 4 >/dev/null echo 1 > /sys/fs/xfs/sda/errortag/btree_chk_sblk sleep 10 umount $mountpoint done Kick off fsstress and only *then* turn on the debugging knob. If it happens that the knob gets turned on after the cntbt lookup succeeds but before the call to xfs_btree_islastblock, then we *can* end up in the situation where a previously checked btree block suddenly starts returning EFSCORRUPTED from xfs_btree_check_block. Kaboom. Darrick give a very detailed explanation as follows: Looking back at commit 27d9ee577dcce, I think the point of all this was to make sure that the cursor has actually performed a lookup, and that the btree block at whatever level we're asking about is ok. If the caller hasn't ever done a lookup, the bc_levels array will be empty, so cur->bc_levels[level].bp pointer will be NULL. The call to xfs_btree_get_block will crash anyway, so the "ASSERT(block);" part is pointless. If the caller did a lookup but the lookup failed due to block corruption, the corresponding cur->bc_levels[level].bp pointer will also be NULL, and we'll still crash. The "ASSERT(xfs_btree_check_block);" logic is also unnecessary. If the cursor level points to an inode root, the block buffer will be incore, so it had better always be consistent. If the caller ignores a failed lookup after a successful one and calls this function, the cursor state is garbage and the assert wouldn't have tripped anyway. So get rid of the assert. Fixes: 27d9ee577dcc ("xfs: actually check xfs_btree_check_block return in xfs_btree_islastblock") Signed-off-by: Guo Xuenan Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_btree.h | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index eef27858a0135..29c4b4ccb909f 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -556,7 +556,6 @@ xfs_btree_islastblock( struct xfs_buf *bp; block = xfs_btree_get_block(cur, level, &bp); - ASSERT(block && xfs_btree_check_block(cur, block, level, bp) == 0); if (cur->bc_flags & XFS_BTREE_LONG_PTRS) return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK); -- GitLab From ddfdd530e43fcb3f7a0a69966e5f6c33497b4ae3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 1 Dec 2022 09:36:16 -0800 Subject: [PATCH 1037/1423] xfs: invalidate xfs_bufs when allocating cow extents While investigating test failures in xfs/17[1-3] in alwayscow mode, I noticed through code inspection that xfs_bmap_alloc_userdata isn't setting XFS_ALLOC_USERDATA when allocating extents for a file's CoW fork. COW staging extents should be flagged as USERDATA, since user data are persisted to these blocks before being remapped into a file. This mis-classification has a few impacts on the behavior of the system. First, the filestreams allocator is supposed to keep allocating from a chosen AG until it runs out of space in that AG. However, it only does that for USERDATA allocations, which means that COW allocations aren't tied to the filestreams AG. Fortunately, few people use filestreams, so nobody's noticed. A more serious problem is that xfs_alloc_ag_vextent_small looks for a buffer to invalidate *if* the USERDATA flag is set and the AG is so full that the allocation had to come from the AGFL because the cntbt is empty. The consequences of not invalidating the buffer are severe -- if the AIL incorrectly checkpoints a buffer that is now being used to store user data, that action will clobber the user's written data. Fix filestreams and yet another data corruption vector by flagging COW allocations as USERDATA. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 56b9b7db38bbd..0d56a8d862e80 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4058,7 +4058,7 @@ xfs_bmap_alloc_userdata( * the busy list. */ bma->datatype = XFS_ALLOC_NOBUSY; - if (whichfork == XFS_DATA_FORK) { + if (whichfork == XFS_DATA_FORK || whichfork == XFS_COW_FORK) { bma->datatype |= XFS_ALLOC_USERDATA; if (bma->offset == 0) bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; -- GitLab From 3d50b95b50db36de945bfc34d4d94b7e11ee8fd9 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Mon, 14 Nov 2022 11:56:06 +0000 Subject: [PATCH 1038/1423] i2c: smbus: add DDR support for SPD On my x05 laptop I got: Memory type 0x12 not supported yet, not instantiating SPD Adding the 0x12 case lead to a successful instantiated SPD AT24 EEPROM. i801_smbus 0000:00:1f.3: SMBus using polling i2c i2c-6: 2/2 memory slots populated (from DMI) at24 6-0050: 256 byte spd EEPROM, read-only i2c i2c-6: Successfully instantiated SPD at 0x50 at24 6-0051: 256 byte spd EEPROM, read-only And then, I decoded it successfully via decode-dimms. Signed-off-by: Corentin Labbe Reviewed-by: Jean Delvare Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-smbus.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/i2c/i2c-smbus.c b/drivers/i2c/i2c-smbus.c index 07c92c8495a3c..c85710ed95483 100644 --- a/drivers/i2c/i2c-smbus.c +++ b/drivers/i2c/i2c-smbus.c @@ -361,9 +361,15 @@ void i2c_register_spd(struct i2c_adapter *adap) return; } + /* + * Memory types could be found at section 7.18.2 (Memory Device — Type), table 78 + * https://www.dmtf.org/sites/default/files/standards/documents/DSP0134_3.6.0.pdf + */ switch (common_mem_type) { + case 0x12: /* DDR */ case 0x13: /* DDR2 */ case 0x18: /* DDR3 */ + case 0x1B: /* LPDDR */ case 0x1C: /* LPDDR2 */ case 0x1D: /* LPDDR3 */ name = "spd"; -- GitLab From 5bf71889ad9a4d39b7665c105a005c5a33d730ba Mon Sep 17 00:00:00 2001 From: Akhil R Date: Thu, 17 Nov 2022 15:34:15 +0530 Subject: [PATCH 1039/1423] i2c: tegra: Set ACPI node as primary fwnode Set ACPI node as the primary fwnode of I2C adapter to allow enumeration of child devices from the ACPI table Signed-off-by: Zubair Waheed Signed-off-by: Akhil R Reviewed-by: Thierry Reding Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-tegra.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c index 954022c04cc42..69c9ae161bbec 100644 --- a/drivers/i2c/busses/i2c-tegra.c +++ b/drivers/i2c/busses/i2c-tegra.c @@ -1826,6 +1826,7 @@ static int tegra_i2c_probe(struct platform_device *pdev) i2c_dev->adapter.class = I2C_CLASS_DEPRECATED; i2c_dev->adapter.algo = &tegra_i2c_algo; i2c_dev->adapter.nr = pdev->id; + ACPI_COMPANION_SET(&i2c_dev->adapter.dev, ACPI_COMPANION(&pdev->dev)); if (i2c_dev->hw->supports_bus_clear) i2c_dev->adapter.bus_recovery_info = &tegra_i2c_recovery_info; -- GitLab From de917701da5d5ccb31825dca850ac49399e0f289 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Tue, 15 Nov 2022 12:30:18 +0000 Subject: [PATCH 1040/1423] dt-bindings: i2c: renesas,riic: Document RZ/Five SoC The RIIC block on the RZ/Five SoC is identical to one found on the RZ/G2UL SoC. "renesas,riic-r9a07g043" compatible string will be used on the RZ/Five SoC so to make this clear, update the comment to include RZ/Five SoC. No driver changes are required as generic compatible string "renesas,riic-rz" will be used as a fallback on RZ/Five SoC. Signed-off-by: Lad Prabhakar Reviewed-by: Geert Uytterhoeven Acked-by: Rob Herring Signed-off-by: Wolfram Sang --- Documentation/devicetree/bindings/i2c/renesas,riic.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/i2c/renesas,riic.yaml b/Documentation/devicetree/bindings/i2c/renesas,riic.yaml index d3c0d5c427acb..2291a7cd619be 100644 --- a/Documentation/devicetree/bindings/i2c/renesas,riic.yaml +++ b/Documentation/devicetree/bindings/i2c/renesas,riic.yaml @@ -19,7 +19,7 @@ properties: - enum: - renesas,riic-r7s72100 # RZ/A1H - renesas,riic-r7s9210 # RZ/A2M - - renesas,riic-r9a07g043 # RZ/G2UL + - renesas,riic-r9a07g043 # RZ/G2UL and RZ/Five - renesas,riic-r9a07g044 # RZ/G2{L,LC} - renesas,riic-r9a07g054 # RZ/V2L - const: renesas,riic-rz # RZ/A or RZ/G2L -- GitLab From a33004e844e4c60da86ecf8c249aab7179817fce Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 29 Nov 2022 17:52:59 +0000 Subject: [PATCH 1041/1423] KVM: selftests: Fix inverted "warning" in access tracking perf test Warn if the number of idle pages is greater than or equal to 10% of the total number of pages, not if the percentage of idle pages is less than 10%. The original code asserted that less than 10% of pages were still idle, but the check got inverted when the assert was converted to a warning. Opportunistically clean up the warning; selftests are 64-bit only, there is no need to use "%PRIu64" instead of "%lu". Fixes: 6336a810db5c ("KVM: selftests: replace assertion with warning in access_tracking_perf_test") Reviewed-by: Emanuele Giuseppe Esposito Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221129175300.4052283-2-seanjc@google.com --- tools/testing/selftests/kvm/access_tracking_perf_test.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c index 02d3587cab0a3..d45ef319a68f3 100644 --- a/tools/testing/selftests/kvm/access_tracking_perf_test.c +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c @@ -185,10 +185,9 @@ static void mark_vcpu_memory_idle(struct kvm_vm *vm, * happens, much more pages are cached there and guest won't see the * "idle" bit cleared. */ - if (still_idle < pages / 10) - printf("WARNING: vCPU%d: Too many pages still idle (%" PRIu64 - "out of %" PRIu64 "), this will affect performance results" - ".\n", + if (still_idle >= pages / 10) + printf("WARNING: vCPU%d: Too many pages still idle (%lu out of %lu), " + "this will affect performance results.\n", vcpu_idx, still_idle, pages); close(page_idle_fd); -- GitLab From 8fcee0421386344b58fdc1cd5940219617037968 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 29 Nov 2022 17:53:00 +0000 Subject: [PATCH 1042/1423] KVM: selftests: Restore assert for non-nested VMs in access tracking test Restore the assert (on x86-64) that <10% of pages are still idle when NOT running as a nested VM in the access tracking test. The original assert was converted to a "warning" to avoid false failures when running the test in a VM, but the non-nested case does not suffer from the same "infinite TLB size" issue. Using the HYPERVISOR flag isn't infallible as VMMs aren't strictly required to enumerate the "feature" in CPUID, but practically speaking anyone that is running KVM selftests in VMs is going to be using a VMM and hypervisor that sets the HYPERVISOR flag. Cc: David Matlack Reviewed-by: Emanuele Giuseppe Esposito Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221129175300.4052283-3-seanjc@google.com --- .../selftests/kvm/access_tracking_perf_test.c | 17 ++++++++++++----- .../selftests/kvm/include/x86_64/processor.h | 1 + 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c index d45ef319a68f3..9f9503e40ca56 100644 --- a/tools/testing/selftests/kvm/access_tracking_perf_test.c +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c @@ -46,6 +46,7 @@ #include "test_util.h" #include "memstress.h" #include "guest_modes.h" +#include "processor.h" /* Global variable used to synchronize all of the vCPU threads. */ static int iteration; @@ -180,15 +181,21 @@ static void mark_vcpu_memory_idle(struct kvm_vm *vm, * access tracking but low enough as to not make the test too brittle * over time and across architectures. * - * Note that when run in nested virtualization, this check will trigger - * much more frequently because TLB size is unlimited and since no flush - * happens, much more pages are cached there and guest won't see the - * "idle" bit cleared. + * When running the guest as a nested VM, "warn" instead of asserting + * as the TLB size is effectively unlimited and the KVM doesn't + * explicitly flush the TLB when aging SPTEs. As a result, more pages + * are cached and the guest won't see the "idle" bit cleared. */ - if (still_idle >= pages / 10) + if (still_idle >= pages / 10) { +#ifdef __x86_64__ + TEST_ASSERT(this_cpu_has(X86_FEATURE_HYPERVISOR), + "vCPU%d: Too many pages still idle (%lu out of %lu)", + vcpu_idx, still_idle, pages); +#endif printf("WARNING: vCPU%d: Too many pages still idle (%lu out of %lu), " "this will affect performance results.\n", vcpu_idx, still_idle, pages); + } close(page_idle_fd); close(pagemap_fd); diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 5d310abe6c3f4..22852bd32d7b7 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -94,6 +94,7 @@ struct kvm_x86_cpu_feature { #define X86_FEATURE_XSAVE KVM_X86_CPU_FEATURE(0x1, 0, ECX, 26) #define X86_FEATURE_OSXSAVE KVM_X86_CPU_FEATURE(0x1, 0, ECX, 27) #define X86_FEATURE_RDRAND KVM_X86_CPU_FEATURE(0x1, 0, ECX, 30) +#define X86_FEATURE_HYPERVISOR KVM_X86_CPU_FEATURE(0x1, 0, ECX, 31) #define X86_FEATURE_PAE KVM_X86_CPU_FEATURE(0x1, 0, EDX, 6) #define X86_FEATURE_MCE KVM_X86_CPU_FEATURE(0x1, 0, EDX, 7) #define X86_FEATURE_APIC KVM_X86_CPU_FEATURE(0x1, 0, EDX, 9) -- GitLab From 18eee7bfd18d6c9586dd224cf5b74258700fe815 Mon Sep 17 00:00:00 2001 From: Lei Wang Date: Mon, 28 Nov 2022 22:57:32 +0000 Subject: [PATCH 1043/1423] KVM: selftests: Move XFD CPUID checking out of __vm_xsave_require_permission() Move the kvm_cpu_has() check on X86_FEATURE_XFD out of the helper to enable off-by-default XSAVE-managed features and into the one test that currenty requires XFD (XFeature Disable) support. kvm_cpu_has() uses kvm_get_supported_cpuid() and thus caches KVM_GET_SUPPORTED_CPUID, and so using kvm_cpu_has() before ARCH_REQ_XCOMP_GUEST_PERM effectively results in the test caching stale values, e.g. subsequent checks on AMX_TILE will get false negatives. Although off-by-default features are nonsensical without XFD, checking for XFD virtualization prior to enabling such features isn't strictly required. Signed-off-by: Lei Wang Fixes: 7fbb653e01fd ("KVM: selftests: Check KVM's supported CPUID, not host CPUID, for XFD") Link: https://lore.kernel.org/r/20221125023839.315207-1-lei4.wang@intel.com [sean: add Fixes, reword changelog] Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221128225735.3291648-2-seanjc@google.com --- tools/testing/selftests/kvm/lib/x86_64/processor.c | 2 -- tools/testing/selftests/kvm/x86_64/amx_test.c | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index d532c20c74fda..aac7b32a794b5 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -563,8 +563,6 @@ void __vm_xsave_require_permission(int bit, const char *name) .addr = (unsigned long) &bitmask }; - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XFD)); - kvm_fd = open_kvm_dev_path_or_exit(); rc = __kvm_ioctl(kvm_fd, KVM_GET_DEVICE_ATTR, &attr); close(kvm_fd); diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c index 21de6ae42086d..1256c7faadd32 100644 --- a/tools/testing/selftests/kvm/x86_64/amx_test.c +++ b/tools/testing/selftests/kvm/x86_64/amx_test.c @@ -254,6 +254,7 @@ int main(int argc, char *argv[]) /* Create VM */ vm = vm_create_with_one_vcpu(&vcpu, guest_code); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XFD)); TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XSAVE)); TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_AMX_TILE)); TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XTILECFG)); -- GitLab From 2ceade1d363c934633a1788d0f98fc2332062b92 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 28 Nov 2022 22:57:33 +0000 Subject: [PATCH 1044/1423] KVM: selftests: Move __vm_xsave_require_permission() below CPUID helpers Move __vm_xsave_require_permission() below the CPUID helpers so that a future change can reference the cached result of KVM_GET_SUPPORTED_CPUID while keeping the definition of the variable close to its intended user, kvm_get_supported_cpuid(). No functional change intended. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221128225735.3291648-3-seanjc@google.com --- .../selftests/kvm/lib/x86_64/processor.c | 64 +++++++++---------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index aac7b32a794b5..23067465c0355 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -552,38 +552,6 @@ static void vcpu_setup(struct kvm_vm *vm, struct kvm_vcpu *vcpu) vcpu_sregs_set(vcpu, &sregs); } -void __vm_xsave_require_permission(int bit, const char *name) -{ - int kvm_fd; - u64 bitmask; - long rc; - struct kvm_device_attr attr = { - .group = 0, - .attr = KVM_X86_XCOMP_GUEST_SUPP, - .addr = (unsigned long) &bitmask - }; - - kvm_fd = open_kvm_dev_path_or_exit(); - rc = __kvm_ioctl(kvm_fd, KVM_GET_DEVICE_ATTR, &attr); - close(kvm_fd); - - if (rc == -1 && (errno == ENXIO || errno == EINVAL)) - __TEST_REQUIRE(0, "KVM_X86_XCOMP_GUEST_SUPP not supported"); - - TEST_ASSERT(rc == 0, "KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) error: %ld", rc); - - __TEST_REQUIRE(bitmask & (1ULL << bit), - "Required XSAVE feature '%s' not supported", name); - - TEST_REQUIRE(!syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, bit)); - - rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &bitmask); - TEST_ASSERT(rc == 0, "prctl(ARCH_GET_XCOMP_GUEST_PERM) error: %ld", rc); - TEST_ASSERT(bitmask & (1ULL << bit), - "prctl(ARCH_REQ_XCOMP_GUEST_PERM) failure bitmask=0x%lx", - bitmask); -} - void kvm_arch_vm_post_create(struct kvm_vm *vm) { vm_create_irqchip(vm); @@ -705,6 +673,38 @@ uint64_t kvm_get_feature_msr(uint64_t msr_index) return buffer.entry.data; } +void __vm_xsave_require_permission(int bit, const char *name) +{ + int kvm_fd; + u64 bitmask; + long rc; + struct kvm_device_attr attr = { + .group = 0, + .attr = KVM_X86_XCOMP_GUEST_SUPP, + .addr = (unsigned long) &bitmask + }; + + kvm_fd = open_kvm_dev_path_or_exit(); + rc = __kvm_ioctl(kvm_fd, KVM_GET_DEVICE_ATTR, &attr); + close(kvm_fd); + + if (rc == -1 && (errno == ENXIO || errno == EINVAL)) + __TEST_REQUIRE(0, "KVM_X86_XCOMP_GUEST_SUPP not supported"); + + TEST_ASSERT(rc == 0, "KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) error: %ld", rc); + + __TEST_REQUIRE(bitmask & (1ULL << bit), + "Required XSAVE feature '%s' not supported", name); + + TEST_REQUIRE(!syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, bit)); + + rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &bitmask); + TEST_ASSERT(rc == 0, "prctl(ARCH_GET_XCOMP_GUEST_PERM) error: %ld", rc); + TEST_ASSERT(bitmask & (1ULL << bit), + "prctl(ARCH_REQ_XCOMP_GUEST_PERM) failure bitmask=0x%lx", + bitmask); +} + void vcpu_init_cpuid(struct kvm_vcpu *vcpu, const struct kvm_cpuid2 *cpuid) { TEST_ASSERT(cpuid != vcpu->cpuid, "@cpuid can't be the vCPU's CPUID"); -- GitLab From cd5f3d210095347e9d40a9a1d464f5ee0bb5d7f2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 28 Nov 2022 22:57:34 +0000 Subject: [PATCH 1045/1423] KVM: selftests: Disallow "get supported CPUID" before REQ_XCOMP_GUEST_PERM Disallow using kvm_get_supported_cpuid() and thus caching KVM's supported CPUID info before enabling XSAVE-managed features that are off-by-default and must be enabled by ARCH_REQ_XCOMP_GUEST_PERM. Caching the supported CPUID before all XSAVE features are enabled can result in false negatives due to testing features that were cached before they were enabled. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221128225735.3291648-4-seanjc@google.com --- .../selftests/kvm/lib/x86_64/processor.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 23067465c0355..1d3829e652e65 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -601,21 +601,24 @@ void vcpu_arch_free(struct kvm_vcpu *vcpu) free(vcpu->cpuid); } +/* Do not use kvm_supported_cpuid directly except for validity checks. */ +static void *kvm_supported_cpuid; + const struct kvm_cpuid2 *kvm_get_supported_cpuid(void) { - static struct kvm_cpuid2 *cpuid; int kvm_fd; - if (cpuid) - return cpuid; + if (kvm_supported_cpuid) + return kvm_supported_cpuid; - cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); + kvm_supported_cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); kvm_fd = open_kvm_dev_path_or_exit(); - kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID, cpuid); + kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID, + (struct kvm_cpuid2 *)kvm_supported_cpuid); close(kvm_fd); - return cpuid; + return kvm_supported_cpuid; } static uint32_t __kvm_cpu_has(const struct kvm_cpuid2 *cpuid, @@ -684,6 +687,9 @@ void __vm_xsave_require_permission(int bit, const char *name) .addr = (unsigned long) &bitmask }; + TEST_ASSERT(!kvm_supported_cpuid, + "kvm_get_supported_cpuid() cannot be used before ARCH_REQ_XCOMP_GUEST_PERM"); + kvm_fd = open_kvm_dev_path_or_exit(); rc = __kvm_ioctl(kvm_fd, KVM_GET_DEVICE_ATTR, &attr); close(kvm_fd); -- GitLab From 553d1652b8615b5ae3080bb1a561207aee87fa85 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 28 Nov 2022 22:57:35 +0000 Subject: [PATCH 1046/1423] KVM: selftests: Do kvm_cpu_has() checks before creating VM+vCPU Move the AMX test's kvm_cpu_has() checks before creating the VM+vCPU, there are no dependencies between the two operations. Opportunistically add a comment to call out that enabling off-by-default XSAVE-managed features must be done before KVM_GET_SUPPORTED_CPUID is cached. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221128225735.3291648-5-seanjc@google.com --- tools/testing/selftests/kvm/x86_64/amx_test.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c index 1256c7faadd32..bd72c6eb3b670 100644 --- a/tools/testing/selftests/kvm/x86_64/amx_test.c +++ b/tools/testing/selftests/kvm/x86_64/amx_test.c @@ -249,17 +249,21 @@ int main(int argc, char *argv[]) u32 amx_offset; int stage, ret; + /* + * Note, all off-by-default features must be enabled before anything + * caches KVM_GET_SUPPORTED_CPUID, e.g. before using kvm_cpu_has(). + */ vm_xsave_require_permission(XSTATE_XTILE_DATA_BIT); - /* Create VM */ - vm = vm_create_with_one_vcpu(&vcpu, guest_code); - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XFD)); TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XSAVE)); TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_AMX_TILE)); TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XTILECFG)); TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XTILEDATA)); + /* Create VM */ + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + TEST_ASSERT(kvm_cpu_has_p(X86_PROPERTY_XSTATE_MAX_SIZE), "KVM should enumerate max XSAVE size when XSAVE is supported"); xsave_restore_size = kvm_cpu_property(X86_PROPERTY_XSTATE_MAX_SIZE); -- GitLab From 0c3265235fc17e78773025ed0ddc7c0324b6ed89 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Nov 2022 01:33:09 +0000 Subject: [PATCH 1047/1423] KVM: selftests: Define and use a custom static assert in lib headers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define and use kvm_static_assert() in the common KVM selftests headers to provide deterministic behavior, and to allow creating static asserts without dummy messages. The kernel's static_assert() makes the message param optional, and on the surface, tools/include/linux/build_bug.h appears to follow suit. However, glibc may override static_assert() and redefine it as a direct alias of _Static_assert(), which makes the message parameter mandatory. This leads to non-deterministic behavior as KVM selftests code that utilizes static_assert() without a custom message may or not compile depending on the order of includes. E.g. recently added asserts in x86_64/processor.h fail on some systems with errors like In file included from lib/memstress.c:11:0: include/x86_64/processor.h: In function ‘this_cpu_has_p’: include/x86_64/processor.h:193:34: error: expected ‘,’ before ‘)’ token static_assert(low_bit < high_bit); \ ^ due to _Static_assert() expecting a comma before a message. The "message optional" version of static_assert() uses macro magic to strip away the comma when presented with empty an __VA_ARGS__ #ifndef static_assert #define static_assert(expr, ...) __static_assert(expr, ##__VA_ARGS__, #expr) #define __static_assert(expr, msg, ...) _Static_assert(expr, msg) #endif // static_assert and effectively generates "_Static_assert(expr, #expr)". The incompatible version of static_assert() gets defined by this snippet in /usr/include/assert.h: #if defined __USE_ISOC11 && !defined __cplusplus # undef static_assert # define static_assert _Static_assert #endif which yields "_Static_assert(expr)" and thus fails as above. KVM selftests don't actually care about using C11, but __USE_ISOC11 gets defined because of _GNU_SOURCE, which many tests do #define. _GNU_SOURCE triggers a massive pile of defines in /usr/include/features.h, including _ISOC11_SOURCE: /* If _GNU_SOURCE was defined by the user, turn on all the other features. */ #ifdef _GNU_SOURCE # undef _ISOC95_SOURCE # define _ISOC95_SOURCE 1 # undef _ISOC99_SOURCE # define _ISOC99_SOURCE 1 # undef _ISOC11_SOURCE # define _ISOC11_SOURCE 1 # undef _POSIX_SOURCE # define _POSIX_SOURCE 1 # undef _POSIX_C_SOURCE # define _POSIX_C_SOURCE 200809L # undef _XOPEN_SOURCE # define _XOPEN_SOURCE 700 # undef _XOPEN_SOURCE_EXTENDED # define _XOPEN_SOURCE_EXTENDED 1 # undef _LARGEFILE64_SOURCE # define _LARGEFILE64_SOURCE 1 # undef _DEFAULT_SOURCE # define _DEFAULT_SOURCE 1 # undef _ATFILE_SOURCE # define _ATFILE_SOURCE 1 #endif which further down in /usr/include/features.h leads to: /* This is to enable the ISO C11 extension. */ #if (defined _ISOC11_SOURCE \ || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 201112L)) # define __USE_ISOC11 1 #endif To make matters worse, /usr/include/assert.h doesn't guard against multiple inclusion by turning itself into a nop, but instead #undefs a few macros and continues on. As a result, it's all but impossible to ensure the "message optional" version of static_assert() will actually be used, e.g. explicitly including assert.h and #undef'ing static_assert() doesn't work as a later inclusion of assert.h will again redefine its version. #ifdef _ASSERT_H # undef _ASSERT_H # undef assert # undef __ASSERT_VOID_CAST # ifdef __USE_GNU # undef assert_perror # endif #endif /* assert.h */ #define _ASSERT_H 1 #include Fixes: fcba483e8246 ("KVM: selftests: Sanity check input to ioctls() at build time") Fixes: ee3795536664 ("KVM: selftests: Refactor X86_FEATURE_* framework to prep for X86_PROPERTY_*") Fixes: 53a7dc0f215e ("KVM: selftests: Add X86_PROPERTY_* framework to retrieve CPUID values") Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221122013309.1872347-1-seanjc@google.com --- .../selftests/kvm/include/kvm_util_base.h | 14 +++++++++++- .../selftests/kvm/include/x86_64/processor.h | 22 +++++++++---------- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index c7685c7038ff0..9fa0d340f2915 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -22,6 +22,18 @@ #include "sparsebit.h" +/* + * Provide a version of static_assert() that is guaranteed to have an optional + * message param. If _ISOC11_SOURCE is defined, glibc (/usr/include/assert.h) + * #undefs and #defines static_assert() as a direct alias to _Static_assert(), + * i.e. effectively makes the message mandatory. Many KVM selftests #define + * _GNU_SOURCE for various reasons, and _GNU_SOURCE implies _ISOC11_SOURCE. As + * a result, static_assert() behavior is non-deterministic and may or may not + * require a message depending on #include order. + */ +#define __kvm_static_assert(expr, msg, ...) _Static_assert(expr, msg) +#define kvm_static_assert(expr, ...) __kvm_static_assert(expr, ##__VA_ARGS__, #expr) + #define KVM_DEV_PATH "/dev/kvm" #define KVM_MAX_VCPUS 512 @@ -196,7 +208,7 @@ static inline bool kvm_has_cap(long cap) #define kvm_do_ioctl(fd, cmd, arg) \ ({ \ - static_assert(!_IOC_SIZE(cmd) || sizeof(*arg) == _IOC_SIZE(cmd), ""); \ + kvm_static_assert(!_IOC_SIZE(cmd) || sizeof(*arg) == _IOC_SIZE(cmd)); \ ioctl(fd, cmd, arg); \ }) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 22852bd32d7b7..411549ef4947f 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -72,11 +72,11 @@ struct kvm_x86_cpu_feature { .bit = __bit, \ }; \ \ - static_assert((fn & 0xc0000000) == 0 || \ - (fn & 0xc0000000) == 0x40000000 || \ - (fn & 0xc0000000) == 0x80000000 || \ - (fn & 0xc0000000) == 0xc0000000); \ - static_assert(idx < BIT(sizeof(feature.index) * BITS_PER_BYTE)); \ + kvm_static_assert((fn & 0xc0000000) == 0 || \ + (fn & 0xc0000000) == 0x40000000 || \ + (fn & 0xc0000000) == 0x80000000 || \ + (fn & 0xc0000000) == 0xc0000000); \ + kvm_static_assert(idx < BIT(sizeof(feature.index) * BITS_PER_BYTE)); \ feature; \ }) @@ -191,12 +191,12 @@ struct kvm_x86_cpu_property { .hi_bit = high_bit, \ }; \ \ - static_assert(low_bit < high_bit); \ - static_assert((fn & 0xc0000000) == 0 || \ - (fn & 0xc0000000) == 0x40000000 || \ - (fn & 0xc0000000) == 0x80000000 || \ - (fn & 0xc0000000) == 0xc0000000); \ - static_assert(idx < BIT(sizeof(property.index) * BITS_PER_BYTE)); \ + kvm_static_assert(low_bit < high_bit); \ + kvm_static_assert((fn & 0xc0000000) == 0 || \ + (fn & 0xc0000000) == 0x40000000 || \ + (fn & 0xc0000000) == 0x80000000 || \ + (fn & 0xc0000000) == 0xc0000000); \ + kvm_static_assert(idx < BIT(sizeof(property.index) * BITS_PER_BYTE)); \ property; \ }) -- GitLab From efa2afc3969e166702fd2ae3cfb1a7a195ef3533 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Thu, 1 Dec 2022 14:37:05 +0000 Subject: [PATCH 1048/1423] RDMA: Extend RDMA user ABI to support atomic write 1) Define new atomic write request/completion in userspace. 2) Define new atomic write capability in userspace. Link: https://lore.kernel.org/r/1669905432-14-2-git-send-email-yangx.jy@fujitsu.com Signed-off-by: Xiao Yang Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/ib_user_verbs.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 43672cb1fd579..2378148155442 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -466,6 +466,7 @@ enum ib_uverbs_wc_opcode { IB_UVERBS_WC_BIND_MW = 5, IB_UVERBS_WC_LOCAL_INV = 6, IB_UVERBS_WC_TSO = 7, + IB_UVERBS_WC_ATOMIC_WRITE = 9, }; struct ib_uverbs_wc { @@ -784,6 +785,7 @@ enum ib_uverbs_wr_opcode { IB_UVERBS_WR_RDMA_READ_WITH_INV = 11, IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP = 12, IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD = 13, + IB_UVERBS_WR_ATOMIC_WRITE = 15, /* Review enum ib_wr_opcode before modifying this */ }; @@ -1331,6 +1333,8 @@ enum ib_uverbs_device_cap_flags { /* Deprecated. Please use IB_UVERBS_RAW_PACKET_CAP_SCATTER_FCS. */ IB_UVERBS_DEVICE_RAW_SCATTER_FCS = 1ULL << 34, IB_UVERBS_DEVICE_PCI_WRITE_END_PADDING = 1ULL << 36, + /* Atomic write attributes */ + IB_UVERBS_DEVICE_ATOMIC_WRITE = 1ULL << 40, }; enum ib_uverbs_raw_packet_caps { -- GitLab From 3ff81e827b8d5cea36ff374a11c200b4306f45d2 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Thu, 1 Dec 2022 14:37:06 +0000 Subject: [PATCH 1049/1423] RDMA: Extend RDMA kernel ABI to support atomic write 1) Define new atomic write request/completion in kernel. 2) Define new atomic write capability in kernel. 3) Define new atomic write opcode for RC service in packet. Link: https://lore.kernel.org/r/1669905432-14-3-git-send-email-yangx.jy@fujitsu.com Signed-off-by: Xiao Yang Signed-off-by: Jason Gunthorpe --- include/rdma/ib_pack.h | 2 ++ include/rdma/ib_verbs.h | 3 +++ 2 files changed, 5 insertions(+) diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h index a9162f25beaf5..f932d164af637 100644 --- a/include/rdma/ib_pack.h +++ b/include/rdma/ib_pack.h @@ -84,6 +84,7 @@ enum { /* opcode 0x15 is reserved */ IB_OPCODE_SEND_LAST_WITH_INVALIDATE = 0x16, IB_OPCODE_SEND_ONLY_WITH_INVALIDATE = 0x17, + IB_OPCODE_ATOMIC_WRITE = 0x1D, /* real constants follow -- see comment about above IB_OPCODE() macro for more details */ @@ -112,6 +113,7 @@ enum { IB_OPCODE(RC, FETCH_ADD), IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE), IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE), + IB_OPCODE(RC, ATOMIC_WRITE), /* UC */ IB_OPCODE(UC, SEND_FIRST), diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 77dd9148815b9..df6bb26ba0be6 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -270,6 +270,7 @@ enum ib_device_cap_flags { /* The device supports padding incoming writes to cacheline. */ IB_DEVICE_PCI_WRITE_END_PADDING = IB_UVERBS_DEVICE_PCI_WRITE_END_PADDING, + IB_DEVICE_ATOMIC_WRITE = IB_UVERBS_DEVICE_ATOMIC_WRITE, }; enum ib_kernel_cap_flags { @@ -982,6 +983,7 @@ enum ib_wc_opcode { IB_WC_BIND_MW = IB_UVERBS_WC_BIND_MW, IB_WC_LOCAL_INV = IB_UVERBS_WC_LOCAL_INV, IB_WC_LSO = IB_UVERBS_WC_TSO, + IB_WC_ATOMIC_WRITE = IB_UVERBS_WC_ATOMIC_WRITE, IB_WC_REG_MR, IB_WC_MASKED_COMP_SWAP, IB_WC_MASKED_FETCH_ADD, @@ -1325,6 +1327,7 @@ enum ib_wr_opcode { IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP, IB_WR_MASKED_ATOMIC_FETCH_AND_ADD = IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD, + IB_WR_ATOMIC_WRITE = IB_UVERBS_WR_ATOMIC_WRITE, /* These are kernel only and can not be issued by userspace */ IB_WR_REG_MR = 0x20, -- GitLab From c2d939002934fa9d7b802f196b069963b46da194 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Thu, 1 Dec 2022 14:37:07 +0000 Subject: [PATCH 1050/1423] RDMA/rxe: Extend rxe user ABI to support atomic write Define an atomic_wr array to store 8-byte value. Link: https://lore.kernel.org/r/1669905432-14-4-git-send-email-yangx.jy@fujitsu.com Signed-off-by: Xiao Yang Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/rdma_user_rxe.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h index 73f679dfd2dfb..d20d1ecf046fd 100644 --- a/include/uapi/rdma/rdma_user_rxe.h +++ b/include/uapi/rdma/rdma_user_rxe.h @@ -146,6 +146,7 @@ struct rxe_dma_info { __u32 reserved; union { __DECLARE_FLEX_ARRAY(__u8, inline_data); + __DECLARE_FLEX_ARRAY(__u8, atomic_wr); __DECLARE_FLEX_ARRAY(struct rxe_sge, sge); }; }; -- GitLab From 5c7af6c7938466aa2f3c52057f4dd28b4a1e9e42 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Thu, 1 Dec 2022 14:37:08 +0000 Subject: [PATCH 1051/1423] RDMA/rxe: Extend rxe packet format to support atomic write Extend rxe_wr_opcode_info[] and rxe_opcode[] for new atomic write opcode. Link: https://lore.kernel.org/r/1669905432-14-5-git-send-email-yangx.jy@fujitsu.com Signed-off-by: Xiao Yang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_opcode.c | 18 ++++++++++++++++++ drivers/infiniband/sw/rxe/rxe_opcode.h | 3 +++ 2 files changed, 21 insertions(+) diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.c b/drivers/infiniband/sw/rxe/rxe_opcode.c index d4ba4d506f176..fb196029048e5 100644 --- a/drivers/infiniband/sw/rxe/rxe_opcode.c +++ b/drivers/infiniband/sw/rxe/rxe_opcode.c @@ -101,6 +101,12 @@ struct rxe_wr_opcode_info rxe_wr_opcode_info[] = { [IB_QPT_UC] = WR_LOCAL_OP_MASK, }, }, + [IB_WR_ATOMIC_WRITE] = { + .name = "IB_WR_ATOMIC_WRITE", + .mask = { + [IB_QPT_RC] = WR_ATOMIC_WRITE_MASK, + }, + }, }; struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE] = { @@ -378,6 +384,18 @@ struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE] = { RXE_IETH_BYTES, } }, + [IB_OPCODE_RC_ATOMIC_WRITE] = { + .name = "IB_OPCODE_RC_ATOMIC_WRITE", + .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK | + RXE_ATOMIC_WRITE_MASK | RXE_START_MASK | + RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + RXE_RETH_BYTES, + } + }, /* UC */ [IB_OPCODE_UC_SEND_FIRST] = { diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.h b/drivers/infiniband/sw/rxe/rxe_opcode.h index 8f9aaaf260f29..a470e9b0b8842 100644 --- a/drivers/infiniband/sw/rxe/rxe_opcode.h +++ b/drivers/infiniband/sw/rxe/rxe_opcode.h @@ -20,6 +20,7 @@ enum rxe_wr_mask { WR_READ_MASK = BIT(3), WR_WRITE_MASK = BIT(4), WR_LOCAL_OP_MASK = BIT(5), + WR_ATOMIC_WRITE_MASK = BIT(7), WR_READ_OR_WRITE_MASK = WR_READ_MASK | WR_WRITE_MASK, WR_WRITE_OR_SEND_MASK = WR_WRITE_MASK | WR_SEND_MASK, @@ -81,6 +82,8 @@ enum rxe_hdr_mask { RXE_LOOPBACK_MASK = BIT(NUM_HDR_TYPES + 12), + RXE_ATOMIC_WRITE_MASK = BIT(NUM_HDR_TYPES + 14), + RXE_READ_OR_ATOMIC_MASK = (RXE_READ_MASK | RXE_ATOMIC_MASK), RXE_WRITE_OR_SEND_MASK = (RXE_WRITE_MASK | RXE_SEND_MASK), RXE_READ_OR_WRITE_MASK = (RXE_READ_MASK | RXE_WRITE_MASK), -- GitLab From abb633cf28049e6a7c37c44f83a8584f7dbded7d Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Thu, 1 Dec 2022 14:39:25 +0000 Subject: [PATCH 1052/1423] RDMA/rxe: Make requester support atomic write on RC service Make requester process and send an atomic write request on RC service. Link: https://lore.kernel.org/r/1669905568-62-1-git-send-email-yangx.jy@fujitsu.com Signed-off-by: Xiao Yang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_req.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 4d45f508392fe..2713e90589225 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -258,6 +258,10 @@ static int next_opcode_rc(struct rxe_qp *qp, u32 opcode, int fits) else return fits ? IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE : IB_OPCODE_RC_SEND_FIRST; + + case IB_WR_ATOMIC_WRITE: + return IB_OPCODE_RC_ATOMIC_WRITE; + case IB_WR_REG_MR: case IB_WR_LOCAL_INV: return opcode; @@ -486,6 +490,11 @@ static int finish_packet(struct rxe_qp *qp, struct rxe_av *av, } } + if (pkt->mask & RXE_ATOMIC_WRITE_MASK) { + memcpy(payload_addr(pkt), wqe->dma.atomic_wr, payload); + wqe->dma.resid -= payload; + } + return 0; } @@ -709,13 +718,15 @@ int rxe_requester(void *arg) } mask = rxe_opcode[opcode].mask; - if (unlikely(mask & RXE_READ_OR_ATOMIC_MASK)) { + if (unlikely(mask & (RXE_READ_OR_ATOMIC_MASK | + RXE_ATOMIC_WRITE_MASK))) { if (check_init_depth(qp, wqe)) goto exit; } mtu = get_mtu(qp); - payload = (mask & RXE_WRITE_OR_SEND_MASK) ? wqe->dma.resid : 0; + payload = (mask & (RXE_WRITE_OR_SEND_MASK | RXE_ATOMIC_WRITE_MASK)) ? + wqe->dma.resid : 0; if (payload > mtu) { if (qp_type(qp) == IB_QPT_UD) { /* C10-93.1.1: If the total sum of all the buffer lengths specified for a -- GitLab From 034e285f8b99062a0cf29112e1232154a6a44aa5 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Thu, 1 Dec 2022 14:39:26 +0000 Subject: [PATCH 1053/1423] RDMA/rxe: Make responder support atomic write on RC service Make responder process an atomic write request and send a read response on RC service. Link: https://lore.kernel.org/r/1669905568-62-2-git-send-email-yangx.jy@fujitsu.com Signed-off-by: Xiao Yang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_resp.c | 84 ++++++++++++++++++++++++++-- 1 file changed, 79 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 6761bcd1d4d8f..6ac544477f3f7 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -22,6 +22,7 @@ enum resp_states { RESPST_EXECUTE, RESPST_READ_REPLY, RESPST_ATOMIC_REPLY, + RESPST_ATOMIC_WRITE_REPLY, RESPST_COMPLETE, RESPST_ACKNOWLEDGE, RESPST_CLEANUP, @@ -57,6 +58,7 @@ static char *resp_state_name[] = { [RESPST_EXECUTE] = "EXECUTE", [RESPST_READ_REPLY] = "READ_REPLY", [RESPST_ATOMIC_REPLY] = "ATOMIC_REPLY", + [RESPST_ATOMIC_WRITE_REPLY] = "ATOMIC_WRITE_REPLY", [RESPST_COMPLETE] = "COMPLETE", [RESPST_ACKNOWLEDGE] = "ACKNOWLEDGE", [RESPST_CLEANUP] = "CLEANUP", @@ -263,7 +265,7 @@ static enum resp_states check_op_valid(struct rxe_qp *qp, case IB_QPT_RC: if (((pkt->mask & RXE_READ_MASK) && !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_READ)) || - ((pkt->mask & RXE_WRITE_MASK) && + ((pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) && !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) || ((pkt->mask & RXE_ATOMIC_MASK) && !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) { @@ -367,7 +369,7 @@ static enum resp_states check_resource(struct rxe_qp *qp, } } - if (pkt->mask & RXE_READ_OR_ATOMIC_MASK) { + if (pkt->mask & (RXE_READ_OR_ATOMIC_MASK | RXE_ATOMIC_WRITE_MASK)) { /* it is the requesters job to not send * too many read/atomic ops, we just * recycle the responder resource queue @@ -438,7 +440,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp, enum resp_states state; int access; - if (pkt->mask & RXE_READ_OR_WRITE_MASK) { + if (pkt->mask & (RXE_READ_OR_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) { if (pkt->mask & RXE_RETH_MASK) { qp->resp.va = reth_va(pkt); qp->resp.offset = 0; @@ -504,7 +506,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp, goto err; } - if (pkt->mask & RXE_WRITE_MASK) { + if (pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) { if (resid > mtu) { if (pktlen != mtu || bth_pad(pkt)) { state = RESPST_ERR_LENGTH; @@ -604,6 +606,7 @@ static struct resp_res *rxe_prepare_res(struct rxe_qp *qp, res->state = rdatm_res_state_new; break; case RXE_ATOMIC_MASK: + case RXE_ATOMIC_WRITE_MASK: res->first_psn = pkt->psn; res->last_psn = pkt->psn; res->cur_psn = pkt->psn; @@ -673,6 +676,55 @@ out: return ret; } +static enum resp_states atomic_write_reply(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + u64 src, *dst; + struct resp_res *res = qp->resp.res; + struct rxe_mr *mr = qp->resp.mr; + int payload = payload_size(pkt); + + if (!res) { + res = rxe_prepare_res(qp, pkt, RXE_ATOMIC_WRITE_MASK); + qp->resp.res = res; + } + + if (!res->replay) { +#ifdef CONFIG_64BIT + if (mr->state != RXE_MR_STATE_VALID) + return RESPST_ERR_RKEY_VIOLATION; + + memcpy(&src, payload_addr(pkt), payload); + + dst = iova_to_vaddr(mr, qp->resp.va + qp->resp.offset, payload); + /* check vaddr is 8 bytes aligned. */ + if (!dst || (uintptr_t)dst & 7) + return RESPST_ERR_MISALIGNED_ATOMIC; + + /* Do atomic write after all prior operations have completed */ + smp_store_release(dst, src); + + /* decrease resp.resid to zero */ + qp->resp.resid -= sizeof(payload); + + qp->resp.msn++; + + /* next expected psn, read handles this separately */ + qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK; + qp->resp.ack_psn = qp->resp.psn; + + qp->resp.opcode = pkt->opcode; + qp->resp.status = IB_WC_SUCCESS; + + return RESPST_ACKNOWLEDGE; +#else + return RESPST_ERR_UNSUPPORTED_OPCODE; +#endif /* CONFIG_64BIT */ + } + + return RESPST_ACKNOWLEDGE; +} + static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp, struct rxe_pkt_info *ack, int opcode, @@ -912,6 +964,8 @@ static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt) return RESPST_READ_REPLY; } else if (pkt->mask & RXE_ATOMIC_MASK) { return RESPST_ATOMIC_REPLY; + } else if (pkt->mask & RXE_ATOMIC_WRITE_MASK) { + return RESPST_ATOMIC_WRITE_REPLY; } else { /* Unreachable */ WARN_ON_ONCE(1); @@ -1085,6 +1139,19 @@ static int send_atomic_ack(struct rxe_qp *qp, u8 syndrome, u32 psn) return ret; } +static int send_read_response_ack(struct rxe_qp *qp, u8 syndrome, u32 psn) +{ + int ret = send_common_ack(qp, syndrome, psn, + IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY, + "RDMA READ response of length zero ACK"); + + /* have to clear this since it is used to trigger + * long read replies + */ + qp->resp.res = NULL; + return ret; +} + static enum resp_states acknowledge(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { @@ -1095,6 +1162,8 @@ static enum resp_states acknowledge(struct rxe_qp *qp, send_ack(qp, qp->resp.aeth_syndrome, pkt->psn); else if (pkt->mask & RXE_ATOMIC_MASK) send_atomic_ack(qp, AETH_ACK_UNLIMITED, pkt->psn); + else if (pkt->mask & RXE_ATOMIC_WRITE_MASK) + send_read_response_ack(qp, AETH_ACK_UNLIMITED, pkt->psn); else if (bth_ack(pkt)) send_ack(qp, AETH_ACK_UNLIMITED, pkt->psn); @@ -1206,7 +1275,9 @@ static enum resp_states duplicate_request(struct rxe_qp *qp, res->replay = 1; res->cur_psn = pkt->psn; qp->resp.res = res; - rc = RESPST_ATOMIC_REPLY; + rc = pkt->mask & RXE_ATOMIC_MASK ? + RESPST_ATOMIC_REPLY : + RESPST_ATOMIC_WRITE_REPLY; goto out; } @@ -1343,6 +1414,9 @@ int rxe_responder(void *arg) case RESPST_ATOMIC_REPLY: state = atomic_reply(qp, pkt); break; + case RESPST_ATOMIC_WRITE_REPLY: + state = atomic_write_reply(qp, pkt); + break; case RESPST_ACKNOWLEDGE: state = acknowledge(qp, pkt); break; -- GitLab From 3aec427bb1499bd3325d2e251edb729a5a8643df Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Thu, 1 Dec 2022 14:39:27 +0000 Subject: [PATCH 1054/1423] RDMA/rxe: Implement atomic write completion Generate an atomic write completion when the atomic write request has been finished. Link: https://lore.kernel.org/r/1669905568-62-3-git-send-email-yangx.jy@fujitsu.com Signed-off-by: Xiao Yang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_comp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index 4dca4f8bbb5aa..1c525325e2716 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -104,6 +104,7 @@ static enum ib_wc_opcode wr_to_wc_opcode(enum ib_wr_opcode opcode) case IB_WR_LOCAL_INV: return IB_WC_LOCAL_INV; case IB_WR_REG_MR: return IB_WC_REG_MR; case IB_WR_BIND_MW: return IB_WC_BIND_MW; + case IB_WR_ATOMIC_WRITE: return IB_WC_ATOMIC_WRITE; default: return 0xff; @@ -269,6 +270,9 @@ static inline enum comp_state check_ack(struct rxe_qp *qp, if ((syn & AETH_TYPE_MASK) != AETH_ACK) return COMPST_ERROR; + if (wqe->wr.opcode == IB_WR_ATOMIC_WRITE) + return COMPST_WRITE_SEND; + fallthrough; /* (IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE doesn't have an AETH) */ -- GitLab From 4cd9f1d320f905e7bc60f030566d15003745ba91 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Thu, 1 Dec 2022 14:39:28 +0000 Subject: [PATCH 1055/1423] RDMA/rxe: Enable atomic write capability for rxe device The capability shows that rxe device supports atomic write operation. Link: https://lore.kernel.org/r/1669905568-62-4-git-send-email-yangx.jy@fujitsu.com Signed-off-by: Xiao Yang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_param.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h index 86c7a8bf3cbbd..bbc88cd71d950 100644 --- a/drivers/infiniband/sw/rxe/rxe_param.h +++ b/drivers/infiniband/sw/rxe/rxe_param.h @@ -51,7 +51,12 @@ enum rxe_device_param { | IB_DEVICE_SRQ_RESIZE | IB_DEVICE_MEM_MGT_EXTENSIONS | IB_DEVICE_MEM_WINDOW +#ifdef CONFIG_64BIT + | IB_DEVICE_MEM_WINDOW_TYPE_2B + | IB_DEVICE_ATOMIC_WRITE, +#else | IB_DEVICE_MEM_WINDOW_TYPE_2B, +#endif /* CONFIG_64BIT */ RXE_MAX_SGE = 32, RXE_MAX_WQE_SIZE = sizeof(struct rxe_send_wqe) + sizeof(struct ib_sge) * RXE_MAX_SGE, -- GitLab From 31f81401e23fb88cc030cd586abd28740e6c8136 Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Mon, 21 Nov 2022 19:27:34 +0800 Subject: [PATCH 1056/1423] crypto: qat - fix error return code in adf_probe Fix to return a negative error code -EINVAL instead of 0. Fixes: 0cec19c761e5 ("crypto: qat - add support for compression for 4xxx") Signed-off-by: Wang Yufen Acked-by: Giovanni Cabiddu Signed-off-by: Herbert Xu --- drivers/crypto/qat/qat_4xxx/adf_drv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/crypto/qat/qat_4xxx/adf_drv.c b/drivers/crypto/qat/qat_4xxx/adf_drv.c index 2f212561acc47..670a58b25cb16 100644 --- a/drivers/crypto/qat/qat_4xxx/adf_drv.c +++ b/drivers/crypto/qat/qat_4xxx/adf_drv.c @@ -261,6 +261,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) hw_data->accel_capabilities_mask = hw_data->get_accel_cap(accel_dev); if (!hw_data->accel_capabilities_mask) { dev_err(&pdev->dev, "Failed to get capabilities mask.\n"); + ret = -EINVAL; goto out_err; } -- GitLab From 6a83830f649a614aca445bbcadbd582c7929e63d Mon Sep 17 00:00:00 2001 From: Nikolaus Voss Date: Mon, 21 Nov 2022 15:12:41 +0100 Subject: [PATCH 1057/1423] crypto: caam - warn if blob_gen key is insecure If CAAM is not in "trusted" or "secure" state, a fixed non-volatile key is used instead of the unique device key. This is the default mode of operation without secure boot (HAB). In this scenario, CAAM encrypted blobs should be used only for testing but not in a production environment, so issue a warning. Signed-off-by: Nikolaus Voss Reviewed-by: Ahmad Fatoum Signed-off-by: Herbert Xu --- drivers/crypto/caam/blob_gen.c | 9 +++++++++ drivers/crypto/caam/regs.h | 3 +++ 2 files changed, 12 insertions(+) diff --git a/drivers/crypto/caam/blob_gen.c b/drivers/crypto/caam/blob_gen.c index 6345c7269eb03..1f65df4898478 100644 --- a/drivers/crypto/caam/blob_gen.c +++ b/drivers/crypto/caam/blob_gen.c @@ -6,6 +6,7 @@ #define pr_fmt(fmt) "caam blob_gen: " fmt +#include #include #include @@ -61,12 +62,14 @@ static void caam_blob_job_done(struct device *dev, u32 *desc, u32 err, void *con int caam_process_blob(struct caam_blob_priv *priv, struct caam_blob_info *info, bool encap) { + const struct caam_drv_private *ctrlpriv; struct caam_blob_job_result testres; struct device *jrdev = &priv->jrdev; dma_addr_t dma_in, dma_out; int op = OP_PCLID_BLOB; size_t output_len; u32 *desc; + u32 moo; int ret; if (info->key_mod_len > CAAM_BLOB_KEYMOD_LENGTH) @@ -100,6 +103,12 @@ int caam_process_blob(struct caam_blob_priv *priv, goto out_unmap_in; } + ctrlpriv = dev_get_drvdata(jrdev->parent); + moo = FIELD_GET(CSTA_MOO, ioread32(&ctrlpriv->ctrl->perfmon.status)); + if (moo != CSTA_MOO_SECURE && moo != CSTA_MOO_TRUSTED) + dev_warn(jrdev, + "using insecure test key, enable HAB to use unique device key!\n"); + /* * A data blob is encrypted using a blob key (BK); a random number. * The BK is used as an AES-CCM key. The initial block (B0) and the diff --git a/drivers/crypto/caam/regs.h b/drivers/crypto/caam/regs.h index 66d6dad841bb2..66928f8a0c4b1 100644 --- a/drivers/crypto/caam/regs.h +++ b/drivers/crypto/caam/regs.h @@ -426,6 +426,9 @@ struct caam_perfmon { u32 rsvd2; #define CSTA_PLEND BIT(10) #define CSTA_ALT_PLEND BIT(18) +#define CSTA_MOO GENMASK(9, 8) +#define CSTA_MOO_SECURE 1 +#define CSTA_MOO_TRUSTED 2 u32 status; /* CSTA - CAAM Status */ u64 rsvd3; -- GitLab From 5b11d1a360ea23c80c6d4ec3f5986a788d0a0995 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 22 Nov 2022 13:53:38 +0800 Subject: [PATCH 1058/1423] crypto: rsa-pkcs1pad - Use helper to set reqsize The value of reqsize must only be changed through the helper. Signed-off-by: Herbert Xu --- crypto/rsa-pkcs1pad.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/crypto/rsa-pkcs1pad.c b/crypto/rsa-pkcs1pad.c index 3285e3af43e14..3bc76edb3f8a9 100644 --- a/crypto/rsa-pkcs1pad.c +++ b/crypto/rsa-pkcs1pad.c @@ -579,6 +579,10 @@ static int pkcs1pad_init_tfm(struct crypto_akcipher *tfm) return PTR_ERR(child_tfm); ctx->child = child_tfm; + + akcipher_set_reqsize(tfm, sizeof(struct pkcs1pad_request) + + crypto_akcipher_reqsize(child_tfm)); + return 0; } @@ -674,7 +678,6 @@ static int pkcs1pad_create(struct crypto_template *tmpl, struct rtattr **tb) inst->alg.set_pub_key = pkcs1pad_set_pub_key; inst->alg.set_priv_key = pkcs1pad_set_priv_key; inst->alg.max_size = pkcs1pad_get_max_size; - inst->alg.reqsize = sizeof(struct pkcs1pad_request) + rsa_alg->reqsize; inst->free = pkcs1pad_free; -- GitLab From bd71e0dced921e00599052b445f9a9f7916a5452 Mon Sep 17 00:00:00 2001 From: Yushan Zhou Date: Tue, 22 Nov 2022 15:49:00 +0800 Subject: [PATCH 1059/1423] crypto: marvell/octeontx - remove redundant NULL check release_firmware() checks whether firmware pointer is NULL. Remove the redundant NULL check in release_tar_archive(). Signed-off-by: Yushan Zhou Signed-off-by: Herbert Xu --- drivers/crypto/marvell/octeontx/otx_cptpf_ucode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/crypto/marvell/octeontx/otx_cptpf_ucode.c b/drivers/crypto/marvell/octeontx/otx_cptpf_ucode.c index df9c2b8747e6d..c4250e5fcf8f7 100644 --- a/drivers/crypto/marvell/octeontx/otx_cptpf_ucode.c +++ b/drivers/crypto/marvell/octeontx/otx_cptpf_ucode.c @@ -345,8 +345,7 @@ static void release_tar_archive(struct tar_arch_info_t *tar_arch) kfree(curr); } - if (tar_arch->fw) - release_firmware(tar_arch->fw); + release_firmware(tar_arch->fw); kfree(tar_arch); } -- GitLab From 56861cbde1b9f3b34d300e6ba87f2c3de1a9c309 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 22 Nov 2022 17:24:01 +0800 Subject: [PATCH 1060/1423] crypto: kpp - Add helper to set reqsize The value of reqsize should only be changed through a helper. To do so we need to first add a helper for this. Signed-off-by: Herbert Xu --- include/crypto/internal/kpp.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/crypto/internal/kpp.h b/include/crypto/internal/kpp.h index 9cb0662ebe871..31ff3c1986ef0 100644 --- a/include/crypto/internal/kpp.h +++ b/include/crypto/internal/kpp.h @@ -50,6 +50,12 @@ static inline void *kpp_request_ctx(struct kpp_request *req) return req->__ctx; } +static inline void kpp_set_reqsize(struct crypto_kpp *kpp, + unsigned int reqsize) +{ + crypto_kpp_alg(kpp)->reqsize = reqsize; +} + static inline void *kpp_tfm_ctx(struct crypto_kpp *tfm) { return tfm->base.__crt_ctx; -- GitLab From 5ba78373561f12d23c975b0a154104a07866f94b Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 22 Nov 2022 17:28:38 +0800 Subject: [PATCH 1061/1423] crypto: hisilicon/hpre - Use helper to set reqsize The value of reqsize must only be changed through the helper. Signed-off-by: Herbert Xu Reviewed-by: Longfang Liu Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/hpre/hpre_crypto.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/crypto/hisilicon/hpre/hpre_crypto.c b/drivers/crypto/hisilicon/hpre/hpre_crypto.c index ef02dadd62170..5f6d363c9435e 100644 --- a/drivers/crypto/hisilicon/hpre/hpre_crypto.c +++ b/drivers/crypto/hisilicon/hpre/hpre_crypto.c @@ -740,6 +740,8 @@ static int hpre_dh_init_tfm(struct crypto_kpp *tfm) { struct hpre_ctx *ctx = kpp_tfm_ctx(tfm); + kpp_set_reqsize(tfm, sizeof(struct hpre_asym_request) + HPRE_ALIGN_SZ); + return hpre_ctx_init(ctx, HPRE_V2_ALG_TYPE); } @@ -1165,6 +1167,9 @@ static int hpre_rsa_init_tfm(struct crypto_akcipher *tfm) return PTR_ERR(ctx->rsa.soft_tfm); } + akcipher_set_reqsize(tfm, sizeof(struct hpre_asym_request) + + HPRE_ALIGN_SZ); + ret = hpre_ctx_init(ctx, HPRE_V2_ALG_TYPE); if (ret) crypto_free_akcipher(ctx->rsa.soft_tfm); @@ -1617,6 +1622,8 @@ static int hpre_ecdh_nist_p192_init_tfm(struct crypto_kpp *tfm) ctx->curve_id = ECC_CURVE_NIST_P192; + kpp_set_reqsize(tfm, sizeof(struct hpre_asym_request) + HPRE_ALIGN_SZ); + return hpre_ctx_init(ctx, HPRE_V3_ECC_ALG_TYPE); } @@ -1626,6 +1633,8 @@ static int hpre_ecdh_nist_p256_init_tfm(struct crypto_kpp *tfm) ctx->curve_id = ECC_CURVE_NIST_P256; + kpp_set_reqsize(tfm, sizeof(struct hpre_asym_request) + HPRE_ALIGN_SZ); + return hpre_ctx_init(ctx, HPRE_V3_ECC_ALG_TYPE); } @@ -1635,6 +1644,8 @@ static int hpre_ecdh_nist_p384_init_tfm(struct crypto_kpp *tfm) ctx->curve_id = ECC_CURVE_NIST_P384; + kpp_set_reqsize(tfm, sizeof(struct hpre_asym_request) + HPRE_ALIGN_SZ); + return hpre_ctx_init(ctx, HPRE_V3_ECC_ALG_TYPE); } @@ -1961,6 +1972,8 @@ static int hpre_curve25519_init_tfm(struct crypto_kpp *tfm) { struct hpre_ctx *ctx = kpp_tfm_ctx(tfm); + kpp_set_reqsize(tfm, sizeof(struct hpre_asym_request) + HPRE_ALIGN_SZ); + return hpre_ctx_init(ctx, HPRE_V3_ECC_ALG_TYPE); } @@ -1981,7 +1994,6 @@ static struct akcipher_alg rsa = { .max_size = hpre_rsa_max_size, .init = hpre_rsa_init_tfm, .exit = hpre_rsa_exit_tfm, - .reqsize = sizeof(struct hpre_asym_request) + HPRE_ALIGN_SZ, .base = { .cra_ctxsize = sizeof(struct hpre_ctx), .cra_priority = HPRE_CRYPTO_ALG_PRI, @@ -1998,7 +2010,6 @@ static struct kpp_alg dh = { .max_size = hpre_dh_max_size, .init = hpre_dh_init_tfm, .exit = hpre_dh_exit_tfm, - .reqsize = sizeof(struct hpre_asym_request) + HPRE_ALIGN_SZ, .base = { .cra_ctxsize = sizeof(struct hpre_ctx), .cra_priority = HPRE_CRYPTO_ALG_PRI, @@ -2016,7 +2027,6 @@ static struct kpp_alg ecdh_curves[] = { .max_size = hpre_ecdh_max_size, .init = hpre_ecdh_nist_p192_init_tfm, .exit = hpre_ecdh_exit_tfm, - .reqsize = sizeof(struct hpre_asym_request) + HPRE_ALIGN_SZ, .base = { .cra_ctxsize = sizeof(struct hpre_ctx), .cra_priority = HPRE_CRYPTO_ALG_PRI, @@ -2031,7 +2041,6 @@ static struct kpp_alg ecdh_curves[] = { .max_size = hpre_ecdh_max_size, .init = hpre_ecdh_nist_p256_init_tfm, .exit = hpre_ecdh_exit_tfm, - .reqsize = sizeof(struct hpre_asym_request) + HPRE_ALIGN_SZ, .base = { .cra_ctxsize = sizeof(struct hpre_ctx), .cra_priority = HPRE_CRYPTO_ALG_PRI, @@ -2046,7 +2055,6 @@ static struct kpp_alg ecdh_curves[] = { .max_size = hpre_ecdh_max_size, .init = hpre_ecdh_nist_p384_init_tfm, .exit = hpre_ecdh_exit_tfm, - .reqsize = sizeof(struct hpre_asym_request) + HPRE_ALIGN_SZ, .base = { .cra_ctxsize = sizeof(struct hpre_ctx), .cra_priority = HPRE_CRYPTO_ALG_PRI, @@ -2064,7 +2072,6 @@ static struct kpp_alg curve25519_alg = { .max_size = hpre_curve25519_max_size, .init = hpre_curve25519_init_tfm, .exit = hpre_curve25519_exit_tfm, - .reqsize = sizeof(struct hpre_asym_request) + HPRE_ALIGN_SZ, .base = { .cra_ctxsize = sizeof(struct hpre_ctx), .cra_priority = HPRE_CRYPTO_ALG_PRI, -- GitLab From 80e62ad58db084920d8cf23323b713391e09f374 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 22 Nov 2022 17:30:58 +0800 Subject: [PATCH 1062/1423] crypto: qat - Use helper to set reqsize The value of reqsize must only be changed through the helper. Signed-off-by: Herbert Xu Reviewed-by: Giovanni Cabiddu Signed-off-by: Herbert Xu --- drivers/crypto/qat/qat_common/qat_asym_algs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/qat/qat_common/qat_asym_algs.c b/drivers/crypto/qat/qat_common/qat_asym_algs.c index 94a26702aeae1..935a7e012946e 100644 --- a/drivers/crypto/qat/qat_common/qat_asym_algs.c +++ b/drivers/crypto/qat/qat_common/qat_asym_algs.c @@ -494,6 +494,8 @@ static int qat_dh_init_tfm(struct crypto_kpp *tfm) if (!inst) return -EINVAL; + kpp_set_reqsize(tfm, sizeof(struct qat_asym_request) + 64); + ctx->p_size = 0; ctx->g2 = false; ctx->inst = inst; @@ -1230,6 +1232,8 @@ static int qat_rsa_init_tfm(struct crypto_akcipher *tfm) if (!inst) return -EINVAL; + akcipher_set_reqsize(tfm, sizeof(struct qat_asym_request) + 64); + ctx->key_sz = 0; ctx->inst = inst; return 0; @@ -1252,7 +1256,6 @@ static struct akcipher_alg rsa = { .max_size = qat_rsa_max_size, .init = qat_rsa_init_tfm, .exit = qat_rsa_exit_tfm, - .reqsize = sizeof(struct qat_asym_request) + 64, .base = { .cra_name = "rsa", .cra_driver_name = "qat-rsa", @@ -1269,7 +1272,6 @@ static struct kpp_alg dh = { .max_size = qat_dh_max_size, .init = qat_dh_init_tfm, .exit = qat_dh_exit_tfm, - .reqsize = sizeof(struct qat_asym_request) + 64, .base = { .cra_name = "dh", .cra_driver_name = "qat-dh", -- GitLab From 908d383b6c94be0f89c5e2a5a346d99495efd4d4 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 22 Nov 2022 17:40:51 +0800 Subject: [PATCH 1063/1423] crypto: caam - Use helper to set reqsize The value of reqsize must only be changed through the helper. Signed-off-by: Herbert Xu Reviewed-by: Gaurav Jain Signed-off-by: Herbert Xu --- drivers/crypto/caam/caampkc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/caam/caampkc.c b/drivers/crypto/caam/caampkc.c index 8867275767101..642846693d7c0 100644 --- a/drivers/crypto/caam/caampkc.c +++ b/drivers/crypto/caam/caampkc.c @@ -1099,6 +1099,8 @@ static int caam_rsa_init_tfm(struct crypto_akcipher *tfm) { struct caam_rsa_ctx *ctx = akcipher_tfm_ctx(tfm); + akcipher_set_reqsize(tfm, sizeof(struct caam_rsa_req_ctx)); + ctx->dev = caam_jr_alloc(); if (IS_ERR(ctx->dev)) { @@ -1141,7 +1143,6 @@ static struct caam_akcipher_alg caam_rsa = { .max_size = caam_rsa_max_size, .init = caam_rsa_init_tfm, .exit = caam_rsa_exit_tfm, - .reqsize = sizeof(struct caam_rsa_req_ctx), .base = { .cra_name = "rsa", .cra_driver_name = "rsa-caam", -- GitLab From 93c446cd36a410b31519af7a2dd32e899cc03d06 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 22 Nov 2022 17:42:26 +0800 Subject: [PATCH 1064/1423] crypto: virtio - Use helper to set reqsize The value of reqsize must only be changed through the helper. Signed-off-by: Herbert Xu Acked-by: Gonglei Signed-off-by: Herbert Xu --- drivers/crypto/virtio/virtio_crypto_akcipher_algs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c b/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c index 168195672e2e1..b2979be613b8f 100644 --- a/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c +++ b/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c @@ -479,6 +479,9 @@ static int virtio_crypto_rsa_init_tfm(struct crypto_akcipher *tfm) ctx->enginectx.op.prepare_request = NULL; ctx->enginectx.op.unprepare_request = NULL; + akcipher_set_reqsize(tfm, + sizeof(struct virtio_crypto_akcipher_request)); + return 0; } @@ -505,7 +508,6 @@ static struct virtio_crypto_akcipher_algo virtio_crypto_akcipher_algs[] = { .max_size = virtio_crypto_rsa_max_size, .init = virtio_crypto_rsa_init_tfm, .exit = virtio_crypto_rsa_exit_tfm, - .reqsize = sizeof(struct virtio_crypto_akcipher_request), .base = { .cra_name = "rsa", .cra_driver_name = "virtio-crypto-rsa", @@ -528,7 +530,6 @@ static struct virtio_crypto_akcipher_algo virtio_crypto_akcipher_algs[] = { .max_size = virtio_crypto_rsa_max_size, .init = virtio_crypto_rsa_init_tfm, .exit = virtio_crypto_rsa_exit_tfm, - .reqsize = sizeof(struct virtio_crypto_akcipher_request), .base = { .cra_name = "pkcs1pad(rsa,sha1)", .cra_driver_name = "virtio-pkcs1-rsa-with-sha1", -- GitLab From 3e71e5b0efcc730216f4450b796df4fdd627ecd0 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 22 Nov 2022 18:03:35 +0800 Subject: [PATCH 1065/1423] crypto: akcipher - Move reqsize into tfm The value of reqsize cannot be determined in case of fallbacks. Therefore it must be stored in the tfm and not the alg object. Signed-off-by: Herbert Xu --- include/crypto/akcipher.h | 7 ++++--- include/crypto/internal/akcipher.h | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/crypto/akcipher.h b/include/crypto/akcipher.h index 5764b46bd1ec1..734c213918bda 100644 --- a/include/crypto/akcipher.h +++ b/include/crypto/akcipher.h @@ -43,9 +43,12 @@ struct akcipher_request { * struct crypto_akcipher - user-instantiated objects which encapsulate * algorithms and core processing logic * + * @reqsize: Request context size required by algorithm implementation * @base: Common crypto API algorithm data structure */ struct crypto_akcipher { + unsigned int reqsize; + struct crypto_tfm base; }; @@ -86,7 +89,6 @@ struct crypto_akcipher { * counterpart to @init, used to remove various changes set in * @init. * - * @reqsize: Request context size required by algorithm implementation * @base: Common crypto API algorithm data structure */ struct akcipher_alg { @@ -102,7 +104,6 @@ struct akcipher_alg { int (*init)(struct crypto_akcipher *tfm); void (*exit)(struct crypto_akcipher *tfm); - unsigned int reqsize; struct crypto_alg base; }; @@ -155,7 +156,7 @@ static inline struct akcipher_alg *crypto_akcipher_alg( static inline unsigned int crypto_akcipher_reqsize(struct crypto_akcipher *tfm) { - return crypto_akcipher_alg(tfm)->reqsize; + return tfm->reqsize; } static inline void akcipher_request_set_tfm(struct akcipher_request *req, diff --git a/include/crypto/internal/akcipher.h b/include/crypto/internal/akcipher.h index 8d3220c9ab772..1474a2d890fc4 100644 --- a/include/crypto/internal/akcipher.h +++ b/include/crypto/internal/akcipher.h @@ -36,7 +36,7 @@ static inline void *akcipher_request_ctx(struct akcipher_request *req) static inline void akcipher_set_reqsize(struct crypto_akcipher *akcipher, unsigned int reqsize) { - crypto_akcipher_alg(akcipher)->reqsize = reqsize; + akcipher->reqsize = reqsize; } static inline void *akcipher_tfm_ctx(struct crypto_akcipher *tfm) -- GitLab From cb99fc0dd1f6985e8b6ade93e3d69f5e33930539 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 22 Nov 2022 18:06:56 +0800 Subject: [PATCH 1066/1423] crypto: dh - Use helper to set reqsize The value of reqsize must only be changed through the helper. Signed-off-by: Herbert Xu --- crypto/dh.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crypto/dh.c b/crypto/dh.c index 99c3b2ef7adca..e39c1bde1ac07 100644 --- a/crypto/dh.c +++ b/crypto/dh.c @@ -318,6 +318,9 @@ static int dh_safe_prime_init_tfm(struct crypto_kpp *tfm) if (IS_ERR(tfm_ctx->dh_tfm)) return PTR_ERR(tfm_ctx->dh_tfm); + kpp_set_reqsize(tfm, sizeof(struct kpp_request) + + crypto_kpp_reqsize(tfm_ctx->dh_tfm)); + return 0; } @@ -593,7 +596,6 @@ static int __maybe_unused __dh_safe_prime_create( inst->alg.max_size = dh_safe_prime_max_size; inst->alg.init = dh_safe_prime_init_tfm; inst->alg.exit = dh_safe_prime_exit_tfm; - inst->alg.reqsize = sizeof(struct kpp_request) + dh_alg->reqsize; inst->alg.base.cra_priority = dh_alg->base.cra_priority; inst->alg.base.cra_module = THIS_MODULE; inst->alg.base.cra_ctxsize = sizeof(struct dh_safe_prime_tfm_ctx); -- GitLab From 4d2b225a67e6df962bbeaad473bfd8f97cfbf478 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 22 Nov 2022 18:09:16 +0800 Subject: [PATCH 1067/1423] crypto: kpp - Move reqsize into tfm The value of reqsize cannot be determined in case of fallbacks. Therefore it must be stored in the tfm and not the alg object. Signed-off-by: Herbert Xu --- include/crypto/internal/kpp.h | 2 +- include/crypto/kpp.h | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/include/crypto/internal/kpp.h b/include/crypto/internal/kpp.h index 31ff3c1986ef0..167662407e36b 100644 --- a/include/crypto/internal/kpp.h +++ b/include/crypto/internal/kpp.h @@ -53,7 +53,7 @@ static inline void *kpp_request_ctx(struct kpp_request *req) static inline void kpp_set_reqsize(struct crypto_kpp *kpp, unsigned int reqsize) { - crypto_kpp_alg(kpp)->reqsize = reqsize; + kpp->reqsize = reqsize; } static inline void *kpp_tfm_ctx(struct crypto_kpp *tfm) diff --git a/include/crypto/kpp.h b/include/crypto/kpp.h index 24d01e9877c12..33ff32878802a 100644 --- a/include/crypto/kpp.h +++ b/include/crypto/kpp.h @@ -37,9 +37,13 @@ struct kpp_request { * struct crypto_kpp - user-instantiated object which encapsulate * algorithms and core processing logic * + * @reqsize: Request context size required by algorithm + * implementation * @base: Common crypto API algorithm data structure */ struct crypto_kpp { + unsigned int reqsize; + struct crypto_tfm base; }; @@ -64,8 +68,6 @@ struct crypto_kpp { * put in place here. * @exit: Undo everything @init did. * - * @reqsize: Request context size required by algorithm - * implementation * @base: Common crypto API algorithm data structure */ struct kpp_alg { @@ -79,7 +81,6 @@ struct kpp_alg { int (*init)(struct crypto_kpp *tfm); void (*exit)(struct crypto_kpp *tfm); - unsigned int reqsize; struct crypto_alg base; }; @@ -128,7 +129,7 @@ static inline struct kpp_alg *crypto_kpp_alg(struct crypto_kpp *tfm) static inline unsigned int crypto_kpp_reqsize(struct crypto_kpp *tfm) { - return crypto_kpp_alg(tfm)->reqsize; + return tfm->reqsize; } static inline void kpp_request_set_tfm(struct kpp_request *req, -- GitLab From 3d780c8a9850ad60dee47a8d971ba7888f3d1bd3 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Tue, 22 Nov 2022 22:56:19 +0100 Subject: [PATCH 1068/1423] crypto: amlogic - Remove kcalloc without check There is no real point in allocating dedicated memory for the irqs array. MAXFLOW is only 2, so it is easier to allocated the needed space directly within the 'meson_dev' structure. This saves some memory allocation and avoids an indirection when using the irqs array. Fixes: 48fe583fe541 ("crypto: amlogic - Add crypto accelerator...") Signed-off-by: Christophe JAILLET Signed-off-by: Herbert Xu --- drivers/crypto/amlogic/amlogic-gxl-core.c | 1 - drivers/crypto/amlogic/amlogic-gxl.h | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/crypto/amlogic/amlogic-gxl-core.c b/drivers/crypto/amlogic/amlogic-gxl-core.c index 6e7ae896717cd..937187027ad57 100644 --- a/drivers/crypto/amlogic/amlogic-gxl-core.c +++ b/drivers/crypto/amlogic/amlogic-gxl-core.c @@ -237,7 +237,6 @@ static int meson_crypto_probe(struct platform_device *pdev) return err; } - mc->irqs = devm_kcalloc(mc->dev, MAXFLOW, sizeof(int), GFP_KERNEL); for (i = 0; i < MAXFLOW; i++) { mc->irqs[i] = platform_get_irq(pdev, i); if (mc->irqs[i] < 0) diff --git a/drivers/crypto/amlogic/amlogic-gxl.h b/drivers/crypto/amlogic/amlogic-gxl.h index dc0f142324a3c..8c0746a1d6d43 100644 --- a/drivers/crypto/amlogic/amlogic-gxl.h +++ b/drivers/crypto/amlogic/amlogic-gxl.h @@ -95,7 +95,7 @@ struct meson_dev { struct device *dev; struct meson_flow *chanlist; atomic_t flow; - int *irqs; + int irqs[MAXFLOW]; #ifdef CONFIG_CRYPTO_DEV_AMLOGIC_GXL_DEBUG struct dentry *dbgfs_dir; #endif -- GitLab From c390c452ebeb44cb979b7374d3acc3859415e86c Mon Sep 17 00:00:00 2001 From: Joe Fradley Date: Tue, 22 Nov 2022 14:54:49 -0800 Subject: [PATCH 1069/1423] crypto: x86/curve25519 - disable gcov curve25519-x86_64.c fails to build when CONFIG_GCOV_KERNEL is enabled. The error is "inline assembly requires more registers than available" thrown from the `fsqr()` function. Therefore, excluding this file from GCOV profiling until this issue is resolved. Thereby allowing CONFIG_GCOV_PROFILE_ALL to be enabled for x86. Signed-off-by: Joe Fradley Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 3b1d701a4f6c5..3e7a329235bdb 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -107,3 +107,6 @@ quiet_cmd_perlasm = PERLASM $@ cmd_perlasm = $(PERL) $< > $@ $(obj)/%.S: $(src)/%.pl FORCE $(call if_changed,perlasm) + +# Disable GCOV in odd or sensitive code +GCOV_PROFILE_curve25519-x86_64.o := n -- GitLab From 7bcceb4c9896b1b672b636ae70fe75110d6bf1ad Mon Sep 17 00:00:00 2001 From: Shang XiaoJing Date: Thu, 24 Nov 2022 14:49:40 +0800 Subject: [PATCH 1070/1423] crypto: omap-sham - Use pm_runtime_resume_and_get() in omap_sham_probe() omap_sham_probe() calls pm_runtime_get_sync() and calls pm_runtime_put_sync() latter to put usage_counter. However, pm_runtime_get_sync() will increment usage_counter even it failed. Fix it by replacing it with pm_runtime_resume_and_get() to keep usage counter balanced. Fixes: b359f034c8bf ("crypto: omap-sham - Convert to use pm_runtime API") Signed-off-by: Shang XiaoJing Acked-by: Mark Greer Signed-off-by: Herbert Xu --- drivers/crypto/omap-sham.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/omap-sham.c b/drivers/crypto/omap-sham.c index 655a7f5a406a1..cbeda59c6b191 100644 --- a/drivers/crypto/omap-sham.c +++ b/drivers/crypto/omap-sham.c @@ -2114,7 +2114,7 @@ static int omap_sham_probe(struct platform_device *pdev) pm_runtime_enable(dev); - err = pm_runtime_get_sync(dev); + err = pm_runtime_resume_and_get(dev); if (err < 0) { dev_err(dev, "failed to get sync: %d\n", err); goto err_pm; -- GitLab From 14386d471322a204344ae81a28738b71e261d3a0 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 25 Nov 2022 12:36:28 +0800 Subject: [PATCH 1071/1423] crypto: Prepare to move crypto_tfm_ctx The helper crypto_tfm_ctx is only used by the Crypto API algorithm code and should really be in algapi.h. However, for historical reasons many files relied on it to be in crypto.h. This patch changes those files to use algapi.h instead in prepartion for a move. Signed-off-by: Herbert Xu --- arch/arm/crypto/aes-cipher-glue.c | 2 +- arch/arm64/crypto/aes-ce-glue.c | 2 +- arch/arm64/crypto/aes-cipher-glue.c | 2 +- arch/arm64/crypto/sm4-ce-cipher-glue.c | 2 +- arch/x86/crypto/twofish_glue.c | 2 +- crypto/aes_generic.c | 2 +- crypto/aes_ti.c | 2 +- crypto/anubis.c | 2 +- crypto/blowfish_common.c | 3 ++- crypto/blowfish_generic.c | 3 ++- crypto/camellia_generic.c | 2 +- crypto/cast5_generic.c | 2 +- crypto/cast6_generic.c | 2 +- crypto/des_generic.c | 2 +- crypto/fcrypt.c | 2 +- crypto/khazad.c | 2 +- crypto/seed.c | 2 +- crypto/serpent_generic.c | 2 +- crypto/sm4_generic.c | 2 +- crypto/tea.c | 2 +- crypto/twofish_common.c | 2 +- crypto/twofish_generic.c | 2 +- drivers/crypto/nx/nx-842.h | 2 +- include/crypto/aria.h | 2 +- include/crypto/internal/acompress.h | 2 ++ include/crypto/internal/scompress.h | 3 ++- 26 files changed, 30 insertions(+), 25 deletions(-) diff --git a/arch/arm/crypto/aes-cipher-glue.c b/arch/arm/crypto/aes-cipher-glue.c index 8cd00f56800e7..6dfaef2d8f913 100644 --- a/arch/arm/crypto/aes-cipher-glue.c +++ b/arch/arm/crypto/aes-cipher-glue.c @@ -7,7 +7,7 @@ */ #include -#include +#include #include asmlinkage void __aes_arm_encrypt(u32 *rk, int rounds, const u8 *in, u8 *out); diff --git a/arch/arm64/crypto/aes-ce-glue.c b/arch/arm64/crypto/aes-ce-glue.c index 56a5f6f0b0c12..e921823ca103a 100644 --- a/arch/arm64/crypto/aes-ce-glue.c +++ b/arch/arm64/crypto/aes-ce-glue.c @@ -9,9 +9,9 @@ #include #include #include +#include #include #include -#include #include #include "aes-ce-setkey.h" diff --git a/arch/arm64/crypto/aes-cipher-glue.c b/arch/arm64/crypto/aes-cipher-glue.c index 8caf6dfefce88..4ec55e568941c 100644 --- a/arch/arm64/crypto/aes-cipher-glue.c +++ b/arch/arm64/crypto/aes-cipher-glue.c @@ -6,7 +6,7 @@ */ #include -#include +#include #include asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds); diff --git a/arch/arm64/crypto/sm4-ce-cipher-glue.c b/arch/arm64/crypto/sm4-ce-cipher-glue.c index 76a34ef4abbbf..c31d76fb5a177 100644 --- a/arch/arm64/crypto/sm4-ce-cipher-glue.c +++ b/arch/arm64/crypto/sm4-ce-cipher-glue.c @@ -2,11 +2,11 @@ #include #include +#include #include #include #include #include -#include #include MODULE_ALIAS_CRYPTO("sm4"); diff --git a/arch/x86/crypto/twofish_glue.c b/arch/x86/crypto/twofish_glue.c index f9c4adc274040..0614beece2793 100644 --- a/arch/x86/crypto/twofish_glue.c +++ b/arch/x86/crypto/twofish_glue.c @@ -38,8 +38,8 @@ * Third Edition. */ +#include #include -#include #include #include #include diff --git a/crypto/aes_generic.c b/crypto/aes_generic.c index 27ab279318137..666474b81c6aa 100644 --- a/crypto/aes_generic.c +++ b/crypto/aes_generic.c @@ -48,11 +48,11 @@ */ #include +#include #include #include #include #include -#include #include #include diff --git a/crypto/aes_ti.c b/crypto/aes_ti.c index 205c2c257d492..a3b342f92fab6 100644 --- a/crypto/aes_ti.c +++ b/crypto/aes_ti.c @@ -6,7 +6,7 @@ */ #include -#include +#include #include static int aesti_set_key(struct crypto_tfm *tfm, const u8 *in_key, diff --git a/crypto/anubis.c b/crypto/anubis.c index 5da0241ef453c..9f0cf61bbc6e2 100644 --- a/crypto/anubis.c +++ b/crypto/anubis.c @@ -29,11 +29,11 @@ * */ +#include #include #include #include #include -#include #include #define ANUBIS_MIN_KEY_SIZE 16 diff --git a/crypto/blowfish_common.c b/crypto/blowfish_common.c index 1c072012baff4..c0208ce269a33 100644 --- a/crypto/blowfish_common.c +++ b/crypto/blowfish_common.c @@ -14,11 +14,12 @@ * Copyright (c) Kyle McMartin * Copyright (c) 2002 James Morris */ + +#include #include #include #include #include -#include #include #include diff --git a/crypto/blowfish_generic.c b/crypto/blowfish_generic.c index 003b52c6880ea..0e74c7242e77e 100644 --- a/crypto/blowfish_generic.c +++ b/crypto/blowfish_generic.c @@ -11,11 +11,12 @@ * Copyright (c) Kyle McMartin * Copyright (c) 2002 James Morris */ + +#include #include #include #include #include -#include #include #include diff --git a/crypto/camellia_generic.c b/crypto/camellia_generic.c index fd1a88af9e77f..c04670cf51acf 100644 --- a/crypto/camellia_generic.c +++ b/crypto/camellia_generic.c @@ -9,7 +9,7 @@ * https://info.isl.ntt.co.jp/crypt/eng/camellia/specifications.html */ -#include +#include #include #include #include diff --git a/crypto/cast5_generic.c b/crypto/cast5_generic.c index 0257c14cefc25..085a1eedae03b 100644 --- a/crypto/cast5_generic.c +++ b/crypto/cast5_generic.c @@ -14,8 +14,8 @@ #include +#include #include -#include #include #include #include diff --git a/crypto/cast6_generic.c b/crypto/cast6_generic.c index 75346380aa0bc..34f1ab53e3a71 100644 --- a/crypto/cast6_generic.c +++ b/crypto/cast6_generic.c @@ -11,8 +11,8 @@ #include +#include #include -#include #include #include #include diff --git a/crypto/des_generic.c b/crypto/des_generic.c index c85354a5e94c7..1274e18d3eb90 100644 --- a/crypto/des_generic.c +++ b/crypto/des_generic.c @@ -8,11 +8,11 @@ */ #include +#include #include #include #include #include -#include #include diff --git a/crypto/fcrypt.c b/crypto/fcrypt.c index 76a04d000c0d3..95a16e88899ba 100644 --- a/crypto/fcrypt.c +++ b/crypto/fcrypt.c @@ -43,10 +43,10 @@ */ #include +#include #include #include #include -#include #define ROUNDS 16 diff --git a/crypto/khazad.c b/crypto/khazad.c index f19339954c89e..70cafe73f9740 100644 --- a/crypto/khazad.c +++ b/crypto/khazad.c @@ -19,11 +19,11 @@ * */ +#include #include #include #include #include -#include #include #define KHAZAD_KEY_SIZE 16 diff --git a/crypto/seed.c b/crypto/seed.c index 27720140820ef..d0506ade2a5f8 100644 --- a/crypto/seed.c +++ b/crypto/seed.c @@ -8,11 +8,11 @@ * Copyright (C) 2007 Korea Information Security Agency (KISA). */ +#include #include #include #include #include -#include #include #define SEED_NUM_KCONSTANTS 16 diff --git a/crypto/serpent_generic.c b/crypto/serpent_generic.c index 45f98b750053e..c6bca47931e21 100644 --- a/crypto/serpent_generic.c +++ b/crypto/serpent_generic.c @@ -7,11 +7,11 @@ * Copyright (C) 2002 Dag Arne Osvik */ +#include #include #include #include #include -#include #include #include diff --git a/crypto/sm4_generic.c b/crypto/sm4_generic.c index 4a6480a27fee5..560eba37dc55e 100644 --- a/crypto/sm4_generic.c +++ b/crypto/sm4_generic.c @@ -7,12 +7,12 @@ * All rights reserved. */ +#include #include #include #include #include #include -#include #include #include diff --git a/crypto/tea.c b/crypto/tea.c index 02efc5d816903..896f863f3067c 100644 --- a/crypto/tea.c +++ b/crypto/tea.c @@ -14,11 +14,11 @@ * Copyright (c) 2004 Aaron Grothe ajgrothe@yahoo.com */ +#include #include #include #include #include -#include #include #define TEA_KEY_SIZE 16 diff --git a/crypto/twofish_common.c b/crypto/twofish_common.c index f921f30334f48..bf4f28742f770 100644 --- a/crypto/twofish_common.c +++ b/crypto/twofish_common.c @@ -25,9 +25,9 @@ * Third Edition. */ +#include #include #include -#include #include #include #include diff --git a/crypto/twofish_generic.c b/crypto/twofish_generic.c index 86b2f067a4162..557915e4062d7 100644 --- a/crypto/twofish_generic.c +++ b/crypto/twofish_generic.c @@ -25,12 +25,12 @@ */ #include +#include #include #include #include #include #include -#include #include /* Macros to compute the g() function in the encryption and decryption diff --git a/drivers/crypto/nx/nx-842.h b/drivers/crypto/nx/nx-842.h index b66f19ac600f2..7590bfb24d79b 100644 --- a/drivers/crypto/nx/nx-842.h +++ b/drivers/crypto/nx/nx-842.h @@ -3,10 +3,10 @@ #ifndef __NX_842_H__ #define __NX_842_H__ +#include #include #include #include -#include #include #include #include diff --git a/include/crypto/aria.h b/include/crypto/aria.h index 254da46cc385c..73295146be119 100644 --- a/include/crypto/aria.h +++ b/include/crypto/aria.h @@ -18,11 +18,11 @@ #ifndef _CRYPTO_ARIA_H #define _CRYPTO_ARIA_H +#include #include #include #include #include -#include #include #define ARIA_MIN_KEY_SIZE 16 diff --git a/include/crypto/internal/acompress.h b/include/crypto/internal/acompress.h index cfc47e18820fb..49339003bd2ce 100644 --- a/include/crypto/internal/acompress.h +++ b/include/crypto/internal/acompress.h @@ -8,7 +8,9 @@ */ #ifndef _CRYPTO_ACOMP_INT_H #define _CRYPTO_ACOMP_INT_H + #include +#include /* * Transform internal helpers. diff --git a/include/crypto/internal/scompress.h b/include/crypto/internal/scompress.h index f834274c2493f..252cc949d4ee5 100644 --- a/include/crypto/internal/scompress.h +++ b/include/crypto/internal/scompress.h @@ -8,7 +8,8 @@ */ #ifndef _CRYPTO_SCOMP_INT_H #define _CRYPTO_SCOMP_INT_H -#include + +#include #define SCOMP_SCRATCH_SIZE 131072 -- GitLab From e634ac4a8aaab37bdc69177df9b40acf92eccc6d Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 25 Nov 2022 12:36:31 +0800 Subject: [PATCH 1072/1423] crypto: api - Add crypto_tfm_ctx_dma This patch adds the helpers crypto_tfm_ctx_aligned and crypto_tfm_ctx_dma. The first aligns the tfm context to the value cra_alignmask. The second sets the alignment according to dma_cache_get_alignment(); This patch also moves crypto_tfm_ctx into algapi.h. Signed-off-by: Herbert Xu --- include/crypto/algapi.h | 41 +++++++++++++++++++++++++++++++++++++++-- include/linux/crypto.h | 5 ----- 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h index f50c5d1725da5..4c99eb66e654c 100644 --- a/include/crypto/algapi.h +++ b/include/crypto/algapi.h @@ -7,6 +7,7 @@ #ifndef _CRYPTO_ALGAPI_H #define _CRYPTO_ALGAPI_H +#include #include #include #include @@ -25,6 +26,14 @@ #define MAX_CIPHER_BLOCKSIZE 16 #define MAX_CIPHER_ALIGNMASK 15 +#ifdef ARCH_DMA_MINALIGN +#define CRYPTO_DMA_ALIGN ARCH_DMA_MINALIGN +#else +#define CRYPTO_DMA_ALIGN CRYPTO_MINALIGN +#endif + +#define CRYPTO_DMA_PADDING ((CRYPTO_DMA_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1)) + struct crypto_aead; struct crypto_instance; struct module; @@ -189,10 +198,38 @@ static inline void crypto_xor_cpy(u8 *dst, const u8 *src1, const u8 *src2, } } +static inline void *crypto_tfm_ctx(struct crypto_tfm *tfm) +{ + return tfm->__crt_ctx; +} + +static inline void *crypto_tfm_ctx_align(struct crypto_tfm *tfm, + unsigned int align) +{ + if (align <= crypto_tfm_ctx_alignment()) + align = 1; + + return PTR_ALIGN(crypto_tfm_ctx(tfm), align); +} + static inline void *crypto_tfm_ctx_aligned(struct crypto_tfm *tfm) { - return PTR_ALIGN(crypto_tfm_ctx(tfm), - crypto_tfm_alg_alignmask(tfm) + 1); + return crypto_tfm_ctx_align(tfm, crypto_tfm_alg_alignmask(tfm) + 1); +} + +static inline unsigned int crypto_dma_align(void) +{ + return CRYPTO_DMA_ALIGN; +} + +static inline unsigned int crypto_dma_padding(void) +{ + return (crypto_dma_align() - 1) & ~(crypto_tfm_ctx_alignment() - 1); +} + +static inline void *crypto_tfm_ctx_dma(struct crypto_tfm *tfm) +{ + return crypto_tfm_ctx_align(tfm, crypto_dma_align()); } static inline struct crypto_instance *crypto_tfm_alg_instance( diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 2324ab6f1846b..5d1e961f810ec 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -714,11 +714,6 @@ static inline void crypto_tfm_clear_flags(struct crypto_tfm *tfm, u32 flags) tfm->crt_flags &= ~flags; } -static inline void *crypto_tfm_ctx(struct crypto_tfm *tfm) -{ - return tfm->__crt_ctx; -} - static inline unsigned int crypto_tfm_ctx_alignment(void) { struct crypto_tfm *tfm; -- GitLab From f8e4d1d0ac832de8efc98f302acf9476bbfffb55 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 25 Nov 2022 12:36:33 +0800 Subject: [PATCH 1073/1423] crypto: aead - Add ctx helpers with DMA alignment This patch adds helpers to access the aead context structure and request context structure with an added alignment for DMA access. Signed-off-by: Herbert Xu --- include/crypto/internal/aead.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/include/crypto/internal/aead.h b/include/crypto/internal/aead.h index d482017f3e203..cd8cb1e921b7b 100644 --- a/include/crypto/internal/aead.h +++ b/include/crypto/internal/aead.h @@ -39,6 +39,11 @@ static inline void *crypto_aead_ctx(struct crypto_aead *tfm) return crypto_tfm_ctx(&tfm->base); } +static inline void *crypto_aead_ctx_dma(struct crypto_aead *tfm) +{ + return crypto_tfm_ctx_dma(&tfm->base); +} + static inline struct crypto_instance *aead_crypto_instance( struct aead_instance *inst) { @@ -65,6 +70,16 @@ static inline void *aead_request_ctx(struct aead_request *req) return req->__ctx; } +static inline void *aead_request_ctx_dma(struct aead_request *req) +{ + unsigned int align = crypto_dma_align(); + + if (align <= crypto_tfm_ctx_alignment()) + align = 1; + + return PTR_ALIGN(aead_request_ctx(req), align); +} + static inline void aead_request_complete(struct aead_request *req, int err) { req->base.complete(&req->base, err); @@ -108,6 +123,13 @@ static inline void crypto_aead_set_reqsize(struct crypto_aead *aead, aead->reqsize = reqsize; } +static inline void crypto_aead_set_reqsize_dma(struct crypto_aead *aead, + unsigned int reqsize) +{ + reqsize += crypto_dma_align() & ~(crypto_tfm_ctx_alignment() - 1); + aead->reqsize = reqsize; +} + static inline void aead_init_queue(struct aead_queue *queue, unsigned int max_qlen) { -- GitLab From b5f755fbd5d1102104c502ae213e9b42478c098f Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 25 Nov 2022 12:36:35 +0800 Subject: [PATCH 1074/1423] crypto: hash - Add ctx helpers with DMA alignment This patch adds helpers to access the ahash context structure and request context structure with an added alignment for DMA access. Signed-off-by: Herbert Xu --- include/crypto/internal/hash.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/include/crypto/internal/hash.h b/include/crypto/internal/hash.h index 0a288dddcf5be..1a2a41b79253c 100644 --- a/include/crypto/internal/hash.h +++ b/include/crypto/internal/hash.h @@ -140,6 +140,11 @@ static inline void *crypto_ahash_ctx(struct crypto_ahash *tfm) return crypto_tfm_ctx(crypto_ahash_tfm(tfm)); } +static inline void *crypto_ahash_ctx_dma(struct crypto_ahash *tfm) +{ + return crypto_tfm_ctx_dma(crypto_ahash_tfm(tfm)); +} + static inline struct ahash_alg *__crypto_ahash_alg(struct crypto_alg *alg) { return container_of(__crypto_hash_alg_common(alg), struct ahash_alg, @@ -152,6 +157,13 @@ static inline void crypto_ahash_set_reqsize(struct crypto_ahash *tfm, tfm->reqsize = reqsize; } +static inline void crypto_ahash_set_reqsize_dma(struct crypto_ahash *ahash, + unsigned int reqsize) +{ + reqsize += crypto_dma_align() & ~(crypto_tfm_ctx_alignment() - 1); + ahash->reqsize = reqsize; +} + static inline struct crypto_instance *ahash_crypto_instance( struct ahash_instance *inst) { @@ -175,6 +187,16 @@ static inline void *ahash_instance_ctx(struct ahash_instance *inst) return crypto_instance_ctx(ahash_crypto_instance(inst)); } +static inline void *ahash_request_ctx_dma(struct ahash_request *req) +{ + unsigned int align = crypto_dma_align(); + + if (align <= crypto_tfm_ctx_alignment()) + align = 1; + + return PTR_ALIGN(ahash_request_ctx(req), align); +} + static inline void ahash_request_complete(struct ahash_request *req, int err) { req->base.complete(&req->base, err); -- GitLab From 12658ac5e612214023c26f0689e6bbe8bbea0871 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 25 Nov 2022 12:36:37 +0800 Subject: [PATCH 1075/1423] crypto: skcipher - Add ctx helpers with DMA alignment This patch adds helpers to access the skcipher context structure and request context structure with an added alignment for DMA access. Signed-off-by: Herbert Xu --- include/crypto/internal/skcipher.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/include/crypto/internal/skcipher.h b/include/crypto/internal/skcipher.h index 2a97540156bbf..06d0a5491cf38 100644 --- a/include/crypto/internal/skcipher.h +++ b/include/crypto/internal/skcipher.h @@ -130,6 +130,13 @@ static inline void crypto_skcipher_set_reqsize( skcipher->reqsize = reqsize; } +static inline void crypto_skcipher_set_reqsize_dma( + struct crypto_skcipher *skcipher, unsigned int reqsize) +{ + reqsize += crypto_dma_align() & ~(crypto_tfm_ctx_alignment() - 1); + skcipher->reqsize = reqsize; +} + int crypto_register_skcipher(struct skcipher_alg *alg); void crypto_unregister_skcipher(struct skcipher_alg *alg); int crypto_register_skciphers(struct skcipher_alg *algs, int count); @@ -159,11 +166,26 @@ static inline void *crypto_skcipher_ctx(struct crypto_skcipher *tfm) return crypto_tfm_ctx(&tfm->base); } +static inline void *crypto_skcipher_ctx_dma(struct crypto_skcipher *tfm) +{ + return crypto_tfm_ctx_dma(&tfm->base); +} + static inline void *skcipher_request_ctx(struct skcipher_request *req) { return req->__ctx; } +static inline void *skcipher_request_ctx_dma(struct skcipher_request *req) +{ + unsigned int align = crypto_dma_align(); + + if (align <= crypto_tfm_ctx_alignment()) + align = 1; + + return PTR_ALIGN(skcipher_request_ctx(req), align); +} + static inline u32 skcipher_request_flags(struct skcipher_request *req) { return req->base.flags; -- GitLab From 1c799571976da15e055f32a0e244697500e97f64 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 25 Nov 2022 12:36:39 +0800 Subject: [PATCH 1076/1423] crypto: api - Increase MAX_ALGAPI_ALIGNMASK to 127 Previously we limited the maximum alignment mask to 63. This is mostly due to stack usage for shash. This patch introduces a separate limit for shash algorithms and increases the general limit to 127 which is the value that we need for DMA allocations on arm64. Signed-off-by: Herbert Xu --- crypto/shash.c | 9 +++++++-- include/crypto/algapi.h | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/crypto/shash.c b/crypto/shash.c index 0f85431588267..868b6ba2b3b74 100644 --- a/crypto/shash.c +++ b/crypto/shash.c @@ -18,6 +18,8 @@ #include "internal.h" +#define MAX_SHASH_ALIGNMASK 63 + static const struct crypto_type crypto_shash_type; int shash_no_setkey(struct crypto_shash *tfm, const u8 *key, @@ -88,7 +90,7 @@ static int shash_update_unaligned(struct shash_desc *desc, const u8 *data, * We cannot count on __aligned() working for large values: * https://patchwork.kernel.org/patch/9507697/ */ - u8 ubuf[MAX_ALGAPI_ALIGNMASK * 2]; + u8 ubuf[MAX_SHASH_ALIGNMASK * 2]; u8 *buf = PTR_ALIGN(&ubuf[0], alignmask + 1); int err; @@ -130,7 +132,7 @@ static int shash_final_unaligned(struct shash_desc *desc, u8 *out) * We cannot count on __aligned() working for large values: * https://patchwork.kernel.org/patch/9507697/ */ - u8 ubuf[MAX_ALGAPI_ALIGNMASK + HASH_MAX_DIGESTSIZE]; + u8 ubuf[MAX_SHASH_ALIGNMASK + HASH_MAX_DIGESTSIZE]; u8 *buf = PTR_ALIGN(&ubuf[0], alignmask + 1); int err; @@ -524,6 +526,9 @@ static int shash_prepare_alg(struct shash_alg *alg) alg->statesize > HASH_MAX_STATESIZE) return -EINVAL; + if (base->cra_alignmask > MAX_SHASH_ALIGNMASK) + return -EINVAL; + if ((alg->export && !alg->import) || (alg->import && !alg->export)) return -EINVAL; diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h index 4c99eb66e654c..8722fd67f40ae 100644 --- a/include/crypto/algapi.h +++ b/include/crypto/algapi.h @@ -22,7 +22,7 @@ * algs and architectures. Ciphers have a lower maximum size. */ #define MAX_ALGAPI_BLOCKSIZE 160 -#define MAX_ALGAPI_ALIGNMASK 63 +#define MAX_ALGAPI_ALIGNMASK 127 #define MAX_CIPHER_BLOCKSIZE 16 #define MAX_CIPHER_ALIGNMASK 15 -- GitLab From 4ac3377645e98d319cb5404e72d40a4aa69d252c Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 25 Nov 2022 12:36:41 +0800 Subject: [PATCH 1077/1423] crypto: akcipher - Add ctx helpers with DMA alignment This patch adds helpers to access the akcipher context structure and request context structure with an added alignment for DMA access. Signed-off-by: Herbert Xu --- include/crypto/internal/akcipher.h | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/include/crypto/internal/akcipher.h b/include/crypto/internal/akcipher.h index 1474a2d890fc4..aaf1092b93b80 100644 --- a/include/crypto/internal/akcipher.h +++ b/include/crypto/internal/akcipher.h @@ -33,15 +33,37 @@ static inline void *akcipher_request_ctx(struct akcipher_request *req) return req->__ctx; } +static inline void *akcipher_request_ctx_dma(struct akcipher_request *req) +{ + unsigned int align = crypto_dma_align(); + + if (align <= crypto_tfm_ctx_alignment()) + align = 1; + + return PTR_ALIGN(akcipher_request_ctx(req), align); +} + static inline void akcipher_set_reqsize(struct crypto_akcipher *akcipher, unsigned int reqsize) { akcipher->reqsize = reqsize; } +static inline void akcipher_set_reqsize_dma(struct crypto_akcipher *akcipher, + unsigned int reqsize) +{ + reqsize += crypto_dma_align() & ~(crypto_tfm_ctx_alignment() - 1); + akcipher->reqsize = reqsize; +} + static inline void *akcipher_tfm_ctx(struct crypto_akcipher *tfm) { - return tfm->base.__crt_ctx; + return crypto_tfm_ctx(&tfm->base); +} + +static inline void *akcipher_tfm_ctx_dma(struct crypto_akcipher *tfm) +{ + return crypto_tfm_ctx_dma(&tfm->base); } static inline void akcipher_request_complete(struct akcipher_request *req, -- GitLab From a5a49249effb6f03086214b25719d415cc867b3d Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 25 Nov 2022 12:36:43 +0800 Subject: [PATCH 1078/1423] crypto: kpp - Add ctx helpers with DMA alignment This patch adds helpers to access the kpp context structure and request context structure with an added alignment for DMA access. Signed-off-by: Herbert Xu --- include/crypto/internal/kpp.h | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/include/crypto/internal/kpp.h b/include/crypto/internal/kpp.h index 167662407e36b..3c9726e89f537 100644 --- a/include/crypto/internal/kpp.h +++ b/include/crypto/internal/kpp.h @@ -50,15 +50,37 @@ static inline void *kpp_request_ctx(struct kpp_request *req) return req->__ctx; } +static inline void *kpp_request_ctx_dma(struct kpp_request *req) +{ + unsigned int align = crypto_dma_align(); + + if (align <= crypto_tfm_ctx_alignment()) + align = 1; + + return PTR_ALIGN(kpp_request_ctx(req), align); +} + static inline void kpp_set_reqsize(struct crypto_kpp *kpp, unsigned int reqsize) { kpp->reqsize = reqsize; } +static inline void kpp_set_reqsize_dma(struct crypto_kpp *kpp, + unsigned int reqsize) +{ + reqsize += crypto_dma_align() & ~(crypto_tfm_ctx_alignment() - 1); + kpp->reqsize = reqsize; +} + static inline void *kpp_tfm_ctx(struct crypto_kpp *tfm) { - return tfm->base.__crt_ctx; + return crypto_tfm_ctx(&tfm->base); +} + +static inline void *kpp_tfm_ctx_dma(struct crypto_kpp *tfm) +{ + return crypto_tfm_ctx_dma(&tfm->base); } static inline void kpp_request_complete(struct kpp_request *req, int err) -- GitLab From 4cb4f7c11deef5222ac15631b16ab54625b926b3 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 25 Nov 2022 12:36:45 +0800 Subject: [PATCH 1079/1423] crypto: caam - Set DMA alignment explicitly This driver has been implicitly relying on kmalloc alignment to be sufficient for DMA. This may no longer be the case with upcoming arm64 changes. This patch changes it to explicitly request DMA alignment from the Crypto API. Signed-off-by: Herbert Xu --- drivers/crypto/caam/caamalg.c | 72 ++++++------- drivers/crypto/caam/caamalg_qi.c | 52 ++++----- drivers/crypto/caam/caamalg_qi2.c | 173 +++++++++++++++--------------- drivers/crypto/caam/caamhash.c | 87 ++++++++------- drivers/crypto/caam/caampkc.c | 47 ++++---- 5 files changed, 216 insertions(+), 215 deletions(-) diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c index d3d8bb0a69900..ecc15bc521db1 100644 --- a/drivers/crypto/caam/caamalg.c +++ b/drivers/crypto/caam/caamalg.c @@ -131,7 +131,7 @@ struct caam_aead_req_ctx { static int aead_null_set_sh_desc(struct crypto_aead *aead) { - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); struct device *jrdev = ctx->jrdev; struct caam_drv_private *ctrlpriv = dev_get_drvdata(jrdev->parent); u32 *desc; @@ -184,7 +184,7 @@ static int aead_set_sh_desc(struct crypto_aead *aead) struct caam_aead_alg *alg = container_of(crypto_aead_alg(aead), struct caam_aead_alg, aead); unsigned int ivsize = crypto_aead_ivsize(aead); - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); struct device *jrdev = ctx->jrdev; struct caam_drv_private *ctrlpriv = dev_get_drvdata(jrdev->parent); u32 ctx1_iv_off = 0; @@ -312,7 +312,7 @@ skip_givenc: static int aead_setauthsize(struct crypto_aead *authenc, unsigned int authsize) { - struct caam_ctx *ctx = crypto_aead_ctx(authenc); + struct caam_ctx *ctx = crypto_aead_ctx_dma(authenc); ctx->authsize = authsize; aead_set_sh_desc(authenc); @@ -322,7 +322,7 @@ static int aead_setauthsize(struct crypto_aead *authenc, static int gcm_set_sh_desc(struct crypto_aead *aead) { - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); struct device *jrdev = ctx->jrdev; unsigned int ivsize = crypto_aead_ivsize(aead); u32 *desc; @@ -372,7 +372,7 @@ static int gcm_set_sh_desc(struct crypto_aead *aead) static int gcm_setauthsize(struct crypto_aead *authenc, unsigned int authsize) { - struct caam_ctx *ctx = crypto_aead_ctx(authenc); + struct caam_ctx *ctx = crypto_aead_ctx_dma(authenc); int err; err = crypto_gcm_check_authsize(authsize); @@ -387,7 +387,7 @@ static int gcm_setauthsize(struct crypto_aead *authenc, unsigned int authsize) static int rfc4106_set_sh_desc(struct crypto_aead *aead) { - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); struct device *jrdev = ctx->jrdev; unsigned int ivsize = crypto_aead_ivsize(aead); u32 *desc; @@ -440,7 +440,7 @@ static int rfc4106_set_sh_desc(struct crypto_aead *aead) static int rfc4106_setauthsize(struct crypto_aead *authenc, unsigned int authsize) { - struct caam_ctx *ctx = crypto_aead_ctx(authenc); + struct caam_ctx *ctx = crypto_aead_ctx_dma(authenc); int err; err = crypto_rfc4106_check_authsize(authsize); @@ -455,7 +455,7 @@ static int rfc4106_setauthsize(struct crypto_aead *authenc, static int rfc4543_set_sh_desc(struct crypto_aead *aead) { - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); struct device *jrdev = ctx->jrdev; unsigned int ivsize = crypto_aead_ivsize(aead); u32 *desc; @@ -508,7 +508,7 @@ static int rfc4543_set_sh_desc(struct crypto_aead *aead) static int rfc4543_setauthsize(struct crypto_aead *authenc, unsigned int authsize) { - struct caam_ctx *ctx = crypto_aead_ctx(authenc); + struct caam_ctx *ctx = crypto_aead_ctx_dma(authenc); if (authsize != 16) return -EINVAL; @@ -521,7 +521,7 @@ static int rfc4543_setauthsize(struct crypto_aead *authenc, static int chachapoly_set_sh_desc(struct crypto_aead *aead) { - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); struct device *jrdev = ctx->jrdev; unsigned int ivsize = crypto_aead_ivsize(aead); u32 *desc; @@ -547,7 +547,7 @@ static int chachapoly_set_sh_desc(struct crypto_aead *aead) static int chachapoly_setauthsize(struct crypto_aead *aead, unsigned int authsize) { - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); if (authsize != POLY1305_DIGEST_SIZE) return -EINVAL; @@ -559,7 +559,7 @@ static int chachapoly_setauthsize(struct crypto_aead *aead, static int chachapoly_setkey(struct crypto_aead *aead, const u8 *key, unsigned int keylen) { - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); unsigned int ivsize = crypto_aead_ivsize(aead); unsigned int saltlen = CHACHAPOLY_IV_SIZE - ivsize; @@ -575,7 +575,7 @@ static int chachapoly_setkey(struct crypto_aead *aead, const u8 *key, static int aead_setkey(struct crypto_aead *aead, const u8 *key, unsigned int keylen) { - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); struct device *jrdev = ctx->jrdev; struct caam_drv_private *ctrlpriv = dev_get_drvdata(jrdev->parent); struct crypto_authenc_keys keys; @@ -656,7 +656,7 @@ static int des3_aead_setkey(struct crypto_aead *aead, const u8 *key, static int gcm_setkey(struct crypto_aead *aead, const u8 *key, unsigned int keylen) { - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); struct device *jrdev = ctx->jrdev; int err; @@ -677,7 +677,7 @@ static int gcm_setkey(struct crypto_aead *aead, static int rfc4106_setkey(struct crypto_aead *aead, const u8 *key, unsigned int keylen) { - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); struct device *jrdev = ctx->jrdev; int err; @@ -703,7 +703,7 @@ static int rfc4106_setkey(struct crypto_aead *aead, static int rfc4543_setkey(struct crypto_aead *aead, const u8 *key, unsigned int keylen) { - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); struct device *jrdev = ctx->jrdev; int err; @@ -729,7 +729,7 @@ static int rfc4543_setkey(struct crypto_aead *aead, static int skcipher_setkey(struct crypto_skcipher *skcipher, const u8 *key, unsigned int keylen, const u32 ctx1_iv_off) { - struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher); + struct caam_ctx *ctx = crypto_skcipher_ctx_dma(skcipher); struct caam_skcipher_alg *alg = container_of(crypto_skcipher_alg(skcipher), typeof(*alg), skcipher); @@ -832,7 +832,7 @@ static int des3_skcipher_setkey(struct crypto_skcipher *skcipher, static int xts_skcipher_setkey(struct crypto_skcipher *skcipher, const u8 *key, unsigned int keylen) { - struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher); + struct caam_ctx *ctx = crypto_skcipher_ctx_dma(skcipher); struct device *jrdev = ctx->jrdev; struct caam_drv_private *ctrlpriv = dev_get_drvdata(jrdev->parent); u32 *desc; @@ -1057,7 +1057,7 @@ static void init_aead_job(struct aead_request *req, bool all_contig, bool encrypt) { struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); int authsize = ctx->authsize; u32 *desc = edesc->hw_desc; u32 out_options, in_options; @@ -1118,7 +1118,7 @@ static void init_gcm_job(struct aead_request *req, bool all_contig, bool encrypt) { struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); unsigned int ivsize = crypto_aead_ivsize(aead); u32 *desc = edesc->hw_desc; bool generic_gcm = (ivsize == GCM_AES_IV_SIZE); @@ -1185,7 +1185,7 @@ static void init_authenc_job(struct aead_request *req, struct caam_aead_alg *alg = container_of(crypto_aead_alg(aead), struct caam_aead_alg, aead); unsigned int ivsize = crypto_aead_ivsize(aead); - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); struct caam_drv_private *ctrlpriv = dev_get_drvdata(ctx->jrdev->parent); const bool ctr_mode = ((ctx->cdata.algtype & OP_ALG_AAI_MASK) == OP_ALG_AAI_CTR_MOD128); @@ -1234,7 +1234,7 @@ static void init_skcipher_job(struct skcipher_request *req, const bool encrypt) { struct crypto_skcipher *skcipher = crypto_skcipher_reqtfm(req); - struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher); + struct caam_ctx *ctx = crypto_skcipher_ctx_dma(skcipher); struct device *jrdev = ctx->jrdev; int ivsize = crypto_skcipher_ivsize(skcipher); u32 *desc = edesc->hw_desc; @@ -1290,7 +1290,7 @@ static struct aead_edesc *aead_edesc_alloc(struct aead_request *req, bool encrypt) { struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); struct device *jrdev = ctx->jrdev; struct caam_aead_req_ctx *rctx = aead_request_ctx(req); gfp_t flags = (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ? @@ -1457,7 +1457,7 @@ static inline int chachapoly_crypt(struct aead_request *req, bool encrypt) { struct aead_edesc *edesc; struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); struct device *jrdev = ctx->jrdev; bool all_contig; u32 *desc; @@ -1491,7 +1491,7 @@ static inline int aead_crypt(struct aead_request *req, bool encrypt) { struct aead_edesc *edesc; struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); struct device *jrdev = ctx->jrdev; bool all_contig; @@ -1524,7 +1524,7 @@ static int aead_decrypt(struct aead_request *req) static int aead_do_one_req(struct crypto_engine *engine, void *areq) { struct aead_request *req = aead_request_cast(areq); - struct caam_ctx *ctx = crypto_aead_ctx(crypto_aead_reqtfm(req)); + struct caam_ctx *ctx = crypto_aead_ctx_dma(crypto_aead_reqtfm(req)); struct caam_aead_req_ctx *rctx = aead_request_ctx(req); u32 *desc = rctx->edesc->hw_desc; int ret; @@ -1550,7 +1550,7 @@ static inline int gcm_crypt(struct aead_request *req, bool encrypt) { struct aead_edesc *edesc; struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); struct device *jrdev = ctx->jrdev; bool all_contig; @@ -1597,7 +1597,7 @@ static struct skcipher_edesc *skcipher_edesc_alloc(struct skcipher_request *req, int desc_bytes) { struct crypto_skcipher *skcipher = crypto_skcipher_reqtfm(req); - struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher); + struct caam_ctx *ctx = crypto_skcipher_ctx_dma(skcipher); struct caam_skcipher_req_ctx *rctx = skcipher_request_ctx(req); struct device *jrdev = ctx->jrdev; gfp_t flags = (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ? @@ -1756,7 +1756,7 @@ static struct skcipher_edesc *skcipher_edesc_alloc(struct skcipher_request *req, static int skcipher_do_one_req(struct crypto_engine *engine, void *areq) { struct skcipher_request *req = skcipher_request_cast(areq); - struct caam_ctx *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); + struct caam_ctx *ctx = crypto_skcipher_ctx_dma(crypto_skcipher_reqtfm(req)); struct caam_skcipher_req_ctx *rctx = skcipher_request_ctx(req); u32 *desc = rctx->edesc->hw_desc; int ret; @@ -1790,7 +1790,7 @@ static inline int skcipher_crypt(struct skcipher_request *req, bool encrypt) { struct skcipher_edesc *edesc; struct crypto_skcipher *skcipher = crypto_skcipher_reqtfm(req); - struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher); + struct caam_ctx *ctx = crypto_skcipher_ctx_dma(skcipher); struct device *jrdev = ctx->jrdev; struct caam_drv_private_jr *jrpriv = dev_get_drvdata(jrdev); struct caam_drv_private *ctrlpriv = dev_get_drvdata(jrdev->parent); @@ -3397,7 +3397,7 @@ static int caam_cra_init(struct crypto_skcipher *tfm) struct skcipher_alg *alg = crypto_skcipher_alg(tfm); struct caam_skcipher_alg *caam_alg = container_of(alg, typeof(*caam_alg), skcipher); - struct caam_ctx *ctx = crypto_skcipher_ctx(tfm); + struct caam_ctx *ctx = crypto_skcipher_ctx_dma(tfm); u32 alg_aai = caam_alg->caam.class1_alg_type & OP_ALG_AAI_MASK; int ret = 0; @@ -3434,7 +3434,7 @@ static int caam_aead_init(struct crypto_aead *tfm) struct aead_alg *alg = crypto_aead_alg(tfm); struct caam_aead_alg *caam_alg = container_of(alg, struct caam_aead_alg, aead); - struct caam_ctx *ctx = crypto_aead_ctx(tfm); + struct caam_ctx *ctx = crypto_aead_ctx_dma(tfm); crypto_aead_set_reqsize(tfm, sizeof(struct caam_aead_req_ctx)); @@ -3454,7 +3454,7 @@ static void caam_exit_common(struct caam_ctx *ctx) static void caam_cra_exit(struct crypto_skcipher *tfm) { - struct caam_ctx *ctx = crypto_skcipher_ctx(tfm); + struct caam_ctx *ctx = crypto_skcipher_ctx_dma(tfm); if (ctx->fallback) crypto_free_skcipher(ctx->fallback); @@ -3463,7 +3463,7 @@ static void caam_cra_exit(struct crypto_skcipher *tfm) static void caam_aead_exit(struct crypto_aead *tfm) { - caam_exit_common(crypto_aead_ctx(tfm)); + caam_exit_common(crypto_aead_ctx_dma(tfm)); } void caam_algapi_exit(void) @@ -3491,7 +3491,7 @@ static void caam_skcipher_alg_init(struct caam_skcipher_alg *t_alg) alg->base.cra_module = THIS_MODULE; alg->base.cra_priority = CAAM_CRA_PRIORITY; - alg->base.cra_ctxsize = sizeof(struct caam_ctx); + alg->base.cra_ctxsize = sizeof(struct caam_ctx) + crypto_dma_padding(); alg->base.cra_flags |= (CRYPTO_ALG_ASYNC | CRYPTO_ALG_ALLOCATES_MEMORY | CRYPTO_ALG_KERN_DRIVER_ONLY); @@ -3505,7 +3505,7 @@ static void caam_aead_alg_init(struct caam_aead_alg *t_alg) alg->base.cra_module = THIS_MODULE; alg->base.cra_priority = CAAM_CRA_PRIORITY; - alg->base.cra_ctxsize = sizeof(struct caam_ctx); + alg->base.cra_ctxsize = sizeof(struct caam_ctx) + crypto_dma_padding(); alg->base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_ALLOCATES_MEMORY | CRYPTO_ALG_KERN_DRIVER_ONLY; diff --git a/drivers/crypto/caam/caamalg_qi.c b/drivers/crypto/caam/caamalg_qi.c index 189a7438b29c4..c37b67be0492d 100644 --- a/drivers/crypto/caam/caamalg_qi.c +++ b/drivers/crypto/caam/caamalg_qi.c @@ -81,7 +81,7 @@ static int aead_set_sh_desc(struct crypto_aead *aead) { struct caam_aead_alg *alg = container_of(crypto_aead_alg(aead), typeof(*alg), aead); - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); unsigned int ivsize = crypto_aead_ivsize(aead); u32 ctx1_iv_off = 0; u32 *nonce = NULL; @@ -184,7 +184,7 @@ skip_givenc: static int aead_setauthsize(struct crypto_aead *authenc, unsigned int authsize) { - struct caam_ctx *ctx = crypto_aead_ctx(authenc); + struct caam_ctx *ctx = crypto_aead_ctx_dma(authenc); ctx->authsize = authsize; aead_set_sh_desc(authenc); @@ -195,7 +195,7 @@ static int aead_setauthsize(struct crypto_aead *authenc, unsigned int authsize) static int aead_setkey(struct crypto_aead *aead, const u8 *key, unsigned int keylen) { - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); struct device *jrdev = ctx->jrdev; struct caam_drv_private *ctrlpriv = dev_get_drvdata(jrdev->parent); struct crypto_authenc_keys keys; @@ -299,7 +299,7 @@ static int des3_aead_setkey(struct crypto_aead *aead, const u8 *key, static int gcm_set_sh_desc(struct crypto_aead *aead) { - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); unsigned int ivsize = crypto_aead_ivsize(aead); int rem_bytes = CAAM_DESC_BYTES_MAX - DESC_JOB_IO_LEN - ctx->cdata.keylen; @@ -342,7 +342,7 @@ static int gcm_set_sh_desc(struct crypto_aead *aead) static int gcm_setauthsize(struct crypto_aead *authenc, unsigned int authsize) { - struct caam_ctx *ctx = crypto_aead_ctx(authenc); + struct caam_ctx *ctx = crypto_aead_ctx_dma(authenc); int err; err = crypto_gcm_check_authsize(authsize); @@ -358,7 +358,7 @@ static int gcm_setauthsize(struct crypto_aead *authenc, unsigned int authsize) static int gcm_setkey(struct crypto_aead *aead, const u8 *key, unsigned int keylen) { - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); struct device *jrdev = ctx->jrdev; int ret; @@ -402,7 +402,7 @@ static int gcm_setkey(struct crypto_aead *aead, static int rfc4106_set_sh_desc(struct crypto_aead *aead) { - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); unsigned int ivsize = crypto_aead_ivsize(aead); int rem_bytes = CAAM_DESC_BYTES_MAX - DESC_JOB_IO_LEN - ctx->cdata.keylen; @@ -446,7 +446,7 @@ static int rfc4106_set_sh_desc(struct crypto_aead *aead) static int rfc4106_setauthsize(struct crypto_aead *authenc, unsigned int authsize) { - struct caam_ctx *ctx = crypto_aead_ctx(authenc); + struct caam_ctx *ctx = crypto_aead_ctx_dma(authenc); int err; err = crypto_rfc4106_check_authsize(authsize); @@ -462,7 +462,7 @@ static int rfc4106_setauthsize(struct crypto_aead *authenc, static int rfc4106_setkey(struct crypto_aead *aead, const u8 *key, unsigned int keylen) { - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); struct device *jrdev = ctx->jrdev; int ret; @@ -510,7 +510,7 @@ static int rfc4106_setkey(struct crypto_aead *aead, static int rfc4543_set_sh_desc(struct crypto_aead *aead) { - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); unsigned int ivsize = crypto_aead_ivsize(aead); int rem_bytes = CAAM_DESC_BYTES_MAX - DESC_JOB_IO_LEN - ctx->cdata.keylen; @@ -554,7 +554,7 @@ static int rfc4543_set_sh_desc(struct crypto_aead *aead) static int rfc4543_setauthsize(struct crypto_aead *authenc, unsigned int authsize) { - struct caam_ctx *ctx = crypto_aead_ctx(authenc); + struct caam_ctx *ctx = crypto_aead_ctx_dma(authenc); if (authsize != 16) return -EINVAL; @@ -568,7 +568,7 @@ static int rfc4543_setauthsize(struct crypto_aead *authenc, static int rfc4543_setkey(struct crypto_aead *aead, const u8 *key, unsigned int keylen) { - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); struct device *jrdev = ctx->jrdev; int ret; @@ -617,7 +617,7 @@ static int rfc4543_setkey(struct crypto_aead *aead, static int skcipher_setkey(struct crypto_skcipher *skcipher, const u8 *key, unsigned int keylen, const u32 ctx1_iv_off) { - struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher); + struct caam_ctx *ctx = crypto_skcipher_ctx_dma(skcipher); struct caam_skcipher_alg *alg = container_of(crypto_skcipher_alg(skcipher), typeof(*alg), skcipher); @@ -731,7 +731,7 @@ static int des_skcipher_setkey(struct crypto_skcipher *skcipher, static int xts_skcipher_setkey(struct crypto_skcipher *skcipher, const u8 *key, unsigned int keylen) { - struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher); + struct caam_ctx *ctx = crypto_skcipher_ctx_dma(skcipher); struct device *jrdev = ctx->jrdev; struct caam_drv_private *ctrlpriv = dev_get_drvdata(jrdev->parent); int ret = 0; @@ -915,7 +915,7 @@ static void aead_done(struct caam_drv_req *drv_req, u32 status) struct aead_edesc *edesc; struct aead_request *aead_req = drv_req->app_ctx; struct crypto_aead *aead = crypto_aead_reqtfm(aead_req); - struct caam_ctx *caam_ctx = crypto_aead_ctx(aead); + struct caam_ctx *caam_ctx = crypto_aead_ctx_dma(aead); int ecode = 0; qidev = caam_ctx->qidev; @@ -937,7 +937,7 @@ static struct aead_edesc *aead_edesc_alloc(struct aead_request *req, bool encrypt) { struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); struct caam_aead_alg *alg = container_of(crypto_aead_alg(aead), typeof(*alg), aead); struct device *qidev = ctx->qidev; @@ -1157,7 +1157,7 @@ static inline int aead_crypt(struct aead_request *req, bool encrypt) { struct aead_edesc *edesc; struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); int ret; if (unlikely(caam_congested)) @@ -1207,7 +1207,7 @@ static void skcipher_done(struct caam_drv_req *drv_req, u32 status) struct skcipher_edesc *edesc; struct skcipher_request *req = drv_req->app_ctx; struct crypto_skcipher *skcipher = crypto_skcipher_reqtfm(req); - struct caam_ctx *caam_ctx = crypto_skcipher_ctx(skcipher); + struct caam_ctx *caam_ctx = crypto_skcipher_ctx_dma(skcipher); struct device *qidev = caam_ctx->qidev; int ivsize = crypto_skcipher_ivsize(skcipher); int ecode = 0; @@ -1245,7 +1245,7 @@ static struct skcipher_edesc *skcipher_edesc_alloc(struct skcipher_request *req, bool encrypt) { struct crypto_skcipher *skcipher = crypto_skcipher_reqtfm(req); - struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher); + struct caam_ctx *ctx = crypto_skcipher_ctx_dma(skcipher); struct device *qidev = ctx->qidev; gfp_t flags = (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ? GFP_KERNEL : GFP_ATOMIC; @@ -1405,7 +1405,7 @@ static inline int skcipher_crypt(struct skcipher_request *req, bool encrypt) { struct skcipher_edesc *edesc; struct crypto_skcipher *skcipher = crypto_skcipher_reqtfm(req); - struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher); + struct caam_ctx *ctx = crypto_skcipher_ctx_dma(skcipher); struct caam_drv_private *ctrlpriv = dev_get_drvdata(ctx->jrdev->parent); int ret; @@ -2491,7 +2491,7 @@ static int caam_cra_init(struct crypto_skcipher *tfm) struct skcipher_alg *alg = crypto_skcipher_alg(tfm); struct caam_skcipher_alg *caam_alg = container_of(alg, typeof(*caam_alg), skcipher); - struct caam_ctx *ctx = crypto_skcipher_ctx(tfm); + struct caam_ctx *ctx = crypto_skcipher_ctx_dma(tfm); u32 alg_aai = caam_alg->caam.class1_alg_type & OP_ALG_AAI_MASK; int ret = 0; @@ -2524,7 +2524,7 @@ static int caam_aead_init(struct crypto_aead *tfm) struct aead_alg *alg = crypto_aead_alg(tfm); struct caam_aead_alg *caam_alg = container_of(alg, typeof(*caam_alg), aead); - struct caam_ctx *ctx = crypto_aead_ctx(tfm); + struct caam_ctx *ctx = crypto_aead_ctx_dma(tfm); return caam_init_common(ctx, &caam_alg->caam, !caam_alg->caam.nodkp); } @@ -2542,7 +2542,7 @@ static void caam_exit_common(struct caam_ctx *ctx) static void caam_cra_exit(struct crypto_skcipher *tfm) { - struct caam_ctx *ctx = crypto_skcipher_ctx(tfm); + struct caam_ctx *ctx = crypto_skcipher_ctx_dma(tfm); if (ctx->fallback) crypto_free_skcipher(ctx->fallback); @@ -2551,7 +2551,7 @@ static void caam_cra_exit(struct crypto_skcipher *tfm) static void caam_aead_exit(struct crypto_aead *tfm) { - caam_exit_common(crypto_aead_ctx(tfm)); + caam_exit_common(crypto_aead_ctx_dma(tfm)); } void caam_qi_algapi_exit(void) @@ -2579,7 +2579,7 @@ static void caam_skcipher_alg_init(struct caam_skcipher_alg *t_alg) alg->base.cra_module = THIS_MODULE; alg->base.cra_priority = CAAM_CRA_PRIORITY; - alg->base.cra_ctxsize = sizeof(struct caam_ctx); + alg->base.cra_ctxsize = sizeof(struct caam_ctx) + crypto_dma_padding(); alg->base.cra_flags |= (CRYPTO_ALG_ASYNC | CRYPTO_ALG_ALLOCATES_MEMORY | CRYPTO_ALG_KERN_DRIVER_ONLY); @@ -2593,7 +2593,7 @@ static void caam_aead_alg_init(struct caam_aead_alg *t_alg) alg->base.cra_module = THIS_MODULE; alg->base.cra_priority = CAAM_CRA_PRIORITY; - alg->base.cra_ctxsize = sizeof(struct caam_ctx); + alg->base.cra_ctxsize = sizeof(struct caam_ctx) + crypto_dma_padding(); alg->base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_ALLOCATES_MEMORY | CRYPTO_ALG_KERN_DRIVER_ONLY; diff --git a/drivers/crypto/caam/caamalg_qi2.c b/drivers/crypto/caam/caamalg_qi2.c index 4482cb145d051..1b0dd742c53f9 100644 --- a/drivers/crypto/caam/caamalg_qi2.c +++ b/drivers/crypto/caam/caamalg_qi2.c @@ -134,12 +134,12 @@ static struct caam_request *to_caam_req(struct crypto_async_request *areq) { switch (crypto_tfm_alg_type(areq->tfm)) { case CRYPTO_ALG_TYPE_SKCIPHER: - return skcipher_request_ctx(skcipher_request_cast(areq)); + return skcipher_request_ctx_dma(skcipher_request_cast(areq)); case CRYPTO_ALG_TYPE_AEAD: - return aead_request_ctx(container_of(areq, struct aead_request, - base)); + return aead_request_ctx_dma( + container_of(areq, struct aead_request, base)); case CRYPTO_ALG_TYPE_AHASH: - return ahash_request_ctx(ahash_request_cast(areq)); + return ahash_request_ctx_dma(ahash_request_cast(areq)); default: return ERR_PTR(-EINVAL); } @@ -171,7 +171,7 @@ static int aead_set_sh_desc(struct crypto_aead *aead) { struct caam_aead_alg *alg = container_of(crypto_aead_alg(aead), typeof(*alg), aead); - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); unsigned int ivsize = crypto_aead_ivsize(aead); struct device *dev = ctx->dev; struct dpaa2_caam_priv *priv = dev_get_drvdata(dev); @@ -276,7 +276,7 @@ static int aead_set_sh_desc(struct crypto_aead *aead) static int aead_setauthsize(struct crypto_aead *authenc, unsigned int authsize) { - struct caam_ctx *ctx = crypto_aead_ctx(authenc); + struct caam_ctx *ctx = crypto_aead_ctx_dma(authenc); ctx->authsize = authsize; aead_set_sh_desc(authenc); @@ -287,7 +287,7 @@ static int aead_setauthsize(struct crypto_aead *authenc, unsigned int authsize) static int aead_setkey(struct crypto_aead *aead, const u8 *key, unsigned int keylen) { - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); struct device *dev = ctx->dev; struct crypto_authenc_keys keys; @@ -350,10 +350,10 @@ static struct aead_edesc *aead_edesc_alloc(struct aead_request *req, bool encrypt) { struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct caam_request *req_ctx = aead_request_ctx(req); + struct caam_request *req_ctx = aead_request_ctx_dma(req); struct dpaa2_fl_entry *in_fle = &req_ctx->fd_flt[1]; struct dpaa2_fl_entry *out_fle = &req_ctx->fd_flt[0]; - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); struct caam_aead_alg *alg = container_of(crypto_aead_alg(aead), typeof(*alg), aead); struct device *dev = ctx->dev; @@ -587,7 +587,7 @@ skip_out_fle: static int chachapoly_set_sh_desc(struct crypto_aead *aead) { - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); unsigned int ivsize = crypto_aead_ivsize(aead); struct device *dev = ctx->dev; struct caam_flc *flc; @@ -620,7 +620,7 @@ static int chachapoly_set_sh_desc(struct crypto_aead *aead) static int chachapoly_setauthsize(struct crypto_aead *aead, unsigned int authsize) { - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); if (authsize != POLY1305_DIGEST_SIZE) return -EINVAL; @@ -632,7 +632,7 @@ static int chachapoly_setauthsize(struct crypto_aead *aead, static int chachapoly_setkey(struct crypto_aead *aead, const u8 *key, unsigned int keylen) { - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); unsigned int ivsize = crypto_aead_ivsize(aead); unsigned int saltlen = CHACHAPOLY_IV_SIZE - ivsize; @@ -647,7 +647,7 @@ static int chachapoly_setkey(struct crypto_aead *aead, const u8 *key, static int gcm_set_sh_desc(struct crypto_aead *aead) { - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); struct device *dev = ctx->dev; unsigned int ivsize = crypto_aead_ivsize(aead); struct caam_flc *flc; @@ -704,7 +704,7 @@ static int gcm_set_sh_desc(struct crypto_aead *aead) static int gcm_setauthsize(struct crypto_aead *authenc, unsigned int authsize) { - struct caam_ctx *ctx = crypto_aead_ctx(authenc); + struct caam_ctx *ctx = crypto_aead_ctx_dma(authenc); int err; err = crypto_gcm_check_authsize(authsize); @@ -720,7 +720,7 @@ static int gcm_setauthsize(struct crypto_aead *authenc, unsigned int authsize) static int gcm_setkey(struct crypto_aead *aead, const u8 *key, unsigned int keylen) { - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); struct device *dev = ctx->dev; int ret; @@ -739,7 +739,7 @@ static int gcm_setkey(struct crypto_aead *aead, static int rfc4106_set_sh_desc(struct crypto_aead *aead) { - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); struct device *dev = ctx->dev; unsigned int ivsize = crypto_aead_ivsize(aead); struct caam_flc *flc; @@ -799,7 +799,7 @@ static int rfc4106_set_sh_desc(struct crypto_aead *aead) static int rfc4106_setauthsize(struct crypto_aead *authenc, unsigned int authsize) { - struct caam_ctx *ctx = crypto_aead_ctx(authenc); + struct caam_ctx *ctx = crypto_aead_ctx_dma(authenc); int err; err = crypto_rfc4106_check_authsize(authsize); @@ -815,7 +815,7 @@ static int rfc4106_setauthsize(struct crypto_aead *authenc, static int rfc4106_setkey(struct crypto_aead *aead, const u8 *key, unsigned int keylen) { - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); struct device *dev = ctx->dev; int ret; @@ -840,7 +840,7 @@ static int rfc4106_setkey(struct crypto_aead *aead, static int rfc4543_set_sh_desc(struct crypto_aead *aead) { - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); struct device *dev = ctx->dev; unsigned int ivsize = crypto_aead_ivsize(aead); struct caam_flc *flc; @@ -900,7 +900,7 @@ static int rfc4543_set_sh_desc(struct crypto_aead *aead) static int rfc4543_setauthsize(struct crypto_aead *authenc, unsigned int authsize) { - struct caam_ctx *ctx = crypto_aead_ctx(authenc); + struct caam_ctx *ctx = crypto_aead_ctx_dma(authenc); if (authsize != 16) return -EINVAL; @@ -914,7 +914,7 @@ static int rfc4543_setauthsize(struct crypto_aead *authenc, static int rfc4543_setkey(struct crypto_aead *aead, const u8 *key, unsigned int keylen) { - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); struct device *dev = ctx->dev; int ret; @@ -940,7 +940,7 @@ static int rfc4543_setkey(struct crypto_aead *aead, static int skcipher_setkey(struct crypto_skcipher *skcipher, const u8 *key, unsigned int keylen, const u32 ctx1_iv_off) { - struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher); + struct caam_ctx *ctx = crypto_skcipher_ctx_dma(skcipher); struct caam_skcipher_alg *alg = container_of(crypto_skcipher_alg(skcipher), struct caam_skcipher_alg, skcipher); @@ -1059,7 +1059,7 @@ static int des3_skcipher_setkey(struct crypto_skcipher *skcipher, static int xts_skcipher_setkey(struct crypto_skcipher *skcipher, const u8 *key, unsigned int keylen) { - struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher); + struct caam_ctx *ctx = crypto_skcipher_ctx_dma(skcipher); struct device *dev = ctx->dev; struct dpaa2_caam_priv *priv = dev_get_drvdata(dev); struct caam_flc *flc; @@ -1109,10 +1109,10 @@ static int xts_skcipher_setkey(struct crypto_skcipher *skcipher, const u8 *key, static struct skcipher_edesc *skcipher_edesc_alloc(struct skcipher_request *req) { struct crypto_skcipher *skcipher = crypto_skcipher_reqtfm(req); - struct caam_request *req_ctx = skcipher_request_ctx(req); + struct caam_request *req_ctx = skcipher_request_ctx_dma(req); struct dpaa2_fl_entry *in_fle = &req_ctx->fd_flt[1]; struct dpaa2_fl_entry *out_fle = &req_ctx->fd_flt[0]; - struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher); + struct caam_ctx *ctx = crypto_skcipher_ctx_dma(skcipher); struct device *dev = ctx->dev; gfp_t flags = (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ? GFP_KERNEL : GFP_ATOMIC; @@ -1286,7 +1286,7 @@ static void aead_encrypt_done(void *cbk_ctx, u32 status) struct caam_request *req_ctx = to_caam_req(areq); struct aead_edesc *edesc = req_ctx->edesc; struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); int ecode = 0; dev_dbg(ctx->dev, "%s %d: err 0x%x\n", __func__, __LINE__, status); @@ -1307,7 +1307,7 @@ static void aead_decrypt_done(void *cbk_ctx, u32 status) struct caam_request *req_ctx = to_caam_req(areq); struct aead_edesc *edesc = req_ctx->edesc; struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); int ecode = 0; dev_dbg(ctx->dev, "%s %d: err 0x%x\n", __func__, __LINE__, status); @@ -1324,8 +1324,8 @@ static int aead_encrypt(struct aead_request *req) { struct aead_edesc *edesc; struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct caam_ctx *ctx = crypto_aead_ctx(aead); - struct caam_request *caam_req = aead_request_ctx(req); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); + struct caam_request *caam_req = aead_request_ctx_dma(req); int ret; /* allocate extended descriptor */ @@ -1352,8 +1352,8 @@ static int aead_decrypt(struct aead_request *req) { struct aead_edesc *edesc; struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct caam_ctx *ctx = crypto_aead_ctx(aead); - struct caam_request *caam_req = aead_request_ctx(req); + struct caam_ctx *ctx = crypto_aead_ctx_dma(aead); + struct caam_request *caam_req = aead_request_ctx_dma(req); int ret; /* allocate extended descriptor */ @@ -1392,7 +1392,7 @@ static void skcipher_encrypt_done(void *cbk_ctx, u32 status) struct skcipher_request *req = skcipher_request_cast(areq); struct caam_request *req_ctx = to_caam_req(areq); struct crypto_skcipher *skcipher = crypto_skcipher_reqtfm(req); - struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher); + struct caam_ctx *ctx = crypto_skcipher_ctx_dma(skcipher); struct skcipher_edesc *edesc = req_ctx->edesc; int ecode = 0; int ivsize = crypto_skcipher_ivsize(skcipher); @@ -1430,7 +1430,7 @@ static void skcipher_decrypt_done(void *cbk_ctx, u32 status) struct skcipher_request *req = skcipher_request_cast(areq); struct caam_request *req_ctx = to_caam_req(areq); struct crypto_skcipher *skcipher = crypto_skcipher_reqtfm(req); - struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher); + struct caam_ctx *ctx = crypto_skcipher_ctx_dma(skcipher); struct skcipher_edesc *edesc = req_ctx->edesc; int ecode = 0; int ivsize = crypto_skcipher_ivsize(skcipher); @@ -1474,8 +1474,8 @@ static int skcipher_encrypt(struct skcipher_request *req) { struct skcipher_edesc *edesc; struct crypto_skcipher *skcipher = crypto_skcipher_reqtfm(req); - struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher); - struct caam_request *caam_req = skcipher_request_ctx(req); + struct caam_ctx *ctx = crypto_skcipher_ctx_dma(skcipher); + struct caam_request *caam_req = skcipher_request_ctx_dma(req); struct dpaa2_caam_priv *priv = dev_get_drvdata(ctx->dev); int ret; @@ -1524,8 +1524,8 @@ static int skcipher_decrypt(struct skcipher_request *req) { struct skcipher_edesc *edesc; struct crypto_skcipher *skcipher = crypto_skcipher_reqtfm(req); - struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher); - struct caam_request *caam_req = skcipher_request_ctx(req); + struct caam_ctx *ctx = crypto_skcipher_ctx_dma(skcipher); + struct caam_request *caam_req = skcipher_request_ctx_dma(req); struct dpaa2_caam_priv *priv = dev_get_drvdata(ctx->dev); int ret; @@ -1603,7 +1603,7 @@ static int caam_cra_init_skcipher(struct crypto_skcipher *tfm) struct skcipher_alg *alg = crypto_skcipher_alg(tfm); struct caam_skcipher_alg *caam_alg = container_of(alg, typeof(*caam_alg), skcipher); - struct caam_ctx *ctx = crypto_skcipher_ctx(tfm); + struct caam_ctx *ctx = crypto_skcipher_ctx_dma(tfm); u32 alg_aai = caam_alg->caam.class1_alg_type & OP_ALG_AAI_MASK; int ret = 0; @@ -1621,10 +1621,12 @@ static int caam_cra_init_skcipher(struct crypto_skcipher *tfm) } ctx->fallback = fallback; - crypto_skcipher_set_reqsize(tfm, sizeof(struct caam_request) + - crypto_skcipher_reqsize(fallback)); + crypto_skcipher_set_reqsize_dma( + tfm, sizeof(struct caam_request) + + crypto_skcipher_reqsize(fallback)); } else { - crypto_skcipher_set_reqsize(tfm, sizeof(struct caam_request)); + crypto_skcipher_set_reqsize_dma(tfm, + sizeof(struct caam_request)); } ret = caam_cra_init(ctx, &caam_alg->caam, false); @@ -1640,8 +1642,8 @@ static int caam_cra_init_aead(struct crypto_aead *tfm) struct caam_aead_alg *caam_alg = container_of(alg, typeof(*caam_alg), aead); - crypto_aead_set_reqsize(tfm, sizeof(struct caam_request)); - return caam_cra_init(crypto_aead_ctx(tfm), &caam_alg->caam, + crypto_aead_set_reqsize_dma(tfm, sizeof(struct caam_request)); + return caam_cra_init(crypto_aead_ctx_dma(tfm), &caam_alg->caam, !caam_alg->caam.nodkp); } @@ -1654,7 +1656,7 @@ static void caam_exit_common(struct caam_ctx *ctx) static void caam_cra_exit(struct crypto_skcipher *tfm) { - struct caam_ctx *ctx = crypto_skcipher_ctx(tfm); + struct caam_ctx *ctx = crypto_skcipher_ctx_dma(tfm); if (ctx->fallback) crypto_free_skcipher(ctx->fallback); @@ -1663,7 +1665,7 @@ static void caam_cra_exit(struct crypto_skcipher *tfm) static void caam_cra_exit_aead(struct crypto_aead *tfm) { - caam_exit_common(crypto_aead_ctx(tfm)); + caam_exit_common(crypto_aead_ctx_dma(tfm)); } static struct caam_skcipher_alg driver_algs[] = { @@ -3008,7 +3010,7 @@ static void caam_skcipher_alg_init(struct caam_skcipher_alg *t_alg) alg->base.cra_module = THIS_MODULE; alg->base.cra_priority = CAAM_CRA_PRIORITY; - alg->base.cra_ctxsize = sizeof(struct caam_ctx); + alg->base.cra_ctxsize = sizeof(struct caam_ctx) + crypto_dma_padding(); alg->base.cra_flags |= (CRYPTO_ALG_ASYNC | CRYPTO_ALG_ALLOCATES_MEMORY | CRYPTO_ALG_KERN_DRIVER_ONLY); @@ -3022,7 +3024,7 @@ static void caam_aead_alg_init(struct caam_aead_alg *t_alg) alg->base.cra_module = THIS_MODULE; alg->base.cra_priority = CAAM_CRA_PRIORITY; - alg->base.cra_ctxsize = sizeof(struct caam_ctx); + alg->base.cra_ctxsize = sizeof(struct caam_ctx) + crypto_dma_padding(); alg->base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_ALLOCATES_MEMORY | CRYPTO_ALG_KERN_DRIVER_ONLY; @@ -3132,7 +3134,7 @@ static inline int ctx_map_to_qm_sg(struct device *dev, static int ahash_set_sh_desc(struct crypto_ahash *ahash) { - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); int digestsize = crypto_ahash_digestsize(ahash); struct dpaa2_caam_priv *priv = dev_get_drvdata(ctx->dev); struct caam_flc *flc; @@ -3305,7 +3307,7 @@ err_flc: static int ahash_setkey(struct crypto_ahash *ahash, const u8 *key, unsigned int keylen) { - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); unsigned int blocksize = crypto_tfm_alg_blocksize(&ahash->base); unsigned int digestsize = crypto_ahash_digestsize(ahash); int ret; @@ -3356,7 +3358,7 @@ bad_free_key: static inline void ahash_unmap(struct device *dev, struct ahash_edesc *edesc, struct ahash_request *req) { - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_state *state = ahash_request_ctx_dma(req); if (edesc->src_nents) dma_unmap_sg(dev, req->src, edesc->src_nents, DMA_TO_DEVICE); @@ -3376,7 +3378,7 @@ static inline void ahash_unmap_ctx(struct device *dev, struct ahash_edesc *edesc, struct ahash_request *req, u32 flag) { - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_state *state = ahash_request_ctx_dma(req); if (state->ctx_dma) { dma_unmap_single(dev, state->ctx_dma, state->ctx_dma_len, flag); @@ -3390,9 +3392,9 @@ static void ahash_done(void *cbk_ctx, u32 status) struct crypto_async_request *areq = cbk_ctx; struct ahash_request *req = ahash_request_cast(areq); struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_state *state = ahash_request_ctx_dma(req); struct ahash_edesc *edesc = state->caam_req.edesc; - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); int digestsize = crypto_ahash_digestsize(ahash); int ecode = 0; @@ -3417,9 +3419,9 @@ static void ahash_done_bi(void *cbk_ctx, u32 status) struct crypto_async_request *areq = cbk_ctx; struct ahash_request *req = ahash_request_cast(areq); struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_state *state = ahash_request_ctx_dma(req); struct ahash_edesc *edesc = state->caam_req.edesc; - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); int ecode = 0; dev_dbg(ctx->dev, "%s %d: err 0x%x\n", __func__, __LINE__, status); @@ -3455,9 +3457,9 @@ static void ahash_done_ctx_src(void *cbk_ctx, u32 status) struct crypto_async_request *areq = cbk_ctx; struct ahash_request *req = ahash_request_cast(areq); struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_state *state = ahash_request_ctx_dma(req); struct ahash_edesc *edesc = state->caam_req.edesc; - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); int digestsize = crypto_ahash_digestsize(ahash); int ecode = 0; @@ -3482,9 +3484,9 @@ static void ahash_done_ctx_dst(void *cbk_ctx, u32 status) struct crypto_async_request *areq = cbk_ctx; struct ahash_request *req = ahash_request_cast(areq); struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_state *state = ahash_request_ctx_dma(req); struct ahash_edesc *edesc = state->caam_req.edesc; - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); int ecode = 0; dev_dbg(ctx->dev, "%s %d: err 0x%x\n", __func__, __LINE__, status); @@ -3518,8 +3520,8 @@ static void ahash_done_ctx_dst(void *cbk_ctx, u32 status) static int ahash_update_ctx(struct ahash_request *req) { struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); + struct caam_hash_state *state = ahash_request_ctx_dma(req); struct caam_request *req_ctx = &state->caam_req; struct dpaa2_fl_entry *in_fle = &req_ctx->fd_flt[1]; struct dpaa2_fl_entry *out_fle = &req_ctx->fd_flt[0]; @@ -3637,8 +3639,8 @@ unmap_ctx: static int ahash_final_ctx(struct ahash_request *req) { struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); + struct caam_hash_state *state = ahash_request_ctx_dma(req); struct caam_request *req_ctx = &state->caam_req; struct dpaa2_fl_entry *in_fle = &req_ctx->fd_flt[1]; struct dpaa2_fl_entry *out_fle = &req_ctx->fd_flt[0]; @@ -3708,8 +3710,8 @@ unmap_ctx: static int ahash_finup_ctx(struct ahash_request *req) { struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); + struct caam_hash_state *state = ahash_request_ctx_dma(req); struct caam_request *req_ctx = &state->caam_req; struct dpaa2_fl_entry *in_fle = &req_ctx->fd_flt[1]; struct dpaa2_fl_entry *out_fle = &req_ctx->fd_flt[0]; @@ -3802,8 +3804,8 @@ unmap_ctx: static int ahash_digest(struct ahash_request *req) { struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); + struct caam_hash_state *state = ahash_request_ctx_dma(req); struct caam_request *req_ctx = &state->caam_req; struct dpaa2_fl_entry *in_fle = &req_ctx->fd_flt[1]; struct dpaa2_fl_entry *out_fle = &req_ctx->fd_flt[0]; @@ -3897,8 +3899,8 @@ unmap: static int ahash_final_no_ctx(struct ahash_request *req) { struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); + struct caam_hash_state *state = ahash_request_ctx_dma(req); struct caam_request *req_ctx = &state->caam_req; struct dpaa2_fl_entry *in_fle = &req_ctx->fd_flt[1]; struct dpaa2_fl_entry *out_fle = &req_ctx->fd_flt[0]; @@ -3970,8 +3972,8 @@ unmap: static int ahash_update_no_ctx(struct ahash_request *req) { struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); + struct caam_hash_state *state = ahash_request_ctx_dma(req); struct caam_request *req_ctx = &state->caam_req; struct dpaa2_fl_entry *in_fle = &req_ctx->fd_flt[1]; struct dpaa2_fl_entry *out_fle = &req_ctx->fd_flt[0]; @@ -4091,8 +4093,8 @@ unmap_ctx: static int ahash_finup_no_ctx(struct ahash_request *req) { struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); + struct caam_hash_state *state = ahash_request_ctx_dma(req); struct caam_request *req_ctx = &state->caam_req; struct dpaa2_fl_entry *in_fle = &req_ctx->fd_flt[1]; struct dpaa2_fl_entry *out_fle = &req_ctx->fd_flt[0]; @@ -4187,8 +4189,8 @@ unmap: static int ahash_update_first(struct ahash_request *req) { struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); + struct caam_hash_state *state = ahash_request_ctx_dma(req); struct caam_request *req_ctx = &state->caam_req; struct dpaa2_fl_entry *in_fle = &req_ctx->fd_flt[1]; struct dpaa2_fl_entry *out_fle = &req_ctx->fd_flt[0]; @@ -4320,7 +4322,7 @@ static int ahash_finup_first(struct ahash_request *req) static int ahash_init(struct ahash_request *req) { - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_state *state = ahash_request_ctx_dma(req); state->update = ahash_update_first; state->finup = ahash_finup_first; @@ -4337,28 +4339,28 @@ static int ahash_init(struct ahash_request *req) static int ahash_update(struct ahash_request *req) { - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_state *state = ahash_request_ctx_dma(req); return state->update(req); } static int ahash_finup(struct ahash_request *req) { - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_state *state = ahash_request_ctx_dma(req); return state->finup(req); } static int ahash_final(struct ahash_request *req) { - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_state *state = ahash_request_ctx_dma(req); return state->final(req); } static int ahash_export(struct ahash_request *req, void *out) { - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_state *state = ahash_request_ctx_dma(req); struct caam_export_state *export = out; u8 *buf = state->buf; int len = state->buflen; @@ -4375,7 +4377,7 @@ static int ahash_export(struct ahash_request *req, void *out) static int ahash_import(struct ahash_request *req, const void *in) { - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_state *state = ahash_request_ctx_dma(req); const struct caam_export_state *export = in; memset(state, 0, sizeof(*state)); @@ -4547,7 +4549,7 @@ static int caam_hash_cra_init(struct crypto_tfm *tfm) container_of(halg, struct ahash_alg, halg); struct caam_hash_alg *caam_hash = container_of(alg, struct caam_hash_alg, ahash_alg); - struct caam_hash_ctx *ctx = crypto_tfm_ctx(tfm); + struct caam_hash_ctx *ctx = crypto_tfm_ctx_dma(tfm); /* Sizes for MDHA running digests: MD5, SHA1, 224, 256, 384, 512 */ static const u8 runninglen[] = { HASH_MSG_LEN + MD5_DIGEST_SIZE, HASH_MSG_LEN + SHA1_DIGEST_SIZE, @@ -4594,8 +4596,7 @@ static int caam_hash_cra_init(struct crypto_tfm *tfm) OP_ALG_ALGSEL_SUBMASK) >> OP_ALG_ALGSEL_SHIFT]; - crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), - sizeof(struct caam_hash_state)); + crypto_ahash_set_reqsize_dma(ahash, sizeof(struct caam_hash_state)); /* * For keyed hash algorithms shared descriptors @@ -4606,7 +4607,7 @@ static int caam_hash_cra_init(struct crypto_tfm *tfm) static void caam_hash_cra_exit(struct crypto_tfm *tfm) { - struct caam_hash_ctx *ctx = crypto_tfm_ctx(tfm); + struct caam_hash_ctx *ctx = crypto_tfm_ctx_dma(tfm); dma_unmap_single_attrs(ctx->dev, ctx->flc_dma[0], sizeof(ctx->flc), DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC); @@ -4646,7 +4647,7 @@ static struct caam_hash_alg *caam_hash_alloc(struct device *dev, alg->cra_module = THIS_MODULE; alg->cra_init = caam_hash_cra_init; alg->cra_exit = caam_hash_cra_exit; - alg->cra_ctxsize = sizeof(struct caam_hash_ctx); + alg->cra_ctxsize = sizeof(struct caam_hash_ctx) + crypto_dma_padding(); alg->cra_priority = CAAM_CRA_PRIORITY; alg->cra_blocksize = template->blocksize; alg->cra_alignmask = 0; diff --git a/drivers/crypto/caam/caamhash.c b/drivers/crypto/caam/caamhash.c index 36ef738e4a181..1050e965a4385 100644 --- a/drivers/crypto/caam/caamhash.c +++ b/drivers/crypto/caam/caamhash.c @@ -199,7 +199,7 @@ static inline int ctx_map_to_sec4_sg(struct device *jrdev, static int ahash_set_sh_desc(struct crypto_ahash *ahash) { - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); int digestsize = crypto_ahash_digestsize(ahash); struct device *jrdev = ctx->jrdev; struct caam_drv_private *ctrlpriv = dev_get_drvdata(jrdev->parent); @@ -255,7 +255,7 @@ static int ahash_set_sh_desc(struct crypto_ahash *ahash) static int axcbc_set_sh_desc(struct crypto_ahash *ahash) { - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); int digestsize = crypto_ahash_digestsize(ahash); struct device *jrdev = ctx->jrdev; u32 *desc; @@ -307,7 +307,7 @@ static int axcbc_set_sh_desc(struct crypto_ahash *ahash) static int acmac_set_sh_desc(struct crypto_ahash *ahash) { - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); int digestsize = crypto_ahash_digestsize(ahash); struct device *jrdev = ctx->jrdev; u32 *desc; @@ -421,7 +421,7 @@ static int hash_digest_key(struct caam_hash_ctx *ctx, u32 *keylen, u8 *key, static int ahash_setkey(struct crypto_ahash *ahash, const u8 *key, unsigned int keylen) { - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); struct device *jrdev = ctx->jrdev; int blocksize = crypto_tfm_alg_blocksize(&ahash->base); int digestsize = crypto_ahash_digestsize(ahash); @@ -484,7 +484,7 @@ static int ahash_setkey(struct crypto_ahash *ahash, static int axcbc_setkey(struct crypto_ahash *ahash, const u8 *key, unsigned int keylen) { - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); struct device *jrdev = ctx->jrdev; if (keylen != AES_KEYSIZE_128) @@ -504,7 +504,7 @@ static int axcbc_setkey(struct crypto_ahash *ahash, const u8 *key, static int acmac_setkey(struct crypto_ahash *ahash, const u8 *key, unsigned int keylen) { - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); int err; err = aes_check_keylen(keylen); @@ -543,7 +543,7 @@ static inline void ahash_unmap(struct device *dev, struct ahash_edesc *edesc, struct ahash_request *req, int dst_len) { - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_state *state = ahash_request_ctx_dma(req); if (edesc->src_nents) dma_unmap_sg(dev, req->src, edesc->src_nents, DMA_TO_DEVICE); @@ -563,7 +563,7 @@ static inline void ahash_unmap_ctx(struct device *dev, struct ahash_edesc *edesc, struct ahash_request *req, int dst_len, u32 flag) { - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_state *state = ahash_request_ctx_dma(req); if (state->ctx_dma) { dma_unmap_single(dev, state->ctx_dma, state->ctx_dma_len, flag); @@ -580,8 +580,8 @@ static inline void ahash_done_cpy(struct device *jrdev, u32 *desc, u32 err, struct ahash_edesc *edesc; struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); int digestsize = crypto_ahash_digestsize(ahash); - struct caam_hash_state *state = ahash_request_ctx(req); - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); + struct caam_hash_state *state = ahash_request_ctx_dma(req); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); int ecode = 0; bool has_bklog; @@ -630,8 +630,8 @@ static inline void ahash_done_switch(struct device *jrdev, u32 *desc, u32 err, struct caam_drv_private_jr *jrp = dev_get_drvdata(jrdev); struct ahash_edesc *edesc; struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); + struct caam_hash_state *state = ahash_request_ctx_dma(req); int digestsize = crypto_ahash_digestsize(ahash); int ecode = 0; bool has_bklog; @@ -695,8 +695,8 @@ static struct ahash_edesc *ahash_edesc_alloc(struct ahash_request *req, dma_addr_t sh_desc_dma) { struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); + struct caam_hash_state *state = ahash_request_ctx_dma(req); gfp_t flags = (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ? GFP_KERNEL : GFP_ATOMIC; struct ahash_edesc *edesc; @@ -755,8 +755,8 @@ static int ahash_edesc_add_src(struct caam_hash_ctx *ctx, static int ahash_do_one_req(struct crypto_engine *engine, void *areq) { struct ahash_request *req = ahash_request_cast(areq); - struct caam_hash_ctx *ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req)); - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(crypto_ahash_reqtfm(req)); + struct caam_hash_state *state = ahash_request_ctx_dma(req); struct device *jrdev = ctx->jrdev; u32 *desc = state->edesc->hw_desc; int ret; @@ -785,7 +785,7 @@ static int ahash_enqueue_req(struct device *jrdev, int dst_len, enum dma_data_direction dir) { struct caam_drv_private_jr *jrpriv = dev_get_drvdata(jrdev); - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_state *state = ahash_request_ctx_dma(req); struct ahash_edesc *edesc = state->edesc; u32 *desc = edesc->hw_desc; int ret; @@ -815,8 +815,8 @@ static int ahash_enqueue_req(struct device *jrdev, static int ahash_update_ctx(struct ahash_request *req) { struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); + struct caam_hash_state *state = ahash_request_ctx_dma(req); struct device *jrdev = ctx->jrdev; u8 *buf = state->buf; int *buflen = &state->buflen; @@ -940,8 +940,8 @@ unmap_ctx: static int ahash_final_ctx(struct ahash_request *req) { struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); + struct caam_hash_state *state = ahash_request_ctx_dma(req); struct device *jrdev = ctx->jrdev; int buflen = state->buflen; u32 *desc; @@ -1001,8 +1001,8 @@ static int ahash_final_ctx(struct ahash_request *req) static int ahash_finup_ctx(struct ahash_request *req) { struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); + struct caam_hash_state *state = ahash_request_ctx_dma(req); struct device *jrdev = ctx->jrdev; int buflen = state->buflen; u32 *desc; @@ -1075,8 +1075,8 @@ static int ahash_finup_ctx(struct ahash_request *req) static int ahash_digest(struct ahash_request *req) { struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); + struct caam_hash_state *state = ahash_request_ctx_dma(req); struct device *jrdev = ctx->jrdev; u32 *desc; int digestsize = crypto_ahash_digestsize(ahash); @@ -1142,8 +1142,8 @@ static int ahash_digest(struct ahash_request *req) static int ahash_final_no_ctx(struct ahash_request *req) { struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); + struct caam_hash_state *state = ahash_request_ctx_dma(req); struct device *jrdev = ctx->jrdev; u8 *buf = state->buf; int buflen = state->buflen; @@ -1191,8 +1191,8 @@ static int ahash_final_no_ctx(struct ahash_request *req) static int ahash_update_no_ctx(struct ahash_request *req) { struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); + struct caam_hash_state *state = ahash_request_ctx_dma(req); struct device *jrdev = ctx->jrdev; u8 *buf = state->buf; int *buflen = &state->buflen; @@ -1312,8 +1312,8 @@ static int ahash_update_no_ctx(struct ahash_request *req) static int ahash_finup_no_ctx(struct ahash_request *req) { struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); + struct caam_hash_state *state = ahash_request_ctx_dma(req); struct device *jrdev = ctx->jrdev; int buflen = state->buflen; u32 *desc; @@ -1388,8 +1388,8 @@ static int ahash_finup_no_ctx(struct ahash_request *req) static int ahash_update_first(struct ahash_request *req) { struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); - struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash); - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); + struct caam_hash_state *state = ahash_request_ctx_dma(req); struct device *jrdev = ctx->jrdev; u8 *buf = state->buf; int *buflen = &state->buflen; @@ -1498,7 +1498,7 @@ static int ahash_finup_first(struct ahash_request *req) static int ahash_init(struct ahash_request *req) { - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_state *state = ahash_request_ctx_dma(req); state->update = ahash_update_first; state->finup = ahash_finup_first; @@ -1515,28 +1515,28 @@ static int ahash_init(struct ahash_request *req) static int ahash_update(struct ahash_request *req) { - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_state *state = ahash_request_ctx_dma(req); return state->update(req); } static int ahash_finup(struct ahash_request *req) { - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_state *state = ahash_request_ctx_dma(req); return state->finup(req); } static int ahash_final(struct ahash_request *req) { - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_state *state = ahash_request_ctx_dma(req); return state->final(req); } static int ahash_export(struct ahash_request *req, void *out) { - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_state *state = ahash_request_ctx_dma(req); struct caam_export_state *export = out; u8 *buf = state->buf; int len = state->buflen; @@ -1553,7 +1553,7 @@ static int ahash_export(struct ahash_request *req, void *out) static int ahash_import(struct ahash_request *req, const void *in) { - struct caam_hash_state *state = ahash_request_ctx(req); + struct caam_hash_state *state = ahash_request_ctx_dma(req); const struct caam_export_state *export = in; memset(state, 0, sizeof(*state)); @@ -1762,7 +1762,7 @@ static int caam_hash_cra_init(struct crypto_tfm *tfm) container_of(halg, struct ahash_alg, halg); struct caam_hash_alg *caam_hash = container_of(alg, struct caam_hash_alg, ahash_alg); - struct caam_hash_ctx *ctx = crypto_tfm_ctx(tfm); + struct caam_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); /* Sizes for MDHA running digests: MD5, SHA1, 224, 256, 384, 512 */ static const u8 runninglen[] = { HASH_MSG_LEN + MD5_DIGEST_SIZE, HASH_MSG_LEN + SHA1_DIGEST_SIZE, @@ -1854,8 +1854,7 @@ static int caam_hash_cra_init(struct crypto_tfm *tfm) ctx->enginectx.op.do_one_request = ahash_do_one_req; - crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), - sizeof(struct caam_hash_state)); + crypto_ahash_set_reqsize_dma(ahash, sizeof(struct caam_hash_state)); /* * For keyed hash algorithms shared descriptors @@ -1866,7 +1865,7 @@ static int caam_hash_cra_init(struct crypto_tfm *tfm) static void caam_hash_cra_exit(struct crypto_tfm *tfm) { - struct caam_hash_ctx *ctx = crypto_tfm_ctx(tfm); + struct caam_hash_ctx *ctx = crypto_tfm_ctx_dma(tfm); dma_unmap_single_attrs(ctx->jrdev, ctx->sh_desc_update_dma, offsetof(struct caam_hash_ctx, key) - @@ -1926,7 +1925,7 @@ caam_hash_alloc(struct caam_hash_template *template, alg->cra_module = THIS_MODULE; alg->cra_init = caam_hash_cra_init; alg->cra_exit = caam_hash_cra_exit; - alg->cra_ctxsize = sizeof(struct caam_hash_ctx); + alg->cra_ctxsize = sizeof(struct caam_hash_ctx) + crypto_dma_padding(); alg->cra_priority = CAAM_CRA_PRIORITY; alg->cra_blocksize = template->blocksize; alg->cra_alignmask = 0; diff --git a/drivers/crypto/caam/caampkc.c b/drivers/crypto/caam/caampkc.c index 642846693d7c0..aef031946f337 100644 --- a/drivers/crypto/caam/caampkc.c +++ b/drivers/crypto/caam/caampkc.c @@ -57,7 +57,7 @@ static void rsa_pub_unmap(struct device *dev, struct rsa_edesc *edesc, struct akcipher_request *req) { struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); - struct caam_rsa_ctx *ctx = akcipher_tfm_ctx(tfm); + struct caam_rsa_ctx *ctx = akcipher_tfm_ctx_dma(tfm); struct caam_rsa_key *key = &ctx->key; struct rsa_pub_pdb *pdb = &edesc->pdb.pub; @@ -69,7 +69,7 @@ static void rsa_priv_f1_unmap(struct device *dev, struct rsa_edesc *edesc, struct akcipher_request *req) { struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); - struct caam_rsa_ctx *ctx = akcipher_tfm_ctx(tfm); + struct caam_rsa_ctx *ctx = akcipher_tfm_ctx_dma(tfm); struct caam_rsa_key *key = &ctx->key; struct rsa_priv_f1_pdb *pdb = &edesc->pdb.priv_f1; @@ -81,7 +81,7 @@ static void rsa_priv_f2_unmap(struct device *dev, struct rsa_edesc *edesc, struct akcipher_request *req) { struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); - struct caam_rsa_ctx *ctx = akcipher_tfm_ctx(tfm); + struct caam_rsa_ctx *ctx = akcipher_tfm_ctx_dma(tfm); struct caam_rsa_key *key = &ctx->key; struct rsa_priv_f2_pdb *pdb = &edesc->pdb.priv_f2; size_t p_sz = key->p_sz; @@ -98,7 +98,7 @@ static void rsa_priv_f3_unmap(struct device *dev, struct rsa_edesc *edesc, struct akcipher_request *req) { struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); - struct caam_rsa_ctx *ctx = akcipher_tfm_ctx(tfm); + struct caam_rsa_ctx *ctx = akcipher_tfm_ctx_dma(tfm); struct caam_rsa_key *key = &ctx->key; struct rsa_priv_f3_pdb *pdb = &edesc->pdb.priv_f3; size_t p_sz = key->p_sz; @@ -149,7 +149,7 @@ static void rsa_priv_f_done(struct device *dev, u32 *desc, u32 err, struct akcipher_request *req = context; struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); struct caam_drv_private_jr *jrp = dev_get_drvdata(dev); - struct caam_rsa_ctx *ctx = akcipher_tfm_ctx(tfm); + struct caam_rsa_ctx *ctx = akcipher_tfm_ctx_dma(tfm); struct caam_rsa_key *key = &ctx->key; struct caam_rsa_req_ctx *req_ctx = akcipher_request_ctx(req); struct rsa_edesc *edesc; @@ -242,7 +242,7 @@ static struct rsa_edesc *rsa_edesc_alloc(struct akcipher_request *req, size_t desclen) { struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); - struct caam_rsa_ctx *ctx = akcipher_tfm_ctx(tfm); + struct caam_rsa_ctx *ctx = akcipher_tfm_ctx_dma(tfm); struct device *dev = ctx->dev; struct caam_rsa_req_ctx *req_ctx = akcipher_request_ctx(req); struct caam_rsa_key *key = &ctx->key; @@ -371,7 +371,7 @@ static int akcipher_do_one_req(struct crypto_engine *engine, void *areq) base); struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); struct caam_rsa_req_ctx *req_ctx = akcipher_request_ctx(req); - struct caam_rsa_ctx *ctx = akcipher_tfm_ctx(tfm); + struct caam_rsa_ctx *ctx = akcipher_tfm_ctx_dma(tfm); struct device *jrdev = ctx->dev; u32 *desc = req_ctx->edesc->hw_desc; int ret; @@ -399,7 +399,7 @@ static int set_rsa_pub_pdb(struct akcipher_request *req, { struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); struct caam_rsa_req_ctx *req_ctx = akcipher_request_ctx(req); - struct caam_rsa_ctx *ctx = akcipher_tfm_ctx(tfm); + struct caam_rsa_ctx *ctx = akcipher_tfm_ctx_dma(tfm); struct caam_rsa_key *key = &ctx->key; struct device *dev = ctx->dev; struct rsa_pub_pdb *pdb = &edesc->pdb.pub; @@ -444,7 +444,7 @@ static int set_rsa_priv_f1_pdb(struct akcipher_request *req, struct rsa_edesc *edesc) { struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); - struct caam_rsa_ctx *ctx = akcipher_tfm_ctx(tfm); + struct caam_rsa_ctx *ctx = akcipher_tfm_ctx_dma(tfm); struct caam_rsa_key *key = &ctx->key; struct device *dev = ctx->dev; struct rsa_priv_f1_pdb *pdb = &edesc->pdb.priv_f1; @@ -491,7 +491,7 @@ static int set_rsa_priv_f2_pdb(struct akcipher_request *req, struct rsa_edesc *edesc) { struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); - struct caam_rsa_ctx *ctx = akcipher_tfm_ctx(tfm); + struct caam_rsa_ctx *ctx = akcipher_tfm_ctx_dma(tfm); struct caam_rsa_key *key = &ctx->key; struct device *dev = ctx->dev; struct rsa_priv_f2_pdb *pdb = &edesc->pdb.priv_f2; @@ -568,7 +568,7 @@ static int set_rsa_priv_f3_pdb(struct akcipher_request *req, struct rsa_edesc *edesc) { struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); - struct caam_rsa_ctx *ctx = akcipher_tfm_ctx(tfm); + struct caam_rsa_ctx *ctx = akcipher_tfm_ctx_dma(tfm); struct caam_rsa_key *key = &ctx->key; struct device *dev = ctx->dev; struct rsa_priv_f3_pdb *pdb = &edesc->pdb.priv_f3; @@ -664,7 +664,7 @@ static int akcipher_enqueue_req(struct device *jrdev, { struct caam_drv_private_jr *jrpriv = dev_get_drvdata(jrdev); struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); - struct caam_rsa_ctx *ctx = akcipher_tfm_ctx(tfm); + struct caam_rsa_ctx *ctx = akcipher_tfm_ctx_dma(tfm); struct caam_rsa_key *key = &ctx->key; struct caam_rsa_req_ctx *req_ctx = akcipher_request_ctx(req); struct rsa_edesc *edesc = req_ctx->edesc; @@ -707,7 +707,7 @@ static int akcipher_enqueue_req(struct device *jrdev, static int caam_rsa_enc(struct akcipher_request *req) { struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); - struct caam_rsa_ctx *ctx = akcipher_tfm_ctx(tfm); + struct caam_rsa_ctx *ctx = akcipher_tfm_ctx_dma(tfm); struct caam_rsa_key *key = &ctx->key; struct device *jrdev = ctx->dev; struct rsa_edesc *edesc; @@ -746,7 +746,7 @@ init_fail: static int caam_rsa_dec_priv_f1(struct akcipher_request *req) { struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); - struct caam_rsa_ctx *ctx = akcipher_tfm_ctx(tfm); + struct caam_rsa_ctx *ctx = akcipher_tfm_ctx_dma(tfm); struct device *jrdev = ctx->dev; struct rsa_edesc *edesc; int ret; @@ -775,7 +775,7 @@ init_fail: static int caam_rsa_dec_priv_f2(struct akcipher_request *req) { struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); - struct caam_rsa_ctx *ctx = akcipher_tfm_ctx(tfm); + struct caam_rsa_ctx *ctx = akcipher_tfm_ctx_dma(tfm); struct device *jrdev = ctx->dev; struct rsa_edesc *edesc; int ret; @@ -804,7 +804,7 @@ init_fail: static int caam_rsa_dec_priv_f3(struct akcipher_request *req) { struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); - struct caam_rsa_ctx *ctx = akcipher_tfm_ctx(tfm); + struct caam_rsa_ctx *ctx = akcipher_tfm_ctx_dma(tfm); struct device *jrdev = ctx->dev; struct rsa_edesc *edesc; int ret; @@ -833,7 +833,7 @@ init_fail: static int caam_rsa_dec(struct akcipher_request *req) { struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); - struct caam_rsa_ctx *ctx = akcipher_tfm_ctx(tfm); + struct caam_rsa_ctx *ctx = akcipher_tfm_ctx_dma(tfm); struct caam_rsa_key *key = &ctx->key; int ret; @@ -936,7 +936,7 @@ static int caam_rsa_check_key_length(unsigned int len) static int caam_rsa_set_pub_key(struct crypto_akcipher *tfm, const void *key, unsigned int keylen) { - struct caam_rsa_ctx *ctx = akcipher_tfm_ctx(tfm); + struct caam_rsa_ctx *ctx = akcipher_tfm_ctx_dma(tfm); struct rsa_key raw_key = {NULL}; struct caam_rsa_key *rsa_key = &ctx->key; int ret; @@ -1038,7 +1038,7 @@ free_p: static int caam_rsa_set_priv_key(struct crypto_akcipher *tfm, const void *key, unsigned int keylen) { - struct caam_rsa_ctx *ctx = akcipher_tfm_ctx(tfm); + struct caam_rsa_ctx *ctx = akcipher_tfm_ctx_dma(tfm); struct rsa_key raw_key = {NULL}; struct caam_rsa_key *rsa_key = &ctx->key; int ret; @@ -1089,7 +1089,7 @@ err: static unsigned int caam_rsa_max_size(struct crypto_akcipher *tfm) { - struct caam_rsa_ctx *ctx = akcipher_tfm_ctx(tfm); + struct caam_rsa_ctx *ctx = akcipher_tfm_ctx_dma(tfm); return ctx->key.n_sz; } @@ -1097,7 +1097,7 @@ static unsigned int caam_rsa_max_size(struct crypto_akcipher *tfm) /* Per session pkc's driver context creation function */ static int caam_rsa_init_tfm(struct crypto_akcipher *tfm) { - struct caam_rsa_ctx *ctx = akcipher_tfm_ctx(tfm); + struct caam_rsa_ctx *ctx = akcipher_tfm_ctx_dma(tfm); akcipher_set_reqsize(tfm, sizeof(struct caam_rsa_req_ctx)); @@ -1125,7 +1125,7 @@ static int caam_rsa_init_tfm(struct crypto_akcipher *tfm) /* Per session pkc's driver context cleanup function */ static void caam_rsa_exit_tfm(struct crypto_akcipher *tfm) { - struct caam_rsa_ctx *ctx = akcipher_tfm_ctx(tfm); + struct caam_rsa_ctx *ctx = akcipher_tfm_ctx_dma(tfm); struct caam_rsa_key *key = &ctx->key; dma_unmap_single(ctx->dev, ctx->padding_dma, CAAM_RSA_MAX_INPUT_SIZE - @@ -1148,7 +1148,8 @@ static struct caam_akcipher_alg caam_rsa = { .cra_driver_name = "rsa-caam", .cra_priority = 3000, .cra_module = THIS_MODULE, - .cra_ctxsize = sizeof(struct caam_rsa_ctx), + .cra_ctxsize = sizeof(struct caam_rsa_ctx) + + CRYPTO_DMA_PADDING, }, } }; -- GitLab From 2ae6feb1a1f6678fe11864f1b6920ed10b09ad6a Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Fri, 25 Nov 2022 20:18:11 +0800 Subject: [PATCH 1080/1423] crypto: ccree,hisilicon - Fix dependencies to correct algorithm Commit d2825fa9365d ("crypto: sm3,sm4 - move into crypto directory") moves the SM3 and SM4 stand-alone library and the algorithm implementation for the Crypto API into the same directory, and the corresponding relationship of Kconfig is modified, CONFIG_CRYPTO_SM3/4 corresponds to the stand-alone library of SM3/4, and CONFIG_CRYPTO_SM3/4_GENERIC corresponds to the algorithm implementation for the Crypto API. Therefore, it is necessary for this module to depend on the correct algorithm. Fixes: d2825fa9365d ("crypto: sm3,sm4 - move into crypto directory") Cc: Jason A. Donenfeld Cc: stable@vger.kernel.org # v5.19+ Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu --- drivers/crypto/Kconfig | 4 ++-- drivers/crypto/hisilicon/Kconfig | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig index 2947888d3b828..dfb103f81a64b 100644 --- a/drivers/crypto/Kconfig +++ b/drivers/crypto/Kconfig @@ -800,8 +800,8 @@ config CRYPTO_DEV_CCREE select CRYPTO_ECB select CRYPTO_CTR select CRYPTO_XTS - select CRYPTO_SM4 - select CRYPTO_SM3 + select CRYPTO_SM4_GENERIC + select CRYPTO_SM3_GENERIC help Say 'Y' to enable a driver for the REE interface of the Arm TrustZone CryptoCell family of processors. Currently the diff --git a/drivers/crypto/hisilicon/Kconfig b/drivers/crypto/hisilicon/Kconfig index 27e1fa9120639..743ce4fc3158c 100644 --- a/drivers/crypto/hisilicon/Kconfig +++ b/drivers/crypto/hisilicon/Kconfig @@ -26,7 +26,7 @@ config CRYPTO_DEV_HISI_SEC2 select CRYPTO_SHA1 select CRYPTO_SHA256 select CRYPTO_SHA512 - select CRYPTO_SM4 + select CRYPTO_SM4_GENERIC depends on PCI && PCI_MSI depends on UACCE || UACCE=n depends on ARM64 || (COMPILE_TEST && 64BIT) -- GitLab From 4dc334cab1c34efb17fa6cd10b12fbc9458e5760 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 2 Dec 2022 05:54:01 -0800 Subject: [PATCH 1081/1423] i915/gvt: Move gvt mapping cache initialization to intel_vgpu_init_dev() vfio container registers .dma_unmap() callback after the device is opened. So it's fine for mdev drivers to initialize internal mapping cache in .open_device(). See vfio_device_container_register(). Now with iommufd an access ops with an unmap callback is registered when the device is bound to iommufd which is before .open_device() is called. This implies gvt's .dma_unmap() could be called before its internal mapping cache is initialized. The fix is moving gvt mapping cache initialization to vGPU init. While at it also move ptable initialization together. Link: https://lore.kernel.org/r/20221202135402.756470-2-yi.l.liu@intel.com Reviewed-by: Zhi Wang Reviewed-by: Zhenyu Wang Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/gpu/drm/i915/gvt/kvmgt.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 7a45e5360caf2..aaf0d9e8da951 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -671,9 +671,6 @@ static int intel_vgpu_open_device(struct vfio_device *vfio_dev) vgpu->attached = true; - kvmgt_protect_table_init(vgpu); - gvt_cache_init(vgpu); - vgpu->track_node.track_write = kvmgt_page_track_write; vgpu->track_node.track_flush_slot = kvmgt_page_track_flush_slot; kvm_page_track_register_notifier(vgpu->vfio_device.kvm, @@ -718,6 +715,11 @@ static void intel_vgpu_close_device(struct vfio_device *vfio_dev) kvmgt_protect_table_destroy(vgpu); gvt_cache_destroy(vgpu); + WARN_ON(vgpu->nr_cache_entries); + + vgpu->gfn_cache = RB_ROOT; + vgpu->dma_addr_cache = RB_ROOT; + intel_vgpu_release_msi_eventfd_ctx(vgpu); vgpu->attached = false; @@ -1451,9 +1453,17 @@ static int intel_vgpu_init_dev(struct vfio_device *vfio_dev) struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev); struct intel_vgpu_type *type = container_of(mdev->type, struct intel_vgpu_type, type); + int ret; vgpu->gvt = kdev_to_i915(mdev->type->parent->dev)->gvt; - return intel_gvt_create_vgpu(vgpu, type->conf); + ret = intel_gvt_create_vgpu(vgpu, type->conf); + if (ret) + return ret; + + kvmgt_protect_table_init(vgpu); + gvt_cache_init(vgpu); + + return 0; } static void intel_vgpu_release_dev(struct vfio_device *vfio_dev) -- GitLab From 2a54e347d990574ceb047b71ea0b03979232b85e Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Fri, 2 Dec 2022 05:54:02 -0800 Subject: [PATCH 1082/1423] vfio/ap: Validate iova during dma_unmap and trigger irq disable Currently, each mapped iova is stashed in its associated vfio_ap_queue; when we get an unmap request, validate that it matches with one or more of these stashed values before attempting unpins. Each stashed iova represents IRQ that was enabled for a queue. Therefore, if a match is found, trigger IRQ disable for this queue to ensure that underlying firmware will no longer try to use the associated pfn after the page is unpinned. IRQ disable will also handle the associated unpin. Link: https://lore.kernel.org/r/20221202135402.756470-3-yi.l.liu@intel.com Reviewed-by: Tony Krowiak Signed-off-by: Matthew Rosato Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/s390/crypto/vfio_ap_ops.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 0b4cc8c597ae6..8bf353d46820e 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -1535,13 +1535,29 @@ static int vfio_ap_mdev_set_kvm(struct ap_matrix_mdev *matrix_mdev, return 0; } +static void unmap_iova(struct ap_matrix_mdev *matrix_mdev, u64 iova, u64 length) +{ + struct ap_queue_table *qtable = &matrix_mdev->qtable; + struct vfio_ap_queue *q; + int loop_cursor; + + hash_for_each(qtable->queues, loop_cursor, q, mdev_qnode) { + if (q->saved_iova >= iova && q->saved_iova < iova + length) + vfio_ap_irq_disable(q); + } +} + static void vfio_ap_mdev_dma_unmap(struct vfio_device *vdev, u64 iova, u64 length) { struct ap_matrix_mdev *matrix_mdev = container_of(vdev, struct ap_matrix_mdev, vdev); - vfio_unpin_pages(&matrix_mdev->vdev, iova, 1); + mutex_lock(&matrix_dev->mdevs_lock); + + unmap_iova(matrix_mdev, iova, length); + + mutex_unlock(&matrix_dev->mdevs_lock); } /** -- GitLab From 294aaccb50130f596943be892c5d3a3568b76c57 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:31:46 -0400 Subject: [PATCH 1083/1423] vfio: Move vfio_device driver open/close code to a function This error unwind is getting complicated. Move all the code into two pair'd function. The functions should be called when the open_count == 1 after incrementing/before decrementing. Link: https://lore.kernel.org/r/1-v4-42cd2eb0e3eb+335a-vfio_iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Yi Liu Reviewed-by: Alex Williamson Tested-by: Alex Williamson Tested-by: Yi Liu Tested-by: Nicolin Chen Tested-by: Lixiao Yang Tested-by: Matthew Rosato Tested-by: Yu He Signed-off-by: Jason Gunthorpe --- drivers/vfio/vfio_main.c | 95 ++++++++++++++++++++++------------------ 1 file changed, 53 insertions(+), 42 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 2d168793d4e1c..2e8346d13c16c 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -734,6 +734,51 @@ bool vfio_assert_device_open(struct vfio_device *device) return !WARN_ON_ONCE(!READ_ONCE(device->open_count)); } +static int vfio_device_first_open(struct vfio_device *device) +{ + int ret; + + lockdep_assert_held(&device->dev_set->lock); + + if (!try_module_get(device->dev->driver->owner)) + return -ENODEV; + + /* + * Here we pass the KVM pointer with the group under the lock. If the + * device driver will use it, it must obtain a reference and release it + * during close_device. + */ + mutex_lock(&device->group->group_lock); + device->kvm = device->group->kvm; + if (device->ops->open_device) { + ret = device->ops->open_device(device); + if (ret) + goto err_module_put; + } + vfio_device_container_register(device); + mutex_unlock(&device->group->group_lock); + return 0; + +err_module_put: + device->kvm = NULL; + mutex_unlock(&device->group->group_lock); + module_put(device->dev->driver->owner); + return ret; +} + +static void vfio_device_last_close(struct vfio_device *device) +{ + lockdep_assert_held(&device->dev_set->lock); + + mutex_lock(&device->group->group_lock); + vfio_device_container_unregister(device); + if (device->ops->close_device) + device->ops->close_device(device); + device->kvm = NULL; + mutex_unlock(&device->group->group_lock); + module_put(device->dev->driver->owner); +} + static struct file *vfio_device_open(struct vfio_device *device) { struct file *filep; @@ -745,29 +790,12 @@ static struct file *vfio_device_open(struct vfio_device *device) if (ret) return ERR_PTR(ret); - if (!try_module_get(device->dev->driver->owner)) { - ret = -ENODEV; - goto err_unassign_container; - } - mutex_lock(&device->dev_set->lock); device->open_count++; if (device->open_count == 1) { - /* - * Here we pass the KVM pointer with the group under the read - * lock. If the device driver will use it, it must obtain a - * reference and release it during close_device. - */ - mutex_lock(&device->group->group_lock); - device->kvm = device->group->kvm; - - if (device->ops->open_device) { - ret = device->ops->open_device(device); - if (ret) - goto err_undo_count; - } - vfio_device_container_register(device); - mutex_unlock(&device->group->group_lock); + ret = vfio_device_first_open(device); + if (ret) + goto err_unassign_container; } mutex_unlock(&device->dev_set->lock); @@ -800,20 +828,11 @@ static struct file *vfio_device_open(struct vfio_device *device) err_close_device: mutex_lock(&device->dev_set->lock); - mutex_lock(&device->group->group_lock); - if (device->open_count == 1 && device->ops->close_device) { - device->ops->close_device(device); - - vfio_device_container_unregister(device); - } -err_undo_count: - mutex_unlock(&device->group->group_lock); + if (device->open_count == 1) + vfio_device_last_close(device); +err_unassign_container: device->open_count--; - if (device->open_count == 0 && device->kvm) - device->kvm = NULL; mutex_unlock(&device->dev_set->lock); - module_put(device->dev->driver->owner); -err_unassign_container: vfio_device_unassign_container(device); return ERR_PTR(ret); } @@ -1016,19 +1035,11 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep) mutex_lock(&device->dev_set->lock); vfio_assert_device_open(device); - mutex_lock(&device->group->group_lock); - if (device->open_count == 1 && device->ops->close_device) - device->ops->close_device(device); - - vfio_device_container_unregister(device); - mutex_unlock(&device->group->group_lock); + if (device->open_count == 1) + vfio_device_last_close(device); device->open_count--; - if (device->open_count == 0) - device->kvm = NULL; mutex_unlock(&device->dev_set->lock); - module_put(device->dev->driver->owner); - vfio_device_unassign_container(device); vfio_device_put_registration(device); -- GitLab From bab6fabc01d99c7e0293807e835231740379b692 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:31:47 -0400 Subject: [PATCH 1084/1423] vfio: Move vfio_device_assign_container() into vfio_device_first_open() The only thing this function does is assert the group has an assigned container and incrs refcounts. The overall model we have is that once a container_users refcount is incremented it cannot be de-assigned from the group - vfio_group_ioctl_unset_container() will fail and the group FD cannot be closed. Thus we do not need to check this on every device FD open, just the first. Reorganize the code so that only the first open and last close manages the container. Link: https://lore.kernel.org/r/2-v4-42cd2eb0e3eb+335a-vfio_iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Yi Liu Reviewed-by: Alex Williamson Tested-by: Alex Williamson Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Tested-by: Yu He Signed-off-by: Jason Gunthorpe --- drivers/vfio/container.c | 4 ++-- drivers/vfio/vfio_main.c | 24 +++++++++++------------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/drivers/vfio/container.c b/drivers/vfio/container.c index d74164abbf401..dd79a66ec62ca 100644 --- a/drivers/vfio/container.c +++ b/drivers/vfio/container.c @@ -531,11 +531,11 @@ int vfio_device_assign_container(struct vfio_device *device) void vfio_device_unassign_container(struct vfio_device *device) { - mutex_lock(&device->group->group_lock); + lockdep_assert_held_write(&device->group->group_lock); + WARN_ON(device->group->container_users <= 1); device->group->container_users--; fput(device->group->opened_file); - mutex_unlock(&device->group->group_lock); } /* diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 2e8346d13c16c..717c7f404feee 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -749,18 +749,24 @@ static int vfio_device_first_open(struct vfio_device *device) * during close_device. */ mutex_lock(&device->group->group_lock); + ret = vfio_device_assign_container(device); + if (ret) + goto err_module_put; + device->kvm = device->group->kvm; if (device->ops->open_device) { ret = device->ops->open_device(device); if (ret) - goto err_module_put; + goto err_container; } vfio_device_container_register(device); mutex_unlock(&device->group->group_lock); return 0; -err_module_put: +err_container: device->kvm = NULL; + vfio_device_unassign_container(device); +err_module_put: mutex_unlock(&device->group->group_lock); module_put(device->dev->driver->owner); return ret; @@ -775,6 +781,7 @@ static void vfio_device_last_close(struct vfio_device *device) if (device->ops->close_device) device->ops->close_device(device); device->kvm = NULL; + vfio_device_unassign_container(device); mutex_unlock(&device->group->group_lock); module_put(device->dev->driver->owner); } @@ -784,18 +791,12 @@ static struct file *vfio_device_open(struct vfio_device *device) struct file *filep; int ret; - mutex_lock(&device->group->group_lock); - ret = vfio_device_assign_container(device); - mutex_unlock(&device->group->group_lock); - if (ret) - return ERR_PTR(ret); - mutex_lock(&device->dev_set->lock); device->open_count++; if (device->open_count == 1) { ret = vfio_device_first_open(device); if (ret) - goto err_unassign_container; + goto err_unlock; } mutex_unlock(&device->dev_set->lock); @@ -830,10 +831,9 @@ err_close_device: mutex_lock(&device->dev_set->lock); if (device->open_count == 1) vfio_device_last_close(device); -err_unassign_container: +err_unlock: device->open_count--; mutex_unlock(&device->dev_set->lock); - vfio_device_unassign_container(device); return ERR_PTR(ret); } @@ -1040,8 +1040,6 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep) device->open_count--; mutex_unlock(&device->dev_set->lock); - vfio_device_unassign_container(device); - vfio_device_put_registration(device); return 0; -- GitLab From 04f930c3e44bb9010bba8521f970d00d95a94eb0 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:31:48 -0400 Subject: [PATCH 1085/1423] vfio: Rename vfio_device_assign/unassign_container() These functions don't really assign anything anymore, they just increment some refcounts and do a sanity check. Call them vfio_group_[un]use_container() Link: https://lore.kernel.org/r/3-v4-42cd2eb0e3eb+335a-vfio_iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Yi Liu Reviewed-by: Alex Williamson Tested-by: Alex Williamson Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Tested-by: Yu He Signed-off-by: Jason Gunthorpe --- drivers/vfio/container.c | 14 ++++++-------- drivers/vfio/vfio.h | 4 ++-- drivers/vfio/vfio_main.c | 6 +++--- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/vfio/container.c b/drivers/vfio/container.c index dd79a66ec62ca..499777930b08f 100644 --- a/drivers/vfio/container.c +++ b/drivers/vfio/container.c @@ -511,10 +511,8 @@ void vfio_group_detach_container(struct vfio_group *group) vfio_container_put(container); } -int vfio_device_assign_container(struct vfio_device *device) +int vfio_group_use_container(struct vfio_group *group) { - struct vfio_group *group = device->group; - lockdep_assert_held(&group->group_lock); if (!group->container || !group->container->iommu_driver || @@ -529,13 +527,13 @@ int vfio_device_assign_container(struct vfio_device *device) return 0; } -void vfio_device_unassign_container(struct vfio_device *device) +void vfio_group_unuse_container(struct vfio_group *group) { - lockdep_assert_held_write(&device->group->group_lock); + lockdep_assert_held(&group->group_lock); - WARN_ON(device->group->container_users <= 1); - device->group->container_users--; - fput(device->group->opened_file); + WARN_ON(group->container_users <= 1); + group->container_users--; + fput(group->opened_file); } /* diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index bcad54bbab08c..f95f4925b83bb 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -112,8 +112,8 @@ void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops); bool vfio_assert_device_open(struct vfio_device *device); struct vfio_container *vfio_container_from_file(struct file *filep); -int vfio_device_assign_container(struct vfio_device *device); -void vfio_device_unassign_container(struct vfio_device *device); +int vfio_group_use_container(struct vfio_group *group); +void vfio_group_unuse_container(struct vfio_group *group); int vfio_container_attach_group(struct vfio_container *container, struct vfio_group *group); void vfio_group_detach_container(struct vfio_group *group); diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 717c7f404feee..8c2dcb481ae10 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -749,7 +749,7 @@ static int vfio_device_first_open(struct vfio_device *device) * during close_device. */ mutex_lock(&device->group->group_lock); - ret = vfio_device_assign_container(device); + ret = vfio_group_use_container(device->group); if (ret) goto err_module_put; @@ -765,7 +765,7 @@ static int vfio_device_first_open(struct vfio_device *device) err_container: device->kvm = NULL; - vfio_device_unassign_container(device); + vfio_group_unuse_container(device->group); err_module_put: mutex_unlock(&device->group->group_lock); module_put(device->dev->driver->owner); @@ -781,7 +781,7 @@ static void vfio_device_last_close(struct vfio_device *device) if (device->ops->close_device) device->ops->close_device(device); device->kvm = NULL; - vfio_device_unassign_container(device); + vfio_group_unuse_container(device->group); mutex_unlock(&device->group->group_lock); module_put(device->dev->driver->owner); } -- GitLab From 0d8227b622f3529661ad6a9702a52932e149a30d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:31:49 -0400 Subject: [PATCH 1086/1423] vfio: Use IOMMU_CAP_ENFORCE_CACHE_COHERENCY for vfio_file_enforced_coherent() iommufd doesn't establish the iommu_domains until after the device FD is opened, even if the container has been set. This design is part of moving away from the group centric iommu APIs. This is fine, except that the normal sequence of establishing the kvm wbinvd won't work: group = open("/dev/vfio/XX") ioctl(group, VFIO_GROUP_SET_CONTAINER) ioctl(kvm, KVM_DEV_VFIO_GROUP_ADD) ioctl(group, VFIO_GROUP_GET_DEVICE_FD) As the domains don't start existing until GET_DEVICE_FD. Further, GET_DEVICE_FD requires that KVM_DEV_VFIO_GROUP_ADD already be done as that is what sets the group->kvm and thus device->kvm for the driver to use during open. Now that we have device centric cap ops and the new IOMMU_CAP_ENFORCE_CACHE_COHERENCY we know what the iommu_domain will be capable of without having to create it. Use this to compute vfio_file_enforced_coherent() and resolve the ordering problems. VFIO always tries to upgrade domains to enforce cache coherency, it never attaches a device that supports enforce cache coherency to a less capable domain, so the cap test is a sufficient proxy for the ultimate outcome. iommufd also ensures that devices that set the cap will be connected to enforcing domains. Link: https://lore.kernel.org/r/4-v4-42cd2eb0e3eb+335a-vfio_iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Alex Williamson Tested-by: Alex Williamson Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Tested-by: Yu He Signed-off-by: Jason Gunthorpe --- drivers/vfio/container.c | 5 +++-- drivers/vfio/vfio.h | 2 -- drivers/vfio/vfio_main.c | 29 ++++++++++++++++------------- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/drivers/vfio/container.c b/drivers/vfio/container.c index 499777930b08f..d97747dfb05d0 100644 --- a/drivers/vfio/container.c +++ b/drivers/vfio/container.c @@ -188,8 +188,9 @@ void vfio_device_container_unregister(struct vfio_device *device) device->group->container->iommu_data, device); } -long vfio_container_ioctl_check_extension(struct vfio_container *container, - unsigned long arg) +static long +vfio_container_ioctl_check_extension(struct vfio_container *container, + unsigned long arg) { struct vfio_iommu_driver *driver; long ret = 0; diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index f95f4925b83bb..7315612587042 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -119,8 +119,6 @@ int vfio_container_attach_group(struct vfio_container *container, void vfio_group_detach_container(struct vfio_group *group); void vfio_device_container_register(struct vfio_device *device); void vfio_device_container_unregister(struct vfio_device *device); -long vfio_container_ioctl_check_extension(struct vfio_container *container, - unsigned long arg); int __init vfio_container_init(void); void vfio_container_cleanup(void); diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 8c2dcb481ae10..77d6c0ba6a830 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1622,24 +1622,27 @@ EXPORT_SYMBOL_GPL(vfio_file_is_group); bool vfio_file_enforced_coherent(struct file *file) { struct vfio_group *group = file->private_data; - bool ret; + struct vfio_device *device; + bool ret = true; if (!vfio_file_is_group(file)) return true; - mutex_lock(&group->group_lock); - if (group->container) { - ret = vfio_container_ioctl_check_extension(group->container, - VFIO_DMA_CC_IOMMU); - } else { - /* - * Since the coherency state is determined only once a container - * is attached the user must do so before they can prove they - * have permission. - */ - ret = true; + /* + * If the device does not have IOMMU_CAP_ENFORCE_CACHE_COHERENCY then + * any domain later attached to it will also not support it. If the cap + * is set then the iommu_domain eventually attached to the device/group + * must use a domain with enforce_cache_coherency(). + */ + mutex_lock(&group->device_lock); + list_for_each_entry(device, &group->device_list, group_next) { + if (!device_iommu_capable(device->dev, + IOMMU_CAP_ENFORCE_CACHE_COHERENCY)) { + ret = false; + break; + } } - mutex_unlock(&group->group_lock); + mutex_unlock(&group->device_lock); return ret; } EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent); -- GitLab From 2a3dab19a0a6c1823645764188776f271de1b3cf Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:31:50 -0400 Subject: [PATCH 1087/1423] vfio-iommufd: Allow iommufd to be used in place of a container fd This makes VFIO_GROUP_SET_CONTAINER accept both a vfio container FD and an iommufd. In iommufd mode an IOAS will exist after the SET_CONTAINER, but it will not be attached to any groups. For VFIO this means that the VFIO_GROUP_GET_STATUS and VFIO_GROUP_FLAGS_VIABLE works subtly differently. With the container FD the iommu_group_claim_dma_owner() is done during SET_CONTAINER but for IOMMUFD this is done during VFIO_GROUP_GET_DEVICE_FD. Meaning that VFIO_GROUP_FLAGS_VIABLE could be set but GET_DEVICE_FD will fail due to viability. As GET_DEVICE_FD can fail for many reasons already this is not expected to be a meaningful difference. Reorganize the tests for if the group has an assigned container or iommu into a vfio_group_has_iommu() function and consolidate all the duplicated WARN_ON's etc related to this. Call container functions only if a container is actually present on the group. Link: https://lore.kernel.org/r/5-v4-42cd2eb0e3eb+335a-vfio_iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Alex Williamson Tested-by: Alex Williamson Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Tested-by: Yu He Signed-off-by: Jason Gunthorpe --- drivers/vfio/Kconfig | 1 + drivers/vfio/container.c | 7 +++- drivers/vfio/vfio.h | 2 + drivers/vfio/vfio_main.c | 88 +++++++++++++++++++++++++++++++++------- 4 files changed, 82 insertions(+), 16 deletions(-) diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 86c381ceb9a1e..1118d322eec97 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -2,6 +2,7 @@ menuconfig VFIO tristate "VFIO Non-Privileged userspace driver framework" select IOMMU_API + depends on IOMMUFD || !IOMMUFD select VFIO_IOMMU_TYPE1 if MMU && (X86 || S390 || ARM || ARM64) select INTERVAL_TREE help diff --git a/drivers/vfio/container.c b/drivers/vfio/container.c index d97747dfb05d0..8772dad680853 100644 --- a/drivers/vfio/container.c +++ b/drivers/vfio/container.c @@ -516,8 +516,11 @@ int vfio_group_use_container(struct vfio_group *group) { lockdep_assert_held(&group->group_lock); - if (!group->container || !group->container->iommu_driver || - WARN_ON(!group->container_users)) + /* + * The container fd has been assigned with VFIO_GROUP_SET_CONTAINER but + * VFIO_SET_IOMMU hasn't been done yet. + */ + if (!group->container->iommu_driver) return -EINVAL; if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index 7315612587042..a9dd0615266cb 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -10,6 +10,7 @@ #include #include +struct iommufd_ctx; struct iommu_group; struct vfio_device; struct vfio_container; @@ -60,6 +61,7 @@ struct vfio_group { struct kvm *kvm; struct file *opened_file; struct blocking_notifier_head notifier; + struct iommufd_ctx *iommufd; }; /* events for the backend driver notify callback */ diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 77d6c0ba6a830..f11157d056e6c 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "vfio.h" #define DRIVER_VERSION "0.3" @@ -662,6 +663,18 @@ EXPORT_SYMBOL_GPL(vfio_unregister_group_dev); /* * VFIO Group fd, /dev/vfio/$GROUP */ +static bool vfio_group_has_iommu(struct vfio_group *group) +{ + lockdep_assert_held(&group->group_lock); + /* + * There can only be users if there is a container, and if there is a + * container there must be users. + */ + WARN_ON(!group->container != !group->container_users); + + return group->container || group->iommufd; +} + /* * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or * if there was no container to unset. Since the ioctl is called on @@ -673,15 +686,21 @@ static int vfio_group_ioctl_unset_container(struct vfio_group *group) int ret = 0; mutex_lock(&group->group_lock); - if (!group->container) { + if (!vfio_group_has_iommu(group)) { ret = -EINVAL; goto out_unlock; } - if (group->container_users != 1) { - ret = -EBUSY; - goto out_unlock; + if (group->container) { + if (group->container_users != 1) { + ret = -EBUSY; + goto out_unlock; + } + vfio_group_detach_container(group); + } + if (group->iommufd) { + iommufd_ctx_put(group->iommufd); + group->iommufd = NULL; } - vfio_group_detach_container(group); out_unlock: mutex_unlock(&group->group_lock); @@ -692,6 +711,7 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group, int __user *arg) { struct vfio_container *container; + struct iommufd_ctx *iommufd; struct fd f; int ret; int fd; @@ -704,7 +724,7 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group, return -EBADF; mutex_lock(&group->group_lock); - if (group->container || WARN_ON(group->container_users)) { + if (vfio_group_has_iommu(group)) { ret = -EINVAL; goto out_unlock; } @@ -714,12 +734,28 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group, } container = vfio_container_from_file(f.file); - ret = -EINVAL; if (container) { ret = vfio_container_attach_group(container, group); goto out_unlock; } + iommufd = iommufd_ctx_from_file(f.file); + if (!IS_ERR(iommufd)) { + u32 ioas_id; + + ret = iommufd_vfio_compat_ioas_id(iommufd, &ioas_id); + if (ret) { + iommufd_ctx_put(group->iommufd); + goto out_unlock; + } + + group->iommufd = iommufd; + goto out_unlock; + } + + /* The FD passed is not recognized. */ + ret = -EBADFD; + out_unlock: mutex_unlock(&group->group_lock); fdput(f); @@ -749,9 +785,16 @@ static int vfio_device_first_open(struct vfio_device *device) * during close_device. */ mutex_lock(&device->group->group_lock); - ret = vfio_group_use_container(device->group); - if (ret) + if (!vfio_group_has_iommu(device->group)) { + ret = -EINVAL; goto err_module_put; + } + + if (device->group->container) { + ret = vfio_group_use_container(device->group); + if (ret) + goto err_module_put; + } device->kvm = device->group->kvm; if (device->ops->open_device) { @@ -759,13 +802,15 @@ static int vfio_device_first_open(struct vfio_device *device) if (ret) goto err_container; } - vfio_device_container_register(device); + if (device->group->container) + vfio_device_container_register(device); mutex_unlock(&device->group->group_lock); return 0; err_container: device->kvm = NULL; - vfio_group_unuse_container(device->group); + if (device->group->container) + vfio_group_unuse_container(device->group); err_module_put: mutex_unlock(&device->group->group_lock); module_put(device->dev->driver->owner); @@ -777,11 +822,13 @@ static void vfio_device_last_close(struct vfio_device *device) lockdep_assert_held(&device->dev_set->lock); mutex_lock(&device->group->group_lock); - vfio_device_container_unregister(device); + if (device->group->container) + vfio_device_container_unregister(device); if (device->ops->close_device) device->ops->close_device(device); device->kvm = NULL; - vfio_group_unuse_container(device->group); + if (device->group->container) + vfio_group_unuse_container(device->group); mutex_unlock(&device->group->group_lock); module_put(device->dev->driver->owner); } @@ -897,7 +944,14 @@ static int vfio_group_ioctl_get_status(struct vfio_group *group, return -ENODEV; } - if (group->container) + /* + * With the container FD the iommu_group_claim_dma_owner() is done + * during SET_CONTAINER but for IOMMFD this is done during + * VFIO_GROUP_GET_DEVICE_FD. Meaning that with iommufd + * VFIO_GROUP_FLAGS_VIABLE could be set but GET_DEVICE_FD will fail due + * to viability. + */ + if (vfio_group_has_iommu(group)) status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET | VFIO_GROUP_FLAGS_VIABLE; else if (!iommu_group_dma_owner_claimed(group->iommu_group)) @@ -980,6 +1034,10 @@ static int vfio_group_fops_release(struct inode *inode, struct file *filep) WARN_ON(group->notifier.head); if (group->container) vfio_group_detach_container(group); + if (group->iommufd) { + iommufd_ctx_put(group->iommufd); + group->iommufd = NULL; + } group->opened_file = NULL; mutex_unlock(&group->group_lock); return 0; @@ -1878,6 +1936,8 @@ static void __exit vfio_cleanup(void) module_init(vfio_init); module_exit(vfio_cleanup); +MODULE_IMPORT_NS(IOMMUFD); +MODULE_IMPORT_NS(IOMMUFD_VFIO); MODULE_VERSION(DRIVER_VERSION); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR(DRIVER_AUTHOR); -- GitLab From a4d1f91db5021c57e14721ac090616c90386ac70 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:31:51 -0400 Subject: [PATCH 1088/1423] vfio-iommufd: Support iommufd for physical VFIO devices This creates the iommufd_device for the physical VFIO drivers. These are all the drivers that are calling vfio_register_group_dev() and expect the type1 code to setup a real iommu_domain against their parent struct device. The design gives the driver a choice in how it gets connected to iommufd by providing bind_iommufd/unbind_iommufd/attach_ioas callbacks to implement as required. The core code provides three default callbacks for physical mode using a real iommu_domain. This is suitable for drivers using vfio_register_group_dev() Link: https://lore.kernel.org/r/6-v4-42cd2eb0e3eb+335a-vfio_iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Alex Williamson Tested-by: Alex Williamson Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Tested-by: Yu He Signed-off-by: Jason Gunthorpe --- drivers/vfio/Makefile | 1 + drivers/vfio/fsl-mc/vfio_fsl_mc.c | 3 + drivers/vfio/iommufd.c | 100 ++++++++++++++++++ .../vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 6 ++ drivers/vfio/pci/mlx5/main.c | 3 + drivers/vfio/pci/vfio_pci.c | 3 + drivers/vfio/platform/vfio_amba.c | 3 + drivers/vfio/platform/vfio_platform.c | 3 + drivers/vfio/vfio.h | 15 +++ drivers/vfio/vfio_main.c | 15 ++- include/linux/vfio.h | 25 +++++ 11 files changed, 175 insertions(+), 2 deletions(-) create mode 100644 drivers/vfio/iommufd.c diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index b693a1169286f..3863922529ef2 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -6,6 +6,7 @@ obj-$(CONFIG_VFIO) += vfio.o vfio-y += vfio_main.o \ iova_bitmap.o \ container.o +vfio-$(CONFIG_IOMMUFD) += iommufd.o obj-$(CONFIG_VFIO_VIRQFD) += vfio_virqfd.o obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c index b16874e913e4f..5cd4bb4764403 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -592,6 +592,9 @@ static const struct vfio_device_ops vfio_fsl_mc_ops = { .read = vfio_fsl_mc_read, .write = vfio_fsl_mc_write, .mmap = vfio_fsl_mc_mmap, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, }; static struct fsl_mc_driver vfio_fsl_mc_driver = { diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c new file mode 100644 index 0000000000000..6e47a3df1a710 --- /dev/null +++ b/drivers/vfio/iommufd.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + */ +#include +#include + +#include "vfio.h" + +MODULE_IMPORT_NS(IOMMUFD); +MODULE_IMPORT_NS(IOMMUFD_VFIO); + +int vfio_iommufd_bind(struct vfio_device *vdev, struct iommufd_ctx *ictx) +{ + u32 ioas_id; + u32 device_id; + int ret; + + lockdep_assert_held(&vdev->dev_set->lock); + + /* + * If the driver doesn't provide this op then it means the device does + * not do DMA at all. So nothing to do. + */ + if (!vdev->ops->bind_iommufd) + return 0; + + ret = vdev->ops->bind_iommufd(vdev, ictx, &device_id); + if (ret) + return ret; + + ret = iommufd_vfio_compat_ioas_id(ictx, &ioas_id); + if (ret) + goto err_unbind; + ret = vdev->ops->attach_ioas(vdev, &ioas_id); + if (ret) + goto err_unbind; + + /* + * The legacy path has no way to return the device id or the selected + * pt_id + */ + return 0; + +err_unbind: + if (vdev->ops->unbind_iommufd) + vdev->ops->unbind_iommufd(vdev); + return ret; +} + +void vfio_iommufd_unbind(struct vfio_device *vdev) +{ + lockdep_assert_held(&vdev->dev_set->lock); + + if (vdev->ops->unbind_iommufd) + vdev->ops->unbind_iommufd(vdev); +} + +/* + * The physical standard ops mean that the iommufd_device is bound to the + * physical device vdev->dev that was provided to vfio_init_group_dev(). Drivers + * using this ops set should call vfio_register_group_dev() + */ +int vfio_iommufd_physical_bind(struct vfio_device *vdev, + struct iommufd_ctx *ictx, u32 *out_device_id) +{ + struct iommufd_device *idev; + + idev = iommufd_device_bind(ictx, vdev->dev, out_device_id); + if (IS_ERR(idev)) + return PTR_ERR(idev); + vdev->iommufd_device = idev; + return 0; +} +EXPORT_SYMBOL_GPL(vfio_iommufd_physical_bind); + +void vfio_iommufd_physical_unbind(struct vfio_device *vdev) +{ + lockdep_assert_held(&vdev->dev_set->lock); + + if (vdev->iommufd_attached) { + iommufd_device_detach(vdev->iommufd_device); + vdev->iommufd_attached = false; + } + iommufd_device_unbind(vdev->iommufd_device); + vdev->iommufd_device = NULL; +} +EXPORT_SYMBOL_GPL(vfio_iommufd_physical_unbind); + +int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id) +{ + int rc; + + rc = iommufd_device_attach(vdev->iommufd_device, pt_id); + if (rc) + return rc; + vdev->iommufd_attached = true; + return 0; +} +EXPORT_SYMBOL_GPL(vfio_iommufd_physical_attach_ioas); diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 39eeca18a0f7c..40019b11c5a96 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1246,6 +1246,9 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = { .mmap = hisi_acc_vfio_pci_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, }; static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { @@ -1261,6 +1264,9 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, }; static int hisi_acc_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index fd6ccb8454a24..32d1f38d351e7 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -623,6 +623,9 @@ static const struct vfio_device_ops mlx5vf_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, }; static int mlx5vf_pci_probe(struct pci_dev *pdev, diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 1d4919edfbde4..29091ee2e9849 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -138,6 +138,9 @@ static const struct vfio_device_ops vfio_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, }; static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c index eaea63e5294c5..5a046098d0bdf 100644 --- a/drivers/vfio/platform/vfio_amba.c +++ b/drivers/vfio/platform/vfio_amba.c @@ -117,6 +117,9 @@ static const struct vfio_device_ops vfio_amba_ops = { .read = vfio_platform_read, .write = vfio_platform_write, .mmap = vfio_platform_mmap, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, }; static const struct amba_id pl330_ids[] = { diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c index 82cedcebfd902..b87c3b7087834 100644 --- a/drivers/vfio/platform/vfio_platform.c +++ b/drivers/vfio/platform/vfio_platform.c @@ -106,6 +106,9 @@ static const struct vfio_device_ops vfio_platform_ops = { .read = vfio_platform_read, .write = vfio_platform_write, .mmap = vfio_platform_mmap, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, }; static struct platform_driver vfio_platform_driver = { diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index a9dd0615266cb..9766f70a12c51 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -124,6 +124,21 @@ void vfio_device_container_unregister(struct vfio_device *device); int __init vfio_container_init(void); void vfio_container_cleanup(void); +#if IS_ENABLED(CONFIG_IOMMUFD) +int vfio_iommufd_bind(struct vfio_device *device, struct iommufd_ctx *ictx); +void vfio_iommufd_unbind(struct vfio_device *device); +#else +static inline int vfio_iommufd_bind(struct vfio_device *device, + struct iommufd_ctx *ictx) +{ + return -EOPNOTSUPP; +} + +static inline void vfio_iommufd_unbind(struct vfio_device *device) +{ +} +#endif + #ifdef CONFIG_VFIO_NOIOMMU extern bool vfio_noiommu __read_mostly; #else diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index f11157d056e6c..a74c34232c034 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -525,6 +525,11 @@ static int __vfio_register_dev(struct vfio_device *device, if (IS_ERR(group)) return PTR_ERR(group); + if (WARN_ON(device->ops->bind_iommufd && + (!device->ops->unbind_iommufd || + !device->ops->attach_ioas))) + return -EINVAL; + /* * If the driver doesn't specify a set then the device is added to a * singleton set just for itself. @@ -794,6 +799,10 @@ static int vfio_device_first_open(struct vfio_device *device) ret = vfio_group_use_container(device->group); if (ret) goto err_module_put; + } else if (device->group->iommufd) { + ret = vfio_iommufd_bind(device, device->group->iommufd); + if (ret) + goto err_module_put; } device->kvm = device->group->kvm; @@ -811,6 +820,8 @@ err_container: device->kvm = NULL; if (device->group->container) vfio_group_unuse_container(device->group); + else if (device->group->iommufd) + vfio_iommufd_unbind(device); err_module_put: mutex_unlock(&device->group->group_lock); module_put(device->dev->driver->owner); @@ -829,6 +840,8 @@ static void vfio_device_last_close(struct vfio_device *device) device->kvm = NULL; if (device->group->container) vfio_group_unuse_container(device->group); + else if (device->group->iommufd) + vfio_iommufd_unbind(device); mutex_unlock(&device->group->group_lock); module_put(device->dev->driver->owner); } @@ -1936,8 +1949,6 @@ static void __exit vfio_cleanup(void) module_init(vfio_init); module_exit(vfio_cleanup); -MODULE_IMPORT_NS(IOMMUFD); -MODULE_IMPORT_NS(IOMMUFD_VFIO); MODULE_VERSION(DRIVER_VERSION); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR(DRIVER_AUTHOR); diff --git a/include/linux/vfio.h b/include/linux/vfio.h index e7cebeb875dd1..a7fc4d747dc22 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -17,6 +17,8 @@ #include struct kvm; +struct iommufd_ctx; +struct iommufd_device; /* * VFIO devices can be placed in a set, this allows all devices to share this @@ -54,6 +56,10 @@ struct vfio_device { struct completion comp; struct list_head group_next; struct list_head iommu_entry; +#if IS_ENABLED(CONFIG_IOMMUFD) + struct iommufd_device *iommufd_device; + bool iommufd_attached; +#endif }; /** @@ -80,6 +86,10 @@ struct vfio_device_ops { char *name; int (*init)(struct vfio_device *vdev); void (*release)(struct vfio_device *vdev); + int (*bind_iommufd)(struct vfio_device *vdev, + struct iommufd_ctx *ictx, u32 *out_device_id); + void (*unbind_iommufd)(struct vfio_device *vdev); + int (*attach_ioas)(struct vfio_device *vdev, u32 *pt_id); int (*open_device)(struct vfio_device *vdev); void (*close_device)(struct vfio_device *vdev); ssize_t (*read)(struct vfio_device *vdev, char __user *buf, @@ -96,6 +106,21 @@ struct vfio_device_ops { void __user *arg, size_t argsz); }; +#if IS_ENABLED(CONFIG_IOMMUFD) +int vfio_iommufd_physical_bind(struct vfio_device *vdev, + struct iommufd_ctx *ictx, u32 *out_device_id); +void vfio_iommufd_physical_unbind(struct vfio_device *vdev); +int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id); +#else +#define vfio_iommufd_physical_bind \ + ((int (*)(struct vfio_device *vdev, struct iommufd_ctx *ictx, \ + u32 *out_device_id)) NULL) +#define vfio_iommufd_physical_unbind \ + ((void (*)(struct vfio_device *vdev)) NULL) +#define vfio_iommufd_physical_attach_ioas \ + ((int (*)(struct vfio_device *vdev, u32 *pt_id)) NULL) +#endif + /** * @migration_set_state: Optional callback to change the migration state for * devices that support migration. It's mandatory for -- GitLab From 4741f2e941298ad7553b65e66624435e14793391 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:31:52 -0400 Subject: [PATCH 1089/1423] vfio-iommufd: Support iommufd for emulated VFIO devices Emulated VFIO devices are calling vfio_register_emulated_iommu_dev() and consist of all the mdev drivers. Like the physical drivers, support for iommufd is provided by the driver supplying the correct standard ops. Provide ops from the core that duplicate what vfio_register_emulated_iommu_dev() does. Emulated drivers are where it is more likely to see variation in the iommfd support ops. For instance IDXD will probably need to setup both a iommfd_device context linked to a PASID and an iommufd_access context to support all their mdev operations. Link: https://lore.kernel.org/r/7-v4-42cd2eb0e3eb+335a-vfio_iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Alex Williamson Tested-by: Alex Williamson Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Tested-by: Yu He Signed-off-by: Jason Gunthorpe --- drivers/gpu/drm/i915/gvt/kvmgt.c | 3 + drivers/s390/cio/vfio_ccw_ops.c | 3 + drivers/s390/crypto/vfio_ap_ops.c | 3 + drivers/vfio/container.c | 110 +++++---------------------- drivers/vfio/iommufd.c | 58 ++++++++++++++ drivers/vfio/vfio.h | 10 ++- drivers/vfio/vfio_main.c | 122 +++++++++++++++++++++++++++++- include/linux/vfio.h | 14 ++++ 8 files changed, 229 insertions(+), 94 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index aaf0d9e8da951..f5164099c264e 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -1484,6 +1484,9 @@ static const struct vfio_device_ops intel_vgpu_dev_ops = { .mmap = intel_vgpu_mmap, .ioctl = intel_vgpu_ioctl, .dma_unmap = intel_vgpu_dma_unmap, + .bind_iommufd = vfio_iommufd_emulated_bind, + .unbind_iommufd = vfio_iommufd_emulated_unbind, + .attach_ioas = vfio_iommufd_emulated_attach_ioas, }; static int intel_vgpu_probe(struct mdev_device *mdev) diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 6ae4d012d8008..560453d99c24f 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -588,6 +588,9 @@ static const struct vfio_device_ops vfio_ccw_dev_ops = { .ioctl = vfio_ccw_mdev_ioctl, .request = vfio_ccw_mdev_request, .dma_unmap = vfio_ccw_dma_unmap, + .bind_iommufd = vfio_iommufd_emulated_bind, + .unbind_iommufd = vfio_iommufd_emulated_unbind, + .attach_ioas = vfio_iommufd_emulated_attach_ioas, }; struct mdev_driver vfio_ccw_mdev_driver = { diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 8bf353d46820e..68eeb25fb6611 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -1805,6 +1805,9 @@ static const struct vfio_device_ops vfio_ap_matrix_dev_ops = { .close_device = vfio_ap_mdev_close_device, .ioctl = vfio_ap_mdev_ioctl, .dma_unmap = vfio_ap_mdev_dma_unmap, + .bind_iommufd = vfio_iommufd_emulated_bind, + .unbind_iommufd = vfio_iommufd_emulated_unbind, + .attach_ioas = vfio_iommufd_emulated_attach_ioas, }; static struct mdev_driver vfio_ap_matrix_driver = { diff --git a/drivers/vfio/container.c b/drivers/vfio/container.c index 8772dad680853..7f3961fd4b5aa 100644 --- a/drivers/vfio/container.c +++ b/drivers/vfio/container.c @@ -540,113 +540,41 @@ void vfio_group_unuse_container(struct vfio_group *group) fput(group->opened_file); } -/* - * Pin contiguous user pages and return their associated host pages for local - * domain only. - * @device [in] : device - * @iova [in] : starting IOVA of user pages to be pinned. - * @npage [in] : count of pages to be pinned. This count should not - * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. - * @prot [in] : protection flags - * @pages[out] : array of host pages - * Return error or number of pages pinned. - * - * A driver may only call this function if the vfio_device was created - * by vfio_register_emulated_iommu_dev(). - */ -int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, - int npage, int prot, struct page **pages) +int vfio_container_pin_pages(struct vfio_container *container, + struct iommu_group *iommu_group, dma_addr_t iova, + int npage, int prot, struct page **pages) { - struct vfio_container *container; - struct vfio_group *group = device->group; - struct vfio_iommu_driver *driver; - int ret; - - if (!pages || !npage || !vfio_assert_device_open(device)) - return -EINVAL; + struct vfio_iommu_driver *driver = container->iommu_driver; if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) return -E2BIG; - /* group->container cannot change while a vfio device is open */ - container = group->container; - driver = container->iommu_driver; - if (likely(driver && driver->ops->pin_pages)) - ret = driver->ops->pin_pages(container->iommu_data, - group->iommu_group, iova, - npage, prot, pages); - else - ret = -ENOTTY; - - return ret; + if (unlikely(!driver || !driver->ops->pin_pages)) + return -ENOTTY; + return driver->ops->pin_pages(container->iommu_data, iommu_group, iova, + npage, prot, pages); } -EXPORT_SYMBOL(vfio_pin_pages); -/* - * Unpin contiguous host pages for local domain only. - * @device [in] : device - * @iova [in] : starting address of user pages to be unpinned. - * @npage [in] : count of pages to be unpinned. This count should not - * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. - */ -void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage) +void vfio_container_unpin_pages(struct vfio_container *container, + dma_addr_t iova, int npage) { - struct vfio_container *container; - struct vfio_iommu_driver *driver; - if (WARN_ON(npage <= 0 || npage > VFIO_PIN_PAGES_MAX_ENTRIES)) return; - if (WARN_ON(!vfio_assert_device_open(device))) - return; - - /* group->container cannot change while a vfio device is open */ - container = device->group->container; - driver = container->iommu_driver; - - driver->ops->unpin_pages(container->iommu_data, iova, npage); + container->iommu_driver->ops->unpin_pages(container->iommu_data, iova, + npage); } -EXPORT_SYMBOL(vfio_unpin_pages); -/* - * This interface allows the CPUs to perform some sort of virtual DMA on - * behalf of the device. - * - * CPUs read/write from/into a range of IOVAs pointing to user space memory - * into/from a kernel buffer. - * - * As the read/write of user space memory is conducted via the CPUs and is - * not a real device DMA, it is not necessary to pin the user space memory. - * - * @device [in] : VFIO device - * @iova [in] : base IOVA of a user space buffer - * @data [in] : pointer to kernel buffer - * @len [in] : kernel buffer length - * @write : indicate read or write - * Return error code on failure or 0 on success. - */ -int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data, - size_t len, bool write) +int vfio_container_dma_rw(struct vfio_container *container, dma_addr_t iova, + void *data, size_t len, bool write) { - struct vfio_container *container; - struct vfio_iommu_driver *driver; - int ret = 0; - - if (!data || len <= 0 || !vfio_assert_device_open(device)) - return -EINVAL; - - /* group->container cannot change while a vfio device is open */ - container = device->group->container; - driver = container->iommu_driver; + struct vfio_iommu_driver *driver = container->iommu_driver; - if (likely(driver && driver->ops->dma_rw)) - ret = driver->ops->dma_rw(container->iommu_data, - iova, data, len, write); - else - ret = -ENOTTY; - return ret; + if (unlikely(!driver || !driver->ops->dma_rw)) + return -ENOTTY; + return driver->ops->dma_rw(container->iommu_data, iova, data, len, + write); } -EXPORT_SYMBOL(vfio_dma_rw); int __init vfio_container_init(void) { diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c index 6e47a3df1a710..4f82a6fa7c6c7 100644 --- a/drivers/vfio/iommufd.c +++ b/drivers/vfio/iommufd.c @@ -98,3 +98,61 @@ int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id) return 0; } EXPORT_SYMBOL_GPL(vfio_iommufd_physical_attach_ioas); + +/* + * The emulated standard ops mean that vfio_device is going to use the + * "mdev path" and will call vfio_pin_pages()/vfio_dma_rw(). Drivers using this + * ops set should call vfio_register_emulated_iommu_dev(). + */ + +static void vfio_emulated_unmap(void *data, unsigned long iova, + unsigned long length) +{ + struct vfio_device *vdev = data; + + vdev->ops->dma_unmap(vdev, iova, length); +} + +static const struct iommufd_access_ops vfio_user_ops = { + .needs_pin_pages = 1, + .unmap = vfio_emulated_unmap, +}; + +int vfio_iommufd_emulated_bind(struct vfio_device *vdev, + struct iommufd_ctx *ictx, u32 *out_device_id) +{ + lockdep_assert_held(&vdev->dev_set->lock); + + vdev->iommufd_ictx = ictx; + iommufd_ctx_get(ictx); + return 0; +} +EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_bind); + +void vfio_iommufd_emulated_unbind(struct vfio_device *vdev) +{ + lockdep_assert_held(&vdev->dev_set->lock); + + if (vdev->iommufd_access) { + iommufd_access_destroy(vdev->iommufd_access); + vdev->iommufd_access = NULL; + } + iommufd_ctx_put(vdev->iommufd_ictx); + vdev->iommufd_ictx = NULL; +} +EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_unbind); + +int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id) +{ + struct iommufd_access *user; + + lockdep_assert_held(&vdev->dev_set->lock); + + user = iommufd_access_create(vdev->iommufd_ictx, *pt_id, &vfio_user_ops, + vdev); + if (IS_ERR(user)) + return PTR_ERR(user); + vdev->iommufd_access = user; + return 0; +} +EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_attach_ioas); diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index 9766f70a12c51..b1ef842496372 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -111,8 +111,6 @@ struct vfio_iommu_driver { int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops); void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops); -bool vfio_assert_device_open(struct vfio_device *device); - struct vfio_container *vfio_container_from_file(struct file *filep); int vfio_group_use_container(struct vfio_group *group); void vfio_group_unuse_container(struct vfio_group *group); @@ -121,6 +119,14 @@ int vfio_container_attach_group(struct vfio_container *container, void vfio_group_detach_container(struct vfio_group *group); void vfio_device_container_register(struct vfio_device *device); void vfio_device_container_unregister(struct vfio_device *device); +int vfio_container_pin_pages(struct vfio_container *container, + struct iommu_group *iommu_group, dma_addr_t iova, + int npage, int prot, struct page **pages); +void vfio_container_unpin_pages(struct vfio_container *container, + dma_addr_t iova, int npage); +int vfio_container_dma_rw(struct vfio_container *container, dma_addr_t iova, + void *data, size_t len, bool write); + int __init vfio_container_init(void); void vfio_container_cleanup(void); diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index a74c34232c034..fd5e969ab653a 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -770,7 +770,7 @@ out_unlock: static const struct file_operations vfio_device_fops; /* true if the vfio_device has open_device() called but not close_device() */ -bool vfio_assert_device_open(struct vfio_device *device) +static bool vfio_assert_device_open(struct vfio_device *device) { return !WARN_ON_ONCE(!READ_ONCE(device->open_count)); } @@ -1876,6 +1876,126 @@ int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs, } EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare); +/* + * Pin contiguous user pages and return their associated host pages for local + * domain only. + * @device [in] : device + * @iova [in] : starting IOVA of user pages to be pinned. + * @npage [in] : count of pages to be pinned. This count should not + * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. + * @prot [in] : protection flags + * @pages[out] : array of host pages + * Return error or number of pages pinned. + * + * A driver may only call this function if the vfio_device was created + * by vfio_register_emulated_iommu_dev() due to vfio_container_pin_pages(). + */ +int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, + int npage, int prot, struct page **pages) +{ + /* group->container cannot change while a vfio device is open */ + if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device))) + return -EINVAL; + if (device->group->container) + return vfio_container_pin_pages(device->group->container, + device->group->iommu_group, + iova, npage, prot, pages); + if (device->iommufd_access) { + int ret; + + if (iova > ULONG_MAX) + return -EINVAL; + /* + * VFIO ignores the sub page offset, npages is from the start of + * a PAGE_SIZE chunk of IOVA. The caller is expected to recover + * the sub page offset by doing: + * pages[0] + (iova % PAGE_SIZE) + */ + ret = iommufd_access_pin_pages( + device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE), + npage * PAGE_SIZE, pages, + (prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0); + if (ret) + return ret; + return npage; + } + return -EINVAL; +} +EXPORT_SYMBOL(vfio_pin_pages); + +/* + * Unpin contiguous host pages for local domain only. + * @device [in] : device + * @iova [in] : starting address of user pages to be unpinned. + * @npage [in] : count of pages to be unpinned. This count should not + * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. + */ +void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage) +{ + if (WARN_ON(!vfio_assert_device_open(device))) + return; + + if (device->group->container) { + vfio_container_unpin_pages(device->group->container, iova, + npage); + return; + } + if (device->iommufd_access) { + if (WARN_ON(iova > ULONG_MAX)) + return; + iommufd_access_unpin_pages(device->iommufd_access, + ALIGN_DOWN(iova, PAGE_SIZE), + npage * PAGE_SIZE); + return; + } +} +EXPORT_SYMBOL(vfio_unpin_pages); + +/* + * This interface allows the CPUs to perform some sort of virtual DMA on + * behalf of the device. + * + * CPUs read/write from/into a range of IOVAs pointing to user space memory + * into/from a kernel buffer. + * + * As the read/write of user space memory is conducted via the CPUs and is + * not a real device DMA, it is not necessary to pin the user space memory. + * + * @device [in] : VFIO device + * @iova [in] : base IOVA of a user space buffer + * @data [in] : pointer to kernel buffer + * @len [in] : kernel buffer length + * @write : indicate read or write + * Return error code on failure or 0 on success. + */ +int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data, + size_t len, bool write) +{ + if (!data || len <= 0 || !vfio_assert_device_open(device)) + return -EINVAL; + + if (device->group->container) + return vfio_container_dma_rw(device->group->container, iova, + data, len, write); + + if (device->iommufd_access) { + unsigned int flags = 0; + + if (iova > ULONG_MAX) + return -EINVAL; + + /* VFIO historically tries to auto-detect a kthread */ + if (!current->mm) + flags |= IOMMUFD_ACCESS_RW_KTHREAD; + if (write) + flags |= IOMMUFD_ACCESS_RW_WRITE; + return iommufd_access_rw(device->iommufd_access, iova, data, + len, flags); + } + return -EINVAL; +} +EXPORT_SYMBOL(vfio_dma_rw); + /* * Module/class support */ diff --git a/include/linux/vfio.h b/include/linux/vfio.h index a7fc4d747dc22..d5f84f98c0fa8 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -19,6 +19,7 @@ struct kvm; struct iommufd_ctx; struct iommufd_device; +struct iommufd_access; /* * VFIO devices can be placed in a set, this allows all devices to share this @@ -56,8 +57,10 @@ struct vfio_device { struct completion comp; struct list_head group_next; struct list_head iommu_entry; + struct iommufd_access *iommufd_access; #if IS_ENABLED(CONFIG_IOMMUFD) struct iommufd_device *iommufd_device; + struct iommufd_ctx *iommufd_ictx; bool iommufd_attached; #endif }; @@ -111,6 +114,10 @@ int vfio_iommufd_physical_bind(struct vfio_device *vdev, struct iommufd_ctx *ictx, u32 *out_device_id); void vfio_iommufd_physical_unbind(struct vfio_device *vdev); int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id); +int vfio_iommufd_emulated_bind(struct vfio_device *vdev, + struct iommufd_ctx *ictx, u32 *out_device_id); +void vfio_iommufd_emulated_unbind(struct vfio_device *vdev); +int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id); #else #define vfio_iommufd_physical_bind \ ((int (*)(struct vfio_device *vdev, struct iommufd_ctx *ictx, \ @@ -119,6 +126,13 @@ int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id); ((void (*)(struct vfio_device *vdev)) NULL) #define vfio_iommufd_physical_attach_ioas \ ((int (*)(struct vfio_device *vdev, u32 *pt_id)) NULL) +#define vfio_iommufd_emulated_bind \ + ((int (*)(struct vfio_device *vdev, struct iommufd_ctx *ictx, \ + u32 *out_device_id)) NULL) +#define vfio_iommufd_emulated_unbind \ + ((void (*)(struct vfio_device *vdev)) NULL) +#define vfio_iommufd_emulated_attach_ioas \ + ((int (*)(struct vfio_device *vdev, u32 *pt_id)) NULL) #endif /** -- GitLab From 81ab9890da97e07862476bf635c80adee9b1c515 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:31:53 -0400 Subject: [PATCH 1090/1423] vfio: Move container related MODULE_ALIAS statements into container.c The miscdev is in container.c, so should these related MODULE_ALIAS statements. This is necessary for the next patch to be able to fully disable /dev/vfio/vfio. Fixes: cdc71fe4ecbf ("vfio: Move container code into drivers/vfio/container.c") Link: https://lore.kernel.org/r/8-v4-42cd2eb0e3eb+335a-vfio_iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Yi Liu Reviewed-by: Alex Williamson Tested-by: Alex Williamson Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Tested-by: Yu He Reported-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/vfio/container.c | 3 +++ drivers/vfio/vfio_main.c | 2 -- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/container.c b/drivers/vfio/container.c index 7f3961fd4b5aa..6b362d97d6822 100644 --- a/drivers/vfio/container.c +++ b/drivers/vfio/container.c @@ -608,3 +608,6 @@ void vfio_container_cleanup(void) misc_deregister(&vfio_dev); mutex_destroy(&vfio.iommu_drivers_lock); } + +MODULE_ALIAS_MISCDEV(VFIO_MINOR); +MODULE_ALIAS("devname:vfio/vfio"); diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index fd5e969ab653a..ce6e6a560c702 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -2073,6 +2073,4 @@ MODULE_VERSION(DRIVER_VERSION); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); -MODULE_ALIAS_MISCDEV(VFIO_MINOR); -MODULE_ALIAS("devname:vfio/vfio"); MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce"); -- GitLab From e5a9ec7e096ab7a3b34022409a6ddc63e4e83674 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:31:54 -0400 Subject: [PATCH 1091/1423] vfio: Make vfio_container optionally compiled Add a kconfig CONFIG_VFIO_CONTAINER that controls compiling the container code. If 'n' then only iommufd will provide the container service. All the support for vfio iommu drivers, including type1, will not be built. This allows a compilation check that no inappropriate dependencies between the device/group and container have been created. Link: https://lore.kernel.org/r/9-v4-42cd2eb0e3eb+335a-vfio_iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Alex Williamson Tested-by: Alex Williamson Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Tested-by: Yu He Signed-off-by: Jason Gunthorpe --- drivers/vfio/Kconfig | 35 +++++++++++++++-------- drivers/vfio/Makefile | 4 +-- drivers/vfio/vfio.h | 65 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 91 insertions(+), 13 deletions(-) diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 1118d322eec97..286c1663bd756 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -3,8 +3,8 @@ menuconfig VFIO tristate "VFIO Non-Privileged userspace driver framework" select IOMMU_API depends on IOMMUFD || !IOMMUFD - select VFIO_IOMMU_TYPE1 if MMU && (X86 || S390 || ARM || ARM64) select INTERVAL_TREE + select VFIO_CONTAINER if IOMMUFD=n help VFIO provides a framework for secure userspace device drivers. See Documentation/driver-api/vfio.rst for more details. @@ -12,6 +12,18 @@ menuconfig VFIO If you don't know what to do here, say N. if VFIO +config VFIO_CONTAINER + bool "Support for the VFIO container /dev/vfio/vfio" + select VFIO_IOMMU_TYPE1 if MMU && (X86 || S390 || ARM || ARM64) + default y + help + The VFIO container is the classic interface to VFIO for establishing + IOMMU mappings. If N is selected here then IOMMUFD must be used to + manage the mappings. + + Unless testing IOMMUFD say Y here. + +if VFIO_CONTAINER config VFIO_IOMMU_TYPE1 tristate default n @@ -21,16 +33,6 @@ config VFIO_IOMMU_SPAPR_TCE depends on SPAPR_TCE_IOMMU default VFIO -config VFIO_SPAPR_EEH - tristate - depends on EEH && VFIO_IOMMU_SPAPR_TCE - default VFIO - -config VFIO_VIRQFD - tristate - select EVENTFD - default n - config VFIO_NOIOMMU bool "VFIO No-IOMMU support" help @@ -44,6 +46,17 @@ config VFIO_NOIOMMU this mode since there is no IOMMU to provide DMA translation. If you don't know what to do here, say N. +endif + +config VFIO_SPAPR_EEH + tristate + depends on EEH && VFIO_IOMMU_SPAPR_TCE + default VFIO + +config VFIO_VIRQFD + tristate + select EVENTFD + default n source "drivers/vfio/pci/Kconfig" source "drivers/vfio/platform/Kconfig" diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index 3863922529ef2..b953517dc70f9 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -4,9 +4,9 @@ vfio_virqfd-y := virqfd.o obj-$(CONFIG_VFIO) += vfio.o vfio-y += vfio_main.o \ - iova_bitmap.o \ - container.o + iova_bitmap.o vfio-$(CONFIG_IOMMUFD) += iommufd.o +vfio-$(CONFIG_VFIO_CONTAINER) += container.o obj-$(CONFIG_VFIO_VIRQFD) += vfio_virqfd.o obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index b1ef842496372..ce5fe3fc493b4 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -55,7 +55,9 @@ struct vfio_group { struct list_head device_list; struct mutex device_lock; struct list_head vfio_next; +#if IS_ENABLED(CONFIG_VFIO_CONTAINER) struct list_head container_next; +#endif enum vfio_group_type type; struct mutex group_lock; struct kvm *kvm; @@ -64,6 +66,7 @@ struct vfio_group { struct iommufd_ctx *iommufd; }; +#if IS_ENABLED(CONFIG_VFIO_CONTAINER) /* events for the backend driver notify callback */ enum vfio_iommu_notify_type { VFIO_IOMMU_CONTAINER_CLOSE = 0, @@ -129,6 +132,68 @@ int vfio_container_dma_rw(struct vfio_container *container, dma_addr_t iova, int __init vfio_container_init(void); void vfio_container_cleanup(void); +#else +static inline struct vfio_container * +vfio_container_from_file(struct file *filep) +{ + return NULL; +} + +static inline int vfio_group_use_container(struct vfio_group *group) +{ + return -EOPNOTSUPP; +} + +static inline void vfio_group_unuse_container(struct vfio_group *group) +{ +} + +static inline int vfio_container_attach_group(struct vfio_container *container, + struct vfio_group *group) +{ + return -EOPNOTSUPP; +} + +static inline void vfio_group_detach_container(struct vfio_group *group) +{ +} + +static inline void vfio_device_container_register(struct vfio_device *device) +{ +} + +static inline void vfio_device_container_unregister(struct vfio_device *device) +{ +} + +static inline int vfio_container_pin_pages(struct vfio_container *container, + struct iommu_group *iommu_group, + dma_addr_t iova, int npage, int prot, + struct page **pages) +{ + return -EOPNOTSUPP; +} + +static inline void vfio_container_unpin_pages(struct vfio_container *container, + dma_addr_t iova, int npage) +{ +} + +static inline int vfio_container_dma_rw(struct vfio_container *container, + dma_addr_t iova, void *data, size_t len, + bool write) +{ + return -EOPNOTSUPP; +} + +static inline int vfio_container_init(void) +{ + return 0; +} +static inline void vfio_container_cleanup(void) +{ +} +#endif #if IS_ENABLED(CONFIG_IOMMUFD) int vfio_iommufd_bind(struct vfio_device *device, struct iommufd_ctx *ictx); -- GitLab From 01f70cbb26eadb5959344598977cb7159948263a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:31:55 -0400 Subject: [PATCH 1092/1423] iommufd: Allow iommufd to supply /dev/vfio/vfio If the VFIO container is compiled out, give a kconfig option for iommufd to provide the miscdev node with the same name and permissions as vfio uses. The compatibility node supports the same ioctls as VFIO and automatically enables the VFIO compatible pinned page accounting mode. Link: https://lore.kernel.org/r/10-v4-42cd2eb0e3eb+335a-vfio_iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Yi Liu Reviewed-by: Alex Williamson Tested-by: Alex Williamson Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Tested-by: Yu He Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/Kconfig | 20 +++++++++++++++++++ drivers/iommu/iommufd/main.c | 36 +++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig index 871244f2443fb..8306616b6d819 100644 --- a/drivers/iommu/iommufd/Kconfig +++ b/drivers/iommu/iommufd/Kconfig @@ -12,6 +12,26 @@ config IOMMUFD If you don't know what to do here, say N. if IOMMUFD +config IOMMUFD_VFIO_CONTAINER + bool "IOMMUFD provides the VFIO container /dev/vfio/vfio" + depends on VFIO && !VFIO_CONTAINER + default VFIO && !VFIO_CONTAINER + help + IOMMUFD will provide /dev/vfio/vfio instead of VFIO. This relies on + IOMMUFD providing compatibility emulation to give the same ioctls. + It provides an option to build a kernel with legacy VFIO components + removed. + + IOMMUFD VFIO container emulation is known to lack certain features + of the native VFIO container, such as no-IOMMU support, peer-to-peer + DMA mapping, PPC IOMMU support, as well as other potentially + undiscovered gaps. This option is currently intended for the + purpose of testing IOMMUFD with unmodified userspace supporting VFIO + and making use of the Type1 VFIO IOMMU backend. General purpose + enabling of this option is currently discouraged. + + Unless testing IOMMUFD, say N here. + config IOMMUFD_TEST bool "IOMMU Userspace API Test support" depends on DEBUG_KERNEL diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index bcb463e581009..083e6fcbe10ad 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -18,6 +18,7 @@ #include #include +#include "io_pagetable.h" #include "iommufd_private.h" #include "iommufd_test.h" @@ -25,6 +26,7 @@ struct iommufd_object_ops { void (*destroy)(struct iommufd_object *obj); }; static const struct iommufd_object_ops iommufd_object_ops[]; +static struct miscdevice vfio_misc_dev; struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, @@ -170,6 +172,16 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp) if (!ictx) return -ENOMEM; + /* + * For compatibility with VFIO when /dev/vfio/vfio is opened we default + * to the same rlimit accounting as vfio uses. + */ + if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER) && + filp->private_data == &vfio_misc_dev) { + ictx->account_mode = IOPT_PAGES_ACCOUNT_MM; + pr_info_once("IOMMUFD is providing /dev/vfio/vfio, not VFIO.\n"); + } + xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT); ictx->file = filp; filp->private_data = ictx; @@ -400,6 +412,15 @@ static struct miscdevice iommu_misc_dev = { .mode = 0660, }; + +static struct miscdevice vfio_misc_dev = { + .minor = VFIO_MINOR, + .name = "vfio", + .fops = &iommufd_fops, + .nodename = "vfio/vfio", + .mode = 0666, +}; + static int __init iommufd_init(void) { int ret; @@ -407,18 +428,33 @@ static int __init iommufd_init(void) ret = misc_register(&iommu_misc_dev); if (ret) return ret; + + if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) { + ret = misc_register(&vfio_misc_dev); + if (ret) + goto err_misc; + } iommufd_test_init(); return 0; +err_misc: + misc_deregister(&iommu_misc_dev); + return ret; } static void __exit iommufd_exit(void) { iommufd_test_exit(); + if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) + misc_deregister(&vfio_misc_dev); misc_deregister(&iommu_misc_dev); } module_init(iommufd_init); module_exit(iommufd_exit); +#if IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER) +MODULE_ALIAS_MISCDEV(VFIO_MINOR); +MODULE_ALIAS("devname:vfio/vfio"); +#endif MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices"); MODULE_LICENSE("GPL"); -- GitLab From b058ea3ab5afea873ab8d976277539ca9e43869a Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 29 Nov 2022 13:12:35 +0000 Subject: [PATCH 1093/1423] vfio/iova_bitmap: refactor iova_bitmap_set() to better handle page boundaries Commit f38044e5ef58 ("vfio/iova_bitmap: Fix PAGE_SIZE unaligned bitmaps") had fixed the unaligned bitmaps by capping the remaining iterable set at the start of the bitmap. Although, that mistakenly worked around iova_bitmap_set() incorrectly setting bits across page boundary. Fix this by reworking the loop inside iova_bitmap_set() to iterate over a range of bits to set (cur_bit .. last_bit) which may span different pinned pages, thus updating @page_idx and @offset as it sets the bits. The previous cap to the first page is now adjusted to be always accounted rather than when there's only a non-zero pgoff. While at it, make @page_idx , @offset and @nbits to be unsigned int given that it won't be more than 512 and 4096 respectively (even a bigger PAGE_SIZE or a smaller struct page size won't make this bigger than the above 32-bit max). Also, delete the stale kdoc on Return type. Cc: Avihai Horon Fixes: f38044e5ef58 ("vfio/iova_bitmap: Fix PAGE_SIZE unaligned bitmaps") Co-developed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe Signed-off-by: Joao Martins Reviewed-by: Jason Gunthorpe Tested-by: Avihai Horon Link: https://lore.kernel.org/r/20221129131235.38880-1-joao.m.martins@oracle.com Signed-off-by: Alex Williamson --- drivers/vfio/iova_bitmap.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/drivers/vfio/iova_bitmap.c b/drivers/vfio/iova_bitmap.c index de6d6ea5c4967..0848f920efb7c 100644 --- a/drivers/vfio/iova_bitmap.c +++ b/drivers/vfio/iova_bitmap.c @@ -298,9 +298,7 @@ static unsigned long iova_bitmap_mapped_remaining(struct iova_bitmap *bitmap) { unsigned long remaining, bytes; - /* Cap to one page in the first iteration, if PAGE_SIZE unaligned. */ - bytes = !bitmap->mapped.pgoff ? bitmap->mapped.npages << PAGE_SHIFT : - PAGE_SIZE - bitmap->mapped.pgoff; + bytes = (bitmap->mapped.npages << PAGE_SHIFT) - bitmap->mapped.pgoff; remaining = bitmap->mapped_total_index - bitmap->mapped_base_index; remaining = min_t(unsigned long, remaining, @@ -399,29 +397,27 @@ int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque, * Set the bits corresponding to the range [iova .. iova+length-1] in * the user bitmap. * - * Return: The number of bits set. */ void iova_bitmap_set(struct iova_bitmap *bitmap, unsigned long iova, size_t length) { struct iova_bitmap_map *mapped = &bitmap->mapped; - unsigned long offset = (iova - mapped->iova) >> mapped->pgshift; - unsigned long nbits = max_t(unsigned long, 1, length >> mapped->pgshift); - unsigned long page_idx = offset / BITS_PER_PAGE; - unsigned long page_offset = mapped->pgoff; - void *kaddr; - - offset = offset % BITS_PER_PAGE; + unsigned long cur_bit = ((iova - mapped->iova) >> + mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE; + unsigned long last_bit = (((iova + length - 1) - mapped->iova) >> + mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE; do { - unsigned long size = min(BITS_PER_PAGE - offset, nbits); + unsigned int page_idx = cur_bit / BITS_PER_PAGE; + unsigned int offset = cur_bit % BITS_PER_PAGE; + unsigned int nbits = min(BITS_PER_PAGE - offset, + last_bit - cur_bit + 1); + void *kaddr; kaddr = kmap_local_page(mapped->pages[page_idx]); - bitmap_set(kaddr + page_offset, offset, size); + bitmap_set(kaddr, offset, nbits); kunmap_local(kaddr); - page_offset = offset = 0; - nbits -= size; - page_idx++; - } while (nbits > 0); + cur_bit += nbits; + } while (cur_bit <= last_bit); } EXPORT_SYMBOL_GPL(iova_bitmap_set); -- GitLab From 61e15f871241ee86f217320909005cd022dd844f Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Fri, 2 Dec 2022 11:50:08 +0100 Subject: [PATCH 1094/1423] KVM: Delete all references to removed KVM_SET_MEMORY_REGION ioctl The documentation says that the ioctl has been deprecated, but it has been actually removed and the remaining references are just left overs. Suggested-by: Sean Christopherson Signed-off-by: Javier Martinez Canillas Message-Id: <20221202105011.185147-2-javierm@redhat.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 16 ---------------- include/uapi/linux/kvm.h | 12 ------------ tools/include/uapi/linux/kvm.h | 12 ------------ 3 files changed, 40 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 5617bc4f899f4..850e187c0a389 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -272,18 +272,6 @@ the VCPU file descriptor can be mmap-ed, including: KVM_CAP_DIRTY_LOG_RING, see section 8.3. -4.6 KVM_SET_MEMORY_REGION -------------------------- - -:Capability: basic -:Architectures: all -:Type: vm ioctl -:Parameters: struct kvm_memory_region (in) -:Returns: 0 on success, -1 on error - -This ioctl is obsolete and has been removed. - - 4.7 KVM_CREATE_VCPU ------------------- @@ -1377,10 +1365,6 @@ the memory region are automatically reflected into the guest. For example, an mmap() that affects the region will be made visible immediately. Another example is madvise(MADV_DROP). -It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl. -The KVM_SET_MEMORY_REGION does not allow fine grained control over memory -allocation and is deprecated. - 4.36 KVM_SET_TSS_ADDR --------------------- diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 64dfe9c07c879..c338ca2c972d2 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -86,14 +86,6 @@ struct kvm_debug_guest { /* *** End of deprecated interfaces *** */ -/* for KVM_CREATE_MEMORY_REGION */ -struct kvm_memory_region { - __u32 slot; - __u32 flags; - __u64 guest_phys_addr; - __u64 memory_size; /* bytes */ -}; - /* for KVM_SET_USER_MEMORY_REGION */ struct kvm_userspace_memory_region { __u32 slot; @@ -1442,10 +1434,6 @@ struct kvm_vfio_spapr_tce { __s32 tablefd; }; -/* - * ioctls for VM fds - */ -#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region) /* * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns * a vcpu fd. diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 0d5d4419139ae..8899201d5964f 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -86,14 +86,6 @@ struct kvm_debug_guest { /* *** End of deprecated interfaces *** */ -/* for KVM_CREATE_MEMORY_REGION */ -struct kvm_memory_region { - __u32 slot; - __u32 flags; - __u64 guest_phys_addr; - __u64 memory_size; /* bytes */ -}; - /* for KVM_SET_USER_MEMORY_REGION */ struct kvm_userspace_memory_region { __u32 slot; @@ -1437,10 +1429,6 @@ struct kvm_vfio_spapr_tce { __s32 tablefd; }; -/* - * ioctls for VM fds - */ -#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region) /* * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns * a vcpu fd. -- GitLab From 66a9221d73e71199a120ca12ea5bfaac2aa670a3 Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Fri, 2 Dec 2022 11:50:09 +0100 Subject: [PATCH 1095/1423] KVM: Delete all references to removed KVM_SET_MEMORY_ALIAS ioctl The documentation says that the ioctl has been deprecated, but it has been actually removed and the remaining references are just left overs. Suggested-by: Sean Christopherson Signed-off-by: Javier Martinez Canillas Message-Id: <20221202105011.185147-3-javierm@redhat.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 11 ----------- arch/x86/include/uapi/asm/kvm.h | 8 -------- include/uapi/linux/kvm.h | 2 -- tools/arch/x86/include/uapi/asm/kvm.h | 8 -------- tools/include/uapi/linux/kvm.h | 2 -- 5 files changed, 31 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 850e187c0a389..07e8a42d839a2 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -356,17 +356,6 @@ see the description of the capability. Note that the Xen shared info page, if configured, shall always be assumed to be dirty. KVM will not explicitly mark it such. -4.9 KVM_SET_MEMORY_ALIAS ------------------------- - -:Capability: basic -:Architectures: x86 -:Type: vm ioctl -:Parameters: struct kvm_memory_alias (in) -:Returns: 0 (success), -1 (error) - -This ioctl is obsolete and has been removed. - 4.10 KVM_RUN ------------ diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index c6df6b16a0885..e48deab8901d4 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -53,14 +53,6 @@ /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 -struct kvm_memory_alias { - __u32 slot; /* this has a different namespace than memory slots */ - __u32 flags; - __u64 guest_phys_addr; - __u64 memory_size; - __u64 target_phys_addr; -}; - /* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */ struct kvm_pic_state { __u8 last_irr; /* edge detection */ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index c338ca2c972d2..ce91837656169 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1440,8 +1440,6 @@ struct kvm_vfio_spapr_tce { */ #define KVM_CREATE_VCPU _IO(KVMIO, 0x41) #define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) -/* KVM_SET_MEMORY_ALIAS is obsolete: */ -#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) #define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44) #define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45) #define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46, \ diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index 46de10a809ecb..649e50a8f9ddf 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -53,14 +53,6 @@ /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 -struct kvm_memory_alias { - __u32 slot; /* this has a different namespace than memory slots */ - __u32 flags; - __u64 guest_phys_addr; - __u64 memory_size; - __u64 target_phys_addr; -}; - /* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */ struct kvm_pic_state { __u8 last_irr; /* edge detection */ diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 8899201d5964f..6ba2928f8f18f 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1435,8 +1435,6 @@ struct kvm_vfio_spapr_tce { */ #define KVM_CREATE_VCPU _IO(KVMIO, 0x41) #define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) -/* KVM_SET_MEMORY_ALIAS is obsolete: */ -#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) #define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44) #define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45) #define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46, \ -- GitLab From 30ee198ce42d60101620d33f8bc70c3234798365 Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Fri, 2 Dec 2022 11:50:10 +0100 Subject: [PATCH 1096/1423] KVM: Reference to kvm_userspace_memory_region in doc and comments There are still references to the removed kvm_memory_region data structure but the doc and comments should mention struct kvm_userspace_memory_region instead, since that is what's used by the ioctl that replaced the old one and this data structure support the same set of flags. Signed-off-by: Javier Martinez Canillas Message-Id: <20221202105011.185147-4-javierm@redhat.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 2 +- include/linux/kvm_host.h | 4 ++-- include/uapi/linux/kvm.h | 6 +++--- tools/include/uapi/linux/kvm.h | 6 +++--- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 07e8a42d839a2..92e14823619a2 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -1309,7 +1309,7 @@ yet and must be cleared on entry. __u64 userspace_addr; /* start of the userspace allocated memory */ }; - /* for kvm_memory_region::flags */ + /* for kvm_userspace_memory_region::flags */ #define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) #define KVM_MEM_READONLY (1UL << 1) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8f874a9643139..a5a82b5367749 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -50,8 +50,8 @@ #endif /* - * The bit 16 ~ bit 31 of kvm_memory_region::flags are internally used - * in kvm, other bits are visible for userspace which are defined in + * The bit 16 ~ bit 31 of kvm_userspace_memory_region::flags are internally + * used in kvm, other bits are visible for userspace which are defined in * include/linux/kvm_h. */ #define KVM_MEMSLOT_INVALID (1UL << 16) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index ce91837656169..03708ce10bdaf 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -96,9 +96,9 @@ struct kvm_userspace_memory_region { }; /* - * The bit 0 ~ bit 15 of kvm_memory_region::flags are visible for userspace, - * other bits are reserved for kvm internal use which are defined in - * include/linux/kvm_host.h. + * The bit 0 ~ bit 15 of kvm_userspace_memory_region::flags are visible for + * userspace, other bits are reserved for kvm internal use which are defined + * in include/linux/kvm_host.h. */ #define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) #define KVM_MEM_READONLY (1UL << 1) diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 6ba2928f8f18f..21d6d29502e48 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -96,9 +96,9 @@ struct kvm_userspace_memory_region { }; /* - * The bit 0 ~ bit 15 of kvm_memory_region::flags are visible for userspace, - * other bits are reserved for kvm internal use which are defined in - * include/linux/kvm_host.h. + * The bit 0 ~ bit 15 of kvm_userspace_memory_region::flags are visible for + * userspace, other bits are reserved for kvm internal use which are defined + *in include/linux/kvm_host.h. */ #define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) #define KVM_MEM_READONLY (1UL << 1) -- GitLab From 10c5e80b2c4d67fa9a931ac57beab782cc3db2ef Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Fri, 2 Dec 2022 11:50:11 +0100 Subject: [PATCH 1097/1423] KVM: Add missing arch for KVM_CREATE_DEVICE and KVM_{SET,GET}_DEVICE_ATTR The ioctls are missing an architecture property that is present in others. Suggested-by: Sergio Lopez Pascual Signed-off-by: Javier Martinez Canillas Message-Id: <20221202105011.185147-5-javierm@redhat.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 92e14823619a2..a63d86be45d93 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -3266,6 +3266,7 @@ valid entries found. ---------------------- :Capability: KVM_CAP_DEVICE_CTRL +:Architectures: all :Type: vm ioctl :Parameters: struct kvm_create_device (in/out) :Returns: 0 on success, -1 on error @@ -3306,6 +3307,7 @@ number. :Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device, KVM_CAP_VCPU_ATTRIBUTES for vcpu device KVM_CAP_SYS_ATTRIBUTES for system (/dev/kvm) device (no set) +:Architectures: x86, arm64, s390 :Type: device ioctl, vm ioctl, vcpu ioctl :Parameters: struct kvm_device_attr :Returns: 0 on success, -1 on error -- GitLab From dd03cc90e09daeb8a9509e65a39eb576256790b2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 1 Dec 2022 22:04:33 +0000 Subject: [PATCH 1098/1423] KVM: Remove stale comment about KVM_REQ_UNHALT Remove a comment about KVM_REQ_UNHALT being set by kvm_vcpu_check_block() that was missed when KVM_REQ_UNHALT was dropped. Fixes: c59fb1275838 ("KVM: remove KVM_REQ_UNHALT") Signed-off-by: Sean Christopherson Message-Id: <20221201220433.31366-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1782c4555d94f..1401dcba2f826 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3518,10 +3518,6 @@ void kvm_vcpu_halt(struct kvm_vcpu *vcpu) ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns); do { - /* - * This sets KVM_REQ_UNHALT if an interrupt - * arrives. - */ if (kvm_vcpu_check_block(vcpu) < 0) goto out; cpu_relax(); -- GitLab From 8a6841c439dfbba2067a533b0e8264ea438689f6 Mon Sep 17 00:00:00 2001 From: Jamie Iles Date: Tue, 15 Nov 2022 20:08:29 +0000 Subject: [PATCH 1099/1423] RISC-V: use REG_S/REG_L for mcount In preparation for rv32i ftrace support, convert mcount routines to use native sized loads/stores. Reviewed-by: Andrew Jones Signed-off-by: Jamie Iles Link: https://lore.kernel.org/r/20221115200832.706370-2-jamie@jamieiles.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/mcount.S | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/riscv/kernel/mcount.S b/arch/riscv/kernel/mcount.S index 6d462681c9c02..9cf0904afd6dd 100644 --- a/arch/riscv/kernel/mcount.S +++ b/arch/riscv/kernel/mcount.S @@ -15,8 +15,8 @@ .macro SAVE_ABI_STATE addi sp, sp, -16 - sd s0, 0(sp) - sd ra, 8(sp) + REG_S s0, 0(sp) + REG_S ra, 8(sp) addi s0, sp, 16 .endm @@ -26,22 +26,22 @@ */ .macro SAVE_RET_ABI_STATE addi sp, sp, -32 - sd s0, 16(sp) - sd ra, 24(sp) - sd a0, 8(sp) + REG_S s0, 16(sp) + REG_S ra, 24(sp) + REG_S a0, 8(sp) addi s0, sp, 32 .endm .macro RESTORE_ABI_STATE - ld ra, 8(sp) - ld s0, 0(sp) + REG_L ra, 8(sp) + REG_L s0, 0(sp) addi sp, sp, 16 .endm .macro RESTORE_RET_ABI_STATE - ld ra, 24(sp) - ld s0, 16(sp) - ld a0, 8(sp) + REG_L ra, 24(sp) + REG_L s0, 16(sp) + REG_L a0, 8(sp) addi sp, sp, 32 .endm @@ -82,16 +82,16 @@ ENTRY(MCOUNT_NAME) la t4, ftrace_stub #ifdef CONFIG_FUNCTION_GRAPH_TRACER la t0, ftrace_graph_return - ld t1, 0(t0) + REG_L t1, 0(t0) bne t1, t4, do_ftrace_graph_caller la t3, ftrace_graph_entry - ld t2, 0(t3) + REG_L t2, 0(t3) la t6, ftrace_graph_entry_stub bne t2, t6, do_ftrace_graph_caller #endif la t3, ftrace_trace_function - ld t5, 0(t3) + REG_L t5, 0(t3) bne t5, t4, do_trace ret @@ -104,7 +104,7 @@ do_ftrace_graph_caller: addi a0, s0, -8 mv a1, ra #ifdef HAVE_FUNCTION_GRAPH_FP_TEST - ld a2, -16(s0) + REG_L a2, -16(s0) #endif SAVE_ABI_STATE call prepare_ftrace_return @@ -117,7 +117,7 @@ do_ftrace_graph_caller: * (*ftrace_trace_function)(ra_to_caller, ra_to_caller_of_caller) */ do_trace: - ld a1, -8(s0) + REG_L a1, -8(s0) mv a0, ra SAVE_ABI_STATE -- GitLab From 3bd7743f8d6d7171db9897a746038eefd52a1fbd Mon Sep 17 00:00:00 2001 From: Jamie Iles Date: Tue, 15 Nov 2022 20:08:30 +0000 Subject: [PATCH 1100/1423] RISC-V: reduce mcount save space on RV32 For RV32 we can reduce the size of the ABI save+restore state by using SZREG so that register stores are packed rather than on an 8 byte boundary. Signed-off-by: Jamie Iles Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20221115200832.706370-3-jamie@jamieiles.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/mcount.S | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/riscv/kernel/mcount.S b/arch/riscv/kernel/mcount.S index 9cf0904afd6dd..613bd07c6268a 100644 --- a/arch/riscv/kernel/mcount.S +++ b/arch/riscv/kernel/mcount.S @@ -15,8 +15,8 @@ .macro SAVE_ABI_STATE addi sp, sp, -16 - REG_S s0, 0(sp) - REG_S ra, 8(sp) + REG_S s0, 0*SZREG(sp) + REG_S ra, 1*SZREG(sp) addi s0, sp, 16 .endm @@ -25,24 +25,24 @@ * register if a0 was not saved. */ .macro SAVE_RET_ABI_STATE - addi sp, sp, -32 - REG_S s0, 16(sp) - REG_S ra, 24(sp) - REG_S a0, 8(sp) - addi s0, sp, 32 + addi sp, sp, -4*SZREG + REG_S s0, 2*SZREG(sp) + REG_S ra, 3*SZREG(sp) + REG_S a0, 1*SZREG(sp) + addi s0, sp, 4*SZREG .endm .macro RESTORE_ABI_STATE - REG_L ra, 8(sp) - REG_L s0, 0(sp) + REG_L ra, 1*SZREG(sp) + REG_L s0, 0*SZREG(sp) addi sp, sp, 16 .endm .macro RESTORE_RET_ABI_STATE - REG_L ra, 24(sp) - REG_L s0, 16(sp) - REG_L a0, 8(sp) - addi sp, sp, 32 + REG_L ra, 3*SZREG(sp) + REG_L s0, 2*SZREG(sp) + REG_L a0, 1*SZREG(sp) + addi sp, sp, 4*SZREG .endm ENTRY(ftrace_stub) @@ -101,10 +101,10 @@ ENTRY(MCOUNT_NAME) * prepare_to_return(&ra_to_caller_of_caller, ra_to_caller) */ do_ftrace_graph_caller: - addi a0, s0, -8 + addi a0, s0, -SZREG mv a1, ra #ifdef HAVE_FUNCTION_GRAPH_FP_TEST - REG_L a2, -16(s0) + REG_L a2, -2*SZREG(s0) #endif SAVE_ABI_STATE call prepare_ftrace_return @@ -117,7 +117,7 @@ do_ftrace_graph_caller: * (*ftrace_trace_function)(ra_to_caller, ra_to_caller_of_caller) */ do_trace: - REG_L a1, -8(s0) + REG_L a1, -SZREG(s0) mv a0, ra SAVE_ABI_STATE -- GitLab From dc58a24db8c12ea361e94eaf53adc5d471534694 Mon Sep 17 00:00:00 2001 From: Jamie Iles Date: Tue, 15 Nov 2022 20:08:31 +0000 Subject: [PATCH 1101/1423] RISC-V: preserve a1 in mcount The RISC-V ELF psABI states that both a0 and a1 are used for return values so we should preserve them both in return_to_handler. This is especially important for RV32 for functions returning a 64-bit quantity otherwise the return value can be corrupted and undefined behaviour results. Reviewed-by: Andrew Jones Signed-off-by: Jamie Iles Link: https://lore.kernel.org/r/20221115200832.706370-4-jamie@jamieiles.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/mcount.S | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kernel/mcount.S b/arch/riscv/kernel/mcount.S index 613bd07c6268a..30102aadc4d73 100644 --- a/arch/riscv/kernel/mcount.S +++ b/arch/riscv/kernel/mcount.S @@ -29,6 +29,7 @@ REG_S s0, 2*SZREG(sp) REG_S ra, 3*SZREG(sp) REG_S a0, 1*SZREG(sp) + REG_S a1, 0*SZREG(sp) addi s0, sp, 4*SZREG .endm @@ -42,6 +43,7 @@ REG_L ra, 3*SZREG(sp) REG_L s0, 2*SZREG(sp) REG_L a0, 1*SZREG(sp) + REG_L a1, 0*SZREG(sp) addi sp, sp, 4*SZREG .endm @@ -71,9 +73,9 @@ ENTRY(return_to_handler) mv a0, t6 #endif call ftrace_return_to_handler - mv a1, a0 + mv a2, a0 RESTORE_RET_ABI_STATE - jalr a1 + jalr a2 ENDPROC(return_to_handler) #endif -- GitLab From f32b4b467ebd8a035f8342b7ea27efc84b10d96b Mon Sep 17 00:00:00 2001 From: Jamie Iles Date: Tue, 15 Nov 2022 20:08:32 +0000 Subject: [PATCH 1102/1423] RISC-V: enable dynamic ftrace for RV32I The RISC-V mcount function is now capable of supporting RV32I so make it available in the kernel config. Signed-off-by: Jamie Iles Link: https://lore.kernel.org/r/20221115200832.706370-5-jamie@jamieiles.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 6b48a3ae98439..5ae4f7ce2a056 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -129,6 +129,11 @@ config RISCV select TRACE_IRQFLAGS_SUPPORT select UACCESS_MEMCPY if !MMU select ZONE_DMA32 if 64BIT + select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && MMU && $(cc-option,-fpatchable-function-entry=8) + select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE + select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL + select HAVE_FUNCTION_GRAPH_TRACER + select HAVE_FUNCTION_TRACER if !XIP_KERNEL config ARCH_MMAP_RND_BITS_MIN default 18 if 64BIT @@ -274,11 +279,6 @@ config ARCH_RV64I bool "RV64I" select 64BIT select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 - select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && MMU && $(cc-option,-fpatchable-function-entry=8) - select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE - select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL - select HAVE_FUNCTION_GRAPH_TRACER - select HAVE_FUNCTION_TRACER if !XIP_KERNEL select SWIOTLB if MMU endchoice -- GitLab From ef40757743b47cc95de9b4ed41525c94f8dc73d9 Mon Sep 17 00:00:00 2001 From: Yuan ZhaoXiong Date: Fri, 2 Dec 2022 20:36:14 +0800 Subject: [PATCH 1103/1423] KVM: x86: fix APICv/x2AVIC disabled when vm reboot by itself When a VM reboots itself, the reset process will result in an ioctl(KVM_SET_LAPIC, ...) to disable x2APIC mode and set the xAPIC id of the vCPU to its default value, which is the vCPU id. That will be handled in KVM as follows: kvm_vcpu_ioctl_set_lapic kvm_apic_set_state kvm_lapic_set_base => disable X2APIC mode kvm_apic_state_fixup kvm_lapic_xapic_id_updated kvm_xapic_id(apic) != apic->vcpu->vcpu_id kvm_set_apicv_inhibit(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s)) => update APIC_ID When kvm_apic_set_state invokes kvm_lapic_set_base to disable x2APIC mode, the old 32-bit x2APIC id is still present rather than the 8-bit xAPIC id. kvm_lapic_xapic_id_updated will set the APICV_INHIBIT_REASON_APIC_ID_MODIFIED bit and disable APICv/x2AVIC. Instead, kvm_lapic_xapic_id_updated must be called after APIC_ID is changed. In fact, this fixes another small issue in the code in that potential changes to a vCPU's xAPIC ID need not be tracked for KVM_GET_LAPIC. Fixes: 3743c2f02517 ("KVM: x86: inhibit APICv/AVIC on changes to APIC ID or APIC base") Signed-off-by: Yuan ZhaoXiong Message-Id: <1669984574-32692-1-git-send-email-yuanzhaoxiong@baidu.com> Cc: stable@vger.kernel.org Reported-by: Alejandro Jimenez Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 1bb63746e9910..8224ac8b617a4 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2724,8 +2724,6 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR); __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32); } - } else { - kvm_lapic_xapic_id_updated(vcpu->arch.apic); } return 0; @@ -2761,6 +2759,9 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) } memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s)); + if (!apic_x2apic_mode(apic)) + kvm_lapic_xapic_id_updated(apic); + atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); kvm_recalculate_apic_map(vcpu->kvm); kvm_apic_set_version(vcpu); -- GitLab From ef16b2dff4d1c71eb32b306d400d4c0f3a383ba7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 19 Nov 2022 01:34:44 +0000 Subject: [PATCH 1104/1423] KVM: arm64: selftests: Enable single-step without a "full" ucall() Add a new ucall hook, GUEST_UCALL_NONE(), to allow tests to make ucalls without allocating a ucall struct, and use it to enable single-step in ARM's debug-exceptions test. Like the disable single-step path, the enabling path also needs to ensure that no exclusive access sequences are attempted after enabling single-step, as the exclusive monitor is cleared on ERET from the debug exception taken to EL2. The test currently "works" because clear_bit() isn't actually an atomic operation... yet. Signed-off-by: Sean Christopherson Message-Id: <20221119013450.2643007-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/aarch64/debug-exceptions.c | 21 ++++++++++--------- .../selftests/kvm/include/ucall_common.h | 8 +++++++ 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c index d86c4e4d1c826..c62ec4d7f6a38 100644 --- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c +++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c @@ -239,10 +239,6 @@ static void guest_svc_handler(struct ex_regs *regs) svc_addr = regs->pc; } -enum single_step_op { - SINGLE_STEP_ENABLE = 0, -}; - static void guest_code_ss(int test_cnt) { uint64_t i; @@ -253,8 +249,16 @@ static void guest_code_ss(int test_cnt) w_bvr = i << 2; w_wvr = i << 2; - /* Enable Single Step execution */ - GUEST_SYNC(SINGLE_STEP_ENABLE); + /* + * Enable Single Step execution. Note! This _must_ be a bare + * ucall as the ucall() path uses atomic operations to manage + * the ucall structures, and the built-in "atomics" are usually + * implemented via exclusive access instructions. The exlusive + * monitor is cleared on ERET, and so taking debug exceptions + * during a LDREX=>STREX sequence will prevent forward progress + * and hang the guest/test. + */ + GUEST_UCALL_NONE(); /* * The userspace will verify that the pc is as expected during @@ -356,12 +360,9 @@ void test_single_step_from_userspace(int test_cnt) break; } - TEST_ASSERT(cmd == UCALL_SYNC, + TEST_ASSERT(cmd == UCALL_NONE, "Unexpected ucall cmd 0x%lx", cmd); - TEST_ASSERT(uc.args[1] == SINGLE_STEP_ENABLE, - "Unexpected ucall action 0x%lx", uc.args[1]); - debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP; ss_enable = true; diff --git a/tools/testing/selftests/kvm/include/ucall_common.h b/tools/testing/selftests/kvm/include/ucall_common.h index bdd373189a777..1a6aaef5ccae1 100644 --- a/tools/testing/selftests/kvm/include/ucall_common.h +++ b/tools/testing/selftests/kvm/include/ucall_common.h @@ -35,6 +35,14 @@ void ucall(uint64_t cmd, int nargs, ...); uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc); void ucall_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa); +/* + * Perform userspace call without any associated data. This bare call avoids + * allocating a ucall struct, which can be useful if the atomic operations in + * the full ucall() are problematic and/or unwanted. Note, this will come out + * as UCALL_NONE on the backend. + */ +#define GUEST_UCALL_NONE() ucall_arch_do_ucall((vm_vaddr_t)NULL) + #define GUEST_SYNC_ARGS(stage, arg1, arg2, arg3, arg4) \ ucall(UCALL_SYNC, 6, "hello", stage, arg1, arg2, arg3, arg4) #define GUEST_SYNC(stage) ucall(UCALL_SYNC, 2, "hello", stage) -- GitLab From 7f2b47f22b825c16d9843e6e78bbb2370d2c31a0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 19 Nov 2022 01:34:45 +0000 Subject: [PATCH 1105/1423] tools: Take @bit as an "unsigned long" in {clear,set}_bit() helpers Take @bit as an unsigned long instead of a signed int in clear_bit() and set_bit() so that they match the double-underscore versions, __clear_bit() and __set_bit(). This will allow converting users that really don't want atomic operations to the double-underscores without introducing a functional change, which will in turn allow making {clear,set}_bit() atomic (as advertised). Practically speaking, this _should_ have no functional impact. KVM's selftests usage is either hardcoded (Hyper-V tests) or is artificially limited (arch_timer test and dirty_log test). In KVM, dirty_log test is the only mildly interesting case as it's use indirectly restricted to unsigned 32-bit values, but in theory it could generate a negative value when cast to a signed int. But in that case, taking an "unsigned long" is actually a bug fix. Perf's usage is more difficult to audit, but any code that is affected by the switch is likely already broken. perf_header__{set,clear}_feat() and perf_file_header__read() effectively use only hardcoded enums with small, positive values, atom_new() passes an unsigned long, but its value is capped at 128 via NR_ATOM_PER_PAGE, etc... The only real potential for breakage is in the perf flows that take a "cpu", but it's unlikely perf is subtly relying on a negative index into bitmaps, e.g. "cpu" can be "-1", but only as "not valid" placeholder. Note, tools/testing/nvdimm/ makes heavy use of set_bit(), but that code builds into a kernel module of sorts, i.e. pulls in all of the kernel's header and so is getting the kernel's atomic set_bit(). The NVDIMM test usage of atomics is likely unnecessary, e.g. ndtest_dimm_register() sets bits in a local variable, but that's neither here nor there as far as this change is concerned. Signed-off-by: Sean Christopherson Message-Id: <20221119013450.2643007-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- tools/include/asm-generic/bitops/atomic.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/include/asm-generic/bitops/atomic.h b/tools/include/asm-generic/bitops/atomic.h index 2f6ea28764a79..f64b049d236c7 100644 --- a/tools/include/asm-generic/bitops/atomic.h +++ b/tools/include/asm-generic/bitops/atomic.h @@ -5,12 +5,12 @@ #include #include -static inline void set_bit(int nr, unsigned long *addr) +static inline void set_bit(unsigned long nr, unsigned long *addr) { addr[nr / __BITS_PER_LONG] |= 1UL << (nr % __BITS_PER_LONG); } -static inline void clear_bit(int nr, unsigned long *addr) +static inline void clear_bit(unsigned long nr, unsigned long *addr) { addr[nr / __BITS_PER_LONG] &= ~(1UL << (nr % __BITS_PER_LONG)); } -- GitLab From 75d7ba32f9829e778484cf6e96e6e8f80914b0b3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 19 Nov 2022 01:34:46 +0000 Subject: [PATCH 1106/1423] perf tools: Use dedicated non-atomic clear/set bit helpers Use the dedicated non-atomic helpers for {clear,set}_bit() and their test variants, i.e. the double-underscore versions. Depsite being defined in atomic.h, and despite the kernel versions being atomic in the kernel, tools' {clear,set}_bit() helpers aren't actually atomic. Move to the double-underscore versions so that the versions that are expected to be atomic (for kernel developers) can be made atomic without affecting users that don't want atomic operations. No functional change intended. Signed-off-by: Sean Christopherson Acked-by: Namhyung Kim Message-Id: <20221119013450.2643007-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- tools/perf/bench/find-bit-bench.c | 2 +- tools/perf/builtin-c2c.c | 6 +++--- tools/perf/builtin-kwork.c | 6 +++--- tools/perf/builtin-record.c | 6 +++--- tools/perf/builtin-sched.c | 2 +- tools/perf/tests/bitmap.c | 2 +- tools/perf/tests/mem2node.c | 2 +- tools/perf/util/affinity.c | 4 ++-- tools/perf/util/header.c | 8 ++++---- tools/perf/util/mmap.c | 6 +++--- tools/perf/util/pmu.c | 2 +- tools/perf/util/scripting-engines/trace-event-perl.c | 2 +- tools/perf/util/scripting-engines/trace-event-python.c | 2 +- tools/perf/util/session.c | 2 +- tools/perf/util/svghelper.c | 2 +- 15 files changed, 27 insertions(+), 27 deletions(-) diff --git a/tools/perf/bench/find-bit-bench.c b/tools/perf/bench/find-bit-bench.c index 22b5cfe970237..d103c3136983d 100644 --- a/tools/perf/bench/find-bit-bench.c +++ b/tools/perf/bench/find-bit-bench.c @@ -70,7 +70,7 @@ static int do_for_each_set_bit(unsigned int num_bits) bitmap_zero(to_test, num_bits); skip = num_bits / set_bits; for (i = 0; i < num_bits; i += skip) - set_bit(i, to_test); + __set_bit(i, to_test); for (i = 0; i < outer_iterations; i++) { old = accumulator; diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index a9190458d2d50..52d94c7dd8366 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -230,7 +230,7 @@ static void c2c_he__set_cpu(struct c2c_hist_entry *c2c_he, "WARNING: no sample cpu value")) return; - set_bit(sample->cpu, c2c_he->cpuset); + __set_bit(sample->cpu, c2c_he->cpuset); } static void c2c_he__set_node(struct c2c_hist_entry *c2c_he, @@ -247,7 +247,7 @@ static void c2c_he__set_node(struct c2c_hist_entry *c2c_he, if (WARN_ONCE(node < 0, "WARNING: failed to find node\n")) return; - set_bit(node, c2c_he->nodeset); + __set_bit(node, c2c_he->nodeset); if (c2c_he->paddr != sample->phys_addr) { c2c_he->paddr_cnt++; @@ -2318,7 +2318,7 @@ static int setup_nodes(struct perf_session *session) continue; perf_cpu_map__for_each_cpu(cpu, idx, map) { - set_bit(cpu.cpu, set); + __set_bit(cpu.cpu, set); if (WARN_ONCE(cpu2node[cpu.cpu] != -1, "node/cpu topology bug")) return -EINVAL; diff --git a/tools/perf/builtin-kwork.c b/tools/perf/builtin-kwork.c index fb8c63656ad89..1f63e24f704e2 100644 --- a/tools/perf/builtin-kwork.c +++ b/tools/perf/builtin-kwork.c @@ -216,7 +216,7 @@ static struct kwork_atom *atom_new(struct perf_kwork *kwork, list_add_tail(&page->list, &kwork->atom_page_list); found_atom: - set_bit(i, page->bitmap); + __set_bit(i, page->bitmap); atom->time = sample->time; atom->prev = NULL; atom->page_addr = page; @@ -229,8 +229,8 @@ static void atom_free(struct kwork_atom *atom) if (atom->prev != NULL) atom_free(atom->prev); - clear_bit(atom->bit_inpage, - ((struct kwork_atom_page *)atom->page_addr)->bitmap); + __clear_bit(atom->bit_inpage, + ((struct kwork_atom_page *)atom->page_addr)->bitmap); } static void atom_del(struct kwork_atom *atom) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index e128b855dddec..2711c141c5bf0 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -3555,7 +3555,7 @@ static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cp /* Return ENODEV is input cpu is greater than max cpu */ if ((unsigned long)cpu.cpu > mask->nbits) return -ENODEV; - set_bit(cpu.cpu, mask->bits); + __set_bit(cpu.cpu, mask->bits); } return 0; @@ -3627,8 +3627,8 @@ static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map pr_debug("nr_threads: %d\n", rec->nr_threads); for (t = 0; t < rec->nr_threads; t++) { - set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits); - set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits); + __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits); + __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits); if (verbose) { pr_debug("thread_masks[%d]: ", t); mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps"); diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index f93737eef07ba..86e18575c9bee 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1573,7 +1573,7 @@ static int map_switch_event(struct perf_sched *sched, struct evsel *evsel, if (sched->map.comp) { cpus_nr = bitmap_weight(sched->map.comp_cpus_mask, MAX_CPUS); - if (!test_and_set_bit(this_cpu.cpu, sched->map.comp_cpus_mask)) { + if (!__test_and_set_bit(this_cpu.cpu, sched->map.comp_cpus_mask)) { sched->map.comp_cpus[cpus_nr++] = this_cpu; new_cpu = true; } diff --git a/tools/perf/tests/bitmap.c b/tools/perf/tests/bitmap.c index 4965dd6669566..0173f5402a35b 100644 --- a/tools/perf/tests/bitmap.c +++ b/tools/perf/tests/bitmap.c @@ -18,7 +18,7 @@ static unsigned long *get_bitmap(const char *str, int nbits) if (map && bm) { for (i = 0; i < perf_cpu_map__nr(map); i++) - set_bit(perf_cpu_map__cpu(map, i).cpu, bm); + __set_bit(perf_cpu_map__cpu(map, i).cpu, bm); } if (map) diff --git a/tools/perf/tests/mem2node.c b/tools/perf/tests/mem2node.c index 4c96829510c91..a0e88c4961074 100644 --- a/tools/perf/tests/mem2node.c +++ b/tools/perf/tests/mem2node.c @@ -33,7 +33,7 @@ static unsigned long *get_bitmap(const char *str, int nbits) int i; perf_cpu_map__for_each_cpu(cpu, i, map) - set_bit(cpu.cpu, bm); + __set_bit(cpu.cpu, bm); } if (map) diff --git a/tools/perf/util/affinity.c b/tools/perf/util/affinity.c index 4ee96b3c755b7..38dc4524b7e86 100644 --- a/tools/perf/util/affinity.c +++ b/tools/perf/util/affinity.c @@ -58,14 +58,14 @@ void affinity__set(struct affinity *a, int cpu) return; a->changed = true; - set_bit(cpu, a->sched_cpus); + __set_bit(cpu, a->sched_cpus); /* * We ignore errors because affinity is just an optimization. * This could happen for example with isolated CPUs or cpusets. * In this case the IPIs inside the kernel's perf API still work. */ sched_setaffinity(0, cpu_set_size, (cpu_set_t *)a->sched_cpus); - clear_bit(cpu, a->sched_cpus); + __clear_bit(cpu, a->sched_cpus); } static void __affinity__cleanup(struct affinity *a) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 98dfaf84bd137..dc2ae397d400e 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -79,12 +79,12 @@ struct perf_file_attr { void perf_header__set_feat(struct perf_header *header, int feat) { - set_bit(feat, header->adds_features); + __set_bit(feat, header->adds_features); } void perf_header__clear_feat(struct perf_header *header, int feat) { - clear_bit(feat, header->adds_features); + __clear_bit(feat, header->adds_features); } bool perf_header__has_feat(const struct perf_header *header, int feat) @@ -1358,7 +1358,7 @@ static int memory_node__read(struct memory_node *n, unsigned long idx) rewinddir(dir); for_each_memory(phys, dir) { - set_bit(phys, n->set); + __set_bit(phys, n->set); } closedir(dir); @@ -3952,7 +3952,7 @@ int perf_file_header__read(struct perf_file_header *header, if (!test_bit(HEADER_HOSTNAME, header->adds_features)) { bitmap_zero(header->adds_features, HEADER_FEAT_BITS); - set_bit(HEADER_BUILD_ID, header->adds_features); + __set_bit(HEADER_BUILD_ID, header->adds_features); } } diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index a4dff881be39b..49093b21ee2da 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -111,7 +111,7 @@ static int perf_mmap__aio_bind(struct mmap *map, int idx, struct perf_cpu cpu, i pr_err("Failed to allocate node mask for mbind: error %m\n"); return -1; } - set_bit(node_index, node_mask); + __set_bit(node_index, node_mask); if (mbind(data, mmap_len, MPOL_BIND, node_mask, node_index + 1 + 1, 0)) { pr_err("Failed to bind [%p-%p] AIO buffer to node %lu: error %m\n", data, data + mmap_len, node_index); @@ -256,7 +256,7 @@ static void build_node_mask(int node, struct mmap_cpu_mask *mask) for (idx = 0; idx < nr_cpus; idx++) { cpu = perf_cpu_map__cpu(cpu_map, idx); /* map c index to online cpu index */ if (cpu__get_node(cpu) == node) - set_bit(cpu.cpu, mask->bits); + __set_bit(cpu.cpu, mask->bits); } } @@ -270,7 +270,7 @@ static int perf_mmap__setup_affinity_mask(struct mmap *map, struct mmap_params * if (mp->affinity == PERF_AFFINITY_NODE && cpu__max_node() > 1) build_node_mask(cpu__get_node(map->core.cpu), &map->affinity_mask); else if (mp->affinity == PERF_AFFINITY_CPU) - set_bit(map->core.cpu.cpu, map->affinity_mask.bits); + __set_bit(map->core.cpu.cpu, map->affinity_mask.bits); return 0; } diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 03284059175f7..371d8f7a3de3c 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1513,7 +1513,7 @@ void perf_pmu__set_format(unsigned long *bits, long from, long to) memset(bits, 0, BITS_TO_BYTES(PERF_PMU_FORMAT_BITS)); for (b = from; b <= to; b++) - set_bit(b, bits); + __set_bit(b, bits); } void perf_pmu__del_formats(struct list_head *formats) diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c index a5d945415bbc1..5b602b6d46854 100644 --- a/tools/perf/util/scripting-engines/trace-event-perl.c +++ b/tools/perf/util/scripting-engines/trace-event-perl.c @@ -365,7 +365,7 @@ static void perl_process_tracepoint(struct perf_sample *sample, sprintf(handler, "%s::%s", event->system, event->name); - if (!test_and_set_bit(event->id, events_defined)) + if (!__test_and_set_bit(event->id, events_defined)) define_event_symbols(event, handler, event->print_fmt.args); s = nsecs / NSEC_PER_SEC; diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index 1f2040f36d4e9..0f229fa29163e 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -933,7 +933,7 @@ static void python_process_tracepoint(struct perf_sample *sample, sprintf(handler_name, "%s__%s", event->system, event->name); - if (!test_and_set_bit(event->id, events_defined)) + if (!__test_and_set_bit(event->id, events_defined)) define_event_symbols(event, handler_name, event->print_fmt.args); handler = get_handler(handler_name); diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 1a4f10de29ffe..873fd51ec1b21 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -2748,7 +2748,7 @@ int perf_session__cpu_bitmap(struct perf_session *session, goto out_delete_map; } - set_bit(cpu.cpu, cpu_bitmap); + __set_bit(cpu.cpu, cpu_bitmap); } err = 0; diff --git a/tools/perf/util/svghelper.c b/tools/perf/util/svghelper.c index 1e0c731fc5396..5c62d3118c41f 100644 --- a/tools/perf/util/svghelper.c +++ b/tools/perf/util/svghelper.c @@ -741,7 +741,7 @@ static int str_to_bitmap(char *s, cpumask_t *b, int nr_cpus) break; } - set_bit(c.cpu, cpumask_bits(b)); + __set_bit(c.cpu, cpumask_bits(b)); } perf_cpu_map__put(m); -- GitLab From 03a0c819e71755398d59993b9adee203544617d5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 19 Nov 2022 01:34:47 +0000 Subject: [PATCH 1107/1423] KVM: selftests: Use non-atomic clear/set bit helpers in KVM tests Use the dedicated non-atomic helpers for {clear,set}_bit() and their test variants, i.e. the double-underscore versions. Depsite being defined in atomic.h, and despite the kernel versions being atomic in the kernel, tools' {clear,set}_bit() helpers aren't actually atomic. Move to the double-underscore versions so that the versions that are expected to be atomic (for kernel developers) can be made atomic without affecting users that don't want atomic operations. Leave the usage in ucall_free() as-is, it's the one place in tools/ that actually wants/needs atomic behavior. Signed-off-by: Sean Christopherson Message-Id: <20221119013450.2643007-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/aarch64/arch_timer.c | 2 +- tools/testing/selftests/kvm/dirty_log_test.c | 34 +++++++++---------- .../selftests/kvm/x86_64/hyperv_evmcs.c | 4 +-- .../selftests/kvm/x86_64/hyperv_svm_test.c | 4 +-- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c index f2a96779716a8..26556a266021e 100644 --- a/tools/testing/selftests/kvm/aarch64/arch_timer.c +++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c @@ -222,7 +222,7 @@ static void *test_vcpu_run(void *arg) /* Currently, any exit from guest is an indication of completion */ pthread_mutex_lock(&vcpu_done_map_lock); - set_bit(vcpu_idx, vcpu_done_map); + __set_bit(vcpu_idx, vcpu_done_map); pthread_mutex_unlock(&vcpu_done_map_lock); switch (get_ucall(vcpu, &uc)) { diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index a38c4369fb8ed..a75548865f6b4 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -44,20 +44,20 @@ # define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7) # define test_bit_le(nr, addr) \ test_bit((nr) ^ BITOP_LE_SWIZZLE, addr) -# define set_bit_le(nr, addr) \ - set_bit((nr) ^ BITOP_LE_SWIZZLE, addr) -# define clear_bit_le(nr, addr) \ - clear_bit((nr) ^ BITOP_LE_SWIZZLE, addr) -# define test_and_set_bit_le(nr, addr) \ - test_and_set_bit((nr) ^ BITOP_LE_SWIZZLE, addr) -# define test_and_clear_bit_le(nr, addr) \ - test_and_clear_bit((nr) ^ BITOP_LE_SWIZZLE, addr) +# define __set_bit_le(nr, addr) \ + __set_bit((nr) ^ BITOP_LE_SWIZZLE, addr) +# define __clear_bit_le(nr, addr) \ + __clear_bit((nr) ^ BITOP_LE_SWIZZLE, addr) +# define __test_and_set_bit_le(nr, addr) \ + __test_and_set_bit((nr) ^ BITOP_LE_SWIZZLE, addr) +# define __test_and_clear_bit_le(nr, addr) \ + __test_and_clear_bit((nr) ^ BITOP_LE_SWIZZLE, addr) #else -# define test_bit_le test_bit -# define set_bit_le set_bit -# define clear_bit_le clear_bit -# define test_and_set_bit_le test_and_set_bit -# define test_and_clear_bit_le test_and_clear_bit +# define test_bit_le test_bit +# define __set_bit_le __set_bit +# define __clear_bit_le __clear_bit +# define __test_and_set_bit_le __test_and_set_bit +# define __test_and_clear_bit_le __test_and_clear_bit #endif #define TEST_DIRTY_RING_COUNT 65536 @@ -305,7 +305,7 @@ static uint32_t dirty_ring_collect_one(struct kvm_dirty_gfn *dirty_gfns, TEST_ASSERT(cur->offset < num_pages, "Offset overflow: " "0x%llx >= 0x%x", cur->offset, num_pages); //pr_info("fetch 0x%x page %llu\n", *fetch_index, cur->offset); - set_bit_le(cur->offset, bitmap); + __set_bit_le(cur->offset, bitmap); dirty_ring_last_page = cur->offset; dirty_gfn_set_collected(cur); (*fetch_index)++; @@ -560,7 +560,7 @@ static void vm_dirty_log_verify(enum vm_guest_mode mode, unsigned long *bmap) value_ptr = host_test_mem + page * host_page_size; /* If this is a special page that we were tracking... */ - if (test_and_clear_bit_le(page, host_bmap_track)) { + if (__test_and_clear_bit_le(page, host_bmap_track)) { host_track_next_count++; TEST_ASSERT(test_bit_le(page, bmap), "Page %"PRIu64" should have its dirty bit " @@ -568,7 +568,7 @@ static void vm_dirty_log_verify(enum vm_guest_mode mode, unsigned long *bmap) page); } - if (test_and_clear_bit_le(page, bmap)) { + if (__test_and_clear_bit_le(page, bmap)) { bool matched; host_dirty_count++; @@ -661,7 +661,7 @@ static void vm_dirty_log_verify(enum vm_guest_mode mode, unsigned long *bmap) * should report its dirtyness in the * next run */ - set_bit_le(page, host_bmap_track); + __set_bit_le(page, host_bmap_track); } } } diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c b/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c index ba09d300c9539..af29e5776d403 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c @@ -142,7 +142,7 @@ void guest_code(struct vmx_pages *vmx_pages, struct hyperv_test_pages *hv_pages, /* Intercept RDMSR 0xc0000100 */ vmwrite(CPU_BASED_VM_EXEC_CONTROL, vmreadz(CPU_BASED_VM_EXEC_CONTROL) | CPU_BASED_USE_MSR_BITMAPS); - set_bit(MSR_FS_BASE & 0x1fff, vmx_pages->msr + 0x400); + __set_bit(MSR_FS_BASE & 0x1fff, vmx_pages->msr + 0x400); GUEST_ASSERT(!vmresume()); GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ); current_evmcs->guest_rip += 2; /* rdmsr */ @@ -154,7 +154,7 @@ void guest_code(struct vmx_pages *vmx_pages, struct hyperv_test_pages *hv_pages, current_evmcs->guest_rip += 2; /* rdmsr */ /* Intercept RDMSR 0xc0000101 without telling KVM about it */ - set_bit(MSR_GS_BASE & 0x1fff, vmx_pages->msr + 0x400); + __set_bit(MSR_GS_BASE & 0x1fff, vmx_pages->msr + 0x400); /* Make sure HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP is set */ current_evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; GUEST_ASSERT(!vmresume()); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c index 3b3cc94ba8e47..68a7d354ea070 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c @@ -103,7 +103,7 @@ static void __attribute__((__flatten__)) guest_code(struct svm_test_data *svm, /* Intercept RDMSR 0xc0000100 */ vmcb->control.intercept |= 1ULL << INTERCEPT_MSR_PROT; - set_bit(2 * (MSR_FS_BASE & 0x1fff), svm->msr + 0x800); + __set_bit(2 * (MSR_FS_BASE & 0x1fff), svm->msr + 0x800); run_guest(vmcb, svm->vmcb_gpa); GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR); vmcb->save.rip += 2; /* rdmsr */ @@ -115,7 +115,7 @@ static void __attribute__((__flatten__)) guest_code(struct svm_test_data *svm, vmcb->save.rip += 2; /* rdmsr */ /* Intercept RDMSR 0xc0000101 without telling KVM about it */ - set_bit(2 * (MSR_GS_BASE & 0x1fff), svm->msr + 0x800); + __set_bit(2 * (MSR_GS_BASE & 0x1fff), svm->msr + 0x800); /* Make sure HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP is set */ vmcb->control.clean |= HV_VMCB_NESTED_ENLIGHTENMENTS; run_guest(vmcb, svm->vmcb_gpa); -- GitLab From 7f32a6cf8b5a8067537f25a1f12744292431aae1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 19 Nov 2022 01:34:48 +0000 Subject: [PATCH 1108/1423] tools: Drop conflicting non-atomic test_and_{clear,set}_bit() helpers Drop tools' non-atomic test_and_set_bit() and test_and_clear_bit() helpers now that all users are gone. The names will be claimed in the future for atomic versions. Signed-off-by: Sean Christopherson Message-Id: <20221119013450.2643007-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- tools/include/linux/bitmap.h | 34 ---------------------------------- 1 file changed, 34 deletions(-) diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index 65d0747c5205c..f3566ea0f932e 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -77,40 +77,6 @@ static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, __bitmap_or(dst, src1, src2, nbits); } -/** - * test_and_set_bit - Set a bit and return its old value - * @nr: Bit to set - * @addr: Address to count from - */ -static inline int test_and_set_bit(int nr, unsigned long *addr) -{ - unsigned long mask = BIT_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); - unsigned long old; - - old = *p; - *p = old | mask; - - return (old & mask) != 0; -} - -/** - * test_and_clear_bit - Clear a bit and return its old value - * @nr: Bit to clear - * @addr: Address to count from - */ -static inline int test_and_clear_bit(int nr, unsigned long *addr) -{ - unsigned long mask = BIT_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); - unsigned long old; - - old = *p; - *p = old & ~mask; - - return (old & mask) != 0; -} - /** * bitmap_zalloc - Allocate bitmap * @nbits: Number of bits -- GitLab From 36293352ff433061d45d52784983e44950c09ae3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 19 Nov 2022 01:34:49 +0000 Subject: [PATCH 1109/1423] tools: Drop "atomic_" prefix from atomic test_and_set_bit() Drop the "atomic_" prefix from tools' atomic_test_and_set_bit() to match the kernel nomenclature where test_and_set_bit() is atomic, and __test_and_set_bit() provides the non-atomic variant. Signed-off-by: Sean Christopherson Message-Id: <20221119013450.2643007-9-seanjc@google.com> Signed-off-by: Paolo Bonzini --- tools/arch/x86/include/asm/atomic.h | 3 +-- tools/include/asm-generic/atomic-gcc.h | 2 +- tools/testing/selftests/kvm/lib/ucall_common.c | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/arch/x86/include/asm/atomic.h b/tools/arch/x86/include/asm/atomic.h index 01cc27ec4520d..a42733af7d51a 100644 --- a/tools/arch/x86/include/asm/atomic.h +++ b/tools/arch/x86/include/asm/atomic.h @@ -71,10 +71,9 @@ static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new) return cmpxchg(&v->counter, old, new); } -static inline int atomic_test_and_set_bit(long nr, unsigned long *addr) +static inline int test_and_set_bit(long nr, unsigned long *addr) { GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(bts), *addr, "Ir", nr, "%0", "c"); - } #endif /* _TOOLS_LINUX_ASM_X86_ATOMIC_H */ diff --git a/tools/include/asm-generic/atomic-gcc.h b/tools/include/asm-generic/atomic-gcc.h index 6daa68bf5b9e0..37ef522aaac48 100644 --- a/tools/include/asm-generic/atomic-gcc.h +++ b/tools/include/asm-generic/atomic-gcc.h @@ -70,7 +70,7 @@ static inline int atomic_cmpxchg(atomic_t *v, int oldval, int newval) return cmpxchg(&(v)->counter, oldval, newval); } -static inline int atomic_test_and_set_bit(long nr, unsigned long *addr) +static inline int test_and_set_bit(long nr, unsigned long *addr) { unsigned long mask = BIT_MASK(nr); long old; diff --git a/tools/testing/selftests/kvm/lib/ucall_common.c b/tools/testing/selftests/kvm/lib/ucall_common.c index fcae96461e460..820ce6c828299 100644 --- a/tools/testing/selftests/kvm/lib/ucall_common.c +++ b/tools/testing/selftests/kvm/lib/ucall_common.c @@ -44,7 +44,7 @@ static struct ucall *ucall_alloc(void) GUEST_ASSERT(ucall_pool); for (i = 0; i < KVM_MAX_VCPUS; ++i) { - if (!atomic_test_and_set_bit(i, ucall_pool->in_use)) { + if (!test_and_set_bit(i, ucall_pool->in_use)) { uc = &ucall_pool->ucalls[i]; memset(uc->args, 0, sizeof(uc->args)); return uc; -- GitLab From bb056c0f080a3d15c2a9ad9057a8b542d45e4ba0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 19 Nov 2022 01:34:50 +0000 Subject: [PATCH 1110/1423] tools: KVM: selftests: Convert clear/set_bit() to actual atomics Convert {clear,set}_bit() to atomics as KVM's ucall implementation relies on clear_bit() being atomic, they are defined in atomic.h, and the same helpers in the kernel proper are atomic. KVM's ucall infrastructure is the only user of clear_bit() in tools/, and there are no true set_bit() users. tools/testing/nvdimm/ does make heavy use of set_bit(), but that code builds into a kernel module of sorts, i.e. pulls in all of the kernel's header and so is already getting the kernel's atomic set_bit(). Signed-off-by: Sean Christopherson Message-Id: <20221119013450.2643007-10-seanjc@google.com> Signed-off-by: Paolo Bonzini --- tools/arch/x86/include/asm/atomic.h | 5 +++++ tools/include/asm-generic/atomic-gcc.h | 11 +++++++++++ tools/include/asm-generic/bitops/atomic.h | 15 ++++++--------- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/tools/arch/x86/include/asm/atomic.h b/tools/arch/x86/include/asm/atomic.h index a42733af7d51a..365cf182df12b 100644 --- a/tools/arch/x86/include/asm/atomic.h +++ b/tools/arch/x86/include/asm/atomic.h @@ -76,4 +76,9 @@ static inline int test_and_set_bit(long nr, unsigned long *addr) GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(bts), *addr, "Ir", nr, "%0", "c"); } +static inline int test_and_clear_bit(long nr, unsigned long *addr) +{ + GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btc), *addr, "Ir", nr, "%0", "c"); +} + #endif /* _TOOLS_LINUX_ASM_X86_ATOMIC_H */ diff --git a/tools/include/asm-generic/atomic-gcc.h b/tools/include/asm-generic/atomic-gcc.h index 37ef522aaac48..9b3c528bab924 100644 --- a/tools/include/asm-generic/atomic-gcc.h +++ b/tools/include/asm-generic/atomic-gcc.h @@ -81,4 +81,15 @@ static inline int test_and_set_bit(long nr, unsigned long *addr) return !!(old & mask); } +static inline int test_and_clear_bit(long nr, unsigned long *addr) +{ + unsigned long mask = BIT_MASK(nr); + long old; + + addr += BIT_WORD(nr); + + old = __sync_fetch_and_and(addr, ~mask); + return !!(old & mask); +} + #endif /* __TOOLS_ASM_GENERIC_ATOMIC_H */ diff --git a/tools/include/asm-generic/bitops/atomic.h b/tools/include/asm-generic/bitops/atomic.h index f64b049d236c7..ab37a221b41a4 100644 --- a/tools/include/asm-generic/bitops/atomic.h +++ b/tools/include/asm-generic/bitops/atomic.h @@ -5,14 +5,11 @@ #include #include -static inline void set_bit(unsigned long nr, unsigned long *addr) -{ - addr[nr / __BITS_PER_LONG] |= 1UL << (nr % __BITS_PER_LONG); -} - -static inline void clear_bit(unsigned long nr, unsigned long *addr) -{ - addr[nr / __BITS_PER_LONG] &= ~(1UL << (nr % __BITS_PER_LONG)); -} +/* + * Just alias the test versions, all of the compiler built-in atomics "fetch", + * and optimizing compile-time constants on x86 isn't worth the complexity. + */ +#define set_bit test_and_set_bit +#define clear_bit test_and_clear_bit #endif /* _TOOLS_LINUX_ASM_GENERIC_BITOPS_ATOMIC_H_ */ -- GitLab From 4bf46e35826d8dc4fc0a103dd0ccd94c072a4c6a Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 1 Dec 2022 09:13:54 +0000 Subject: [PATCH 1111/1423] KVM: selftests: Fix spelling mistake "probabalistic" -> "probabilistic" There is a spelling mistake in some help text. Fix it. Signed-off-by: Colin Ian King Message-Id: <20221201091354.1613652-1-colin.i.king@gmail.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/dirty_log_perf_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index c33e89012ae6f..e9d6d1aecf89c 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -398,7 +398,7 @@ static void help(char *name) printf(" -x: Split the memory region into this number of memslots.\n" " (default: 1)\n"); printf(" -w: specify the percentage of pages which should be written to\n" - " as an integer from 0-100 inclusive. This is probabalistic,\n" + " as an integer from 0-100 inclusive. This is probabilistic,\n" " so -w X means each page has an X%% chance of writing\n" " and a (100-X)%% chance of reading.\n" " (default: 100 i.e. all pages are written to.)\n"); -- GitLab From 6925ba3d9b8ccf1989b4cf13d6f0d7e341899481 Mon Sep 17 00:00:00 2001 From: Hal Feng Date: Fri, 18 Nov 2022 09:17:14 +0800 Subject: [PATCH 1112/1423] RISC-V: defconfig: Enable CONFIG_SERIAL_8250_DW Add CONFIG_SERIAL_8250_DW=y, which is a necessary option for StarFive JH7110 and JH7100 SoCs to boot with serial ports. Reviewed-by: Conor Dooley Signed-off-by: Hal Feng Acked-by: Palmer Dabbelt Link: https://lore.kernel.org/r/20221118011714.70877-9-hal.feng@starfivetech.com Signed-off-by: Palmer Dabbelt --- arch/riscv/configs/defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index daba5d7438620..74ed7037314fd 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -123,6 +123,7 @@ CONFIG_MICROSEMI_PHY=y CONFIG_INPUT_MOUSEDEV=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_8250_DW=y CONFIG_SERIAL_OF_PLATFORM=y CONFIG_VIRTIO_CONSOLE=y CONFIG_HW_RANDOM=y -- GitLab From 0c2a04128f500ea4dfc6bc449507005b998b76ab Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 2 Dec 2022 13:34:45 -0500 Subject: [PATCH 1113/1423] KVM: x86: remove unnecessary exports Several symbols are not used by vendor modules but still exported. Removing them ensures that new coupling between kvm.ko and kvm-*.ko is noticed and reviewed. Co-developed-by: Sean Christopherson Co-developed-by: Like Xu Signed-off-by: Sean Christopherson Signed-off-by: Like Xu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 1 - arch/x86/kvm/irq.c | 2 -- arch/x86/kvm/lapic.c | 3 --- arch/x86/kvm/x86.c | 8 -------- 4 files changed, 14 deletions(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 2c7f2a26421e8..cc3e8c7d08500 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -898,7 +898,6 @@ bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu) return false; return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; } -EXPORT_SYMBOL_GPL(kvm_hv_assist_page_enabled); int kvm_hv_get_assist_page(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index d8d50558f1657..a70952eca9058 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -31,7 +31,6 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) return r; } -EXPORT_SYMBOL(kvm_cpu_has_pending_timer); /* * check if there is a pending userspace external interrupt @@ -150,7 +149,6 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) if (kvm_xen_timer_enabled(vcpu)) kvm_xen_inject_timer_irqs(vcpu); } -EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); void __kvm_migrate_timers(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 8224ac8b617a4..4efdb4a4d72c6 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -160,7 +160,6 @@ bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu) && !(kvm_mwait_in_guest(vcpu->kvm) || kvm_can_post_timer_interrupt(vcpu)); } -EXPORT_SYMBOL_GPL(kvm_can_use_hv_timer); static bool kvm_use_posted_timer_interrupt(struct kvm_vcpu *vcpu) { @@ -1914,7 +1913,6 @@ bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) return vcpu->arch.apic->lapic_timer.hv_timer_in_use; } -EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use); static void cancel_hv_timer(struct kvm_lapic *apic) { @@ -2432,7 +2430,6 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) apic->isr_count = count_vectors(apic->regs + APIC_ISR); } } -EXPORT_SYMBOL_GPL(kvm_apic_update_apicv); void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 152ea4993b76f..4825773886f9d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -463,7 +463,6 @@ u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) { return vcpu->arch.apic_base; } -EXPORT_SYMBOL_GPL(kvm_get_apic_base); enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu) { @@ -491,7 +490,6 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) kvm_recalculate_apic_map(vcpu->kvm); return 0; } -EXPORT_SYMBOL_GPL(kvm_set_apic_base); /* * Handle a fault on a hardware virtualization (VMX or SVM) instruction. @@ -782,7 +780,6 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code, fault->address); } -EXPORT_SYMBOL_GPL(kvm_inject_page_fault); void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) @@ -811,7 +808,6 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu) atomic_inc(&vcpu->arch.nmi_queued); kvm_make_request(KVM_REQ_NMI, vcpu); } -EXPORT_SYMBOL_GPL(kvm_inject_nmi); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { @@ -836,7 +832,6 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) kvm_queue_exception_e(vcpu, GP_VECTOR, 0); return false; } -EXPORT_SYMBOL_GPL(kvm_require_cpl); bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) { @@ -2069,7 +2064,6 @@ int kvm_emulate_as_nop(struct kvm_vcpu *vcpu) { return kvm_skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_as_nop); int kvm_emulate_invd(struct kvm_vcpu *vcpu) { @@ -2515,7 +2509,6 @@ u64 kvm_scale_tsc(u64 tsc, u64 ratio) return _tsc; } -EXPORT_SYMBOL_GPL(kvm_scale_tsc); static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) { @@ -12068,7 +12061,6 @@ bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) { return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id; } -EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) { -- GitLab From 74bee0cad8dcd8ddec5e763c369239fc5990676a Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 7 Oct 2022 15:16:44 -0700 Subject: [PATCH 1114/1423] KVM: x86: Advertise that the SMM_CTL MSR is not supported CPUID.80000021H:EAX[bit 9] indicates that the SMM_CTL MSR (0xc0010116) is not supported. This defeature can be advertised by KVM_GET_SUPPORTED_CPUID regardless of whether or not the host enumerates it; currently it will be included only if the host enumerates at least leaf 8000001DH, due to a preexisting bug in QEMU that KVM has to work around (commit f751d8eac176, "KVM: x86: work around QEMU issue with synthetic CPUID leaves", 2022-04-29). Signed-off-by: Jim Mattson Message-Id: <20221007221644.138355-1-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 723502181a3a9..0b5bf013fcb8e 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1233,8 +1233,12 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) * Other defined bits are for MSRs that KVM does not expose: * EAX 3 SPCL, SMM page configuration lock * EAX 13 PCMSR, Prefetch control MSR + * + * KVM doesn't support SMM_CTL. + * EAX 9 SMM_CTL MSR is not supported */ entry->eax &= BIT(0) | BIT(2) | BIT(6); + entry->eax |= BIT(9); if (static_cpu_has(X86_FEATURE_LFENCE_RDTSC)) entry->eax |= BIT(2); if (!static_cpu_has_bug(X86_BUG_NULL_SEG)) -- GitLab From d33deda095d3637d218e7eed441633b2a01e1413 Mon Sep 17 00:00:00 2001 From: Tong Tiangen Date: Mon, 24 Oct 2022 09:47:24 +0000 Subject: [PATCH 1115/1423] riscv/mm: hugepage's PG_dcache_clean flag is only set in head page HugeTLB pages are always fully mapped, so only setting head page's PG_dcache_clean flag is enough. Signed-off-by: Tong Tiangen Link: https://lore.kernel.org/lkml/20220331065640.5777-2-songmuchun@bytedance.com/ Link: https://lore.kernel.org/r/20221024094725.3054311-2-tongtiangen@huawei.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/cacheflush.h | 7 +++++++ arch/riscv/mm/cacheflush.c | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h index 8a5c246b0a216..c172d05de4740 100644 --- a/arch/riscv/include/asm/cacheflush.h +++ b/arch/riscv/include/asm/cacheflush.h @@ -17,6 +17,13 @@ static inline void local_flush_icache_all(void) static inline void flush_dcache_page(struct page *page) { + /* + * HugeTLB pages are always fully mapped and only head page will be + * set PG_dcache_clean (see comments in flush_icache_pte()). + */ + if (PageHuge(page)) + page = compound_head(page); + if (test_bit(PG_dcache_clean, &page->flags)) clear_bit(PG_dcache_clean, &page->flags); } diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c index 6cb7d96ad9c7b..062559c04fc3a 100644 --- a/arch/riscv/mm/cacheflush.c +++ b/arch/riscv/mm/cacheflush.c @@ -82,6 +82,13 @@ void flush_icache_pte(pte_t pte) { struct page *page = pte_page(pte); + /* + * HugeTLB pages are always fully mapped, so only setting head page's + * PG_dcache_clean flag is enough. + */ + if (PageHuge(page)) + page = compound_head(page); + if (!test_and_set_bit(PG_dcache_clean, &page->flags)) flush_icache_all(); } -- GitLab From d8bf77a1dc3079692f54be3087a5fd16d90027b0 Mon Sep 17 00:00:00 2001 From: Tong Tiangen Date: Mon, 24 Oct 2022 09:47:25 +0000 Subject: [PATCH 1116/1423] riscv/mm: add arch hook arch_clear_hugepage_flags With the PG_arch_1 we keep track if the page's data cache is clean, architecture rely on this property to treat new pages as dirty with respect to the data cache and perform the flushing before mapping the pages into userspace. This patch adds a new architecture hook, arch_clear_hugepage_flags,so that architectures which rely on the page flags being in a particular state for fresh allocations can adjust the flags accordingly when a page is freed into the pool. Fixes: 9e953cda5cdf ("riscv: Introduce huge page support for 32/64bit kernel") Signed-off-by: Tong Tiangen Link: https://lore.kernel.org/r/20221024094725.3054311-3-tongtiangen@huawei.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/hugetlb.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h index a5c2ca1d1cd8b..ec19d6afc8965 100644 --- a/arch/riscv/include/asm/hugetlb.h +++ b/arch/riscv/include/asm/hugetlb.h @@ -5,4 +5,10 @@ #include #include +static inline void arch_clear_hugepage_flags(struct page *page) +{ + clear_bit(PG_dcache_clean, &page->flags); +} +#define arch_clear_hugepage_flags arch_clear_hugepage_flags + #endif /* _ASM_RISCV_HUGETLB_H */ -- GitLab From b57c2f124098459a4acc15d5044f87cba31c87f0 Mon Sep 17 00:00:00 2001 From: Binglei Wang Date: Tue, 25 Oct 2022 16:18:32 +0100 Subject: [PATCH 1117/1423] riscv: add riscv rethook implementation Implement the kretprobes on riscv arch by using rethook machenism which abstracts general kretprobe info into a struct rethook_node to be embedded in the struct kretprobe_instance. Acked-by: Masami Hiramatsu (Google) Signed-off-by: Binglei Wang Signed-off-by: Conor Dooley Link: https://lore.kernel.org/r/20221025151831.1097417-1-conor@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/kprobes.h | 2 -- arch/riscv/kernel/probes/Makefile | 2 +- arch/riscv/kernel/probes/kprobes.c | 13 --------- arch/riscv/kernel/probes/rethook.c | 27 +++++++++++++++++++ arch/riscv/kernel/probes/rethook.h | 8 ++++++ ...obes_trampoline.S => rethook_trampoline.S} | 6 ++--- 7 files changed, 40 insertions(+), 19 deletions(-) create mode 100644 arch/riscv/kernel/probes/rethook.c create mode 100644 arch/riscv/kernel/probes/rethook.h rename arch/riscv/kernel/probes/{kprobes_trampoline.S => rethook_trampoline.S} (94%) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 8d47562be90c7..ef8d66de5f381 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -101,6 +101,7 @@ config RISCV select HAVE_KPROBES if !XIP_KERNEL select HAVE_KPROBES_ON_FTRACE if !XIP_KERNEL select HAVE_KRETPROBES if !XIP_KERNEL + select HAVE_RETHOOK if !XIP_KERNEL select HAVE_MOVE_PMD select HAVE_MOVE_PUD select HAVE_PCI diff --git a/arch/riscv/include/asm/kprobes.h b/arch/riscv/include/asm/kprobes.h index 217ef89f22b9f..e7882ccb0fd46 100644 --- a/arch/riscv/include/asm/kprobes.h +++ b/arch/riscv/include/asm/kprobes.h @@ -40,8 +40,6 @@ void arch_remove_kprobe(struct kprobe *p); int kprobe_fault_handler(struct pt_regs *regs, unsigned int trapnr); bool kprobe_breakpoint_handler(struct pt_regs *regs); bool kprobe_single_step_handler(struct pt_regs *regs); -void __kretprobe_trampoline(void); -void __kprobes *trampoline_probe_handler(struct pt_regs *regs); #endif /* CONFIG_KPROBES */ #endif /* _ASM_RISCV_KPROBES_H */ diff --git a/arch/riscv/kernel/probes/Makefile b/arch/riscv/kernel/probes/Makefile index 7f0840dcc31bc..c40139e9ca478 100644 --- a/arch/riscv/kernel/probes/Makefile +++ b/arch/riscv/kernel/probes/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_KPROBES) += kprobes.o decode-insn.o simulate-insn.o -obj-$(CONFIG_KPROBES) += kprobes_trampoline.o +obj-$(CONFIG_RETHOOK) += rethook.o rethook_trampoline.o obj-$(CONFIG_KPROBES_ON_FTRACE) += ftrace.o obj-$(CONFIG_UPROBES) += uprobes.o decode-insn.o simulate-insn.o CFLAGS_REMOVE_simulate-insn.o = $(CC_FLAGS_FTRACE) diff --git a/arch/riscv/kernel/probes/kprobes.c b/arch/riscv/kernel/probes/kprobes.c index e6e950b7cf327..f21592d203062 100644 --- a/arch/riscv/kernel/probes/kprobes.c +++ b/arch/riscv/kernel/probes/kprobes.c @@ -345,19 +345,6 @@ int __init arch_populate_kprobe_blacklist(void) return ret; } -void __kprobes __used *trampoline_probe_handler(struct pt_regs *regs) -{ - return (void *)kretprobe_trampoline_handler(regs, NULL); -} - -void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, - struct pt_regs *regs) -{ - ri->ret_addr = (kprobe_opcode_t *)regs->ra; - ri->fp = NULL; - regs->ra = (unsigned long) &__kretprobe_trampoline; -} - int __kprobes arch_trampoline_kprobe(struct kprobe *p) { return 0; diff --git a/arch/riscv/kernel/probes/rethook.c b/arch/riscv/kernel/probes/rethook.c new file mode 100644 index 0000000000000..5c27c1f509894 --- /dev/null +++ b/arch/riscv/kernel/probes/rethook.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Generic return hook for riscv. + */ + +#include +#include +#include "rethook.h" + +/* This is called from arch_rethook_trampoline() */ +unsigned long __used arch_rethook_trampoline_callback(struct pt_regs *regs) +{ + return rethook_trampoline_handler(regs, regs->s0); +} + +NOKPROBE_SYMBOL(arch_rethook_trampoline_callback); + +void arch_rethook_prepare(struct rethook_node *rhn, struct pt_regs *regs, bool mcount) +{ + rhn->ret_addr = regs->ra; + rhn->frame = regs->s0; + + /* replace return addr with trampoline */ + regs->ra = (unsigned long)arch_rethook_trampoline; +} + +NOKPROBE_SYMBOL(arch_rethook_prepare); diff --git a/arch/riscv/kernel/probes/rethook.h b/arch/riscv/kernel/probes/rethook.h new file mode 100644 index 0000000000000..4758f7e3ce881 --- /dev/null +++ b/arch/riscv/kernel/probes/rethook.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __RISCV_RETHOOK_H +#define __RISCV_RETHOOK_H + +unsigned long arch_rethook_trampoline_callback(struct pt_regs *regs); +void arch_rethook_prepare(struct rethook_node *rhn, struct pt_regs *regs, bool mcount); + +#endif diff --git a/arch/riscv/kernel/probes/kprobes_trampoline.S b/arch/riscv/kernel/probes/rethook_trampoline.S similarity index 94% rename from arch/riscv/kernel/probes/kprobes_trampoline.S rename to arch/riscv/kernel/probes/rethook_trampoline.S index 7bdb09ded39b8..21bac92a170a9 100644 --- a/arch/riscv/kernel/probes/kprobes_trampoline.S +++ b/arch/riscv/kernel/probes/rethook_trampoline.S @@ -75,13 +75,13 @@ REG_L x31, PT_T6(sp) .endm -ENTRY(__kretprobe_trampoline) +ENTRY(arch_rethook_trampoline) addi sp, sp, -(PT_SIZE_ON_STACK) save_all_base_regs move a0, sp /* pt_regs */ - call trampoline_probe_handler + call arch_rethook_trampoline_callback /* use the result as the return-address */ move ra, a0 @@ -90,4 +90,4 @@ ENTRY(__kretprobe_trampoline) addi sp, sp, PT_SIZE_ON_STACK ret -ENDPROC(__kretprobe_trampoline) +ENDPROC(arch_rethook_trampoline) -- GitLab From de92f65719cd672f4b48397540b9f9eff67eca40 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 2 Dec 2022 12:59:11 -0800 Subject: [PATCH 1118/1423] exit: Allow oops_limit to be disabled In preparation for keeping oops_limit logic in sync with warn_limit, have oops_limit == 0 disable checking the Oops counter. Cc: Jann Horn Cc: Jonathan Corbet Cc: Andrew Morton Cc: Baolin Wang Cc: "Jason A. Donenfeld" Cc: Eric Biggers Cc: Huang Ying Cc: "Eric W. Biederman" Cc: Arnd Bergmann Cc: linux-doc@vger.kernel.org Signed-off-by: Kees Cook --- Documentation/admin-guide/sysctl/kernel.rst | 5 +++-- kernel/exit.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 09f3fb2f85858..a31d8d81ea078 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -671,8 +671,9 @@ oops_limit ========== Number of kernel oopses after which the kernel should panic when -``panic_on_oops`` is not set. Setting this to 0 or 1 has the same effect -as setting ``panic_on_oops=1``. +``panic_on_oops`` is not set. Setting this to 0 disables checking +the count. Setting this to 1 has the same effect as setting +``panic_on_oops=1``. The default value is 10000. osrelease, ostype & version diff --git a/kernel/exit.c b/kernel/exit.c index dc1a32149f944..deffb8e4b1b24 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -954,7 +954,7 @@ void __noreturn make_task_dead(int signr) * To make sure this can't happen, place an upper bound on how often the * kernel may oops without panic(). */ - if (atomic_inc_return(&oops_count) >= READ_ONCE(oops_limit)) + if (atomic_inc_return(&oops_count) >= READ_ONCE(oops_limit) && oops_limit) panic("Oopsed too often (kernel.oops_limit is %d)", oops_limit); /* -- GitLab From 79cc1ba7badf9e7a12af99695a557e9ce27ee967 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Nov 2022 15:43:24 -0800 Subject: [PATCH 1119/1423] panic: Consolidate open-coded panic_on_warn checks Several run-time checkers (KASAN, UBSAN, KFENCE, KCSAN, sched) roll their own warnings, and each check "panic_on_warn". Consolidate this into a single function so that future instrumentation can be added in a single location. Cc: Marco Elver Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Steven Rostedt Cc: Ben Segall Cc: Mel Gorman Cc: Daniel Bristot de Oliveira Cc: Valentin Schneider Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Vincenzo Frascino Cc: Andrew Morton Cc: David Gow Cc: tangmeng Cc: Jann Horn Cc: Shuah Khan Cc: Petr Mladek Cc: "Paul E. McKenney" Cc: Sebastian Andrzej Siewior Cc: "Guilherme G. Piccoli" Cc: Tiezhu Yang Cc: kasan-dev@googlegroups.com Cc: linux-mm@kvack.org Reviewed-by: Luis Chamberlain Signed-off-by: Kees Cook Reviewed-by: Marco Elver Reviewed-by: Andrey Konovalov Link: https://lore.kernel.org/r/20221117234328.594699-4-keescook@chromium.org --- include/linux/panic.h | 1 + kernel/kcsan/report.c | 3 +-- kernel/panic.c | 9 +++++++-- kernel/sched/core.c | 3 +-- lib/ubsan.c | 3 +-- mm/kasan/report.c | 4 ++-- mm/kfence/report.c | 3 +-- 7 files changed, 14 insertions(+), 12 deletions(-) diff --git a/include/linux/panic.h b/include/linux/panic.h index c7759b3f20452..979b776e3bcb3 100644 --- a/include/linux/panic.h +++ b/include/linux/panic.h @@ -11,6 +11,7 @@ extern long (*panic_blink)(int state); __printf(1, 2) void panic(const char *fmt, ...) __noreturn __cold; void nmi_panic(struct pt_regs *regs, const char *msg); +void check_panic_on_warn(const char *origin); extern void oops_enter(void); extern void oops_exit(void); extern bool oops_may_print(void); diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 67794404042a5..e95ce7d7a76e7 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -492,8 +492,7 @@ static void print_report(enum kcsan_value_change value_change, dump_stack_print_info(KERN_DEFAULT); pr_err("==================================================================\n"); - if (panic_on_warn) - panic("panic_on_warn set ...\n"); + check_panic_on_warn("KCSAN"); } static void release_report(unsigned long *flags, struct other_info *other_info) diff --git a/kernel/panic.c b/kernel/panic.c index d843d036651e0..cfa354322d5f5 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -201,6 +201,12 @@ static void panic_print_sys_info(bool console_flush) ftrace_dump(DUMP_ALL); } +void check_panic_on_warn(const char *origin) +{ + if (panic_on_warn) + panic("%s: panic_on_warn set ...\n", origin); +} + /** * panic - halt the system * @fmt: The text string to print @@ -619,8 +625,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, if (regs) show_regs(regs); - if (panic_on_warn) - panic("panic_on_warn set ...\n"); + check_panic_on_warn("kernel"); if (!regs) dump_stack(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5800b0623ff30..285ef8821b4f2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5729,8 +5729,7 @@ static noinline void __schedule_bug(struct task_struct *prev) pr_err("Preemption disabled at:"); print_ip_sym(KERN_ERR, preempt_disable_ip); } - if (panic_on_warn) - panic("scheduling while atomic\n"); + check_panic_on_warn("scheduling while atomic"); dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); diff --git a/lib/ubsan.c b/lib/ubsan.c index 36bd75e334263..60c7099857a05 100644 --- a/lib/ubsan.c +++ b/lib/ubsan.c @@ -154,8 +154,7 @@ static void ubsan_epilogue(void) current->in_ubsan--; - if (panic_on_warn) - panic("panic_on_warn set ...\n"); + check_panic_on_warn("UBSAN"); } void __ubsan_handle_divrem_overflow(void *_data, void *lhs, void *rhs) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index df3602062bfd6..cc98dfdd3ed2f 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -164,8 +164,8 @@ static void end_report(unsigned long *flags, void *addr) (unsigned long)addr); pr_err("==================================================================\n"); spin_unlock_irqrestore(&report_lock, *flags); - if (panic_on_warn && !test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) - panic("panic_on_warn set ...\n"); + if (!test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) + check_panic_on_warn("KASAN"); if (kasan_arg_fault == KASAN_ARG_FAULT_PANIC) panic("kasan.fault=panic set ...\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); diff --git a/mm/kfence/report.c b/mm/kfence/report.c index 7e496856c2ebe..110c27ca597da 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -268,8 +268,7 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r lockdep_on(); - if (panic_on_warn) - panic("panic_on_warn set ...\n"); + check_panic_on_warn("KFENCE"); /* We encountered a memory safety error, taint the kernel! */ add_taint(TAINT_BAD_PAGE, LOCKDEP_STILL_OK); -- GitLab From 9fc9e278a5c0b708eeffaf47d6eb0c82aa74ed78 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Nov 2022 15:43:25 -0800 Subject: [PATCH 1120/1423] panic: Introduce warn_limit Like oops_limit, add warn_limit for limiting the number of warnings when panic_on_warn is not set. Cc: Jonathan Corbet Cc: Andrew Morton Cc: Baolin Wang Cc: "Jason A. Donenfeld" Cc: Eric Biggers Cc: Huang Ying Cc: Petr Mladek Cc: tangmeng Cc: "Guilherme G. Piccoli" Cc: Tiezhu Yang Cc: Sebastian Andrzej Siewior Cc: linux-doc@vger.kernel.org Reviewed-by: Luis Chamberlain Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221117234328.594699-5-keescook@chromium.org --- Documentation/admin-guide/sysctl/kernel.rst | 10 ++++++++++ kernel/panic.c | 14 ++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index a31d8d81ea078..179bd303b5853 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -1509,6 +1509,16 @@ entry will default to 2 instead of 0. 2 Unprivileged calls to ``bpf()`` are disabled = ============================================================= + +warn_limit +========== + +Number of kernel warnings after which the kernel should panic when +``panic_on_warn`` is not set. Setting this to 0 disables checking +the warning count. Setting this to 1 has the same effect as setting +``panic_on_warn=1``. The default value is 0. + + watchdog ======== diff --git a/kernel/panic.c b/kernel/panic.c index cfa354322d5f5..f4403fc14f673 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -58,6 +58,7 @@ bool crash_kexec_post_notifiers; int panic_on_warn __read_mostly; unsigned long panic_on_taint; bool panic_on_taint_nousertaint = false; +static unsigned int warn_limit __read_mostly; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -88,6 +89,13 @@ static struct ctl_table kern_panic_table[] = { .extra2 = SYSCTL_ONE, }, #endif + { + .procname = "warn_limit", + .data = &warn_limit, + .maxlen = sizeof(warn_limit), + .mode = 0644, + .proc_handler = proc_douintvec, + }, { } }; @@ -203,8 +211,14 @@ static void panic_print_sys_info(bool console_flush) void check_panic_on_warn(const char *origin) { + static atomic_t warn_count = ATOMIC_INIT(0); + if (panic_on_warn) panic("%s: panic_on_warn set ...\n", origin); + + if (atomic_inc_return(&warn_count) >= READ_ONCE(warn_limit) && warn_limit) + panic("%s: system warned too often (kernel.warn_limit is %d)", + origin, warn_limit); } /** -- GitLab From 8b05aa26336113c4cea25f1c333ee8cd4fc212a6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Nov 2022 15:43:26 -0800 Subject: [PATCH 1121/1423] panic: Expose "warn_count" to sysfs Since Warn count is now tracked and is a fairly interesting signal, add the entry /sys/kernel/warn_count to expose it to userspace. Cc: Petr Mladek Cc: Andrew Morton Cc: tangmeng Cc: "Guilherme G. Piccoli" Cc: Sebastian Andrzej Siewior Cc: Tiezhu Yang Reviewed-by: Luis Chamberlain Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221117234328.594699-6-keescook@chromium.org --- .../ABI/testing/sysfs-kernel-warn_count | 6 +++++ MAINTAINERS | 1 + kernel/panic.c | 22 +++++++++++++++++-- 3 files changed, 27 insertions(+), 2 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-kernel-warn_count diff --git a/Documentation/ABI/testing/sysfs-kernel-warn_count b/Documentation/ABI/testing/sysfs-kernel-warn_count new file mode 100644 index 0000000000000..08f083d2fd51b --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-warn_count @@ -0,0 +1,6 @@ +What: /sys/kernel/oops_count +Date: November 2022 +KernelVersion: 6.2.0 +Contact: Linux Kernel Hardening List +Description: + Shows how many times the system has Warned since last boot. diff --git a/MAINTAINERS b/MAINTAINERS index 0a1e95a58e54c..282cd8a513fd9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11107,6 +11107,7 @@ L: linux-hardening@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/hardening F: Documentation/ABI/testing/sysfs-kernel-oops_count +F: Documentation/ABI/testing/sysfs-kernel-warn_count F: include/linux/overflow.h F: include/linux/randomize_kstack.h F: mm/usercopy.c diff --git a/kernel/panic.c b/kernel/panic.c index f4403fc14f673..54deb743b2d5a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -107,6 +108,25 @@ static __init int kernel_panic_sysctls_init(void) late_initcall(kernel_panic_sysctls_init); #endif +static atomic_t warn_count = ATOMIC_INIT(0); + +#ifdef CONFIG_SYSFS +static ssize_t warn_count_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sysfs_emit(page, "%d\n", atomic_read(&warn_count)); +} + +static struct kobj_attribute warn_count_attr = __ATTR_RO(warn_count); + +static __init int kernel_panic_sysfs_init(void) +{ + sysfs_add_file_to_group(kernel_kobj, &warn_count_attr.attr, NULL); + return 0; +} +late_initcall(kernel_panic_sysfs_init); +#endif + static long no_blink(int state) { return 0; @@ -211,8 +231,6 @@ static void panic_print_sys_info(bool console_flush) void check_panic_on_warn(const char *origin) { - static atomic_t warn_count = ATOMIC_INIT(0); - if (panic_on_warn) panic("%s: panic_on_warn set ...\n", origin); -- GitLab From 5abf698754b8e5e4f1ca1058ee2b9785fbce6d23 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Mon, 28 Nov 2022 02:44:03 -0800 Subject: [PATCH 1122/1423] lib: fortify_kunit: build without structleak plugin Building allmodconfig with aarch64-linux-gnu-gcc (Debian 11.3.0-6), fortify_kunit with strucleak plugin enabled makes the stack frame size to grow too large: lib/fortify_kunit.c:140:1: error: the frame size of 2368 bytes is larger than 2048 bytes [-Werror=frame-larger-than=] Turn off the structleak plugin checks for fortify_kunit. Suggested-by: Arnd Bergmann Signed-off-by: Anders Roxell Signed-off-by: Kees Cook --- lib/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/Makefile b/lib/Makefile index 2f0454b931dc8..83c650bb4459f 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -379,6 +379,7 @@ obj-$(CONFIG_OVERFLOW_KUNIT_TEST) += overflow_kunit.o CFLAGS_stackinit_kunit.o += $(call cc-disable-warning, switch-unreachable) obj-$(CONFIG_STACKINIT_KUNIT_TEST) += stackinit_kunit.o CFLAGS_fortify_kunit.o += $(call cc-disable-warning, unsequenced) +CFLAGS_fortify_kunit.o += $(DISABLE_STRUCTLEAK_PLUGIN) obj-$(CONFIG_FORTIFY_KUNIT_TEST) += fortify_kunit.o obj-$(CONFIG_STRSCPY_KUNIT_TEST) += strscpy_kunit.o obj-$(CONFIG_SIPHASH_KUNIT_TEST) += siphash_kunit.o -- GitLab From 3a017d6355f24de42f2ad688df9fa19e0cb128f2 Mon Sep 17 00:00:00 2001 From: "haifeng.xu" Date: Mon, 28 Nov 2022 06:56:06 +0000 Subject: [PATCH 1123/1423] signal: Initialize the info in ksignal When handing the SIGNAL_GROUP_EXIT flag, the info in ksignal isn't cleared. However, the info acquired by dequeue_synchronous_signal/dequeue_signal is initialized and can be safely used. Fortunately, the fatal signal process just uses the si_signo and doesn't use any other member. Even so, the initialization before use is more safer. Signed-off-by: haifeng.xu Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221128065606.19570-1-haifeng.xu@shopee.com --- kernel/signal.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/signal.c b/kernel/signal.c index d140672185a48..b9b0c8c620e7a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2693,6 +2693,7 @@ relock: /* Has this task already been marked for death? */ if ((signal->flags & SIGNAL_GROUP_EXIT) || signal->group_exec_task) { + clear_siginfo(&ksig->info); ksig->info.si_signo = signr = SIGKILL; sigdelset(¤t->pending.signal, SIGKILL); trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO, -- GitLab From bdc77507fecd00ddad2f502f86a48a9ec38f0f84 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 1 Dec 2022 16:23:25 -0800 Subject: [PATCH 1124/1423] um: virt-pci: Avoid GCC non-NULL warning GCC gets confused about the return value of get_cpu_var() possibly being NULL, so explicitly test for it before calls to memcpy() and memset(). Avoids warnings like this: arch/um/drivers/virt-pci.c: In function 'um_pci_send_cmd': include/linux/fortify-string.h:48:33: warning: argument 1 null where non-null expected [-Wnonnull] 48 | #define __underlying_memcpy __builtin_memcpy | ^ include/linux/fortify-string.h:438:9: note: in expansion of macro '__underlying_memcpy' 438 | __underlying_##op(p, q, __fortify_size); \ | ^~~~~~~~~~~~~ include/linux/fortify-string.h:483:26: note: in expansion of macro '__fortify_memcpy_chk' 483 | #define memcpy(p, q, s) __fortify_memcpy_chk(p, q, s, \ | ^~~~~~~~~~~~~~~~~~~~ arch/um/drivers/virt-pci.c:100:9: note: in expansion of macro 'memcpy' 100 | memcpy(buf, cmd, cmd_size); | ^~~~~~ While at it, avoid literal "8" and use stored sizeof(buf->data) in memset() and um_pci_send_cmd(). Reported-by: kernel test robot Link: https://lore.kernel.org/lkml/202211271212.SUZSC9f9-lkp@intel.com Fixes: ba38961a069b ("um: Enable FORTIFY_SOURCE") Cc: Richard Weinberger Cc: Anton Ivanov Cc: Johannes Berg Cc: "Michael S. Tsirkin" Cc: Al Viro Cc: Xiu Jianfeng Cc: Vincent Whitchurch Cc: linux-um@lists.infradead.org Cc: stable@vger.kernel.org Signed-off-by: Kees Cook --- arch/um/drivers/virt-pci.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/um/drivers/virt-pci.c b/arch/um/drivers/virt-pci.c index acb55b302b14c..3ac220dafec4a 100644 --- a/arch/um/drivers/virt-pci.c +++ b/arch/um/drivers/virt-pci.c @@ -97,7 +97,8 @@ static int um_pci_send_cmd(struct um_pci_device *dev, } buf = get_cpu_var(um_pci_msg_bufs); - memcpy(buf, cmd, cmd_size); + if (buf) + memcpy(buf, cmd, cmd_size); if (posted) { u8 *ncmd = kmalloc(cmd_size + extra_size, GFP_ATOMIC); @@ -182,6 +183,7 @@ static unsigned long um_pci_cfgspace_read(void *priv, unsigned int offset, struct um_pci_message_buffer *buf; u8 *data; unsigned long ret = ULONG_MAX; + size_t bytes = sizeof(buf->data); if (!dev) return ULONG_MAX; @@ -189,7 +191,8 @@ static unsigned long um_pci_cfgspace_read(void *priv, unsigned int offset, buf = get_cpu_var(um_pci_msg_bufs); data = buf->data; - memset(buf->data, 0xff, sizeof(buf->data)); + if (buf) + memset(data, 0xff, bytes); switch (size) { case 1: @@ -204,7 +207,7 @@ static unsigned long um_pci_cfgspace_read(void *priv, unsigned int offset, goto out; } - if (um_pci_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, data, 8)) + if (um_pci_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, data, bytes)) goto out; switch (size) { -- GitLab From d662198e03bc7fb4635156ee7e8b8d325e2d8512 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Nov 2022 19:42:55 -0800 Subject: [PATCH 1125/1423] hpet: Replace one-element array with flexible-array member One-element arrays are deprecated[1] and are being replaced with flexible array members in support of the ongoing efforts to tighten the FORTIFY_SOURCE routines on memcpy(), correctly instrument array indexing with UBSAN_BOUNDS, and to globally enable -fstrict-flex-arrays=3. Replace one-element array with flexible-array member in struct hpet. This results in no differences in binary output. The use of struct hpet is never used with sizeof() and accesses via hpet_timers array are already done after explicit bounds checking. [1] https://github.com/KSPP/linux/issues/79 Cc: Clemens Ladisch Cc: "Gustavo A. R. Silva" Signed-off-by: Kees Cook Reviewed-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20221118034250.never.999-kees@kernel.org --- include/linux/hpet.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/hpet.h b/include/linux/hpet.h index 8604564b985dc..21e69eaf7a36d 100644 --- a/include/linux/hpet.h +++ b/include/linux/hpet.h @@ -30,7 +30,7 @@ struct hpet { unsigned long _hpet_compare; } _u1; u64 hpet_fsb[2]; /* FSB route */ - } hpet_timers[1]; + } hpet_timers[]; }; #define hpet_mc _u0._hpet_mc -- GitLab From d272e01fa0a2f15c5c331a37cd99c6875c7b7186 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 15 Nov 2022 09:35:10 -0600 Subject: [PATCH 1126/1423] ksmbd: replace one-element arrays with flexible-array members One-element arrays are deprecated, and we are replacing them with flexible array members instead. So, replace one-element arrays with flexible-array members in multiple structs in fs/ksmbd/smb_common.h and one in fs/ksmbd/smb2pdu.h. Important to mention is that doing a build before/after this patch results in no binary output differences. This helps with the ongoing efforts to tighten the FORTIFY_SOURCE routines on memcpy() and help us make progress towards globally enabling -fstrict-flex-arrays=3 [1]. Link: https://github.com/KSPP/linux/issues/242 Link: https://github.com/KSPP/linux/issues/79 Link: https://gcc.gnu.org/pipermail/gcc-patches/2022-October/602902.html [1] Signed-off-by: Gustavo A. R. Silva Reviewed-by: Sergey Senozhatsky Acked-by: Namjae Jeon Reviewed-by: Kees Cook Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/Y3OxronfaPYv9qGP@work --- fs/ksmbd/smb2pdu.c | 4 ++-- fs/ksmbd/smb2pdu.h | 2 +- fs/ksmbd/smb_common.h | 12 ++++++------ 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index b2fc85d440d03..31f00cec5255b 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -3438,7 +3438,7 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level, goto free_conv_name; } - struct_sz = readdir_info_level_struct_sz(info_level) - 1 + conv_len; + struct_sz = readdir_info_level_struct_sz(info_level) + conv_len; next_entry_offset = ALIGN(struct_sz, KSMBD_DIR_INFO_ALIGNMENT); d_info->last_entry_off_align = next_entry_offset - struct_sz; @@ -3690,7 +3690,7 @@ static int reserve_populate_dentry(struct ksmbd_dir_info *d_info, return -EOPNOTSUPP; conv_len = (d_info->name_len + 1) * 2; - next_entry_offset = ALIGN(struct_sz - 1 + conv_len, + next_entry_offset = ALIGN(struct_sz + conv_len, KSMBD_DIR_INFO_ALIGNMENT); if (next_entry_offset > d_info->out_buf_len) { diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h index 092fdd3f87505..aa5dbe54f5a18 100644 --- a/fs/ksmbd/smb2pdu.h +++ b/fs/ksmbd/smb2pdu.h @@ -443,7 +443,7 @@ struct smb2_posix_info { /* SidBuffer contain two sids (UNIX user sid(16), UNIX group sid(16)) */ u8 SidBuffer[32]; __le32 name_len; - u8 name[1]; + u8 name[]; /* * var sized owner SID * var sized group SID diff --git a/fs/ksmbd/smb_common.h b/fs/ksmbd/smb_common.h index 318c16fa81da3..e663ab9ea7590 100644 --- a/fs/ksmbd/smb_common.h +++ b/fs/ksmbd/smb_common.h @@ -277,14 +277,14 @@ struct file_directory_info { __le64 AllocationSize; __le32 ExtFileAttributes; __le32 FileNameLength; - char FileName[1]; + char FileName[]; } __packed; /* level 0x101 FF resp data */ struct file_names_info { __le32 NextEntryOffset; __u32 FileIndex; __le32 FileNameLength; - char FileName[1]; + char FileName[]; } __packed; /* level 0xc FF resp data */ struct file_full_directory_info { @@ -299,7 +299,7 @@ struct file_full_directory_info { __le32 ExtFileAttributes; __le32 FileNameLength; __le32 EaSize; - char FileName[1]; + char FileName[]; } __packed; /* level 0x102 FF resp */ struct file_both_directory_info { @@ -317,7 +317,7 @@ struct file_both_directory_info { __u8 ShortNameLength; __u8 Reserved; __u8 ShortName[24]; - char FileName[1]; + char FileName[]; } __packed; /* level 0x104 FFrsp data */ struct file_id_both_directory_info { @@ -337,7 +337,7 @@ struct file_id_both_directory_info { __u8 ShortName[24]; __le16 Reserved2; __le64 UniqueId; - char FileName[1]; + char FileName[]; } __packed; struct file_id_full_dir_info { @@ -354,7 +354,7 @@ struct file_id_full_dir_info { __le32 EaSize; /* EA size */ __le32 Reserved; __le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/ - char FileName[1]; + char FileName[]; } __packed; /* level 0x105 FF rsp data */ struct smb_version_values { -- GitLab From 649d6b1019a2f243bc3a98cb85902a8ebf74289a Mon Sep 17 00:00:00 2001 From: Xianting Tian Date: Wed, 26 Oct 2022 22:42:07 +0800 Subject: [PATCH 1127/1423] RISC-V: Add arch_crash_save_vmcoreinfo support Add arch_crash_save_vmcoreinfo(), which exports VM layout(MODULES, VMALLOC, VMEMMAP ranges and KERNEL_LINK_ADDR), va bits and ram base for vmcore. Default pagetable levels and PAGE_OFFSET aren't same for different kernel version as below. For pagetable levels, it sets sv57 by default and falls back to setting sv48 at boot time if sv57 is not supported by the hardware. For ram base, the default value is 0x80200000 for qemu riscv64 env and, for example, is 0x200000 on the XuanTie 910 CPU. * Linux Kernel 5.18 ~ * PGTABLE_LEVELS = 5 * PAGE_OFFSET = 0xff60000000000000 * Linux Kernel 5.17 ~ * PGTABLE_LEVELS = 4 * PAGE_OFFSET = 0xffffaf8000000000 * Linux Kernel 4.19 ~ * PGTABLE_LEVELS = 3 * PAGE_OFFSET = 0xffffffe000000000 Since these configurations change from time to time and version to version, it is preferable to export them via vmcoreinfo than to change the crash's code frequently, it can simplify the development of crash tool. Signed-off-by: Xianting Tian Tested-by: Deepak Gupta Tested-by: Guo Ren Acked-by: Baoquan He Link: https://lore.kernel.org/r/20221026144208.373504-2-xianting.tian@linux.alibaba.com [Palmer: wrap commit text] Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/Makefile | 1 + arch/riscv/kernel/crash_core.c | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) create mode 100644 arch/riscv/kernel/crash_core.c diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index db6e4b1294ba3..4cf303a779ab9 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -81,6 +81,7 @@ obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_KEXEC_CORE) += kexec_relocate.o crash_save_regs.o machine_kexec.o obj-$(CONFIG_KEXEC_FILE) += elf_kexec.o machine_kexec_file.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o +obj-$(CONFIG_CRASH_CORE) += crash_core.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o diff --git a/arch/riscv/kernel/crash_core.c b/arch/riscv/kernel/crash_core.c new file mode 100644 index 0000000000000..b351a3c013555 --- /dev/null +++ b/arch/riscv/kernel/crash_core.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include + +void arch_crash_save_vmcoreinfo(void) +{ + VMCOREINFO_NUMBER(VA_BITS); + VMCOREINFO_NUMBER(phys_ram_base); + + vmcoreinfo_append_str("NUMBER(PAGE_OFFSET)=0x%lx\n", PAGE_OFFSET); + vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START); + vmcoreinfo_append_str("NUMBER(VMALLOC_END)=0x%lx\n", VMALLOC_END); + vmcoreinfo_append_str("NUMBER(VMEMMAP_START)=0x%lx\n", VMEMMAP_START); + vmcoreinfo_append_str("NUMBER(VMEMMAP_END)=0x%lx\n", VMEMMAP_END); +#ifdef CONFIG_64BIT + vmcoreinfo_append_str("NUMBER(MODULES_VADDR)=0x%lx\n", MODULES_VADDR); + vmcoreinfo_append_str("NUMBER(MODULES_END)=0x%lx\n", MODULES_END); +#endif + vmcoreinfo_append_str("NUMBER(KERNEL_LINK_ADDR)=0x%lx\n", KERNEL_LINK_ADDR); +} -- GitLab From c5b4216929ebc8ac9107a373db65babc14ba4e80 Mon Sep 17 00:00:00 2001 From: Xianting Tian Date: Wed, 26 Oct 2022 22:42:08 +0800 Subject: [PATCH 1128/1423] Documentation: kdump: describe VMCOREINFO export for RISCV64 The following interrelated definitions and ranges are needed by the kdump crash tool, which are exported by "arch/riscv/kernel/crash_core.c": VA_BITS, PAGE_OFFSET, phys_ram_base, KERNEL_LINK_ADDR, MODULES_VADDR ~ MODULES_END, VMALLOC_START ~ VMALLOC_END, VMEMMAP_START ~ VMEMMAP_END, Document these RISCV64 exports above. Reviewed-by: Bagas Sanjaya Signed-off-by: Xianting Tian Acked-by: Baoquan He Link: https://lore.kernel.org/r/20221026144208.373504-3-xianting.tian@linux.alibaba.com [Palmer: wrap commit text] Signed-off-by: Palmer Dabbelt --- .../admin-guide/kdump/vmcoreinfo.rst | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst index 6726f439958ca..86fd884928700 100644 --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst @@ -595,3 +595,32 @@ X2TLB ----- Indicates whether the crashed kernel enabled SH extended mode. + +RISCV64 +======= + +VA_BITS +------- + +The maximum number of bits for virtual addresses. Used to compute the +virtual memory ranges. + +PAGE_OFFSET +----------- + +Indicates the virtual kernel start address of the direct-mapped RAM region. + +phys_ram_base +------------- + +Indicates the start physical RAM address. + +MODULES_VADDR|MODULES_END|VMALLOC_START|VMALLOC_END|VMEMMAP_START|VMEMMAP_END|KERNEL_LINK_ADDR +---------------------------------------------------------------------------------------------- + +Used to get the correct ranges: + + * MODULES_VADDR ~ MODULES_END : Kernel module space. + * VMALLOC_START ~ VMALLOC_END : vmalloc() / ioremap() space. + * VMEMMAP_START ~ VMEMMAP_END : vmemmap space, used for struct page array. + * KERNEL_LINK_ADDR : start address of Kernel link and BPF -- GitLab From 323a74fc20f53c0d0e13a16aee703a30d9751235 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 2 Dec 2022 13:19:40 -0800 Subject: [PATCH 1129/1423] RDMA: Disable IB HW for UML Disable all of drivers/infiniband/hw/ and rdmavt for UML builds until someone needs it and provides patches to support it. This prevents build errors in hw/qib/qib_wc_x86_64.c. Fixes: 68f5d3f3b654 ("um: add PCI over virtio emulation driver") Signed-off-by: Randy Dunlap Cc: Jason Gunthorpe Cc: Dennis Dalessandro Cc: Christoph Hellwig Cc: linux-rdma@vger.kernel.org Cc: Jeff Dike Cc: Richard Weinberger Cc: Anton Ivanov Cc: Johannes Berg Cc: linux-um@lists.infradead.org Link: https://lore.kernel.org/r/20221202211940.29111-1-rdunlap@infradead.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index ccc874478f0b6..a5827d11e9346 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -78,6 +78,7 @@ config INFINIBAND_VIRT_DMA def_bool !HIGHMEM if INFINIBAND_USER_ACCESS || !INFINIBAND_USER_ACCESS +if !UML source "drivers/infiniband/hw/bnxt_re/Kconfig" source "drivers/infiniband/hw/cxgb4/Kconfig" source "drivers/infiniband/hw/efa/Kconfig" @@ -95,6 +96,7 @@ source "drivers/infiniband/hw/qib/Kconfig" source "drivers/infiniband/hw/usnic/Kconfig" source "drivers/infiniband/hw/vmw_pvrdma/Kconfig" source "drivers/infiniband/sw/rdmavt/Kconfig" +endif # !UML source "drivers/infiniband/sw/rxe/Kconfig" source "drivers/infiniband/sw/siw/Kconfig" endif -- GitLab From 725349f8ba1e78a146c6ff8f3ee5e2712e517106 Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Fri, 2 Dec 2022 12:00:37 +0800 Subject: [PATCH 1130/1423] RDMA/hfi1: Fix error return code in parse_platform_config() In the previous iteration of the while loop, the "ret" may have been assigned a value of 0, so the error return code -EINVAL may have been incorrectly set to 0. To fix set valid return code before calling to goto. Fixes: 97167e813415 ("staging/rdma/hfi1: Tune for unknown channel if configuration file is absent") Signed-off-by: Wang Yufen Link: https://lore.kernel.org/r/1669953638-11747-1-git-send-email-wangyufen@huawei.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hfi1/firmware.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/firmware.c b/drivers/infiniband/hw/hfi1/firmware.c index 1d77514ebbee0..0c0cef5b1e0e5 100644 --- a/drivers/infiniband/hw/hfi1/firmware.c +++ b/drivers/infiniband/hw/hfi1/firmware.c @@ -1743,6 +1743,7 @@ int parse_platform_config(struct hfi1_devdata *dd) if (!dd->platform_config.data) { dd_dev_err(dd, "%s: Missing config file\n", __func__); + ret = -EINVAL; goto bail; } ptr = (u32 *)dd->platform_config.data; @@ -1751,6 +1752,7 @@ int parse_platform_config(struct hfi1_devdata *dd) ptr++; if (magic_num != PLATFORM_CONFIG_MAGIC_NUM) { dd_dev_err(dd, "%s: Bad config file\n", __func__); + ret = -EINVAL; goto bail; } @@ -1774,6 +1776,7 @@ int parse_platform_config(struct hfi1_devdata *dd) if (file_length > dd->platform_config.size) { dd_dev_info(dd, "%s:File claims to be larger than read size\n", __func__); + ret = -EINVAL; goto bail; } else if (file_length < dd->platform_config.size) { dd_dev_info(dd, @@ -1794,6 +1797,7 @@ int parse_platform_config(struct hfi1_devdata *dd) dd_dev_err(dd, "%s: Failed validation at offset %ld\n", __func__, (ptr - (u32 *) dd->platform_config.data)); + ret = -EINVAL; goto bail; } @@ -1837,6 +1841,7 @@ int parse_platform_config(struct hfi1_devdata *dd) __func__, table_type, (ptr - (u32 *) dd->platform_config.data)); + ret = -EINVAL; goto bail; /* We don't trust this file now */ } pcfgcache->config_tables[table_type].table = ptr; @@ -1856,6 +1861,7 @@ int parse_platform_config(struct hfi1_devdata *dd) __func__, table_type, (ptr - (u32 *)dd->platform_config.data)); + ret = -EINVAL; goto bail; /* We don't trust this file now */ } pcfgcache->config_tables[table_type].table_metadata = -- GitLab From ed461b30b22c8fa85c25189c14cb89f29595cd14 Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Fri, 2 Dec 2022 12:00:38 +0800 Subject: [PATCH 1131/1423] RDMA/srp: Fix error return code in srp_parse_options() In the previous iteration of the while loop, the "ret" may have been assigned a value of 0, so the error return code -EINVAL may have been incorrectly set to 0. To fix set valid return code before calling to goto. Also investigate each case separately as Andy suggessted. Fixes: e711f968c49c ("IB/srp: replace custom implementation of hex2bin()") Fixes: 2a174df0c602 ("IB/srp: Use kstrtoull() instead of simple_strtoull()") Fixes: 19f313438c77 ("IB/srp: Add RDMA/CM support") Signed-off-by: Wang Yufen Link: https://lore.kernel.org/r/1669953638-11747-2-git-send-email-wangyufen@huawei.com Reviewed-by: Bart Van Assche Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/srp/ib_srp.c | 96 ++++++++++++++++++++++++----- 1 file changed, 82 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 1075c2ac8fe20..b4d6a4a5ae81e 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -3410,7 +3410,8 @@ static int srp_parse_options(struct net *net, const char *buf, break; case SRP_OPT_PKEY: - if (match_hex(args, &token)) { + ret = match_hex(args, &token); + if (ret) { pr_warn("bad P_Key parameter '%s'\n", p); goto out; } @@ -3470,7 +3471,8 @@ static int srp_parse_options(struct net *net, const char *buf, break; case SRP_OPT_MAX_SECT: - if (match_int(args, &token)) { + ret = match_int(args, &token); + if (ret) { pr_warn("bad max sect parameter '%s'\n", p); goto out; } @@ -3478,8 +3480,15 @@ static int srp_parse_options(struct net *net, const char *buf, break; case SRP_OPT_QUEUE_SIZE: - if (match_int(args, &token) || token < 1) { + ret = match_int(args, &token); + if (ret) { + pr_warn("match_int() failed for queue_size parameter '%s', Error %d\n", + p, ret); + goto out; + } + if (token < 1) { pr_warn("bad queue_size parameter '%s'\n", p); + ret = -EINVAL; goto out; } target->scsi_host->can_queue = token; @@ -3490,25 +3499,40 @@ static int srp_parse_options(struct net *net, const char *buf, break; case SRP_OPT_MAX_CMD_PER_LUN: - if (match_int(args, &token) || token < 1) { + ret = match_int(args, &token); + if (ret) { + pr_warn("match_int() failed for max cmd_per_lun parameter '%s', Error %d\n", + p, ret); + goto out; + } + if (token < 1) { pr_warn("bad max cmd_per_lun parameter '%s'\n", p); + ret = -EINVAL; goto out; } target->scsi_host->cmd_per_lun = token; break; case SRP_OPT_TARGET_CAN_QUEUE: - if (match_int(args, &token) || token < 1) { + ret = match_int(args, &token); + if (ret) { + pr_warn("match_int() failed for max target_can_queue parameter '%s', Error %d\n", + p, ret); + goto out; + } + if (token < 1) { pr_warn("bad max target_can_queue parameter '%s'\n", p); + ret = -EINVAL; goto out; } target->target_can_queue = token; break; case SRP_OPT_IO_CLASS: - if (match_hex(args, &token)) { + ret = match_hex(args, &token); + if (ret) { pr_warn("bad IO class parameter '%s'\n", p); goto out; } @@ -3517,6 +3541,7 @@ static int srp_parse_options(struct net *net, const char *buf, pr_warn("unknown IO class parameter value %x specified (use %x or %x).\n", token, SRP_REV10_IB_IO_CLASS, SRP_REV16A_IB_IO_CLASS); + ret = -EINVAL; goto out; } target->io_class = token; @@ -3539,16 +3564,24 @@ static int srp_parse_options(struct net *net, const char *buf, break; case SRP_OPT_CMD_SG_ENTRIES: - if (match_int(args, &token) || token < 1 || token > 255) { + ret = match_int(args, &token); + if (ret) { + pr_warn("match_int() failed for max cmd_sg_entries parameter '%s', Error %d\n", + p, ret); + goto out; + } + if (token < 1 || token > 255) { pr_warn("bad max cmd_sg_entries parameter '%s'\n", p); + ret = -EINVAL; goto out; } target->cmd_sg_cnt = token; break; case SRP_OPT_ALLOW_EXT_SG: - if (match_int(args, &token)) { + ret = match_int(args, &token); + if (ret) { pr_warn("bad allow_ext_sg parameter '%s'\n", p); goto out; } @@ -3556,43 +3589,77 @@ static int srp_parse_options(struct net *net, const char *buf, break; case SRP_OPT_SG_TABLESIZE: - if (match_int(args, &token) || token < 1 || - token > SG_MAX_SEGMENTS) { + ret = match_int(args, &token); + if (ret) { + pr_warn("match_int() failed for max sg_tablesize parameter '%s', Error %d\n", + p, ret); + goto out; + } + if (token < 1 || token > SG_MAX_SEGMENTS) { pr_warn("bad max sg_tablesize parameter '%s'\n", p); + ret = -EINVAL; goto out; } target->sg_tablesize = token; break; case SRP_OPT_COMP_VECTOR: - if (match_int(args, &token) || token < 0) { + ret = match_int(args, &token); + if (ret) { + pr_warn("match_int() failed for comp_vector parameter '%s', Error %d\n", + p, ret); + goto out; + } + if (token < 0) { pr_warn("bad comp_vector parameter '%s'\n", p); + ret = -EINVAL; goto out; } target->comp_vector = token; break; case SRP_OPT_TL_RETRY_COUNT: - if (match_int(args, &token) || token < 2 || token > 7) { + ret = match_int(args, &token); + if (ret) { + pr_warn("match_int() failed for tl_retry_count parameter '%s', Error %d\n", + p, ret); + goto out; + } + if (token < 2 || token > 7) { pr_warn("bad tl_retry_count parameter '%s' (must be a number between 2 and 7)\n", p); + ret = -EINVAL; goto out; } target->tl_retry_count = token; break; case SRP_OPT_MAX_IT_IU_SIZE: - if (match_int(args, &token) || token < 0) { + ret = match_int(args, &token); + if (ret) { + pr_warn("match_int() failed for max it_iu_size parameter '%s', Error %d\n", + p, ret); + goto out; + } + if (token < 0) { pr_warn("bad maximum initiator to target IU size '%s'\n", p); + ret = -EINVAL; goto out; } target->max_it_iu_size = token; break; case SRP_OPT_CH_COUNT: - if (match_int(args, &token) || token < 1) { + ret = match_int(args, &token); + if (ret) { + pr_warn("match_int() failed for channel count parameter '%s', Error %d\n", + p, ret); + goto out; + } + if (token < 1) { pr_warn("bad channel count %s\n", p); + ret = -EINVAL; goto out; } target->ch_count = token; @@ -3601,6 +3668,7 @@ static int srp_parse_options(struct net *net, const char *buf, default: pr_warn("unknown parameter or missing value '%s' in target creation request\n", p); + ret = -EINVAL; goto out; } } -- GitLab From 6978837ce42f8bea85041fc08c854f4e28852b3e Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Sat, 3 Dec 2022 11:37:14 +0800 Subject: [PATCH 1132/1423] RDMA/mlx5: no need to kfree NULL pointer Goto label 'free' where it will kfree the 'in' is not needed though it's safe to kfree NULL. Return err code directly to simplify the code. 1973 free: 1974 kfree(in); 1975 return err; Signed-off-by: Li Zhijian Link: https://lore.kernel.org/r/20221203033714.25870-1-lizhijian@fujitsu.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mr.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 410cc5fd25239..053fe946e45ae 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1929,10 +1929,8 @@ int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata) ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4); in = kzalloc(inlen, GFP_KERNEL); - if (!in) { - err = -ENOMEM; - goto free; - } + if (!in) + return -ENOMEM; mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); -- GitLab From 1f5619ed881081be300db61da552ffae7163bb72 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 2 Dec 2022 13:30:51 -0800 Subject: [PATCH 1133/1423] xfs: Remove duplicated include in xfs_iomap.c ./fs/xfs/xfs_iomap.c: xfs_error.h is included more than once. ./fs/xfs/xfs_iomap.c: xfs_errortag.h is included more than once. Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=3337 Reported-by: Abaci Robot Signed-off-by: Yang Li Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_iomap.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 68436370927d5..43f447199c082 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -27,8 +27,6 @@ #include "xfs_dquot_item.h" #include "xfs_dquot.h" #include "xfs_reflink.h" -#include "xfs_error.h" -#include "xfs_errortag.h" #define XFS_ALLOC_ALIGN(mp, off) \ (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log) -- GitLab From af45de8368883c9620a7735a8c2532e52101c1a2 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Tue, 29 Nov 2022 15:47:01 +0100 Subject: [PATCH 1134/1423] dt-bindings: qcom: geni-se: document I2C Master Hub wrapper variant The I2C Master Hub is a stripped down version of the GENI Serial Engine QUP Wrapper Controller but only supporting I2C serial engines without DMA support. Document the variant compatible, forbid UART and SPI sub-nodes, and remove requirement for the Master AHB clock and iommu property. Signed-off-by: Neil Armstrong Reviewed-by: Krzysztof Kozlowski Signed-off-by: Wolfram Sang --- .../bindings/soc/qcom/qcom,geni-se.yaml | 44 ++++++++++++++++--- 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/Documentation/devicetree/bindings/soc/qcom/qcom,geni-se.yaml b/Documentation/devicetree/bindings/soc/qcom/qcom,geni-se.yaml index 2bf5293fc9952..ab4df02052853 100644 --- a/Documentation/devicetree/bindings/soc/qcom/qcom,geni-se.yaml +++ b/Documentation/devicetree/bindings/soc/qcom/qcom,geni-se.yaml @@ -21,20 +21,19 @@ properties: compatible: enum: - qcom,geni-se-qup + - qcom,geni-se-i2c-master-hub reg: description: QUP wrapper common register address and length. maxItems: 1 clock-names: - items: - - const: m-ahb - - const: s-ahb + minItems: 1 + maxItems: 2 clocks: - items: - - description: Master AHB Clock - - description: Slave AHB Clock + minItems: 1 + maxItems: 2 "#address-cells": const: 2 @@ -81,6 +80,39 @@ patternProperties: description: GENI Serial Engine based UART Controller. $ref: /schemas/serial/qcom,serial-geni-qcom.yaml# +allOf: + - if: + properties: + compatible: + contains: + const: qcom,geni-se-i2c-master-hub + then: + properties: + clock-names: + items: + - const: s-ahb + + clocks: + items: + - description: Slave AHB Clock + + iommus: false + + patternProperties: + "spi@[0-9a-f]+$": false + "serial@[0-9a-f]+$": false + else: + properties: + clock-names: + items: + - const: m-ahb + - const: s-ahb + + clocks: + items: + - description: Master AHB Clock + - description: Slave AHB Clock + additionalProperties: false examples: -- GitLab From cb29d4e6a9ef004eed24774bc26839aa9e373ba4 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Tue, 29 Nov 2022 15:47:02 +0100 Subject: [PATCH 1135/1423] dt-bindings: i2c: qcom-geni: document I2C Master Hub serial I2C engine The I2C Master Hub is a stripped down version of the GENI Serial Engine QUP Wrapper Controller but only supporting I2C serial engines without DMA support. Document the I2C Serial Engine variant used within the I2C Master Hub Wrapper. This serial engine variant lacks DMA support, requires a core clock, and since DMA support is lacking the memory interconnect path isn't needed. Signed-off-by: Neil Armstrong Reviewed-by: Krzysztof Kozlowski Signed-off-by: Wolfram Sang --- .../bindings/i2c/qcom,i2c-geni-qcom.yaml | 64 ++++++++++++++++--- 1 file changed, 54 insertions(+), 10 deletions(-) diff --git a/Documentation/devicetree/bindings/i2c/qcom,i2c-geni-qcom.yaml b/Documentation/devicetree/bindings/i2c/qcom,i2c-geni-qcom.yaml index 0e7ed00562e21..f5f7dc8f325cb 100644 --- a/Documentation/devicetree/bindings/i2c/qcom,i2c-geni-qcom.yaml +++ b/Documentation/devicetree/bindings/i2c/qcom,i2c-geni-qcom.yaml @@ -10,18 +10,19 @@ maintainers: - Andy Gross - Bjorn Andersson -allOf: - - $ref: /schemas/i2c/i2c-controller.yaml# - properties: compatible: - const: qcom,geni-i2c + enum: + - qcom,geni-i2c + - qcom,geni-i2c-master-hub clocks: - maxItems: 1 + minItems: 1 + maxItems: 2 clock-names: - const: se + minItems: 1 + maxItems: 2 clock-frequency: default: 100000 @@ -35,13 +36,12 @@ properties: - const: rx interconnects: + minItems: 2 maxItems: 3 interconnect-names: - items: - - const: qup-core - - const: qup-config - - const: qup-memory + minItems: 2 + maxItems: 3 interrupts: maxItems: 1 @@ -71,6 +71,50 @@ required: - clock-names - reg +allOf: + - $ref: /schemas/i2c/i2c-controller.yaml# + - if: + properties: + compatible: + contains: + const: qcom,geni-i2c-master-hub + then: + properties: + clocks: + minItems: 2 + + clock-names: + items: + - const: se + - const: core + + dmas: false + dma-names: false + + interconnects: + maxItems: 2 + + interconnect-names: + items: + - const: qup-core + - const: qup-config + else: + properties: + clocks: + maxItems: 1 + + clock-names: + const: se + + interconnects: + minItems: 3 + + interconnect-names: + items: + - const: qup-core + - const: qup-config + - const: qup-memory + unevaluatedProperties: false examples: -- GitLab From 63fc9af83c11e02ea9c981d3bd0382e36f49916f Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Tue, 29 Nov 2022 15:47:03 +0100 Subject: [PATCH 1136/1423] soc: qcom: geni-se: add desc struct to specify clocks from device match data The I2C Master Hub is a stripped down version of the GENI Serial Engine QUP Wrapper Controller but only supporting I2C serial engines without DMA support. Prepare support for the I2C Master Hub variant by moving the required clocks list to a new desc struct then passing it through the compatible match data. Signed-off-by: Neil Armstrong Reviewed-by: Konrad Dybcio Signed-off-by: Wolfram Sang --- drivers/soc/qcom/qcom-geni-se.c | 69 ++++++++++++++++++++++++++------- 1 file changed, 55 insertions(+), 14 deletions(-) diff --git a/drivers/soc/qcom/qcom-geni-se.c b/drivers/soc/qcom/qcom-geni-se.c index a0ceeede450f1..9ddee9fd11ba7 100644 --- a/drivers/soc/qcom/qcom-geni-se.c +++ b/drivers/soc/qcom/qcom-geni-se.c @@ -81,19 +81,31 @@ */ #define MAX_CLK_PERF_LEVEL 32 -#define NUM_AHB_CLKS 2 +#define MAX_CLKS 2 /** * struct geni_wrapper - Data structure to represent the QUP Wrapper Core * @dev: Device pointer of the QUP wrapper core * @base: Base address of this instance of QUP wrapper core - * @ahb_clks: Handle to the primary & secondary AHB clocks + * @clks: Handle to the primary & optional secondary AHB clocks + * @num_clks: Count of clocks * @to_core: Core ICC path */ struct geni_wrapper { struct device *dev; void __iomem *base; - struct clk_bulk_data ahb_clks[NUM_AHB_CLKS]; + struct clk_bulk_data clks[MAX_CLKS]; + unsigned int num_clks; +}; + +/** + * struct geni_se_desc - Data structure to represent the QUP Wrapper resources + * @clks: Name of the primary & optional secondary AHB clocks + * @num_clks: Count of clock names + */ +struct geni_se_desc { + unsigned int num_clks; + const char * const *clks; }; static const char * const icc_path_names[] = {"qup-core", "qup-config", @@ -496,8 +508,7 @@ static void geni_se_clks_off(struct geni_se *se) struct geni_wrapper *wrapper = se->wrapper; clk_disable_unprepare(se->clk); - clk_bulk_disable_unprepare(ARRAY_SIZE(wrapper->ahb_clks), - wrapper->ahb_clks); + clk_bulk_disable_unprepare(wrapper->num_clks, wrapper->clks); } /** @@ -528,15 +539,13 @@ static int geni_se_clks_on(struct geni_se *se) int ret; struct geni_wrapper *wrapper = se->wrapper; - ret = clk_bulk_prepare_enable(ARRAY_SIZE(wrapper->ahb_clks), - wrapper->ahb_clks); + ret = clk_bulk_prepare_enable(wrapper->num_clks, wrapper->clks); if (ret) return ret; ret = clk_prepare_enable(se->clk); if (ret) - clk_bulk_disable_unprepare(ARRAY_SIZE(wrapper->ahb_clks), - wrapper->ahb_clks); + clk_bulk_disable_unprepare(wrapper->num_clks, wrapper->clks); return ret; } @@ -887,11 +896,33 @@ static int geni_se_probe(struct platform_device *pdev) return PTR_ERR(wrapper->base); if (!has_acpi_companion(&pdev->dev)) { - wrapper->ahb_clks[0].id = "m-ahb"; - wrapper->ahb_clks[1].id = "s-ahb"; - ret = devm_clk_bulk_get(dev, NUM_AHB_CLKS, wrapper->ahb_clks); + const struct geni_se_desc *desc; + int i; + + desc = device_get_match_data(&pdev->dev); + if (!desc) + return -EINVAL; + + wrapper->num_clks = min_t(unsigned int, desc->num_clks, MAX_CLKS); + + for (i = 0; i < wrapper->num_clks; ++i) + wrapper->clks[i].id = desc->clks[i]; + + ret = of_count_phandle_with_args(dev->of_node, "clocks", "#clock-cells"); + if (ret < 0) { + dev_err(dev, "invalid clocks property at %pOF\n", dev->of_node); + return ret; + } + + if (ret < wrapper->num_clks) { + dev_err(dev, "invalid clocks count at %pOF, expected %d entries\n", + dev->of_node, wrapper->num_clks); + return -EINVAL; + } + + ret = devm_clk_bulk_get(dev, wrapper->num_clks, wrapper->clks); if (ret) { - dev_err(dev, "Err getting AHB clks %d\n", ret); + dev_err(dev, "Err getting clks %d\n", ret); return ret; } } @@ -901,8 +932,18 @@ static int geni_se_probe(struct platform_device *pdev) return devm_of_platform_populate(dev); } +static const char * const qup_clks[] = { + "m-ahb", + "s-ahb", +}; + +static const struct geni_se_desc qup_desc = { + .clks = qup_clks, + .num_clks = ARRAY_SIZE(qup_clks), +}; + static const struct of_device_id geni_se_dt_match[] = { - { .compatible = "qcom,geni-se-qup", }, + { .compatible = "qcom,geni-se-qup", .data = &qup_desc }, {} }; MODULE_DEVICE_TABLE(of, geni_se_dt_match); -- GitLab From f4aba01db4801cc02d1fae4c8099984a23740996 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Tue, 29 Nov 2022 15:47:04 +0100 Subject: [PATCH 1137/1423] soc: qcom: geni-se: add support for I2C Master Hub wrapper variant The I2C Master Hub is a stripped down version of the GENI Serial Engine QUP Wrapper Controller but only supporting I2C serial engines without DMA support. Add the clock list for the I2C Master Hub variant to a new desc struct then pass it through the I2C Master Hub compatible match data. Signed-off-by: Neil Armstrong Reviewed-by: Konrad Dybcio Signed-off-by: Wolfram Sang --- drivers/soc/qcom/qcom-geni-se.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/soc/qcom/qcom-geni-se.c b/drivers/soc/qcom/qcom-geni-se.c index 9ddee9fd11ba7..f0475b93ca730 100644 --- a/drivers/soc/qcom/qcom-geni-se.c +++ b/drivers/soc/qcom/qcom-geni-se.c @@ -942,8 +942,18 @@ static const struct geni_se_desc qup_desc = { .num_clks = ARRAY_SIZE(qup_clks), }; +static const char * const i2c_master_hub_clks[] = { + "s-ahb", +}; + +static const struct geni_se_desc i2c_master_hub_desc = { + .clks = i2c_master_hub_clks, + .num_clks = ARRAY_SIZE(i2c_master_hub_clks), +}; + static const struct of_device_id geni_se_dt_match[] = { { .compatible = "qcom,geni-se-qup", .data = &qup_desc }, + { .compatible = "qcom,geni-se-i2c-master-hub", .data = &i2c_master_hub_desc }, {} }; MODULE_DEVICE_TABLE(of, geni_se_dt_match); -- GitLab From 14d02fbadb5dc1cdf66078ef8430dd1cd22bfd53 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Tue, 29 Nov 2022 15:47:05 +0100 Subject: [PATCH 1138/1423] i2c: qcom-geni: add desc struct to prepare support for I2C Master Hub variant The I2C Master Hub is a stripped down version of the GENI Serial Engine QUP Wrapper Controller but only supporting I2C serial engines without DMA support. Those I2C serial engines variants have some requirements: - a separate "core" clock - doesn't support DMA, thus no memory interconnect path - fixed FIFO size not discoverable in the HW_PARAM_0 register Add a desc struct specifying all those requirements which will be used in a next change when adding the I2C Master Hub serial engine compatible. Signed-off-by: Neil Armstrong Reviewed-by: Konrad Dybcio Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-qcom-geni.c | 50 ++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/drivers/i2c/busses/i2c-qcom-geni.c b/drivers/i2c/busses/i2c-qcom-geni.c index 84a77512614d9..75dd0718c5a12 100644 --- a/drivers/i2c/busses/i2c-qcom-geni.c +++ b/drivers/i2c/busses/i2c-qcom-geni.c @@ -88,6 +88,7 @@ struct geni_i2c_dev { int cur_wr; int cur_rd; spinlock_t lock; + struct clk *core_clk; u32 clk_freq_out; const struct geni_i2c_clk_fld *clk_fld; int suspended; @@ -100,6 +101,13 @@ struct geni_i2c_dev { bool abort_done; }; +struct geni_i2c_desc { + bool has_core_clk; + char *icc_ddr; + bool no_dma_support; + unsigned int tx_fifo_depth; +}; + struct geni_i2c_err_log { int err; const char *msg; @@ -764,6 +772,7 @@ static int geni_i2c_probe(struct platform_device *pdev) u32 proto, tx_depth, fifo_disable; int ret; struct device *dev = &pdev->dev; + const struct geni_i2c_desc *desc = NULL; gi2c = devm_kzalloc(dev, sizeof(*gi2c), GFP_KERNEL); if (!gi2c) @@ -776,6 +785,14 @@ static int geni_i2c_probe(struct platform_device *pdev) if (IS_ERR(gi2c->se.base)) return PTR_ERR(gi2c->se.base); + desc = device_get_match_data(&pdev->dev); + + if (desc && desc->has_core_clk) { + gi2c->core_clk = devm_clk_get(dev, "core"); + if (IS_ERR(gi2c->core_clk)) + return PTR_ERR(gi2c->core_clk); + } + gi2c->se.clk = devm_clk_get(dev, "se"); if (IS_ERR(gi2c->se.clk) && !has_acpi_companion(dev)) return PTR_ERR(gi2c->se.clk); @@ -819,7 +836,7 @@ static int geni_i2c_probe(struct platform_device *pdev) gi2c->adap.dev.of_node = dev->of_node; strscpy(gi2c->adap.name, "Geni-I2C", sizeof(gi2c->adap.name)); - ret = geni_icc_get(&gi2c->se, "qup-memory"); + ret = geni_icc_get(&gi2c->se, desc ? desc->icc_ddr : "qup-memory"); if (ret) return ret; /* @@ -829,12 +846,17 @@ static int geni_i2c_probe(struct platform_device *pdev) */ gi2c->se.icc_paths[GENI_TO_CORE].avg_bw = GENI_DEFAULT_BW; gi2c->se.icc_paths[CPU_TO_GENI].avg_bw = GENI_DEFAULT_BW; - gi2c->se.icc_paths[GENI_TO_DDR].avg_bw = Bps_to_icc(gi2c->clk_freq_out); + if (!desc || desc->icc_ddr) + gi2c->se.icc_paths[GENI_TO_DDR].avg_bw = Bps_to_icc(gi2c->clk_freq_out); ret = geni_icc_set_bw(&gi2c->se); if (ret) return ret; + ret = clk_prepare_enable(gi2c->core_clk); + if (ret) + return ret; + ret = geni_se_resources_on(&gi2c->se); if (ret) { dev_err(dev, "Error turning on resources %d\n", ret); @@ -844,10 +866,15 @@ static int geni_i2c_probe(struct platform_device *pdev) if (proto != GENI_SE_I2C) { dev_err(dev, "Invalid proto %d\n", proto); geni_se_resources_off(&gi2c->se); + clk_disable_unprepare(gi2c->core_clk); return -ENXIO; } - fifo_disable = readl_relaxed(gi2c->se.base + GENI_IF_DISABLE_RO) & FIFO_IF_DISABLE; + if (desc && desc->no_dma_support) + fifo_disable = false; + else + fifo_disable = readl_relaxed(gi2c->se.base + GENI_IF_DISABLE_RO) & FIFO_IF_DISABLE; + if (fifo_disable) { /* FIFO is disabled, so we can only use GPI DMA */ gi2c->gpi_mode = true; @@ -859,6 +886,16 @@ static int geni_i2c_probe(struct platform_device *pdev) } else { gi2c->gpi_mode = false; tx_depth = geni_se_get_tx_fifo_depth(&gi2c->se); + + /* I2C Master Hub Serial Elements doesn't have the HW_PARAM_0 register */ + if (!tx_depth && desc) + tx_depth = desc->tx_fifo_depth; + + if (!tx_depth) { + dev_err(dev, "Invalid TX FIFO depth\n"); + return -EINVAL; + } + gi2c->tx_wm = tx_depth - 1; geni_se_init(&gi2c->se, gi2c->tx_wm, tx_depth); geni_se_config_packing(&gi2c->se, BITS_PER_BYTE, @@ -867,6 +904,7 @@ static int geni_i2c_probe(struct platform_device *pdev) dev_dbg(dev, "i2c fifo/se-dma mode. fifo depth:%d\n", tx_depth); } + clk_disable_unprepare(gi2c->core_clk); ret = geni_se_resources_off(&gi2c->se); if (ret) { dev_err(dev, "Error turning off resources %d\n", ret); @@ -932,6 +970,8 @@ static int __maybe_unused geni_i2c_runtime_suspend(struct device *dev) gi2c->suspended = 1; } + clk_disable_unprepare(gi2c->core_clk); + return geni_icc_disable(&gi2c->se); } @@ -944,6 +984,10 @@ static int __maybe_unused geni_i2c_runtime_resume(struct device *dev) if (ret) return ret; + ret = clk_prepare_enable(gi2c->core_clk); + if (ret) + return ret; + ret = geni_se_resources_on(&gi2c->se); if (ret) return ret; -- GitLab From cacd9643eca7a1f4635479aff4ec33aaade45e64 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Tue, 29 Nov 2022 15:47:06 +0100 Subject: [PATCH 1139/1423] i2c: qcom-geni: add support for I2C Master Hub variant The I2C Master Hub is a stripped down version of the GENI Serial Engine QUP Wrapper Controller but only supporting I2C serial engines without DMA support. Add the I2C Master Hub serial engine compatible along the specific requirements in a new desc struct passed through the device match data. Signed-off-by: Neil Armstrong Reviewed-by: Konrad Dybcio Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-qcom-geni.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/i2c/busses/i2c-qcom-geni.c b/drivers/i2c/busses/i2c-qcom-geni.c index 75dd0718c5a12..bfe75038bc149 100644 --- a/drivers/i2c/busses/i2c-qcom-geni.c +++ b/drivers/i2c/busses/i2c-qcom-geni.c @@ -1026,8 +1026,16 @@ static const struct dev_pm_ops geni_i2c_pm_ops = { NULL) }; +const struct geni_i2c_desc i2c_master_hub = { + .has_core_clk = true, + .icc_ddr = NULL, + .no_dma_support = true, + .tx_fifo_depth = 16, +}; + static const struct of_device_id geni_i2c_dt_match[] = { { .compatible = "qcom,geni-i2c" }, + { .compatible = "qcom,geni-i2c-master-hub", .data = &i2c_master_hub }, {} }; MODULE_DEVICE_TABLE(of, geni_i2c_dt_match); -- GitLab From eaade84a6302f139aede74fe5a568a70adb9baa2 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 5 Dec 2022 12:31:44 +0800 Subject: [PATCH 1140/1423] crypto: api - Use linux/cache.h instead of asm/cache.h Directly including asm/cache.h leads to build failures on powerpc so replace it with linux/cache.h instead. Fixes: e634ac4a8aaa ("crypto: api - Add crypto_tfm_ctx_dma") Reported-by: Stephen Rothwell Signed-off-by: Herbert Xu --- include/crypto/algapi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h index 8722fd67f40ae..61b327206b557 100644 --- a/include/crypto/algapi.h +++ b/include/crypto/algapi.h @@ -7,8 +7,8 @@ #ifndef _CRYPTO_ALGAPI_H #define _CRYPTO_ALGAPI_H -#include #include +#include #include #include #include -- GitLab From 73e9841ba7e3e394c276d5f24ef4e639bd5f24e1 Mon Sep 17 00:00:00 2001 From: Binbin Zhou Date: Wed, 30 Nov 2022 13:55:51 +0800 Subject: [PATCH 1141/1423] i2c: gpio: Fix potential unused warning for 'i2c_gpio_dt_ids' Dropping a matching #ifdef check along with dropping of_match_ptr() is just a cleanup, while dropping of_match_ptr() that has no corresponding #ifdef fixes an actual warning. Suggested-by: Andy Shevchenko Acked-by: Arnd Bergmann Signed-off-by: Binbin Zhou Reviewed-by: Andy Shevchenko Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-gpio.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/i2c/busses/i2c-gpio.c b/drivers/i2c/busses/i2c-gpio.c index b1985c1667e16..0e4385a9bcf71 100644 --- a/drivers/i2c/busses/i2c-gpio.c +++ b/drivers/i2c/busses/i2c-gpio.c @@ -482,19 +482,17 @@ static int i2c_gpio_remove(struct platform_device *pdev) return 0; } -#if defined(CONFIG_OF) static const struct of_device_id i2c_gpio_dt_ids[] = { { .compatible = "i2c-gpio", }, { /* sentinel */ } }; MODULE_DEVICE_TABLE(of, i2c_gpio_dt_ids); -#endif static struct platform_driver i2c_gpio_driver = { .driver = { .name = "i2c-gpio", - .of_match_table = of_match_ptr(i2c_gpio_dt_ids), + .of_match_table = i2c_gpio_dt_ids, }, .probe = i2c_gpio_probe, .remove = i2c_gpio_remove, -- GitLab From 99c4ec2397bb88c3325676a308b93aa8ba362de8 Mon Sep 17 00:00:00 2001 From: Peter Rosin Date: Tue, 17 Apr 2018 16:32:32 +0200 Subject: [PATCH 1142/1423] i2c: mux: pca9541: switch to using .probe_new Use the new probe style for i2c drivers. Signed-off-by: Peter Rosin Signed-off-by: Wolfram Sang --- drivers/i2c/muxes/i2c-mux-pca9541.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/i2c/muxes/i2c-mux-pca9541.c b/drivers/i2c/muxes/i2c-mux-pca9541.c index ea83de78f52db..09d1d9e67e31f 100644 --- a/drivers/i2c/muxes/i2c-mux-pca9541.c +++ b/drivers/i2c/muxes/i2c-mux-pca9541.c @@ -283,8 +283,7 @@ static int pca9541_release_chan(struct i2c_mux_core *muxc, u32 chan) /* * I2C init/probing/exit functions */ -static int pca9541_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int pca9541_probe(struct i2c_client *client) { struct i2c_adapter *adap = client->adapter; struct i2c_mux_core *muxc; @@ -337,7 +336,7 @@ static struct i2c_driver pca9541_driver = { .name = "pca9541", .of_match_table = of_match_ptr(pca9541_of_match), }, - .probe = pca9541_probe, + .probe_new = pca9541_probe, .remove = pca9541_remove, .id_table = pca9541_id, }; -- GitLab From a00f6d3723f5617222ab8df228228c3c2c84e3ec Mon Sep 17 00:00:00 2001 From: Stephen Kitt Date: Wed, 12 Oct 2022 18:36:47 +0200 Subject: [PATCH 1143/1423] drivers/i2c: use simple i2c probe All these drivers have an i2c probe function which doesn't use the "struct i2c_device_id *id" parameter, so they can trivially be converted to the "probe_new" style of probe with a single argument. This is part of an ongoing transition to single-argument i2c probe functions. Old-style probe functions involve a call to i2c_match_id: in drivers/i2c/i2c-core-base.c, /* * When there are no more users of probe(), * rename probe_new to probe. */ if (driver->probe_new) status = driver->probe_new(client); else if (driver->probe) status = driver->probe(client, i2c_match_id(driver->id_table, client)); else status = -EINVAL; Drivers which don't need the second parameter can be declared using probe_new instead, avoiding the call to i2c_match_id. Drivers which do can still be converted to probe_new-style, calling i2c_match_id themselves (as is done currently for of_match_id). This change was done using the following Coccinelle script, and fixed up for whitespace changes: @ rule1 @ identifier fn; identifier client, id; @@ - static int fn(struct i2c_client *client, const struct i2c_device_id *id) + static int fn(struct i2c_client *client) { ...when != id } @ rule2 depends on rule1 @ identifier rule1.fn; identifier driver; @@ struct i2c_driver driver = { - .probe + .probe_new = ( fn | - &fn + fn ) , }; Signed-off-by: Stephen Kitt Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-core-base.c | 5 ++--- drivers/i2c/i2c-smbus.c | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c index 9aa7b9d9a4856..82478eab71af0 100644 --- a/drivers/i2c/i2c-core-base.c +++ b/drivers/i2c/i2c-core-base.c @@ -1017,15 +1017,14 @@ static const struct i2c_device_id dummy_id[] = { { }, }; -static int dummy_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int dummy_probe(struct i2c_client *client) { return 0; } static struct i2c_driver dummy_driver = { .driver.name = "dummy", - .probe = dummy_probe, + .probe_new = dummy_probe, .id_table = dummy_id, }; diff --git a/drivers/i2c/i2c-smbus.c b/drivers/i2c/i2c-smbus.c index c85710ed95483..cd19546d31fcb 100644 --- a/drivers/i2c/i2c-smbus.c +++ b/drivers/i2c/i2c-smbus.c @@ -112,8 +112,7 @@ static void smbalert_work(struct work_struct *work) } /* Setup SMBALERT# infrastructure */ -static int smbalert_probe(struct i2c_client *ara, - const struct i2c_device_id *id) +static int smbalert_probe(struct i2c_client *ara) { struct i2c_smbus_alert_setup *setup = dev_get_platdata(&ara->dev); struct i2c_smbus_alert *alert; @@ -170,7 +169,7 @@ static struct i2c_driver smbalert_driver = { .driver = { .name = "smbus_alert", }, - .probe = smbalert_probe, + .probe_new = smbalert_probe, .remove = smbalert_remove, .id_table = smbalert_ids, }; -- GitLab From a5eacd2e3790ecf1cbc0be2e53a6f3d1ce3bb719 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 18 Nov 2022 23:36:22 +0100 Subject: [PATCH 1144/1423] i2c: mux: pca954x: Convert to i2c's .probe_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit .probe_new() doesn't get the i2c_device_id * parameter, so determine that explicitly in the probe function. Signed-off-by: Uwe Kleine-König Acked-by: Peter Rosin Signed-off-by: Wolfram Sang --- drivers/i2c/muxes/i2c-mux-pca954x.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/i2c/muxes/i2c-mux-pca954x.c b/drivers/i2c/muxes/i2c-mux-pca954x.c index a5f458b635df6..3639e6d7304cd 100644 --- a/drivers/i2c/muxes/i2c-mux-pca954x.c +++ b/drivers/i2c/muxes/i2c-mux-pca954x.c @@ -411,9 +411,9 @@ static int pca954x_init(struct i2c_client *client, struct pca954x *data) /* * I2C init/probing/exit functions */ -static int pca954x_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int pca954x_probe(struct i2c_client *client) { + const struct i2c_device_id *id = i2c_client_get_device_id(client); struct i2c_adapter *adap = client->adapter; struct device *dev = &client->dev; struct gpio_desc *gpio; @@ -554,7 +554,7 @@ static struct i2c_driver pca954x_driver = { .pm = &pca954x_pm, .of_match_table = pca954x_of_match, }, - .probe = pca954x_probe, + .probe_new = pca954x_probe, .remove = pca954x_remove, .id_table = pca954x_id, }; -- GitLab From 87ab726952267caabb9c1404e007e16e42f03b52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 18 Nov 2022 23:36:19 +0100 Subject: [PATCH 1145/1423] i2c: slave-eeprom: Convert to i2c's .probe_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit .probe_new() doesn't get the i2c_device_id * parameter, so determine that explicitly in the probe function. Signed-off-by: Uwe Kleine-König Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-slave-eeprom.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/i2c-slave-eeprom.c b/drivers/i2c/i2c-slave-eeprom.c index 4abc2d9198815..5f25f23c4ff8c 100644 --- a/drivers/i2c/i2c-slave-eeprom.c +++ b/drivers/i2c/i2c-slave-eeprom.c @@ -140,8 +140,9 @@ static int i2c_slave_init_eeprom_data(struct eeprom_data *eeprom, struct i2c_cli return 0; } -static int i2c_slave_eeprom_probe(struct i2c_client *client, const struct i2c_device_id *id) +static int i2c_slave_eeprom_probe(struct i2c_client *client) { + const struct i2c_device_id *id = i2c_client_get_device_id(client); struct eeprom_data *eeprom; int ret; unsigned int size = FIELD_GET(I2C_SLAVE_BYTELEN, id->driver_data) + 1; @@ -206,7 +207,7 @@ static struct i2c_driver i2c_slave_eeprom_driver = { .driver = { .name = "i2c-slave-eeprom", }, - .probe = i2c_slave_eeprom_probe, + .probe_new = i2c_slave_eeprom_probe, .remove = i2c_slave_eeprom_remove, .id_table = i2c_slave_eeprom_id, }; -- GitLab From d78a167332e1ca8113268ed922c1212fd71b73ad Mon Sep 17 00:00:00 2001 From: Hui Tang Date: Mon, 14 Nov 2022 17:25:40 +0800 Subject: [PATCH 1146/1423] i2c: pxa-pci: fix missing pci_disable_device() on error in ce4100_i2c_probe Using pcim_enable_device() to avoid missing pci_disable_device(). Fixes: 7e94dd154e93 ("i2c-pxa2xx: Add PCI support for PXA I2C controller") Signed-off-by: Hui Tang Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-pxa-pci.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/i2c/busses/i2c-pxa-pci.c b/drivers/i2c/busses/i2c-pxa-pci.c index f614cade432bb..30e38bc8b6db8 100644 --- a/drivers/i2c/busses/i2c-pxa-pci.c +++ b/drivers/i2c/busses/i2c-pxa-pci.c @@ -105,7 +105,7 @@ static int ce4100_i2c_probe(struct pci_dev *dev, int i; struct ce4100_devices *sds; - ret = pci_enable_device_mem(dev); + ret = pcim_enable_device(dev); if (ret) return ret; @@ -114,10 +114,8 @@ static int ce4100_i2c_probe(struct pci_dev *dev, return -EINVAL; } sds = kzalloc(sizeof(*sds), GFP_KERNEL); - if (!sds) { - ret = -ENOMEM; - goto err_mem; - } + if (!sds) + return -ENOMEM; for (i = 0; i < ARRAY_SIZE(sds->pdev); i++) { sds->pdev[i] = add_i2c_device(dev, i); @@ -133,8 +131,6 @@ static int ce4100_i2c_probe(struct pci_dev *dev, err_dev_add: kfree(sds); -err_mem: - pci_disable_device(dev); return ret; } -- GitLab From 58ff6569bc6ec369482eb2d132868870380be64c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 5 Dec 2022 12:05:51 +0000 Subject: [PATCH 1147/1423] KVM: arm64: PMU: Fix period computation for 64bit counters with 32bit overflow Fix the bogus masking when computing the period of a 64bit counter with 32bit overflow. It really should be treated like a 32bit counter for the purpose of the period. Reported-by: Ricardo Koller Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/Y4jbosgHbUDI0WF4@google.com --- arch/arm64/kvm/pmu-emul.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index d8ea399430861..24908400e1906 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -461,14 +461,10 @@ static u64 compute_period(struct kvm_pmc *pmc, u64 counter) { u64 val; - if (kvm_pmc_is_64bit(pmc)) { - if (!kvm_pmc_has_64bit_overflow(pmc)) - val = -(counter & GENMASK(31, 0)); - else - val = (-counter) & GENMASK(63, 0); - } else { + if (kvm_pmc_is_64bit(pmc) && kvm_pmc_has_64bit_overflow(pmc)) + val = (-counter) & GENMASK(63, 0); + else val = (-counter) & GENMASK(31, 0); - } return val; } -- GitLab From f794eec86c7cd9a340a66109e6f32f8659ff30fa Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 8 Sep 2022 15:44:58 -0300 Subject: [PATCH 1148/1423] vfio: Simplify vfio_create_group() The vfio.group_lock is now only used to serialize vfio_group creation and destruction, we don't need a micro-optimization of searching, unlocking, then allocating and searching again. Just hold the lock the whole time. Grabbed from: https://lore.kernel.org/kvm/20220922152338.2a2238fe.alex.williamson@redhat.com/ Link: https://lore.kernel.org/r/20221201145535.589687-2-yi.l.liu@intel.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Tested-by: Lixiao Yang Tested-by: Yu He Signed-off-by: Jason Gunthorpe Signed-off-by: Alex Williamson Signed-off-by: Yi Liu --- drivers/vfio/vfio_main.c | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 6d51b700764e5..f913d862a3868 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -143,10 +143,12 @@ EXPORT_SYMBOL_GPL(vfio_device_set_open_count); * Group objects - create, release, get, put, search */ static struct vfio_group * -__vfio_group_get_from_iommu(struct iommu_group *iommu_group) +vfio_group_get_from_iommu(struct iommu_group *iommu_group) { struct vfio_group *group; + lockdep_assert_held(&vfio.group_lock); + /* * group->iommu_group from the vfio.group_list cannot be NULL * under the vfio.group_lock. @@ -160,17 +162,6 @@ __vfio_group_get_from_iommu(struct iommu_group *iommu_group) return NULL; } -static struct vfio_group * -vfio_group_get_from_iommu(struct iommu_group *iommu_group) -{ - struct vfio_group *group; - - mutex_lock(&vfio.group_lock); - group = __vfio_group_get_from_iommu(iommu_group); - mutex_unlock(&vfio.group_lock); - return group; -} - static void vfio_group_release(struct device *dev) { struct vfio_group *group = container_of(dev, struct vfio_group, dev); @@ -225,6 +216,8 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, struct vfio_group *ret; int err; + lockdep_assert_held(&vfio.group_lock); + group = vfio_group_alloc(iommu_group, type); if (IS_ERR(group)) return group; @@ -237,26 +230,16 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, goto err_put; } - mutex_lock(&vfio.group_lock); - - /* Did we race creating this group? */ - ret = __vfio_group_get_from_iommu(iommu_group); - if (ret) - goto err_unlock; - err = cdev_device_add(&group->cdev, &group->dev); if (err) { ret = ERR_PTR(err); - goto err_unlock; + goto err_put; } list_add(&group->vfio_next, &vfio.group_list); - mutex_unlock(&vfio.group_lock); return group; -err_unlock: - mutex_unlock(&vfio.group_lock); err_put: put_device(&group->dev); return ret; @@ -467,7 +450,9 @@ static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev, if (ret) goto out_put_group; + mutex_lock(&vfio.group_lock); group = vfio_create_group(iommu_group, type); + mutex_unlock(&vfio.group_lock); if (IS_ERR(group)) { ret = PTR_ERR(group); goto out_remove_device; @@ -516,9 +501,11 @@ static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) return ERR_PTR(-EINVAL); } + mutex_lock(&vfio.group_lock); group = vfio_group_get_from_iommu(iommu_group); if (!group) group = vfio_create_group(iommu_group, VFIO_IOMMU); + mutex_unlock(&vfio.group_lock); /* The vfio_group holds a reference to the iommu_group */ iommu_group_put(iommu_group); -- GitLab From dcb93d0364a238315dd71f834b199d4b95ae09eb Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 8 Sep 2022 15:44:59 -0300 Subject: [PATCH 1149/1423] vfio: Move the sanity check of the group to vfio_create_group() This avoids opening group specific code in __vfio_register_dev() for the sanity check if an (existing) group is not corrupted by having two copies of the same struct device in it. It also simplifies the error unwind for this sanity check since the failure can be detected in the group allocation. This also prepares for moving the group specific code into separate group.c. Grabbed from: https://lore.kernel.org/kvm/20220922152338.2a2238fe.alex.williamson@redhat.com/ Link: https://lore.kernel.org/r/20221201145535.589687-3-yi.l.liu@intel.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Tested-by: Lixiao Yang Tested-by: Yu He Signed-off-by: Jason Gunthorpe Signed-off-by: Alex Williamson Signed-off-by: Yi Liu --- drivers/vfio/vfio_main.c | 62 ++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 37 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index f913d862a3868..87d9a1670a2fb 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -143,7 +143,7 @@ EXPORT_SYMBOL_GPL(vfio_device_set_open_count); * Group objects - create, release, get, put, search */ static struct vfio_group * -vfio_group_get_from_iommu(struct iommu_group *iommu_group) +vfio_group_find_from_iommu(struct iommu_group *iommu_group) { struct vfio_group *group; @@ -154,10 +154,8 @@ vfio_group_get_from_iommu(struct iommu_group *iommu_group) * under the vfio.group_lock. */ list_for_each_entry(group, &vfio.group_list, vfio_next) { - if (group->iommu_group == iommu_group) { - refcount_inc(&group->drivers); + if (group->iommu_group == iommu_group) return group; - } } return NULL; } @@ -307,23 +305,6 @@ static bool vfio_device_try_get_registration(struct vfio_device *device) return refcount_inc_not_zero(&device->refcount); } -static struct vfio_device *vfio_group_get_device(struct vfio_group *group, - struct device *dev) -{ - struct vfio_device *device; - - mutex_lock(&group->device_lock); - list_for_each_entry(device, &group->device_list, group_next) { - if (device->dev == dev && - vfio_device_try_get_registration(device)) { - mutex_unlock(&group->device_lock); - return device; - } - } - mutex_unlock(&group->device_lock); - return NULL; -} - /* * VFIO driver API */ @@ -467,6 +448,21 @@ out_put_group: return ERR_PTR(ret); } +static bool vfio_group_has_device(struct vfio_group *group, struct device *dev) +{ + struct vfio_device *device; + + mutex_lock(&group->device_lock); + list_for_each_entry(device, &group->device_list, group_next) { + if (device->dev == dev) { + mutex_unlock(&group->device_lock); + return true; + } + } + mutex_unlock(&group->device_lock); + return false; +} + static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) { struct iommu_group *iommu_group; @@ -502,9 +498,15 @@ static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) } mutex_lock(&vfio.group_lock); - group = vfio_group_get_from_iommu(iommu_group); - if (!group) + group = vfio_group_find_from_iommu(iommu_group); + if (group) { + if (WARN_ON(vfio_group_has_device(group, dev))) + group = ERR_PTR(-EINVAL); + else + refcount_inc(&group->drivers); + } else { group = vfio_create_group(iommu_group, VFIO_IOMMU); + } mutex_unlock(&vfio.group_lock); /* The vfio_group holds a reference to the iommu_group */ @@ -515,7 +517,6 @@ static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) static int __vfio_register_dev(struct vfio_device *device, struct vfio_group *group) { - struct vfio_device *existing_device; int ret; /* @@ -537,19 +538,6 @@ static int __vfio_register_dev(struct vfio_device *device, if (!device->dev_set) vfio_assign_device_set(device, device); - existing_device = vfio_group_get_device(group, device->dev); - if (existing_device) { - /* - * group->iommu_group is non-NULL because we hold the drivers - * refcount. - */ - dev_WARN(device->dev, "Device already exists on group %d\n", - iommu_group_id(group->iommu_group)); - vfio_device_put_registration(existing_device); - ret = -EBUSY; - goto err_out; - } - /* Our reference on group is moved to the device */ device->group = group; -- GitLab From 32e0922821f2115eb8940b6a6b942ba61eff15c2 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 23 Sep 2022 02:19:34 -0700 Subject: [PATCH 1150/1423] vfio: Create wrappers for group register/unregister This avoids decoding group fields in the common functions used by vfio_device registration, and prepares for further moving the vfio group specific code into separate file. Link: https://lore.kernel.org/r/20221201145535.589687-4-yi.l.liu@intel.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Alex Williamson Tested-by: Lixiao Yang Tested-by: Yu He Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/vfio/vfio_main.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 87d9a1670a2fb..a5122fa4bf4d1 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -514,6 +514,20 @@ static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) return group; } +static void vfio_device_group_register(struct vfio_device *device) +{ + mutex_lock(&device->group->device_lock); + list_add(&device->group_next, &device->group->device_list); + mutex_unlock(&device->group->device_lock); +} + +static void vfio_device_group_unregister(struct vfio_device *device) +{ + mutex_lock(&device->group->device_lock); + list_del(&device->group_next); + mutex_unlock(&device->group->device_lock); +} + static int __vfio_register_dev(struct vfio_device *device, struct vfio_group *group) { @@ -552,9 +566,7 @@ static int __vfio_register_dev(struct vfio_device *device, /* Refcounting can't start until the driver calls register */ refcount_set(&device->refcount, 1); - mutex_lock(&group->device_lock); - list_add(&device->group_next, &group->device_list); - mutex_unlock(&group->device_lock); + vfio_device_group_register(device); return 0; err_out: @@ -614,7 +626,6 @@ static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group, * removed. Open file descriptors for the device... */ void vfio_unregister_group_dev(struct vfio_device *device) { - struct vfio_group *group = device->group; unsigned int i = 0; bool interrupted = false; long rc; @@ -642,9 +653,7 @@ void vfio_unregister_group_dev(struct vfio_device *device) } } - mutex_lock(&group->device_lock); - list_del(&device->group_next); - mutex_unlock(&group->device_lock); + vfio_device_group_unregister(device); /* Balances device_add in register path */ device_del(&device->device); -- GitLab From 49ea02d390a34a538a3f54b9ce5665e474690bc7 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Thu, 24 Nov 2022 21:22:27 -0800 Subject: [PATCH 1151/1423] vfio: Set device->group in helper function This avoids referencing device->group in __vfio_register_dev(). Link: https://lore.kernel.org/r/20221201145535.589687-5-yi.l.liu@intel.com Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Alex Williamson Tested-by: Lixiao Yang Tested-by: Yu He Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/vfio/vfio_main.c | 41 +++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index a5122fa4bf4d1..7e42ee0ee1bce 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -528,18 +528,29 @@ static void vfio_device_group_unregister(struct vfio_device *device) mutex_unlock(&device->group->device_lock); } -static int __vfio_register_dev(struct vfio_device *device, - struct vfio_group *group) +static int vfio_device_set_group(struct vfio_device *device, + enum vfio_group_type type) { - int ret; + struct vfio_group *group; + + if (type == VFIO_IOMMU) + group = vfio_group_find_or_alloc(device->dev); + else + group = vfio_noiommu_group_alloc(device->dev, type); - /* - * In all cases group is the output of one of the group allocation - * functions and we have group->drivers incremented for us. - */ if (IS_ERR(group)) return PTR_ERR(group); + /* Our reference on group is moved to the device */ + device->group = group; + return 0; +} + +static int __vfio_register_dev(struct vfio_device *device, + enum vfio_group_type type) +{ + int ret; + if (WARN_ON(device->ops->bind_iommufd && (!device->ops->unbind_iommufd || !device->ops->attach_ioas))) @@ -552,12 +563,13 @@ static int __vfio_register_dev(struct vfio_device *device, if (!device->dev_set) vfio_assign_device_set(device, device); - /* Our reference on group is moved to the device */ - device->group = group; - ret = dev_set_name(&device->device, "vfio%d", device->index); if (ret) - goto err_out; + return ret; + + ret = vfio_device_set_group(device, type); + if (ret) + return ret; ret = device_add(&device->device); if (ret) @@ -576,8 +588,7 @@ err_out: int vfio_register_group_dev(struct vfio_device *device) { - return __vfio_register_dev(device, - vfio_group_find_or_alloc(device->dev)); + return __vfio_register_dev(device, VFIO_IOMMU); } EXPORT_SYMBOL_GPL(vfio_register_group_dev); @@ -587,8 +598,7 @@ EXPORT_SYMBOL_GPL(vfio_register_group_dev); */ int vfio_register_emulated_iommu_dev(struct vfio_device *device) { - return __vfio_register_dev(device, - vfio_noiommu_group_alloc(device->dev, VFIO_EMULATED_IOMMU)); + return __vfio_register_dev(device, VFIO_EMULATED_IOMMU); } EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev); @@ -658,6 +668,7 @@ void vfio_unregister_group_dev(struct vfio_device *device) /* Balances device_add in register path */ device_del(&device->device); + /* Balances vfio_device_set_group in register path */ vfio_device_remove_group(device); } EXPORT_SYMBOL_GPL(vfio_unregister_group_dev); -- GitLab From 07b465863325faceb865871ff5f22c1ebba6df54 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Thu, 10 Nov 2022 07:26:09 -0800 Subject: [PATCH 1152/1423] vfio: Swap order of vfio_device_container_register() and open_device() This makes the DMA unmap callback registration to container be consistent across the vfio iommufd compat mode and the legacy container mode. In the vfio iommufd compat mode, this registration is done in the vfio_iommufd_bind() when creating access which has an unmap callback. This is prior to calling the open_device() op. The existing mdev drivers have been converted to be OK with this order. So it is ok to swap the order of vfio_device_container_register() and open_device() for legacy mode. This also prepares for further moving group specific code into separate source file. Link: https://lore.kernel.org/r/20221201145535.589687-6-yi.l.liu@intel.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Alex Williamson Tested-by: Lixiao Yang Tested-by: Yu He Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/vfio/vfio_main.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 7e42ee0ee1bce..5dddf962f6509 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -807,6 +807,7 @@ static int vfio_device_first_open(struct vfio_device *device) ret = vfio_group_use_container(device->group); if (ret) goto err_module_put; + vfio_device_container_register(device); } else if (device->group->iommufd) { ret = vfio_iommufd_bind(device, device->group->iommufd); if (ret) @@ -819,17 +820,17 @@ static int vfio_device_first_open(struct vfio_device *device) if (ret) goto err_container; } - if (device->group->container) - vfio_device_container_register(device); mutex_unlock(&device->group->group_lock); return 0; err_container: device->kvm = NULL; - if (device->group->container) + if (device->group->container) { + vfio_device_container_unregister(device); vfio_group_unuse_container(device->group); - else if (device->group->iommufd) + } else if (device->group->iommufd) { vfio_iommufd_unbind(device); + } err_module_put: mutex_unlock(&device->group->group_lock); module_put(device->dev->driver->owner); @@ -841,15 +842,15 @@ static void vfio_device_last_close(struct vfio_device *device) lockdep_assert_held(&device->dev_set->lock); mutex_lock(&device->group->group_lock); - if (device->group->container) - vfio_device_container_unregister(device); if (device->ops->close_device) device->ops->close_device(device); device->kvm = NULL; - if (device->group->container) + if (device->group->container) { + vfio_device_container_unregister(device); vfio_group_unuse_container(device->group); - else if (device->group->iommufd) + } else if (device->group->iommufd) { vfio_iommufd_unbind(device); + } mutex_unlock(&device->group->group_lock); module_put(device->dev->driver->owner); } -- GitLab From 5cfff0774353ee35601e3d3fe2f0bd95c33aa5db Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 30 Sep 2022 03:22:55 -0700 Subject: [PATCH 1153/1423] vfio: Make vfio_device_open() truly device specific Then move group related logic into vfio_device_open_file(). Accordingly introduce a vfio_device_close() to pair up. Link: https://lore.kernel.org/r/20221201145535.589687-7-yi.l.liu@intel.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Alex Williamson Tested-by: Lixiao Yang Tested-by: Yu He Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/vfio/vfio_main.c | 46 +++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 5dddf962f6509..37413ac254c0a 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -855,20 +855,41 @@ static void vfio_device_last_close(struct vfio_device *device) module_put(device->dev->driver->owner); } -static struct file *vfio_device_open(struct vfio_device *device) +static int vfio_device_open(struct vfio_device *device) { - struct file *filep; - int ret; + int ret = 0; mutex_lock(&device->dev_set->lock); device->open_count++; if (device->open_count == 1) { ret = vfio_device_first_open(device); if (ret) - goto err_unlock; + device->open_count--; } mutex_unlock(&device->dev_set->lock); + return ret; +} + +static void vfio_device_close(struct vfio_device *device) +{ + mutex_lock(&device->dev_set->lock); + vfio_assert_device_open(device); + if (device->open_count == 1) + vfio_device_last_close(device); + device->open_count--; + mutex_unlock(&device->dev_set->lock); +} + +static struct file *vfio_device_open_file(struct vfio_device *device) +{ + struct file *filep; + int ret; + + ret = vfio_device_open(device); + if (ret) + goto err_out; + /* * We can't use anon_inode_getfd() because we need to modify * the f_mode flags directly to allow more than just ioctls @@ -897,12 +918,8 @@ static struct file *vfio_device_open(struct vfio_device *device) return filep; err_close_device: - mutex_lock(&device->dev_set->lock); - if (device->open_count == 1) - vfio_device_last_close(device); -err_unlock: - device->open_count--; - mutex_unlock(&device->dev_set->lock); + vfio_device_close(device); +err_out: return ERR_PTR(ret); } @@ -930,7 +947,7 @@ static int vfio_group_ioctl_get_device_fd(struct vfio_group *group, goto err_put_device; } - filep = vfio_device_open(device); + filep = vfio_device_open_file(device); if (IS_ERR(filep)) { ret = PTR_ERR(filep); goto err_put_fdno; @@ -1113,12 +1130,7 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep) { struct vfio_device *device = filep->private_data; - mutex_lock(&device->dev_set->lock); - vfio_assert_device_open(device); - if (device->open_count == 1) - vfio_device_last_close(device); - device->open_count--; - mutex_unlock(&device->dev_set->lock); + vfio_device_close(device); vfio_device_put_registration(device); -- GitLab From 5c8d3d93f6a7c9371212690b0195160e5f88bdff Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 2 Nov 2022 04:42:25 -0700 Subject: [PATCH 1154/1423] vfio: Refactor vfio_device open and close This refactor makes the vfio_device_open() to accept device, iommufd_ctx pointer and kvm pointer. These parameters are generic items in today's group path and future device cdev path. Caller of vfio_device_open() should take care the necessary protections. e.g. the current group path need to hold the group_lock to ensure the iommufd_ctx and kvm pointer are valid. This refactor also wraps the group spefcific codes in the device open and close paths to be paired helpers like: - vfio_device_group_open/close(): call vfio_device_open/close() - vfio_device_group_use/unuse_iommu(): this pair is container specific. iommufd vs. container is selected in vfio_device_first_open(). Such helpers are supposed to be moved to group.c. While iommufd related codes will be kept in the generic helpers since future device cdev path also need to handle iommufd. Link: https://lore.kernel.org/r/20221201145535.589687-8-yi.l.liu@intel.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Alex Williamson Tested-by: Lixiao Yang Tested-by: Yu He Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/vfio/vfio_main.c | 133 +++++++++++++++++++++++++-------------- 1 file changed, 87 insertions(+), 46 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 37413ac254c0a..a4583f4827e59 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -783,7 +783,38 @@ static bool vfio_assert_device_open(struct vfio_device *device) return !WARN_ON_ONCE(!READ_ONCE(device->open_count)); } -static int vfio_device_first_open(struct vfio_device *device) +static int vfio_device_group_use_iommu(struct vfio_device *device) +{ + struct vfio_group *group = device->group; + int ret = 0; + + lockdep_assert_held(&group->group_lock); + + if (WARN_ON(!group->container)) + return -EINVAL; + + ret = vfio_group_use_container(group); + if (ret) + return ret; + vfio_device_container_register(device); + return 0; +} + +static void vfio_device_group_unuse_iommu(struct vfio_device *device) +{ + struct vfio_group *group = device->group; + + lockdep_assert_held(&group->group_lock); + + if (WARN_ON(!group->container)) + return; + + vfio_device_container_unregister(device); + vfio_group_unuse_container(group); +} + +static int vfio_device_first_open(struct vfio_device *device, + struct iommufd_ctx *iommufd, struct kvm *kvm) { int ret; @@ -792,77 +823,56 @@ static int vfio_device_first_open(struct vfio_device *device) if (!try_module_get(device->dev->driver->owner)) return -ENODEV; - /* - * Here we pass the KVM pointer with the group under the lock. If the - * device driver will use it, it must obtain a reference and release it - * during close_device. - */ - mutex_lock(&device->group->group_lock); - if (!vfio_group_has_iommu(device->group)) { - ret = -EINVAL; + if (iommufd) + ret = vfio_iommufd_bind(device, iommufd); + else + ret = vfio_device_group_use_iommu(device); + if (ret) goto err_module_put; - } - if (device->group->container) { - ret = vfio_group_use_container(device->group); - if (ret) - goto err_module_put; - vfio_device_container_register(device); - } else if (device->group->iommufd) { - ret = vfio_iommufd_bind(device, device->group->iommufd); - if (ret) - goto err_module_put; - } - - device->kvm = device->group->kvm; + device->kvm = kvm; if (device->ops->open_device) { ret = device->ops->open_device(device); if (ret) - goto err_container; + goto err_unuse_iommu; } - mutex_unlock(&device->group->group_lock); return 0; -err_container: +err_unuse_iommu: device->kvm = NULL; - if (device->group->container) { - vfio_device_container_unregister(device); - vfio_group_unuse_container(device->group); - } else if (device->group->iommufd) { + if (iommufd) vfio_iommufd_unbind(device); - } + else + vfio_device_group_unuse_iommu(device); err_module_put: - mutex_unlock(&device->group->group_lock); module_put(device->dev->driver->owner); return ret; } -static void vfio_device_last_close(struct vfio_device *device) +static void vfio_device_last_close(struct vfio_device *device, + struct iommufd_ctx *iommufd) { lockdep_assert_held(&device->dev_set->lock); - mutex_lock(&device->group->group_lock); if (device->ops->close_device) device->ops->close_device(device); device->kvm = NULL; - if (device->group->container) { - vfio_device_container_unregister(device); - vfio_group_unuse_container(device->group); - } else if (device->group->iommufd) { + if (iommufd) vfio_iommufd_unbind(device); - } - mutex_unlock(&device->group->group_lock); + else + vfio_device_group_unuse_iommu(device); module_put(device->dev->driver->owner); } -static int vfio_device_open(struct vfio_device *device) +static int vfio_device_open(struct vfio_device *device, + struct iommufd_ctx *iommufd, struct kvm *kvm) { int ret = 0; mutex_lock(&device->dev_set->lock); device->open_count++; if (device->open_count == 1) { - ret = vfio_device_first_open(device); + ret = vfio_device_first_open(device, iommufd, kvm); if (ret) device->open_count--; } @@ -871,22 +881,53 @@ static int vfio_device_open(struct vfio_device *device) return ret; } -static void vfio_device_close(struct vfio_device *device) +static void vfio_device_close(struct vfio_device *device, + struct iommufd_ctx *iommufd) { mutex_lock(&device->dev_set->lock); vfio_assert_device_open(device); if (device->open_count == 1) - vfio_device_last_close(device); + vfio_device_last_close(device, iommufd); device->open_count--; mutex_unlock(&device->dev_set->lock); } +static int vfio_device_group_open(struct vfio_device *device) +{ + int ret; + + mutex_lock(&device->group->group_lock); + if (!vfio_group_has_iommu(device->group)) { + ret = -EINVAL; + goto out_unlock; + } + + /* + * Here we pass the KVM pointer with the group under the lock. If the + * device driver will use it, it must obtain a reference and release it + * during close_device. + */ + ret = vfio_device_open(device, device->group->iommufd, + device->group->kvm); + +out_unlock: + mutex_unlock(&device->group->group_lock); + return ret; +} + +static void vfio_device_group_close(struct vfio_device *device) +{ + mutex_lock(&device->group->group_lock); + vfio_device_close(device, device->group->iommufd); + mutex_unlock(&device->group->group_lock); +} + static struct file *vfio_device_open_file(struct vfio_device *device) { struct file *filep; int ret; - ret = vfio_device_open(device); + ret = vfio_device_group_open(device); if (ret) goto err_out; @@ -918,7 +959,7 @@ static struct file *vfio_device_open_file(struct vfio_device *device) return filep; err_close_device: - vfio_device_close(device); + vfio_device_group_close(device); err_out: return ERR_PTR(ret); } @@ -1130,7 +1171,7 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep) { struct vfio_device *device = filep->private_data; - vfio_device_close(device); + vfio_device_group_close(device); vfio_device_put_registration(device); -- GitLab From 1334e47ee798ac4715330a6ade0afc929cd54aff Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 23 Sep 2022 07:08:36 -0700 Subject: [PATCH 1155/1423] vfio: Wrap vfio group module init/clean code into helpers This wraps the init/clean code of vfio group global variable to be helpers, and prepares for further moving vfio group specific code into separate file. As container is used by group, so vfio_container_init/cleanup() is moved into vfio_group_init/cleanup(). Link: https://lore.kernel.org/r/20221201145535.589687-9-yi.l.liu@intel.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Alex Williamson Tested-by: Lixiao Yang Tested-by: Yu He Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/vfio/vfio_main.c | 56 ++++++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 20 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index a4583f4827e59..e053998baffd8 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -2066,12 +2066,11 @@ static char *vfio_devnode(struct device *dev, umode_t *mode) return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev)); } -static int __init vfio_init(void) +static int __init vfio_group_init(void) { int ret; ida_init(&vfio.group_ida); - ida_init(&vfio.device_ida); mutex_init(&vfio.group_lock); INIT_LIST_HEAD(&vfio.group_list); @@ -2088,24 +2087,12 @@ static int __init vfio_init(void) vfio.class->devnode = vfio_devnode; - /* /sys/class/vfio-dev/vfioX */ - vfio.device_class = class_create(THIS_MODULE, "vfio-dev"); - if (IS_ERR(vfio.device_class)) { - ret = PTR_ERR(vfio.device_class); - goto err_dev_class; - } - ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio"); if (ret) goto err_alloc_chrdev; - - pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); return 0; err_alloc_chrdev: - class_destroy(vfio.device_class); - vfio.device_class = NULL; -err_dev_class: class_destroy(vfio.class); vfio.class = NULL; err_group_class: @@ -2113,18 +2100,47 @@ err_group_class: return ret; } -static void __exit vfio_cleanup(void) +static void vfio_group_cleanup(void) { WARN_ON(!list_empty(&vfio.group_list)); - - ida_destroy(&vfio.device_ida); ida_destroy(&vfio.group_ida); unregister_chrdev_region(vfio.group_devt, MINORMASK + 1); - class_destroy(vfio.device_class); - vfio.device_class = NULL; class_destroy(vfio.class); - vfio_container_cleanup(); vfio.class = NULL; + vfio_container_cleanup(); +} + +static int __init vfio_init(void) +{ + int ret; + + ida_init(&vfio.device_ida); + + ret = vfio_group_init(); + if (ret) + return ret; + + /* /sys/class/vfio-dev/vfioX */ + vfio.device_class = class_create(THIS_MODULE, "vfio-dev"); + if (IS_ERR(vfio.device_class)) { + ret = PTR_ERR(vfio.device_class); + goto err_dev_class; + } + + pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); + return 0; + +err_dev_class: + vfio_group_cleanup(); + return ret; +} + +static void __exit vfio_cleanup(void) +{ + ida_destroy(&vfio.device_ida); + class_destroy(vfio.device_class); + vfio.device_class = NULL; + vfio_group_cleanup(); xa_destroy(&vfio_device_set_xa); } -- GitLab From 8da7a0e79f9b15330ae68d8532425399f4c27045 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Thu, 10 Nov 2022 18:57:01 -0800 Subject: [PATCH 1156/1423] vfio: Refactor dma APIs for emulated devices To use group helpers instead of opening group related code in the API. This prepares moving group specific code out of vfio_main.c. Link: https://lore.kernel.org/r/20221201145535.589687-10-yi.l.liu@intel.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Alex Williamson Tested-by: Lixiao Yang Tested-by: Yu He Signed-off-by: Yi Liu Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/vfio/container.c | 20 +++++++++++++------- drivers/vfio/vfio.h | 32 ++++++++++++++++---------------- drivers/vfio/vfio_main.c | 25 ++++++++++++++----------- 3 files changed, 43 insertions(+), 34 deletions(-) diff --git a/drivers/vfio/container.c b/drivers/vfio/container.c index 6b362d97d6822..b7a9560ab25e4 100644 --- a/drivers/vfio/container.c +++ b/drivers/vfio/container.c @@ -540,10 +540,12 @@ void vfio_group_unuse_container(struct vfio_group *group) fput(group->opened_file); } -int vfio_container_pin_pages(struct vfio_container *container, - struct iommu_group *iommu_group, dma_addr_t iova, - int npage, int prot, struct page **pages) +int vfio_device_container_pin_pages(struct vfio_device *device, + dma_addr_t iova, int npage, + int prot, struct page **pages) { + struct vfio_container *container = device->group->container; + struct iommu_group *iommu_group = device->group->iommu_group; struct vfio_iommu_driver *driver = container->iommu_driver; if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) @@ -555,9 +557,11 @@ int vfio_container_pin_pages(struct vfio_container *container, npage, prot, pages); } -void vfio_container_unpin_pages(struct vfio_container *container, - dma_addr_t iova, int npage) +void vfio_device_container_unpin_pages(struct vfio_device *device, + dma_addr_t iova, int npage) { + struct vfio_container *container = device->group->container; + if (WARN_ON(npage <= 0 || npage > VFIO_PIN_PAGES_MAX_ENTRIES)) return; @@ -565,9 +569,11 @@ void vfio_container_unpin_pages(struct vfio_container *container, npage); } -int vfio_container_dma_rw(struct vfio_container *container, dma_addr_t iova, - void *data, size_t len, bool write) +int vfio_device_container_dma_rw(struct vfio_device *device, + dma_addr_t iova, void *data, + size_t len, bool write) { + struct vfio_container *container = device->group->container; struct vfio_iommu_driver *driver = container->iommu_driver; if (unlikely(!driver || !driver->ops->dma_rw)) diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index ce5fe3fc493b4..a112e8f2b291a 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -122,13 +122,14 @@ int vfio_container_attach_group(struct vfio_container *container, void vfio_group_detach_container(struct vfio_group *group); void vfio_device_container_register(struct vfio_device *device); void vfio_device_container_unregister(struct vfio_device *device); -int vfio_container_pin_pages(struct vfio_container *container, - struct iommu_group *iommu_group, dma_addr_t iova, - int npage, int prot, struct page **pages); -void vfio_container_unpin_pages(struct vfio_container *container, - dma_addr_t iova, int npage); -int vfio_container_dma_rw(struct vfio_container *container, dma_addr_t iova, - void *data, size_t len, bool write); +int vfio_device_container_pin_pages(struct vfio_device *device, + dma_addr_t iova, int npage, + int prot, struct page **pages); +void vfio_device_container_unpin_pages(struct vfio_device *device, + dma_addr_t iova, int npage); +int vfio_device_container_dma_rw(struct vfio_device *device, + dma_addr_t iova, void *data, + size_t len, bool write); int __init vfio_container_init(void); void vfio_container_cleanup(void); @@ -166,22 +167,21 @@ static inline void vfio_device_container_unregister(struct vfio_device *device) { } -static inline int vfio_container_pin_pages(struct vfio_container *container, - struct iommu_group *iommu_group, - dma_addr_t iova, int npage, int prot, - struct page **pages) +static inline int vfio_device_container_pin_pages(struct vfio_device *device, + dma_addr_t iova, int npage, + int prot, struct page **pages) { return -EOPNOTSUPP; } -static inline void vfio_container_unpin_pages(struct vfio_container *container, - dma_addr_t iova, int npage) +static inline void vfio_device_container_unpin_pages(struct vfio_device *device, + dma_addr_t iova, int npage) { } -static inline int vfio_container_dma_rw(struct vfio_container *container, - dma_addr_t iova, void *data, size_t len, - bool write) +static inline int vfio_device_container_dma_rw(struct vfio_device *device, + dma_addr_t iova, void *data, + size_t len, bool write) { return -EOPNOTSUPP; } diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index e053998baffd8..b7f94ace7b10b 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1938,6 +1938,11 @@ int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs, } EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare); +static bool vfio_device_has_container(struct vfio_device *device) +{ + return device->group->container; +} + /* * Pin contiguous user pages and return their associated host pages for local * domain only. @@ -1950,7 +1955,7 @@ EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare); * Return error or number of pages pinned. * * A driver may only call this function if the vfio_device was created - * by vfio_register_emulated_iommu_dev() due to vfio_container_pin_pages(). + * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages(). */ int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, int npage, int prot, struct page **pages) @@ -1958,10 +1963,9 @@ int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, /* group->container cannot change while a vfio device is open */ if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device))) return -EINVAL; - if (device->group->container) - return vfio_container_pin_pages(device->group->container, - device->group->iommu_group, - iova, npage, prot, pages); + if (vfio_device_has_container(device)) + return vfio_device_container_pin_pages(device, iova, + npage, prot, pages); if (device->iommufd_access) { int ret; @@ -1997,9 +2001,8 @@ void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage) if (WARN_ON(!vfio_assert_device_open(device))) return; - if (device->group->container) { - vfio_container_unpin_pages(device->group->container, iova, - npage); + if (vfio_device_has_container(device)) { + vfio_device_container_unpin_pages(device, iova, npage); return; } if (device->iommufd_access) { @@ -2036,9 +2039,9 @@ int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data, if (!data || len <= 0 || !vfio_assert_device_open(device)) return -EINVAL; - if (device->group->container) - return vfio_container_dma_rw(device->group->container, iova, - data, len, write); + if (vfio_device_has_container(device)) + return vfio_device_container_dma_rw(device, iova, + data, len, write); if (device->iommufd_access) { unsigned int flags = 0; -- GitLab From 9eefba8002c27d65ab52a533fd0611b099b73591 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 25 Nov 2022 03:26:42 -0800 Subject: [PATCH 1157/1423] vfio: Move vfio group specific code into group.c This prepares for compiling out vfio group after vfio device cdev is added. No vfio_group decode code should be in vfio_main.c, and neither device->group reference should be in vfio_main.c. No functional change is intended. Link: https://lore.kernel.org/r/20221201145535.589687-11-yi.l.liu@intel.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Alex Williamson Tested-by: Yu He Tested-by: Lixiao Yang Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/vfio/Makefile | 1 + drivers/vfio/group.c | 877 +++++++++++++++++++++++++++++++++++++++ drivers/vfio/vfio.h | 22 + drivers/vfio/vfio_main.c | 877 +-------------------------------------- 4 files changed, 907 insertions(+), 870 deletions(-) create mode 100644 drivers/vfio/group.c diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index b953517dc70f9..3783db7e8082c 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -4,6 +4,7 @@ vfio_virqfd-y := virqfd.o obj-$(CONFIG_VFIO) += vfio.o vfio-y += vfio_main.o \ + group.o \ iova_bitmap.o vfio-$(CONFIG_IOMMUFD) += iommufd.o vfio-$(CONFIG_VFIO_CONTAINER) += container.o diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c new file mode 100644 index 0000000000000..c5d8bf10495ec --- /dev/null +++ b/drivers/vfio/group.c @@ -0,0 +1,877 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VFIO core + * + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson + * + * Derived from original vfio: + * Copyright 2010 Cisco Systems, Inc. All rights reserved. + * Author: Tom Lyon, pugs@cisco.com + */ + +#include +#include +#include +#include "vfio.h" + +static struct vfio { + struct class *class; + struct list_head group_list; + struct mutex group_lock; /* locks group_list */ + struct ida group_ida; + dev_t group_devt; +} vfio; + +static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group, + char *buf) +{ + struct vfio_device *it, *device = ERR_PTR(-ENODEV); + + mutex_lock(&group->device_lock); + list_for_each_entry(it, &group->device_list, group_next) { + int ret; + + if (it->ops->match) { + ret = it->ops->match(it, buf); + if (ret < 0) { + device = ERR_PTR(ret); + break; + } + } else { + ret = !strcmp(dev_name(it->dev), buf); + } + + if (ret && vfio_device_try_get_registration(it)) { + device = it; + break; + } + } + mutex_unlock(&group->device_lock); + + return device; +} + +/* + * VFIO Group fd, /dev/vfio/$GROUP + */ +static bool vfio_group_has_iommu(struct vfio_group *group) +{ + lockdep_assert_held(&group->group_lock); + /* + * There can only be users if there is a container, and if there is a + * container there must be users. + */ + WARN_ON(!group->container != !group->container_users); + + return group->container || group->iommufd; +} + +/* + * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or + * if there was no container to unset. Since the ioctl is called on + * the group, we know that still exists, therefore the only valid + * transition here is 1->0. + */ +static int vfio_group_ioctl_unset_container(struct vfio_group *group) +{ + int ret = 0; + + mutex_lock(&group->group_lock); + if (!vfio_group_has_iommu(group)) { + ret = -EINVAL; + goto out_unlock; + } + if (group->container) { + if (group->container_users != 1) { + ret = -EBUSY; + goto out_unlock; + } + vfio_group_detach_container(group); + } + if (group->iommufd) { + iommufd_ctx_put(group->iommufd); + group->iommufd = NULL; + } + +out_unlock: + mutex_unlock(&group->group_lock); + return ret; +} + +static int vfio_group_ioctl_set_container(struct vfio_group *group, + int __user *arg) +{ + struct vfio_container *container; + struct iommufd_ctx *iommufd; + struct fd f; + int ret; + int fd; + + if (get_user(fd, arg)) + return -EFAULT; + + f = fdget(fd); + if (!f.file) + return -EBADF; + + mutex_lock(&group->group_lock); + if (vfio_group_has_iommu(group)) { + ret = -EINVAL; + goto out_unlock; + } + if (!group->iommu_group) { + ret = -ENODEV; + goto out_unlock; + } + + container = vfio_container_from_file(f.file); + if (container) { + ret = vfio_container_attach_group(container, group); + goto out_unlock; + } + + iommufd = iommufd_ctx_from_file(f.file); + if (!IS_ERR(iommufd)) { + u32 ioas_id; + + ret = iommufd_vfio_compat_ioas_id(iommufd, &ioas_id); + if (ret) { + iommufd_ctx_put(group->iommufd); + goto out_unlock; + } + + group->iommufd = iommufd; + goto out_unlock; + } + + /* The FD passed is not recognized. */ + ret = -EBADFD; + +out_unlock: + mutex_unlock(&group->group_lock); + fdput(f); + return ret; +} + +static int vfio_device_group_open(struct vfio_device *device) +{ + int ret; + + mutex_lock(&device->group->group_lock); + if (!vfio_group_has_iommu(device->group)) { + ret = -EINVAL; + goto out_unlock; + } + + /* + * Here we pass the KVM pointer with the group under the lock. If the + * device driver will use it, it must obtain a reference and release it + * during close_device. + */ + ret = vfio_device_open(device, device->group->iommufd, + device->group->kvm); + +out_unlock: + mutex_unlock(&device->group->group_lock); + return ret; +} + +void vfio_device_group_close(struct vfio_device *device) +{ + mutex_lock(&device->group->group_lock); + vfio_device_close(device, device->group->iommufd); + mutex_unlock(&device->group->group_lock); +} + +static struct file *vfio_device_open_file(struct vfio_device *device) +{ + struct file *filep; + int ret; + + ret = vfio_device_group_open(device); + if (ret) + goto err_out; + + /* + * We can't use anon_inode_getfd() because we need to modify + * the f_mode flags directly to allow more than just ioctls + */ + filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops, + device, O_RDWR); + if (IS_ERR(filep)) { + ret = PTR_ERR(filep); + goto err_close_device; + } + + /* + * TODO: add an anon_inode interface to do this. + * Appears to be missing by lack of need rather than + * explicitly prevented. Now there's need. + */ + filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE); + + if (device->group->type == VFIO_NO_IOMMU) + dev_warn(device->dev, "vfio-noiommu device opened by user " + "(%s:%d)\n", current->comm, task_pid_nr(current)); + /* + * On success the ref of device is moved to the file and + * put in vfio_device_fops_release() + */ + return filep; + +err_close_device: + vfio_device_group_close(device); +err_out: + return ERR_PTR(ret); +} + +static int vfio_group_ioctl_get_device_fd(struct vfio_group *group, + char __user *arg) +{ + struct vfio_device *device; + struct file *filep; + char *buf; + int fdno; + int ret; + + buf = strndup_user(arg, PAGE_SIZE); + if (IS_ERR(buf)) + return PTR_ERR(buf); + + device = vfio_device_get_from_name(group, buf); + kfree(buf); + if (IS_ERR(device)) + return PTR_ERR(device); + + fdno = get_unused_fd_flags(O_CLOEXEC); + if (fdno < 0) { + ret = fdno; + goto err_put_device; + } + + filep = vfio_device_open_file(device); + if (IS_ERR(filep)) { + ret = PTR_ERR(filep); + goto err_put_fdno; + } + + fd_install(fdno, filep); + return fdno; + +err_put_fdno: + put_unused_fd(fdno); +err_put_device: + vfio_device_put_registration(device); + return ret; +} + +static int vfio_group_ioctl_get_status(struct vfio_group *group, + struct vfio_group_status __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_group_status, flags); + struct vfio_group_status status; + + if (copy_from_user(&status, arg, minsz)) + return -EFAULT; + + if (status.argsz < minsz) + return -EINVAL; + + status.flags = 0; + + mutex_lock(&group->group_lock); + if (!group->iommu_group) { + mutex_unlock(&group->group_lock); + return -ENODEV; + } + + /* + * With the container FD the iommu_group_claim_dma_owner() is done + * during SET_CONTAINER but for IOMMFD this is done during + * VFIO_GROUP_GET_DEVICE_FD. Meaning that with iommufd + * VFIO_GROUP_FLAGS_VIABLE could be set but GET_DEVICE_FD will fail due + * to viability. + */ + if (vfio_group_has_iommu(group)) + status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET | + VFIO_GROUP_FLAGS_VIABLE; + else if (!iommu_group_dma_owner_claimed(group->iommu_group)) + status.flags |= VFIO_GROUP_FLAGS_VIABLE; + mutex_unlock(&group->group_lock); + + if (copy_to_user(arg, &status, minsz)) + return -EFAULT; + return 0; +} + +static long vfio_group_fops_unl_ioctl(struct file *filep, + unsigned int cmd, unsigned long arg) +{ + struct vfio_group *group = filep->private_data; + void __user *uarg = (void __user *)arg; + + switch (cmd) { + case VFIO_GROUP_GET_DEVICE_FD: + return vfio_group_ioctl_get_device_fd(group, uarg); + case VFIO_GROUP_GET_STATUS: + return vfio_group_ioctl_get_status(group, uarg); + case VFIO_GROUP_SET_CONTAINER: + return vfio_group_ioctl_set_container(group, uarg); + case VFIO_GROUP_UNSET_CONTAINER: + return vfio_group_ioctl_unset_container(group); + default: + return -ENOTTY; + } +} + +static int vfio_group_fops_open(struct inode *inode, struct file *filep) +{ + struct vfio_group *group = + container_of(inode->i_cdev, struct vfio_group, cdev); + int ret; + + mutex_lock(&group->group_lock); + + /* + * drivers can be zero if this races with vfio_device_remove_group(), it + * will be stable at 0 under the group rwsem + */ + if (refcount_read(&group->drivers) == 0) { + ret = -ENODEV; + goto out_unlock; + } + + if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) { + ret = -EPERM; + goto out_unlock; + } + + /* + * Do we need multiple instances of the group open? Seems not. + */ + if (group->opened_file) { + ret = -EBUSY; + goto out_unlock; + } + group->opened_file = filep; + filep->private_data = group; + ret = 0; +out_unlock: + mutex_unlock(&group->group_lock); + return ret; +} + +static int vfio_group_fops_release(struct inode *inode, struct file *filep) +{ + struct vfio_group *group = filep->private_data; + + filep->private_data = NULL; + + mutex_lock(&group->group_lock); + /* + * Device FDs hold a group file reference, therefore the group release + * is only called when there are no open devices. + */ + WARN_ON(group->notifier.head); + if (group->container) + vfio_group_detach_container(group); + if (group->iommufd) { + iommufd_ctx_put(group->iommufd); + group->iommufd = NULL; + } + group->opened_file = NULL; + mutex_unlock(&group->group_lock); + return 0; +} + +static const struct file_operations vfio_group_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = vfio_group_fops_unl_ioctl, + .compat_ioctl = compat_ptr_ioctl, + .open = vfio_group_fops_open, + .release = vfio_group_fops_release, +}; + +/* + * Group objects - create, release, get, put, search + */ +static struct vfio_group * +vfio_group_find_from_iommu(struct iommu_group *iommu_group) +{ + struct vfio_group *group; + + lockdep_assert_held(&vfio.group_lock); + + /* + * group->iommu_group from the vfio.group_list cannot be NULL + * under the vfio.group_lock. + */ + list_for_each_entry(group, &vfio.group_list, vfio_next) { + if (group->iommu_group == iommu_group) + return group; + } + return NULL; +} + +static void vfio_group_release(struct device *dev) +{ + struct vfio_group *group = container_of(dev, struct vfio_group, dev); + + mutex_destroy(&group->device_lock); + mutex_destroy(&group->group_lock); + WARN_ON(group->iommu_group); + ida_free(&vfio.group_ida, MINOR(group->dev.devt)); + kfree(group); +} + +static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group, + enum vfio_group_type type) +{ + struct vfio_group *group; + int minor; + + group = kzalloc(sizeof(*group), GFP_KERNEL); + if (!group) + return ERR_PTR(-ENOMEM); + + minor = ida_alloc_max(&vfio.group_ida, MINORMASK, GFP_KERNEL); + if (minor < 0) { + kfree(group); + return ERR_PTR(minor); + } + + device_initialize(&group->dev); + group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor); + group->dev.class = vfio.class; + group->dev.release = vfio_group_release; + cdev_init(&group->cdev, &vfio_group_fops); + group->cdev.owner = THIS_MODULE; + + refcount_set(&group->drivers, 1); + mutex_init(&group->group_lock); + INIT_LIST_HEAD(&group->device_list); + mutex_init(&group->device_lock); + group->iommu_group = iommu_group; + /* put in vfio_group_release() */ + iommu_group_ref_get(iommu_group); + group->type = type; + BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier); + + return group; +} + +static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, + enum vfio_group_type type) +{ + struct vfio_group *group; + struct vfio_group *ret; + int err; + + lockdep_assert_held(&vfio.group_lock); + + group = vfio_group_alloc(iommu_group, type); + if (IS_ERR(group)) + return group; + + err = dev_set_name(&group->dev, "%s%d", + group->type == VFIO_NO_IOMMU ? "noiommu-" : "", + iommu_group_id(iommu_group)); + if (err) { + ret = ERR_PTR(err); + goto err_put; + } + + err = cdev_device_add(&group->cdev, &group->dev); + if (err) { + ret = ERR_PTR(err); + goto err_put; + } + + list_add(&group->vfio_next, &vfio.group_list); + + return group; + +err_put: + put_device(&group->dev); + return ret; +} + +static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev, + enum vfio_group_type type) +{ + struct iommu_group *iommu_group; + struct vfio_group *group; + int ret; + + iommu_group = iommu_group_alloc(); + if (IS_ERR(iommu_group)) + return ERR_CAST(iommu_group); + + ret = iommu_group_set_name(iommu_group, "vfio-noiommu"); + if (ret) + goto out_put_group; + ret = iommu_group_add_device(iommu_group, dev); + if (ret) + goto out_put_group; + + mutex_lock(&vfio.group_lock); + group = vfio_create_group(iommu_group, type); + mutex_unlock(&vfio.group_lock); + if (IS_ERR(group)) { + ret = PTR_ERR(group); + goto out_remove_device; + } + iommu_group_put(iommu_group); + return group; + +out_remove_device: + iommu_group_remove_device(dev); +out_put_group: + iommu_group_put(iommu_group); + return ERR_PTR(ret); +} + +static bool vfio_group_has_device(struct vfio_group *group, struct device *dev) +{ + struct vfio_device *device; + + mutex_lock(&group->device_lock); + list_for_each_entry(device, &group->device_list, group_next) { + if (device->dev == dev) { + mutex_unlock(&group->device_lock); + return true; + } + } + mutex_unlock(&group->device_lock); + return false; +} + +static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) +{ + struct iommu_group *iommu_group; + struct vfio_group *group; + + iommu_group = iommu_group_get(dev); + if (!iommu_group && vfio_noiommu) { + /* + * With noiommu enabled, create an IOMMU group for devices that + * don't already have one, implying no IOMMU hardware/driver + * exists. Taint the kernel because we're about to give a DMA + * capable device to a user without IOMMU protection. + */ + group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU); + if (!IS_ERR(group)) { + add_taint(TAINT_USER, LOCKDEP_STILL_OK); + dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n"); + } + return group; + } + + if (!iommu_group) + return ERR_PTR(-EINVAL); + + /* + * VFIO always sets IOMMU_CACHE because we offer no way for userspace to + * restore cache coherency. It has to be checked here because it is only + * valid for cases where we are using iommu groups. + */ + if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) { + iommu_group_put(iommu_group); + return ERR_PTR(-EINVAL); + } + + mutex_lock(&vfio.group_lock); + group = vfio_group_find_from_iommu(iommu_group); + if (group) { + if (WARN_ON(vfio_group_has_device(group, dev))) + group = ERR_PTR(-EINVAL); + else + refcount_inc(&group->drivers); + } else { + group = vfio_create_group(iommu_group, VFIO_IOMMU); + } + mutex_unlock(&vfio.group_lock); + + /* The vfio_group holds a reference to the iommu_group */ + iommu_group_put(iommu_group); + return group; +} + +int vfio_device_set_group(struct vfio_device *device, + enum vfio_group_type type) +{ + struct vfio_group *group; + + if (type == VFIO_IOMMU) + group = vfio_group_find_or_alloc(device->dev); + else + group = vfio_noiommu_group_alloc(device->dev, type); + + if (IS_ERR(group)) + return PTR_ERR(group); + + /* Our reference on group is moved to the device */ + device->group = group; + return 0; +} + +void vfio_device_remove_group(struct vfio_device *device) +{ + struct vfio_group *group = device->group; + struct iommu_group *iommu_group; + + if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU) + iommu_group_remove_device(device->dev); + + /* Pairs with vfio_create_group() / vfio_group_get_from_iommu() */ + if (!refcount_dec_and_mutex_lock(&group->drivers, &vfio.group_lock)) + return; + list_del(&group->vfio_next); + + /* + * We could concurrently probe another driver in the group that might + * race vfio_device_remove_group() with vfio_get_group(), so we have to + * ensure that the sysfs is all cleaned up under lock otherwise the + * cdev_device_add() will fail due to the name aready existing. + */ + cdev_device_del(&group->cdev, &group->dev); + + mutex_lock(&group->group_lock); + /* + * These data structures all have paired operations that can only be + * undone when the caller holds a live reference on the device. Since + * all pairs must be undone these WARN_ON's indicate some caller did not + * properly hold the group reference. + */ + WARN_ON(!list_empty(&group->device_list)); + WARN_ON(group->notifier.head); + + /* + * Revoke all users of group->iommu_group. At this point we know there + * are no devices active because we are unplugging the last one. Setting + * iommu_group to NULL blocks all new users. + */ + if (group->container) + vfio_group_detach_container(group); + iommu_group = group->iommu_group; + group->iommu_group = NULL; + mutex_unlock(&group->group_lock); + mutex_unlock(&vfio.group_lock); + + iommu_group_put(iommu_group); + put_device(&group->dev); +} + +void vfio_device_group_register(struct vfio_device *device) +{ + mutex_lock(&device->group->device_lock); + list_add(&device->group_next, &device->group->device_list); + mutex_unlock(&device->group->device_lock); +} + +void vfio_device_group_unregister(struct vfio_device *device) +{ + mutex_lock(&device->group->device_lock); + list_del(&device->group_next); + mutex_unlock(&device->group->device_lock); +} + +int vfio_device_group_use_iommu(struct vfio_device *device) +{ + struct vfio_group *group = device->group; + int ret = 0; + + lockdep_assert_held(&group->group_lock); + + if (WARN_ON(!group->container)) + return -EINVAL; + + ret = vfio_group_use_container(group); + if (ret) + return ret; + vfio_device_container_register(device); + return 0; +} + +void vfio_device_group_unuse_iommu(struct vfio_device *device) +{ + struct vfio_group *group = device->group; + + lockdep_assert_held(&group->group_lock); + + if (WARN_ON(!group->container)) + return; + + vfio_device_container_unregister(device); + vfio_group_unuse_container(group); +} + +bool vfio_device_has_container(struct vfio_device *device) +{ + return device->group->container; +} + +/** + * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file + * @file: VFIO group file + * + * The returned iommu_group is valid as long as a ref is held on the file. This + * returns a reference on the group. This function is deprecated, only the SPAPR + * path in kvm should call it. + */ +struct iommu_group *vfio_file_iommu_group(struct file *file) +{ + struct vfio_group *group = file->private_data; + struct iommu_group *iommu_group = NULL; + + if (!IS_ENABLED(CONFIG_SPAPR_TCE_IOMMU)) + return NULL; + + if (!vfio_file_is_group(file)) + return NULL; + + mutex_lock(&group->group_lock); + if (group->iommu_group) { + iommu_group = group->iommu_group; + iommu_group_ref_get(iommu_group); + } + mutex_unlock(&group->group_lock); + return iommu_group; +} +EXPORT_SYMBOL_GPL(vfio_file_iommu_group); + +/** + * vfio_file_is_group - True if the file is usable with VFIO aPIS + * @file: VFIO group file + */ +bool vfio_file_is_group(struct file *file) +{ + return file->f_op == &vfio_group_fops; +} +EXPORT_SYMBOL_GPL(vfio_file_is_group); + +/** + * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file + * is always CPU cache coherent + * @file: VFIO group file + * + * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop + * bit in DMA transactions. A return of false indicates that the user has + * rights to access additional instructions such as wbinvd on x86. + */ +bool vfio_file_enforced_coherent(struct file *file) +{ + struct vfio_group *group = file->private_data; + struct vfio_device *device; + bool ret = true; + + if (!vfio_file_is_group(file)) + return true; + + /* + * If the device does not have IOMMU_CAP_ENFORCE_CACHE_COHERENCY then + * any domain later attached to it will also not support it. If the cap + * is set then the iommu_domain eventually attached to the device/group + * must use a domain with enforce_cache_coherency(). + */ + mutex_lock(&group->device_lock); + list_for_each_entry(device, &group->device_list, group_next) { + if (!device_iommu_capable(device->dev, + IOMMU_CAP_ENFORCE_CACHE_COHERENCY)) { + ret = false; + break; + } + } + mutex_unlock(&group->device_lock); + return ret; +} +EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent); + +/** + * vfio_file_set_kvm - Link a kvm with VFIO drivers + * @file: VFIO group file + * @kvm: KVM to link + * + * When a VFIO device is first opened the KVM will be available in + * device->kvm if one was associated with the group. + */ +void vfio_file_set_kvm(struct file *file, struct kvm *kvm) +{ + struct vfio_group *group = file->private_data; + + if (!vfio_file_is_group(file)) + return; + + mutex_lock(&group->group_lock); + group->kvm = kvm; + mutex_unlock(&group->group_lock); +} +EXPORT_SYMBOL_GPL(vfio_file_set_kvm); + +/** + * vfio_file_has_dev - True if the VFIO file is a handle for device + * @file: VFIO file to check + * @device: Device that must be part of the file + * + * Returns true if given file has permission to manipulate the given device. + */ +bool vfio_file_has_dev(struct file *file, struct vfio_device *device) +{ + struct vfio_group *group = file->private_data; + + if (!vfio_file_is_group(file)) + return false; + + return group == device->group; +} +EXPORT_SYMBOL_GPL(vfio_file_has_dev); + +static char *vfio_devnode(struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev)); +} + +int __init vfio_group_init(void) +{ + int ret; + + ida_init(&vfio.group_ida); + mutex_init(&vfio.group_lock); + INIT_LIST_HEAD(&vfio.group_list); + + ret = vfio_container_init(); + if (ret) + return ret; + + /* /dev/vfio/$GROUP */ + vfio.class = class_create(THIS_MODULE, "vfio"); + if (IS_ERR(vfio.class)) { + ret = PTR_ERR(vfio.class); + goto err_group_class; + } + + vfio.class->devnode = vfio_devnode; + + ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio"); + if (ret) + goto err_alloc_chrdev; + return 0; + +err_alloc_chrdev: + class_destroy(vfio.class); + vfio.class = NULL; +err_group_class: + vfio_container_cleanup(); + return ret; +} + +void vfio_group_cleanup(void) +{ + WARN_ON(!list_empty(&vfio.group_list)); + ida_destroy(&vfio.group_ida); + unregister_chrdev_region(vfio.group_devt, MINORMASK + 1); + class_destroy(vfio.class); + vfio.class = NULL; + vfio_container_cleanup(); +} diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index a112e8f2b291a..2e05418fd18df 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -6,6 +6,7 @@ #ifndef __VFIO_VFIO_H__ #define __VFIO_VFIO_H__ +#include #include #include #include @@ -15,6 +16,15 @@ struct iommu_group; struct vfio_device; struct vfio_container; +void vfio_device_put_registration(struct vfio_device *device); +bool vfio_device_try_get_registration(struct vfio_device *device); +int vfio_device_open(struct vfio_device *device, + struct iommufd_ctx *iommufd, struct kvm *kvm); +void vfio_device_close(struct vfio_device *device, + struct iommufd_ctx *iommufd); + +extern const struct file_operations vfio_device_fops; + enum vfio_group_type { /* * Physical device with IOMMU backing. @@ -66,6 +76,18 @@ struct vfio_group { struct iommufd_ctx *iommufd; }; +int vfio_device_set_group(struct vfio_device *device, + enum vfio_group_type type); +void vfio_device_remove_group(struct vfio_device *device); +void vfio_device_group_register(struct vfio_device *device); +void vfio_device_group_unregister(struct vfio_device *device); +int vfio_device_group_use_iommu(struct vfio_device *device); +void vfio_device_group_unuse_iommu(struct vfio_device *device); +void vfio_device_group_close(struct vfio_device *device); +bool vfio_device_has_container(struct vfio_device *device); +int __init vfio_group_init(void); +void vfio_group_cleanup(void); + #if IS_ENABLED(CONFIG_VFIO_CONTAINER) /* events for the backend driver notify callback */ enum vfio_iommu_notify_type { diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index b7f94ace7b10b..e21ff965141e6 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -13,8 +13,6 @@ #include #include #include -#include -#include #include #include #include @@ -43,17 +41,11 @@ #define DRIVER_DESC "VFIO - User Level meta-driver" static struct vfio { - struct class *class; - struct list_head group_list; - struct mutex group_lock; /* locks group_list */ - struct ida group_ida; - dev_t group_devt; struct class *device_class; struct ida device_ida; } vfio; static DEFINE_XARRAY(vfio_device_set_xa); -static const struct file_operations vfio_group_fops; int vfio_assign_device_set(struct vfio_device *device, void *set_id) { @@ -139,168 +131,17 @@ unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set) } EXPORT_SYMBOL_GPL(vfio_device_set_open_count); -/* - * Group objects - create, release, get, put, search - */ -static struct vfio_group * -vfio_group_find_from_iommu(struct iommu_group *iommu_group) -{ - struct vfio_group *group; - - lockdep_assert_held(&vfio.group_lock); - - /* - * group->iommu_group from the vfio.group_list cannot be NULL - * under the vfio.group_lock. - */ - list_for_each_entry(group, &vfio.group_list, vfio_next) { - if (group->iommu_group == iommu_group) - return group; - } - return NULL; -} - -static void vfio_group_release(struct device *dev) -{ - struct vfio_group *group = container_of(dev, struct vfio_group, dev); - - mutex_destroy(&group->device_lock); - mutex_destroy(&group->group_lock); - WARN_ON(group->iommu_group); - ida_free(&vfio.group_ida, MINOR(group->dev.devt)); - kfree(group); -} - -static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group, - enum vfio_group_type type) -{ - struct vfio_group *group; - int minor; - - group = kzalloc(sizeof(*group), GFP_KERNEL); - if (!group) - return ERR_PTR(-ENOMEM); - - minor = ida_alloc_max(&vfio.group_ida, MINORMASK, GFP_KERNEL); - if (minor < 0) { - kfree(group); - return ERR_PTR(minor); - } - - device_initialize(&group->dev); - group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor); - group->dev.class = vfio.class; - group->dev.release = vfio_group_release; - cdev_init(&group->cdev, &vfio_group_fops); - group->cdev.owner = THIS_MODULE; - - refcount_set(&group->drivers, 1); - mutex_init(&group->group_lock); - INIT_LIST_HEAD(&group->device_list); - mutex_init(&group->device_lock); - group->iommu_group = iommu_group; - /* put in vfio_group_release() */ - iommu_group_ref_get(iommu_group); - group->type = type; - BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier); - - return group; -} - -static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, - enum vfio_group_type type) -{ - struct vfio_group *group; - struct vfio_group *ret; - int err; - - lockdep_assert_held(&vfio.group_lock); - - group = vfio_group_alloc(iommu_group, type); - if (IS_ERR(group)) - return group; - - err = dev_set_name(&group->dev, "%s%d", - group->type == VFIO_NO_IOMMU ? "noiommu-" : "", - iommu_group_id(iommu_group)); - if (err) { - ret = ERR_PTR(err); - goto err_put; - } - - err = cdev_device_add(&group->cdev, &group->dev); - if (err) { - ret = ERR_PTR(err); - goto err_put; - } - - list_add(&group->vfio_next, &vfio.group_list); - - return group; - -err_put: - put_device(&group->dev); - return ret; -} - -static void vfio_device_remove_group(struct vfio_device *device) -{ - struct vfio_group *group = device->group; - struct iommu_group *iommu_group; - - if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU) - iommu_group_remove_device(device->dev); - - /* Pairs with vfio_create_group() / vfio_group_get_from_iommu() */ - if (!refcount_dec_and_mutex_lock(&group->drivers, &vfio.group_lock)) - return; - list_del(&group->vfio_next); - - /* - * We could concurrently probe another driver in the group that might - * race vfio_device_remove_group() with vfio_get_group(), so we have to - * ensure that the sysfs is all cleaned up under lock otherwise the - * cdev_device_add() will fail due to the name aready existing. - */ - cdev_device_del(&group->cdev, &group->dev); - - mutex_lock(&group->group_lock); - /* - * These data structures all have paired operations that can only be - * undone when the caller holds a live reference on the device. Since - * all pairs must be undone these WARN_ON's indicate some caller did not - * properly hold the group reference. - */ - WARN_ON(!list_empty(&group->device_list)); - WARN_ON(group->notifier.head); - - /* - * Revoke all users of group->iommu_group. At this point we know there - * are no devices active because we are unplugging the last one. Setting - * iommu_group to NULL blocks all new users. - */ - if (group->container) - vfio_group_detach_container(group); - iommu_group = group->iommu_group; - group->iommu_group = NULL; - mutex_unlock(&group->group_lock); - mutex_unlock(&vfio.group_lock); - - iommu_group_put(iommu_group); - put_device(&group->dev); -} - /* * Device objects - create, release, get, put, search */ /* Device reference always implies a group reference */ -static void vfio_device_put_registration(struct vfio_device *device) +void vfio_device_put_registration(struct vfio_device *device) { if (refcount_dec_and_test(&device->refcount)) complete(&device->comp); } -static bool vfio_device_try_get_registration(struct vfio_device *device) +bool vfio_device_try_get_registration(struct vfio_device *device) { return refcount_inc_not_zero(&device->refcount); } @@ -413,139 +254,6 @@ void vfio_free_device(struct vfio_device *device) } EXPORT_SYMBOL_GPL(vfio_free_device); -static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev, - enum vfio_group_type type) -{ - struct iommu_group *iommu_group; - struct vfio_group *group; - int ret; - - iommu_group = iommu_group_alloc(); - if (IS_ERR(iommu_group)) - return ERR_CAST(iommu_group); - - ret = iommu_group_set_name(iommu_group, "vfio-noiommu"); - if (ret) - goto out_put_group; - ret = iommu_group_add_device(iommu_group, dev); - if (ret) - goto out_put_group; - - mutex_lock(&vfio.group_lock); - group = vfio_create_group(iommu_group, type); - mutex_unlock(&vfio.group_lock); - if (IS_ERR(group)) { - ret = PTR_ERR(group); - goto out_remove_device; - } - iommu_group_put(iommu_group); - return group; - -out_remove_device: - iommu_group_remove_device(dev); -out_put_group: - iommu_group_put(iommu_group); - return ERR_PTR(ret); -} - -static bool vfio_group_has_device(struct vfio_group *group, struct device *dev) -{ - struct vfio_device *device; - - mutex_lock(&group->device_lock); - list_for_each_entry(device, &group->device_list, group_next) { - if (device->dev == dev) { - mutex_unlock(&group->device_lock); - return true; - } - } - mutex_unlock(&group->device_lock); - return false; -} - -static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) -{ - struct iommu_group *iommu_group; - struct vfio_group *group; - - iommu_group = iommu_group_get(dev); - if (!iommu_group && vfio_noiommu) { - /* - * With noiommu enabled, create an IOMMU group for devices that - * don't already have one, implying no IOMMU hardware/driver - * exists. Taint the kernel because we're about to give a DMA - * capable device to a user without IOMMU protection. - */ - group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU); - if (!IS_ERR(group)) { - add_taint(TAINT_USER, LOCKDEP_STILL_OK); - dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n"); - } - return group; - } - - if (!iommu_group) - return ERR_PTR(-EINVAL); - - /* - * VFIO always sets IOMMU_CACHE because we offer no way for userspace to - * restore cache coherency. It has to be checked here because it is only - * valid for cases where we are using iommu groups. - */ - if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) { - iommu_group_put(iommu_group); - return ERR_PTR(-EINVAL); - } - - mutex_lock(&vfio.group_lock); - group = vfio_group_find_from_iommu(iommu_group); - if (group) { - if (WARN_ON(vfio_group_has_device(group, dev))) - group = ERR_PTR(-EINVAL); - else - refcount_inc(&group->drivers); - } else { - group = vfio_create_group(iommu_group, VFIO_IOMMU); - } - mutex_unlock(&vfio.group_lock); - - /* The vfio_group holds a reference to the iommu_group */ - iommu_group_put(iommu_group); - return group; -} - -static void vfio_device_group_register(struct vfio_device *device) -{ - mutex_lock(&device->group->device_lock); - list_add(&device->group_next, &device->group->device_list); - mutex_unlock(&device->group->device_lock); -} - -static void vfio_device_group_unregister(struct vfio_device *device) -{ - mutex_lock(&device->group->device_lock); - list_del(&device->group_next); - mutex_unlock(&device->group->device_lock); -} - -static int vfio_device_set_group(struct vfio_device *device, - enum vfio_group_type type) -{ - struct vfio_group *group; - - if (type == VFIO_IOMMU) - group = vfio_group_find_or_alloc(device->dev); - else - group = vfio_noiommu_group_alloc(device->dev, type); - - if (IS_ERR(group)) - return PTR_ERR(group); - - /* Our reference on group is moved to the device */ - device->group = group; - return 0; -} - static int __vfio_register_dev(struct vfio_device *device, enum vfio_group_type type) { @@ -602,35 +310,6 @@ int vfio_register_emulated_iommu_dev(struct vfio_device *device) } EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev); -static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group, - char *buf) -{ - struct vfio_device *it, *device = ERR_PTR(-ENODEV); - - mutex_lock(&group->device_lock); - list_for_each_entry(it, &group->device_list, group_next) { - int ret; - - if (it->ops->match) { - ret = it->ops->match(it, buf); - if (ret < 0) { - device = ERR_PTR(ret); - break; - } - } else { - ret = !strcmp(dev_name(it->dev), buf); - } - - if (ret && vfio_device_try_get_registration(it)) { - device = it; - break; - } - } - mutex_unlock(&group->device_lock); - - return device; -} - /* * Decrement the device reference count and wait for the device to be * removed. Open file descriptors for the device... */ @@ -673,146 +352,12 @@ void vfio_unregister_group_dev(struct vfio_device *device) } EXPORT_SYMBOL_GPL(vfio_unregister_group_dev); -/* - * VFIO Group fd, /dev/vfio/$GROUP - */ -static bool vfio_group_has_iommu(struct vfio_group *group) -{ - lockdep_assert_held(&group->group_lock); - /* - * There can only be users if there is a container, and if there is a - * container there must be users. - */ - WARN_ON(!group->container != !group->container_users); - - return group->container || group->iommufd; -} - -/* - * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or - * if there was no container to unset. Since the ioctl is called on - * the group, we know that still exists, therefore the only valid - * transition here is 1->0. - */ -static int vfio_group_ioctl_unset_container(struct vfio_group *group) -{ - int ret = 0; - - mutex_lock(&group->group_lock); - if (!vfio_group_has_iommu(group)) { - ret = -EINVAL; - goto out_unlock; - } - if (group->container) { - if (group->container_users != 1) { - ret = -EBUSY; - goto out_unlock; - } - vfio_group_detach_container(group); - } - if (group->iommufd) { - iommufd_ctx_put(group->iommufd); - group->iommufd = NULL; - } - -out_unlock: - mutex_unlock(&group->group_lock); - return ret; -} - -static int vfio_group_ioctl_set_container(struct vfio_group *group, - int __user *arg) -{ - struct vfio_container *container; - struct iommufd_ctx *iommufd; - struct fd f; - int ret; - int fd; - - if (get_user(fd, arg)) - return -EFAULT; - - f = fdget(fd); - if (!f.file) - return -EBADF; - - mutex_lock(&group->group_lock); - if (vfio_group_has_iommu(group)) { - ret = -EINVAL; - goto out_unlock; - } - if (!group->iommu_group) { - ret = -ENODEV; - goto out_unlock; - } - - container = vfio_container_from_file(f.file); - if (container) { - ret = vfio_container_attach_group(container, group); - goto out_unlock; - } - - iommufd = iommufd_ctx_from_file(f.file); - if (!IS_ERR(iommufd)) { - u32 ioas_id; - - ret = iommufd_vfio_compat_ioas_id(iommufd, &ioas_id); - if (ret) { - iommufd_ctx_put(group->iommufd); - goto out_unlock; - } - - group->iommufd = iommufd; - goto out_unlock; - } - - /* The FD passed is not recognized. */ - ret = -EBADFD; - -out_unlock: - mutex_unlock(&group->group_lock); - fdput(f); - return ret; -} - -static const struct file_operations vfio_device_fops; - /* true if the vfio_device has open_device() called but not close_device() */ static bool vfio_assert_device_open(struct vfio_device *device) { return !WARN_ON_ONCE(!READ_ONCE(device->open_count)); } -static int vfio_device_group_use_iommu(struct vfio_device *device) -{ - struct vfio_group *group = device->group; - int ret = 0; - - lockdep_assert_held(&group->group_lock); - - if (WARN_ON(!group->container)) - return -EINVAL; - - ret = vfio_group_use_container(group); - if (ret) - return ret; - vfio_device_container_register(device); - return 0; -} - -static void vfio_device_group_unuse_iommu(struct vfio_device *device) -{ - struct vfio_group *group = device->group; - - lockdep_assert_held(&group->group_lock); - - if (WARN_ON(!group->container)) - return; - - vfio_device_container_unregister(device); - vfio_group_unuse_container(group); -} - static int vfio_device_first_open(struct vfio_device *device, struct iommufd_ctx *iommufd, struct kvm *kvm) { @@ -864,8 +409,8 @@ static void vfio_device_last_close(struct vfio_device *device, module_put(device->dev->driver->owner); } -static int vfio_device_open(struct vfio_device *device, - struct iommufd_ctx *iommufd, struct kvm *kvm) +int vfio_device_open(struct vfio_device *device, + struct iommufd_ctx *iommufd, struct kvm *kvm) { int ret = 0; @@ -881,8 +426,8 @@ static int vfio_device_open(struct vfio_device *device, return ret; } -static void vfio_device_close(struct vfio_device *device, - struct iommufd_ctx *iommufd) +void vfio_device_close(struct vfio_device *device, + struct iommufd_ctx *iommufd) { mutex_lock(&device->dev_set->lock); vfio_assert_device_open(device); @@ -892,245 +437,6 @@ static void vfio_device_close(struct vfio_device *device, mutex_unlock(&device->dev_set->lock); } -static int vfio_device_group_open(struct vfio_device *device) -{ - int ret; - - mutex_lock(&device->group->group_lock); - if (!vfio_group_has_iommu(device->group)) { - ret = -EINVAL; - goto out_unlock; - } - - /* - * Here we pass the KVM pointer with the group under the lock. If the - * device driver will use it, it must obtain a reference and release it - * during close_device. - */ - ret = vfio_device_open(device, device->group->iommufd, - device->group->kvm); - -out_unlock: - mutex_unlock(&device->group->group_lock); - return ret; -} - -static void vfio_device_group_close(struct vfio_device *device) -{ - mutex_lock(&device->group->group_lock); - vfio_device_close(device, device->group->iommufd); - mutex_unlock(&device->group->group_lock); -} - -static struct file *vfio_device_open_file(struct vfio_device *device) -{ - struct file *filep; - int ret; - - ret = vfio_device_group_open(device); - if (ret) - goto err_out; - - /* - * We can't use anon_inode_getfd() because we need to modify - * the f_mode flags directly to allow more than just ioctls - */ - filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops, - device, O_RDWR); - if (IS_ERR(filep)) { - ret = PTR_ERR(filep); - goto err_close_device; - } - - /* - * TODO: add an anon_inode interface to do this. - * Appears to be missing by lack of need rather than - * explicitly prevented. Now there's need. - */ - filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE); - - if (device->group->type == VFIO_NO_IOMMU) - dev_warn(device->dev, "vfio-noiommu device opened by user " - "(%s:%d)\n", current->comm, task_pid_nr(current)); - /* - * On success the ref of device is moved to the file and - * put in vfio_device_fops_release() - */ - return filep; - -err_close_device: - vfio_device_group_close(device); -err_out: - return ERR_PTR(ret); -} - -static int vfio_group_ioctl_get_device_fd(struct vfio_group *group, - char __user *arg) -{ - struct vfio_device *device; - struct file *filep; - char *buf; - int fdno; - int ret; - - buf = strndup_user(arg, PAGE_SIZE); - if (IS_ERR(buf)) - return PTR_ERR(buf); - - device = vfio_device_get_from_name(group, buf); - kfree(buf); - if (IS_ERR(device)) - return PTR_ERR(device); - - fdno = get_unused_fd_flags(O_CLOEXEC); - if (fdno < 0) { - ret = fdno; - goto err_put_device; - } - - filep = vfio_device_open_file(device); - if (IS_ERR(filep)) { - ret = PTR_ERR(filep); - goto err_put_fdno; - } - - fd_install(fdno, filep); - return fdno; - -err_put_fdno: - put_unused_fd(fdno); -err_put_device: - vfio_device_put_registration(device); - return ret; -} - -static int vfio_group_ioctl_get_status(struct vfio_group *group, - struct vfio_group_status __user *arg) -{ - unsigned long minsz = offsetofend(struct vfio_group_status, flags); - struct vfio_group_status status; - - if (copy_from_user(&status, arg, minsz)) - return -EFAULT; - - if (status.argsz < minsz) - return -EINVAL; - - status.flags = 0; - - mutex_lock(&group->group_lock); - if (!group->iommu_group) { - mutex_unlock(&group->group_lock); - return -ENODEV; - } - - /* - * With the container FD the iommu_group_claim_dma_owner() is done - * during SET_CONTAINER but for IOMMFD this is done during - * VFIO_GROUP_GET_DEVICE_FD. Meaning that with iommufd - * VFIO_GROUP_FLAGS_VIABLE could be set but GET_DEVICE_FD will fail due - * to viability. - */ - if (vfio_group_has_iommu(group)) - status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET | - VFIO_GROUP_FLAGS_VIABLE; - else if (!iommu_group_dma_owner_claimed(group->iommu_group)) - status.flags |= VFIO_GROUP_FLAGS_VIABLE; - mutex_unlock(&group->group_lock); - - if (copy_to_user(arg, &status, minsz)) - return -EFAULT; - return 0; -} - -static long vfio_group_fops_unl_ioctl(struct file *filep, - unsigned int cmd, unsigned long arg) -{ - struct vfio_group *group = filep->private_data; - void __user *uarg = (void __user *)arg; - - switch (cmd) { - case VFIO_GROUP_GET_DEVICE_FD: - return vfio_group_ioctl_get_device_fd(group, uarg); - case VFIO_GROUP_GET_STATUS: - return vfio_group_ioctl_get_status(group, uarg); - case VFIO_GROUP_SET_CONTAINER: - return vfio_group_ioctl_set_container(group, uarg); - case VFIO_GROUP_UNSET_CONTAINER: - return vfio_group_ioctl_unset_container(group); - default: - return -ENOTTY; - } -} - -static int vfio_group_fops_open(struct inode *inode, struct file *filep) -{ - struct vfio_group *group = - container_of(inode->i_cdev, struct vfio_group, cdev); - int ret; - - mutex_lock(&group->group_lock); - - /* - * drivers can be zero if this races with vfio_device_remove_group(), it - * will be stable at 0 under the group rwsem - */ - if (refcount_read(&group->drivers) == 0) { - ret = -ENODEV; - goto out_unlock; - } - - if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) { - ret = -EPERM; - goto out_unlock; - } - - /* - * Do we need multiple instances of the group open? Seems not. - */ - if (group->opened_file) { - ret = -EBUSY; - goto out_unlock; - } - group->opened_file = filep; - filep->private_data = group; - ret = 0; -out_unlock: - mutex_unlock(&group->group_lock); - return ret; -} - -static int vfio_group_fops_release(struct inode *inode, struct file *filep) -{ - struct vfio_group *group = filep->private_data; - - filep->private_data = NULL; - - mutex_lock(&group->group_lock); - /* - * Device FDs hold a group file reference, therefore the group release - * is only called when there are no open devices. - */ - WARN_ON(group->notifier.head); - if (group->container) - vfio_group_detach_container(group); - if (group->iommufd) { - iommufd_ctx_put(group->iommufd); - group->iommufd = NULL; - } - group->opened_file = NULL; - mutex_unlock(&group->group_lock); - return 0; -} - -static const struct file_operations vfio_group_fops = { - .owner = THIS_MODULE, - .unlocked_ioctl = vfio_group_fops_unl_ioctl, - .compat_ioctl = compat_ptr_ioctl, - .open = vfio_group_fops_open, - .release = vfio_group_fops_release, -}; - /* * Wrapper around pm_runtime_resume_and_get(). * Return error code on failure or 0 on success. @@ -1694,7 +1000,7 @@ static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma) return device->ops->mmap(device, vma); } -static const struct file_operations vfio_device_fops = { +const struct file_operations vfio_device_fops = { .owner = THIS_MODULE, .release = vfio_device_fops_release, .read = vfio_device_fops_read, @@ -1704,121 +1010,6 @@ static const struct file_operations vfio_device_fops = { .mmap = vfio_device_fops_mmap, }; -/** - * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file - * @file: VFIO group file - * - * The returned iommu_group is valid as long as a ref is held on the file. This - * returns a reference on the group. This function is deprecated, only the SPAPR - * path in kvm should call it. - */ -struct iommu_group *vfio_file_iommu_group(struct file *file) -{ - struct vfio_group *group = file->private_data; - struct iommu_group *iommu_group = NULL; - - if (!IS_ENABLED(CONFIG_SPAPR_TCE_IOMMU)) - return NULL; - - if (!vfio_file_is_group(file)) - return NULL; - - mutex_lock(&group->group_lock); - if (group->iommu_group) { - iommu_group = group->iommu_group; - iommu_group_ref_get(iommu_group); - } - mutex_unlock(&group->group_lock); - return iommu_group; -} -EXPORT_SYMBOL_GPL(vfio_file_iommu_group); - -/** - * vfio_file_is_group - True if the file is usable with VFIO aPIS - * @file: VFIO group file - */ -bool vfio_file_is_group(struct file *file) -{ - return file->f_op == &vfio_group_fops; -} -EXPORT_SYMBOL_GPL(vfio_file_is_group); - -/** - * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file - * is always CPU cache coherent - * @file: VFIO group file - * - * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop - * bit in DMA transactions. A return of false indicates that the user has - * rights to access additional instructions such as wbinvd on x86. - */ -bool vfio_file_enforced_coherent(struct file *file) -{ - struct vfio_group *group = file->private_data; - struct vfio_device *device; - bool ret = true; - - if (!vfio_file_is_group(file)) - return true; - - /* - * If the device does not have IOMMU_CAP_ENFORCE_CACHE_COHERENCY then - * any domain later attached to it will also not support it. If the cap - * is set then the iommu_domain eventually attached to the device/group - * must use a domain with enforce_cache_coherency(). - */ - mutex_lock(&group->device_lock); - list_for_each_entry(device, &group->device_list, group_next) { - if (!device_iommu_capable(device->dev, - IOMMU_CAP_ENFORCE_CACHE_COHERENCY)) { - ret = false; - break; - } - } - mutex_unlock(&group->device_lock); - return ret; -} -EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent); - -/** - * vfio_file_set_kvm - Link a kvm with VFIO drivers - * @file: VFIO group file - * @kvm: KVM to link - * - * When a VFIO device is first opened the KVM will be available in - * device->kvm if one was associated with the group. - */ -void vfio_file_set_kvm(struct file *file, struct kvm *kvm) -{ - struct vfio_group *group = file->private_data; - - if (!vfio_file_is_group(file)) - return; - - mutex_lock(&group->group_lock); - group->kvm = kvm; - mutex_unlock(&group->group_lock); -} -EXPORT_SYMBOL_GPL(vfio_file_set_kvm); - -/** - * vfio_file_has_dev - True if the VFIO file is a handle for device - * @file: VFIO file to check - * @device: Device that must be part of the file - * - * Returns true if given file has permission to manipulate the given device. - */ -bool vfio_file_has_dev(struct file *file, struct vfio_device *device) -{ - struct vfio_group *group = file->private_data; - - if (!vfio_file_is_group(file)) - return false; - - return group == device->group; -} -EXPORT_SYMBOL_GPL(vfio_file_has_dev); - /* * Sub-module support */ @@ -1938,11 +1129,6 @@ int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs, } EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare); -static bool vfio_device_has_container(struct vfio_device *device) -{ - return device->group->container; -} - /* * Pin contiguous user pages and return their associated host pages for local * domain only. @@ -2064,55 +1250,6 @@ EXPORT_SYMBOL(vfio_dma_rw); /* * Module/class support */ -static char *vfio_devnode(struct device *dev, umode_t *mode) -{ - return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev)); -} - -static int __init vfio_group_init(void) -{ - int ret; - - ida_init(&vfio.group_ida); - mutex_init(&vfio.group_lock); - INIT_LIST_HEAD(&vfio.group_list); - - ret = vfio_container_init(); - if (ret) - return ret; - - /* /dev/vfio/$GROUP */ - vfio.class = class_create(THIS_MODULE, "vfio"); - if (IS_ERR(vfio.class)) { - ret = PTR_ERR(vfio.class); - goto err_group_class; - } - - vfio.class->devnode = vfio_devnode; - - ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio"); - if (ret) - goto err_alloc_chrdev; - return 0; - -err_alloc_chrdev: - class_destroy(vfio.class); - vfio.class = NULL; -err_group_class: - vfio_container_cleanup(); - return ret; -} - -static void vfio_group_cleanup(void) -{ - WARN_ON(!list_empty(&vfio.group_list)); - ida_destroy(&vfio.group_ida); - unregister_chrdev_region(vfio.group_devt, MINORMASK + 1); - class_destroy(vfio.class); - vfio.class = NULL; - vfio_container_cleanup(); -} - static int __init vfio_init(void) { int ret; -- GitLab From 219072c09abde0f1d0a6ce091be375e8eb7d08f0 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 5 Dec 2022 11:40:31 +0000 Subject: [PATCH 1158/1423] KVM: arm64: Fix benign bug with incorrect use of VA_BITS get_user_mapping_size() uses kvm's pgtable library to walk a user space page table created by the kernel, and in doing so, passes metadata that the library needs, including ia_bits, which defines the size of the input address. For the case where the kernel is compiled for 52 VA bits but runs on HW that does not support LVA, it will fall back to 48 VA bits at runtime. Therefore we must use vabits_actual rather than VA_BITS to get the true address size. This is benign in the current code base because the pgtable library only uses it for error checking. Fixes: 6011cf68c885 ("KVM: arm64: Walk userspace page tables to compute the THP mapping size") Signed-off-by: Ryan Roberts Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221205114031.3972780-1-ryan.roberts@arm.com --- arch/arm64/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 4efb983cff436..1ef0704420d96 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -641,7 +641,7 @@ static int get_user_mapping_size(struct kvm *kvm, u64 addr) { struct kvm_pgtable pgt = { .pgd = (kvm_pte_t *)kvm->mm->pgd, - .ia_bits = VA_BITS, + .ia_bits = vabits_actual, .start_level = (KVM_PGTABLE_MAX_LEVELS - CONFIG_PGTABLE_LEVELS), .mm_ops = &kvm_user_mm_ops, -- GitLab From 96df59b1ae23f5c11698c3c2159aeb2ecd4944a4 Mon Sep 17 00:00:00 2001 From: Li Huafei Date: Fri, 4 Nov 2022 17:56:57 +0800 Subject: [PATCH 1159/1423] RISC-V: kexec: Fix memory leak of fdt buffer This is reported by kmemleak detector: unreferenced object 0xff60000082864000 (size 9588): comm "kexec", pid 146, jiffies 4294900634 (age 64.788s) hex dump (first 32 bytes): d0 0d fe ed 00 00 12 ed 00 00 00 48 00 00 11 40 ...........H...@ 00 00 00 28 00 00 00 11 00 00 00 02 00 00 00 00 ...(............ backtrace: [<00000000f95b17c4>] kmemleak_alloc+0x34/0x3e [<00000000b9ec8e3e>] kmalloc_order+0x9c/0xc4 [<00000000a95cf02e>] kmalloc_order_trace+0x34/0xb6 [<00000000f01e68b4>] __kmalloc+0x5c2/0x62a [<000000002bd497b2>] kvmalloc_node+0x66/0xd6 [<00000000906542fa>] of_kexec_alloc_and_setup_fdt+0xa6/0x6ea [<00000000e1166bde>] elf_kexec_load+0x206/0x4ec [<0000000036548e09>] kexec_image_load_default+0x40/0x4c [<0000000079fbe1b4>] sys_kexec_file_load+0x1c4/0x322 [<0000000040c62c03>] ret_from_syscall+0x0/0x2 In elf_kexec_load(), a buffer is allocated via kvmalloc() to store fdt. While it's not freed back to system when kexec kernel is reloaded or unloaded. Then memory leak is caused. Fix it by introducing riscv specific function arch_kimage_file_post_load_cleanup(), and freeing the buffer there. Fixes: 6261586e0c91 ("RISC-V: Add kexec_file support") Signed-off-by: Li Huafei Reviewed-by: Conor Dooley Reviewed-by: Liao Chang Link: https://lore.kernel.org/r/20221104095658.141222-1-lihuafei1@huawei.com Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/kexec.h | 5 +++++ arch/riscv/kernel/elf_kexec.c | 10 ++++++++++ 2 files changed, 15 insertions(+) diff --git a/arch/riscv/include/asm/kexec.h b/arch/riscv/include/asm/kexec.h index eee260e8ab308..2b56769cb530c 100644 --- a/arch/riscv/include/asm/kexec.h +++ b/arch/riscv/include/asm/kexec.h @@ -39,6 +39,7 @@ crash_setup_regs(struct pt_regs *newregs, #define ARCH_HAS_KIMAGE_ARCH struct kimage_arch { + void *fdt; /* For CONFIG_KEXEC_FILE */ unsigned long fdt_addr; }; @@ -62,6 +63,10 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, const Elf_Shdr *relsec, const Elf_Shdr *symtab); #define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add + +struct kimage; +int arch_kimage_file_post_load_cleanup(struct kimage *image); +#define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup #endif #endif diff --git a/arch/riscv/kernel/elf_kexec.c b/arch/riscv/kernel/elf_kexec.c index 0cb94992c15b3..ff30fcb43f47a 100644 --- a/arch/riscv/kernel/elf_kexec.c +++ b/arch/riscv/kernel/elf_kexec.c @@ -21,6 +21,14 @@ #include #include +int arch_kimage_file_post_load_cleanup(struct kimage *image) +{ + kvfree(image->arch.fdt); + image->arch.fdt = NULL; + + return kexec_image_post_load_cleanup_default(image); +} + static int riscv_kexec_elf_load(struct kimage *image, struct elfhdr *ehdr, struct kexec_elf_info *elf_info, unsigned long old_pbase, unsigned long new_pbase) @@ -298,6 +306,8 @@ static void *elf_kexec_load(struct kimage *image, char *kernel_buf, pr_err("Error add DTB kbuf ret=%d\n", ret); goto out_free_fdt; } + /* Cache the fdt buffer address for memory cleanup */ + image->arch.fdt = fdt; pr_notice("Loaded device tree at 0x%lx\n", kbuf.mem); goto out; -- GitLab From cbc32023ddbdf4baa3d9dc513a2184a84080a5a2 Mon Sep 17 00:00:00 2001 From: Li Huafei Date: Fri, 4 Nov 2022 17:56:58 +0800 Subject: [PATCH 1160/1423] RISC-V: kexec: Fix memory leak of elf header buffer This is reported by kmemleak detector: unreferenced object 0xff2000000403d000 (size 4096): comm "kexec", pid 146, jiffies 4294900633 (age 64.792s) hex dump (first 32 bytes): 7f 45 4c 46 02 01 01 00 00 00 00 00 00 00 00 00 .ELF............ 04 00 f3 00 01 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000566ca97c>] kmemleak_vmalloc+0x3c/0xbe [<00000000979283d8>] __vmalloc_node_range+0x3ac/0x560 [<00000000b4b3712a>] __vmalloc_node+0x56/0x62 [<00000000854f75e2>] vzalloc+0x2c/0x34 [<00000000e9a00db9>] crash_prepare_elf64_headers+0x80/0x30c [<0000000067e8bf48>] elf_kexec_load+0x3e8/0x4ec [<0000000036548e09>] kexec_image_load_default+0x40/0x4c [<0000000079fbe1b4>] sys_kexec_file_load+0x1c4/0x322 [<0000000040c62c03>] ret_from_syscall+0x0/0x2 In elf_kexec_load(), a buffer is allocated via vzalloc() to store elf headers. While it's not freed back to system when kdump kernel is reloaded or unloaded, or when image->elf_header is successfully set and then fails to load kdump kernel for some reason. Fix it by freeing the buffer in arch_kimage_file_post_load_cleanup(). Fixes: 8acea455fafa ("RISC-V: Support for kexec_file on panic") Signed-off-by: Li Huafei Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20221104095658.141222-2-lihuafei1@huawei.com Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/elf_kexec.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/riscv/kernel/elf_kexec.c b/arch/riscv/kernel/elf_kexec.c index ff30fcb43f47a..5372b708fae21 100644 --- a/arch/riscv/kernel/elf_kexec.c +++ b/arch/riscv/kernel/elf_kexec.c @@ -26,6 +26,10 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image) kvfree(image->arch.fdt); image->arch.fdt = NULL; + vfree(image->elf_headers); + image->elf_headers = NULL; + image->elf_headers_sz = 0; + return kexec_image_post_load_cleanup_default(image); } -- GitLab From 8f8bcc8c720c360885639de66fe69756febed824 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 5 Dec 2022 11:29:16 -0400 Subject: [PATCH 1161/1423] vfio/pci: Move all the SPAPR PCI specific logic to vfio_pci_core.ko The vfio_spapr_pci_eeh_open/release() functions are one line wrappers around an arch function. Just call them directly. This eliminates some weird exported symbols that don't need to exist. Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe Reviewed-by: Cornelia Huck Link: https://lore.kernel.org/r/1-v5-fc5346cacfd4+4c482-vfio_modules_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 11 +++++++++-- drivers/vfio/vfio_spapr_eeh.c | 13 ------------- include/linux/vfio.h | 11 ----------- 3 files changed, 9 insertions(+), 26 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 189d4930c276d..56501e7ef5645 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -27,6 +27,9 @@ #include #include #include +#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) +#include +#endif #include "vfio_pci_priv.h" @@ -686,7 +689,9 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev) vdev->sriov_pf_core_dev->vf_token->users--; mutex_unlock(&vdev->sriov_pf_core_dev->vf_token->lock); } - vfio_spapr_pci_eeh_release(vdev->pdev); +#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) + eeh_dev_release(vdev->pdev); +#endif vfio_pci_core_disable(vdev); mutex_lock(&vdev->igate); @@ -705,7 +710,9 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_close_device); void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev) { vfio_pci_probe_mmaps(vdev); - vfio_spapr_pci_eeh_open(vdev->pdev); +#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) + eeh_dev_open(vdev->pdev); +#endif if (vdev->sriov_pf_core_dev) { mutex_lock(&vdev->sriov_pf_core_dev->vf_token->lock); diff --git a/drivers/vfio/vfio_spapr_eeh.c b/drivers/vfio/vfio_spapr_eeh.c index 67f55ac1d459c..c9d102aafbcd1 100644 --- a/drivers/vfio/vfio_spapr_eeh.c +++ b/drivers/vfio/vfio_spapr_eeh.c @@ -15,19 +15,6 @@ #define DRIVER_AUTHOR "Gavin Shan, IBM Corporation" #define DRIVER_DESC "VFIO IOMMU SPAPR EEH" -/* We might build address mapping here for "fast" path later */ -void vfio_spapr_pci_eeh_open(struct pci_dev *pdev) -{ - eeh_dev_open(pdev); -} -EXPORT_SYMBOL_GPL(vfio_spapr_pci_eeh_open); - -void vfio_spapr_pci_eeh_release(struct pci_dev *pdev) -{ - eeh_dev_release(pdev); -} -EXPORT_SYMBOL_GPL(vfio_spapr_pci_eeh_release); - long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, unsigned int cmd, unsigned long arg) { diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 43b67e46a2cb5..9378ca79d5480 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -233,21 +233,10 @@ int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs, int max_irq_type, size_t *data_size); -struct pci_dev; #if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) -void vfio_spapr_pci_eeh_open(struct pci_dev *pdev); -void vfio_spapr_pci_eeh_release(struct pci_dev *pdev); long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, unsigned int cmd, unsigned long arg); #else -static inline void vfio_spapr_pci_eeh_open(struct pci_dev *pdev) -{ -} - -static inline void vfio_spapr_pci_eeh_release(struct pci_dev *pdev) -{ -} - static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, unsigned int cmd, unsigned long arg) -- GitLab From e5c38a203eb4343993e889eb69f5386f085f25ef Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 5 Dec 2022 11:29:17 -0400 Subject: [PATCH 1162/1423] vfio/spapr: Move VFIO_CHECK_EXTENSION into tce_iommu_ioctl() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PPC64 kconfig is a bit of a rats nest, but it turns out that if CONFIG_SPAPR_TCE_IOMMU is on then EEH must be too: config SPAPR_TCE_IOMMU bool "sPAPR TCE IOMMU Support" depends on PPC_POWERNV || PPC_PSERIES select IOMMU_API help Enables bits of IOMMU API required by VFIO. The iommu_ops is not implemented as it is not necessary for VFIO. config PPC_POWERNV select FORCE_PCI config PPC_PSERIES select FORCE_PCI config EEH bool depends on (PPC_POWERNV || PPC_PSERIES) && PCI default y So, just open code the call to eeh_enabled() into tce_iommu_ioctl(). Reviewed-by: Christoph Hellwig Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Cornelia Huck Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v5-fc5346cacfd4+4c482-vfio_modules_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_spapr_tce.c | 10 ++++------ drivers/vfio/vfio_spapr_eeh.c | 6 ------ 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 169f07ac162d9..73cec2beae70b 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -785,14 +785,12 @@ static long tce_iommu_ioctl(void *iommu_data, switch (arg) { case VFIO_SPAPR_TCE_IOMMU: case VFIO_SPAPR_TCE_v2_IOMMU: - ret = 1; - break; + return 1; + case VFIO_EEH: + return eeh_enabled(); default: - ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg); - break; + return 0; } - - return (ret < 0) ? 0 : ret; } /* diff --git a/drivers/vfio/vfio_spapr_eeh.c b/drivers/vfio/vfio_spapr_eeh.c index c9d102aafbcd1..221b1b637e18b 100644 --- a/drivers/vfio/vfio_spapr_eeh.c +++ b/drivers/vfio/vfio_spapr_eeh.c @@ -24,12 +24,6 @@ long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, long ret = -EINVAL; switch (cmd) { - case VFIO_CHECK_EXTENSION: - if (arg == VFIO_EEH) - ret = eeh_enabled() ? 1 : 0; - else - ret = 0; - break; case VFIO_EEH_PE_OP: pe = eeh_iommu_group_to_pe(group); if (!pe) -- GitLab From e276e25819b8a173a21947720bb0a548c0b724b7 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 5 Dec 2022 11:29:18 -0400 Subject: [PATCH 1163/1423] vfio: Move vfio_spapr_iommu_eeh_ioctl into vfio_iommu_spapr_tce.c As with the previous patch EEH is always enabled if SPAPR_TCE_IOMMU, so move this last bit of code into the main module. Now that this function only processes VFIO_EEH_PE_OP remove a level of indenting as well, it is only called by a case statement that already checked VFIO_EEH_PE_OP. This eliminates an unnecessary module and SPAPR code in a global header. Reviewed-by: Christoph Hellwig Reviewed-by: Cornelia Huck Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v5-fc5346cacfd4+4c482-vfio_modules_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/Makefile | 1 - drivers/vfio/vfio_iommu_spapr_tce.c | 55 +++++++++++++++++- drivers/vfio/vfio_spapr_eeh.c | 88 ----------------------------- include/linux/vfio.h | 12 ---- 4 files changed, 53 insertions(+), 103 deletions(-) delete mode 100644 drivers/vfio/vfio_spapr_eeh.c diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index b693a1169286f..50b8e8e3fb10d 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -10,7 +10,6 @@ vfio-y += vfio_main.o \ obj-$(CONFIG_VFIO_VIRQFD) += vfio_virqfd.o obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o -obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o obj-$(CONFIG_VFIO_PCI) += pci/ obj-$(CONFIG_VFIO_PLATFORM) += platform/ obj-$(CONFIG_VFIO_MDEV) += mdev/ diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 73cec2beae70b..60a50ce8701e5 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -4,6 +4,7 @@ * * Copyright (C) 2013 IBM Corp. All rights reserved. * Author: Alexey Kardashevskiy + * Copyright Gavin Shan, IBM Corporation 2014. * * Derived from original vfio_iommu_type1.c: * Copyright (C) 2012 Red Hat, Inc. All rights reserved. @@ -773,6 +774,57 @@ static long tce_iommu_create_default_window(struct tce_container *container) return ret; } +static long vfio_spapr_ioctl_eeh_pe_op(struct iommu_group *group, + unsigned long arg) +{ + struct eeh_pe *pe; + struct vfio_eeh_pe_op op; + unsigned long minsz; + + pe = eeh_iommu_group_to_pe(group); + if (!pe) + return -ENODEV; + + minsz = offsetofend(struct vfio_eeh_pe_op, op); + if (copy_from_user(&op, (void __user *)arg, minsz)) + return -EFAULT; + if (op.argsz < minsz || op.flags) + return -EINVAL; + + switch (op.op) { + case VFIO_EEH_PE_DISABLE: + return eeh_pe_set_option(pe, EEH_OPT_DISABLE); + case VFIO_EEH_PE_ENABLE: + return eeh_pe_set_option(pe, EEH_OPT_ENABLE); + case VFIO_EEH_PE_UNFREEZE_IO: + return eeh_pe_set_option(pe, EEH_OPT_THAW_MMIO); + case VFIO_EEH_PE_UNFREEZE_DMA: + return eeh_pe_set_option(pe, EEH_OPT_THAW_DMA); + case VFIO_EEH_PE_GET_STATE: + return eeh_pe_get_state(pe); + break; + case VFIO_EEH_PE_RESET_DEACTIVATE: + return eeh_pe_reset(pe, EEH_RESET_DEACTIVATE, true); + case VFIO_EEH_PE_RESET_HOT: + return eeh_pe_reset(pe, EEH_RESET_HOT, true); + case VFIO_EEH_PE_RESET_FUNDAMENTAL: + return eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL, true); + case VFIO_EEH_PE_CONFIGURE: + return eeh_pe_configure(pe); + case VFIO_EEH_PE_INJECT_ERR: + minsz = offsetofend(struct vfio_eeh_pe_op, err.mask); + if (op.argsz < minsz) + return -EINVAL; + if (copy_from_user(&op, (void __user *)arg, minsz)) + return -EFAULT; + + return eeh_pe_inject_err(pe, op.err.type, op.err.func, + op.err.addr, op.err.mask); + default: + return -EINVAL; + } +} + static long tce_iommu_ioctl(void *iommu_data, unsigned int cmd, unsigned long arg) { @@ -1044,8 +1096,7 @@ static long tce_iommu_ioctl(void *iommu_data, ret = 0; list_for_each_entry(tcegrp, &container->group_list, next) { - ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp, - cmd, arg); + ret = vfio_spapr_ioctl_eeh_pe_op(tcegrp->grp, arg); if (ret) return ret; } diff --git a/drivers/vfio/vfio_spapr_eeh.c b/drivers/vfio/vfio_spapr_eeh.c deleted file mode 100644 index 221b1b637e18b..0000000000000 --- a/drivers/vfio/vfio_spapr_eeh.c +++ /dev/null @@ -1,88 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * EEH functionality support for VFIO devices. The feature is only - * available on sPAPR compatible platforms. - * - * Copyright Gavin Shan, IBM Corporation 2014. - */ - -#include -#include -#include -#include - -#define DRIVER_VERSION "0.1" -#define DRIVER_AUTHOR "Gavin Shan, IBM Corporation" -#define DRIVER_DESC "VFIO IOMMU SPAPR EEH" - -long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, - unsigned int cmd, unsigned long arg) -{ - struct eeh_pe *pe; - struct vfio_eeh_pe_op op; - unsigned long minsz; - long ret = -EINVAL; - - switch (cmd) { - case VFIO_EEH_PE_OP: - pe = eeh_iommu_group_to_pe(group); - if (!pe) - return -ENODEV; - - minsz = offsetofend(struct vfio_eeh_pe_op, op); - if (copy_from_user(&op, (void __user *)arg, minsz)) - return -EFAULT; - if (op.argsz < minsz || op.flags) - return -EINVAL; - - switch (op.op) { - case VFIO_EEH_PE_DISABLE: - ret = eeh_pe_set_option(pe, EEH_OPT_DISABLE); - break; - case VFIO_EEH_PE_ENABLE: - ret = eeh_pe_set_option(pe, EEH_OPT_ENABLE); - break; - case VFIO_EEH_PE_UNFREEZE_IO: - ret = eeh_pe_set_option(pe, EEH_OPT_THAW_MMIO); - break; - case VFIO_EEH_PE_UNFREEZE_DMA: - ret = eeh_pe_set_option(pe, EEH_OPT_THAW_DMA); - break; - case VFIO_EEH_PE_GET_STATE: - ret = eeh_pe_get_state(pe); - break; - case VFIO_EEH_PE_RESET_DEACTIVATE: - ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE, true); - break; - case VFIO_EEH_PE_RESET_HOT: - ret = eeh_pe_reset(pe, EEH_RESET_HOT, true); - break; - case VFIO_EEH_PE_RESET_FUNDAMENTAL: - ret = eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL, true); - break; - case VFIO_EEH_PE_CONFIGURE: - ret = eeh_pe_configure(pe); - break; - case VFIO_EEH_PE_INJECT_ERR: - minsz = offsetofend(struct vfio_eeh_pe_op, err.mask); - if (op.argsz < minsz) - return -EINVAL; - if (copy_from_user(&op, (void __user *)arg, minsz)) - return -EFAULT; - - ret = eeh_pe_inject_err(pe, op.err.type, op.err.func, - op.err.addr, op.err.mask); - break; - default: - ret = -EINVAL; - } - } - - return ret; -} -EXPORT_SYMBOL_GPL(vfio_spapr_iommu_eeh_ioctl); - -MODULE_VERSION(DRIVER_VERSION); -MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR(DRIVER_AUTHOR); -MODULE_DESCRIPTION(DRIVER_DESC); diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 9378ca79d5480..b4d5d4ca3d7d5 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -233,18 +233,6 @@ int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs, int max_irq_type, size_t *data_size); -#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) -long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, unsigned int cmd, - unsigned long arg); -#else -static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, - unsigned int cmd, - unsigned long arg) -{ - return -ENOTTY; -} -#endif /* CONFIG_VFIO_SPAPR_EEH */ - /* * IRQfd - generic */ -- GitLab From 20601c45a0fa20bbb5545f4dd69f4f18448f4973 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 5 Dec 2022 11:29:19 -0400 Subject: [PATCH 1164/1423] vfio: Remove CONFIG_VFIO_SPAPR_EEH We don't need a kconfig symbol for this, just directly test CONFIG_EEH in the few places that need it. Reviewed-by: Christoph Hellwig Reviewed-by: Cornelia Huck Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4-v5-fc5346cacfd4+4c482-vfio_modules_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/Kconfig | 5 ----- drivers/vfio/pci/vfio_pci_core.c | 6 +++--- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 86c381ceb9a1e..d25b91adfd64c 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -20,11 +20,6 @@ config VFIO_IOMMU_SPAPR_TCE depends on SPAPR_TCE_IOMMU default VFIO -config VFIO_SPAPR_EEH - tristate - depends on EEH && VFIO_IOMMU_SPAPR_TCE - default VFIO - config VFIO_VIRQFD tristate select EVENTFD diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 56501e7ef5645..f9365a5bc9612 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -27,7 +27,7 @@ #include #include #include -#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) +#if IS_ENABLED(CONFIG_EEH) #include #endif @@ -689,7 +689,7 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev) vdev->sriov_pf_core_dev->vf_token->users--; mutex_unlock(&vdev->sriov_pf_core_dev->vf_token->lock); } -#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) +#if IS_ENABLED(CONFIG_EEH) eeh_dev_release(vdev->pdev); #endif vfio_pci_core_disable(vdev); @@ -710,7 +710,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_close_device); void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev) { vfio_pci_probe_mmaps(vdev); -#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) +#if IS_ENABLED(CONFIG_EEH) eeh_dev_open(vdev->pdev); #endif -- GitLab From e2d55709398e62cf53e5c7df3758ae52cc62d63a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 5 Dec 2022 11:29:20 -0400 Subject: [PATCH 1165/1423] vfio: Fold vfio_virqfd.ko into vfio.ko This is only 1.8k, putting it in its own module is not really necessary. The kconfig infrastructure is still there to completely remove it for systems that are trying for small footprint. Put it in the main vfio.ko module now that kbuild can support multiple .c files. Signed-off-by: Jason Gunthorpe Reviewed-by: Cornelia Huck Link: https://lore.kernel.org/r/5-v5-fc5346cacfd4+4c482-vfio_modules_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/Kconfig | 2 +- drivers/vfio/Makefile | 4 +--- drivers/vfio/vfio.h | 13 +++++++++++++ drivers/vfio/vfio_main.c | 7 +++++++ drivers/vfio/virqfd.c | 17 +++-------------- 5 files changed, 25 insertions(+), 18 deletions(-) diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index d25b91adfd64c..0b8d53f63c7e5 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -21,7 +21,7 @@ config VFIO_IOMMU_SPAPR_TCE default VFIO config VFIO_VIRQFD - tristate + bool select EVENTFD default n diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index 50b8e8e3fb10d..0721ed4831c92 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -1,13 +1,11 @@ # SPDX-License-Identifier: GPL-2.0 -vfio_virqfd-y := virqfd.o - obj-$(CONFIG_VFIO) += vfio.o vfio-y += vfio_main.o \ iova_bitmap.o \ container.o +vfio-$(CONFIG_VFIO_VIRQFD) += virqfd.o -obj-$(CONFIG_VFIO_VIRQFD) += vfio_virqfd.o obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o obj-$(CONFIG_VFIO_PCI) += pci/ diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index bcad54bbab08c..a7113b4baaa24 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -124,6 +124,19 @@ long vfio_container_ioctl_check_extension(struct vfio_container *container, int __init vfio_container_init(void); void vfio_container_cleanup(void); +#if IS_ENABLED(CONFIG_VFIO_VIRQFD) +int __init vfio_virqfd_init(void); +void vfio_virqfd_exit(void); +#else +static inline int __init vfio_virqfd_init(void) +{ + return 0; +} +static inline void vfio_virqfd_exit(void) +{ +} +#endif + #ifdef CONFIG_VFIO_NOIOMMU extern bool vfio_noiommu __read_mostly; #else diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 662e267a3e13d..7f88569c3ebaf 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1832,6 +1832,10 @@ static int __init vfio_init(void) if (ret) return ret; + ret = vfio_virqfd_init(); + if (ret) + goto err_virqfd; + /* /dev/vfio/$GROUP */ vfio.class = class_create(THIS_MODULE, "vfio"); if (IS_ERR(vfio.class)) { @@ -1862,6 +1866,8 @@ err_dev_class: class_destroy(vfio.class); vfio.class = NULL; err_group_class: + vfio_virqfd_exit(); +err_virqfd: vfio_container_cleanup(); return ret; } @@ -1876,6 +1882,7 @@ static void __exit vfio_cleanup(void) class_destroy(vfio.device_class); vfio.device_class = NULL; class_destroy(vfio.class); + vfio_virqfd_exit(); vfio_container_cleanup(); vfio.class = NULL; xa_destroy(&vfio_device_set_xa); diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c index 414e98d82b02e..497a17b378656 100644 --- a/drivers/vfio/virqfd.c +++ b/drivers/vfio/virqfd.c @@ -12,15 +12,12 @@ #include #include #include - -#define DRIVER_VERSION "0.1" -#define DRIVER_AUTHOR "Alex Williamson " -#define DRIVER_DESC "IRQFD support for VFIO bus drivers" +#include "vfio.h" static struct workqueue_struct *vfio_irqfd_cleanup_wq; static DEFINE_SPINLOCK(virqfd_lock); -static int __init vfio_virqfd_init(void) +int __init vfio_virqfd_init(void) { vfio_irqfd_cleanup_wq = create_singlethread_workqueue("vfio-irqfd-cleanup"); @@ -30,7 +27,7 @@ static int __init vfio_virqfd_init(void) return 0; } -static void __exit vfio_virqfd_exit(void) +void vfio_virqfd_exit(void) { destroy_workqueue(vfio_irqfd_cleanup_wq); } @@ -216,11 +213,3 @@ void vfio_virqfd_disable(struct virqfd **pvirqfd) flush_workqueue(vfio_irqfd_cleanup_wq); } EXPORT_SYMBOL_GPL(vfio_virqfd_disable); - -module_init(vfio_virqfd_init); -module_exit(vfio_virqfd_exit); - -MODULE_VERSION(DRIVER_VERSION); -MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR(DRIVER_AUTHOR); -MODULE_DESCRIPTION(DRIVER_DESC); -- GitLab From ce3895735cc26957dc6b2a8f5af07ddab09483ae Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 2 Dec 2022 09:46:15 -0700 Subject: [PATCH 1166/1423] vfio/ap/ccw/samples: Fix device_register() unwind path We always need to call put_device() if device_register() fails. All vfio drivers calling device_register() include a similar unwind stack via gotos, therefore split device_unregister() into its device_del() and put_device() components in the unwind path, and add a goto target to handle only the put_device() requirement. Reported-by: Ruan Jinjie Link: https://lore.kernel.org/all/20221118032827.3725190-1-ruanjinjie@huawei.com Fixes: d61fc96f47fd ("sample: vfio mdev display - host device") Fixes: 9d1a546c53b4 ("docs: Sample driver to demonstrate how to use Mediated device framework.") Fixes: a5e6e6505f38 ("sample: vfio bochs vbe display (host device for bochs-drm)") Fixes: 9e6f07cd1eaa ("vfio/ccw: create a parent struct") Fixes: 36360658eb5a ("s390: vfio_ap: link the vfio_ap devices to the vfio_ap bus subsystem") Cc: Tony Krowiak Cc: Halil Pasic Cc: Jason Herne Cc: Kirti Wankhede Reviewed-by: Kevin Tian Reviewed-by: Eric Farman Reviewed-by: Tony Krowiak Reviewed-by: Jason J. Herne Link: https://lore.kernel.org/r/166999942139.645727.12439756512449846442.stgit@omen Signed-off-by: Alex Williamson --- drivers/s390/cio/vfio_ccw_drv.c | 3 ++- drivers/s390/crypto/vfio_ap_drv.c | 2 +- samples/vfio-mdev/mbochs.c | 7 ++++--- samples/vfio-mdev/mdpy.c | 7 ++++--- samples/vfio-mdev/mtty.c | 7 ++++--- 5 files changed, 15 insertions(+), 11 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index c2a65808605af..54aba7cceb33f 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -199,8 +199,9 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) return 0; out_unreg: - device_unregister(&parent->dev); + device_del(&parent->dev); out_free: + put_device(&parent->dev); dev_set_drvdata(&sch->dev, NULL); return ret; } diff --git a/drivers/s390/crypto/vfio_ap_drv.c b/drivers/s390/crypto/vfio_ap_drv.c index f43cfeabd2cc8..997b524bdd2b5 100644 --- a/drivers/s390/crypto/vfio_ap_drv.c +++ b/drivers/s390/crypto/vfio_ap_drv.c @@ -122,7 +122,7 @@ static int vfio_ap_matrix_dev_create(void) return 0; matrix_drv_err: - device_unregister(&matrix_dev->device); + device_del(&matrix_dev->device); matrix_reg_err: put_device(&matrix_dev->device); matrix_alloc_err: diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index 8b5a3a778a257..e54eb752e1ba8 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -1430,7 +1430,7 @@ static int __init mbochs_dev_init(void) ret = device_register(&mbochs_dev); if (ret) - goto err_class; + goto err_put; ret = mdev_register_parent(&mbochs_parent, &mbochs_dev, &mbochs_driver, mbochs_mdev_types, @@ -1441,8 +1441,9 @@ static int __init mbochs_dev_init(void) return 0; err_device: - device_unregister(&mbochs_dev); -err_class: + device_del(&mbochs_dev); +err_put: + put_device(&mbochs_dev); class_destroy(mbochs_class); err_driver: mdev_unregister_driver(&mbochs_driver); diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index 721fb06c6413f..e8400fdab71da 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -717,7 +717,7 @@ static int __init mdpy_dev_init(void) ret = device_register(&mdpy_dev); if (ret) - goto err_class; + goto err_put; ret = mdev_register_parent(&mdpy_parent, &mdpy_dev, &mdpy_driver, mdpy_mdev_types, @@ -728,8 +728,9 @@ static int __init mdpy_dev_init(void) return 0; err_device: - device_unregister(&mdpy_dev); -err_class: + device_del(&mdpy_dev); +err_put: + put_device(&mdpy_dev); class_destroy(mdpy_class); err_driver: mdev_unregister_driver(&mdpy_driver); diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index 3c2a421b9b696..e887de672c526 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -1330,7 +1330,7 @@ static int __init mtty_dev_init(void) ret = device_register(&mtty_dev.dev); if (ret) - goto err_class; + goto err_put; ret = mdev_register_parent(&mtty_dev.parent, &mtty_dev.dev, &mtty_driver, mtty_mdev_types, @@ -1340,8 +1340,9 @@ static int __init mtty_dev_init(void) return 0; err_device: - device_unregister(&mtty_dev.dev); -err_class: + device_del(&mtty_dev.dev); +err_put: + put_device(&mtty_dev.dev); class_destroy(mtty_dev.vd_class); err_driver: mdev_unregister_driver(&mtty_driver); -- GitLab From 5c3022e4a616d800cf5f4c3a981d7992179e44a1 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Wed, 9 Nov 2022 01:49:36 -0500 Subject: [PATCH 1167/1423] riscv: stacktrace: Fixup ftrace_graph_ret_addr retp argument The 'retp' is a pointer to the return address on the stack, so we must pass the current return address pointer as the 'retp' argument to ftrace_push_return_trace(). Not parent function's return address on the stack. Fixes: b785ec129bd9 ("riscv/ftrace: Add HAVE_FUNCTION_GRAPH_RET_ADDR_PTR support") Signed-off-by: Guo Ren Signed-off-by: Guo Ren Link: https://lore.kernel.org/r/20221109064937.3643993-2-guoren@kernel.org Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/stacktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index 08d11a53f39e7..bcfe9eb55f80f 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -58,7 +58,7 @@ void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs, } else { fp = frame->fp; pc = ftrace_graph_ret_addr(current, NULL, frame->ra, - (unsigned long *)(fp - 8)); + &frame->ra); } } -- GitLab From 7ecdadf7f8c659524f6b2aebf6be7bf619764d90 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Wed, 9 Nov 2022 01:49:37 -0500 Subject: [PATCH 1168/1423] riscv: stacktrace: Make walk_stackframe cross pt_regs frame The current walk_stackframe with FRAME_POINTER would stop unwinding at ret_from_exception: BUG: sleeping function called from invalid context at kernel/locking/rwsem.c:1518 in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 1, name: init CPU: 0 PID: 1 Comm: init Not tainted 5.10.113-00021-g15c15974895c-dirty #192 Call Trace: [] walk_stackframe+0x0/0xee [] show_stack+0x32/0x4a [] dump_stack_lvl+0x72/0x8e [] dump_stack+0x14/0x1c [] ___might_sleep+0x12e/0x138 [] __might_sleep+0x10/0x18 [] down_read+0x22/0xa4 [] do_page_fault+0xb0/0x2fe [] ret_from_exception+0x0/0xc The optimization would help walk_stackframe cross the pt_regs frame and get more backtrace of debug info: BUG: sleeping function called from invalid context at kernel/locking/rwsem.c:1518 in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 1, name: init CPU: 0 PID: 1 Comm: init Not tainted 5.10.113-00021-g15c15974895c-dirty #192 Call Trace: [] walk_stackframe+0x0/0xee [] show_stack+0x32/0x4a [] dump_stack_lvl+0x72/0x8e [] dump_stack+0x14/0x1c [] ___might_sleep+0x12e/0x138 [] __might_sleep+0x10/0x18 [] down_read+0x22/0xa4 [] do_page_fault+0xb0/0x2fe [] ret_from_exception+0x0/0xc [] riscv_intc_irq+0x1a/0x72 [] ret_from_exception+0x0/0xc [] vma_link+0x54/0x160 [] mmap_region+0x2cc/0x4d0 [] do_mmap+0x2d8/0x3ac [] vm_mmap_pgoff+0x70/0xb8 [] vm_mmap+0x2a/0x36 [] elf_map+0x72/0x84 [] load_elf_binary+0x69a/0xec8 [] bprm_execve+0x246/0x53a [] kernel_execve+0xe8/0x124 [] run_init_process+0xfa/0x10c [] try_to_run_init_process+0x12/0x3c [] kernel_init+0xb4/0xf8 [] ret_from_exception+0x0/0xc Here is the error injection test code for the above output: drivers/irqchip/irq-riscv-intc.c: static asmlinkage void riscv_intc_irq(struct pt_regs *regs) { unsigned long cause = regs->cause & ~CAUSE_IRQ_FLAG; + u32 tmp; __get_user(tmp, (u32 *)0); Signed-off-by: Guo Ren Signed-off-by: Guo Ren Link: https://lore.kernel.org/r/20221109064937.3643993-3-guoren@kernel.org [Palmer: use SYM_CODE_*] Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/entry.S | 3 ++- arch/riscv/kernel/stacktrace.c | 9 +++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index b9eda3fcbd6d7..da44fe2d0d821 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -248,7 +248,7 @@ ret_from_syscall_rejected: andi t0, t0, _TIF_SYSCALL_WORK bnez t0, handle_syscall_trace_exit -ret_from_exception: +SYM_CODE_START_NOALIGN(ret_from_exception) REG_L s0, PT_STATUS(sp) csrc CSR_STATUS, SR_IE #ifdef CONFIG_TRACE_IRQFLAGS @@ -262,6 +262,7 @@ ret_from_exception: andi s0, s0, SR_SPP #endif bnez s0, resume_kernel +SYM_CODE_END(ret_from_exception) resume_userspace: /* Interrupts must be disabled here so flags are checked atomically */ diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index bcfe9eb55f80f..75c8dd64fc48e 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -16,6 +16,8 @@ #ifdef CONFIG_FRAME_POINTER +extern asmlinkage void ret_from_exception(void); + void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs, bool (*fn)(void *, unsigned long), void *arg) { @@ -59,6 +61,13 @@ void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs, fp = frame->fp; pc = ftrace_graph_ret_addr(current, NULL, frame->ra, &frame->ra); + if (pc == (unsigned long)ret_from_exception) { + if (unlikely(!__kernel_text_address(pc) || !fn(arg, pc))) + break; + + pc = ((struct pt_regs *)sp)->epc; + fp = ((struct pt_regs *)sp)->s0; + } } } -- GitLab From 0a584655ef89541dae4d48d2c523b1480ae80284 Mon Sep 17 00:00:00 2001 From: Francisco Munoz Date: Mon, 5 Dec 2022 17:16:37 -0700 Subject: [PATCH 1169/1423] PCI: vmd: Fix secondary bus reset for Intel bridges MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reset was never applied in the current implementation because Intel Bridges owned by VMD are parentless. Internally, pci_reset_bus() applies a reset to the parent of the PCI device supplied as argument, but in this case it failed because there wasn't a parent. In more detail, this change allows the VMD driver to enumerate NVMe devices in pass-through configurations when guest reboots are performed. There was an attempted to fix this, but later we discovered that the code inside pci_reset_bus() wasn’t triggering secondary bus resets. Therefore, we updated the parameters passed to it, and now NVMe SSDs attached to VMD bridges are properly enumerated in VT-d pass-through scenarios. Link: https://lore.kernel.org/r/20221206001637.4744-1-francisco.munoz.ruiz@linux.intel.com Fixes: 6aab5622296b ("PCI: vmd: Clean up domain before enumeration") Signed-off-by: Francisco Munoz Signed-off-by: Lorenzo Pieralisi Reviewed-by: Nirmal Patel Reviewed-by: Jonathan Derrick --- drivers/pci/controller/vmd.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index 98e0746e681c9..769eedeb8802a 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -719,6 +719,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) resource_size_t offset[2] = {0}; resource_size_t membar2_offset = 0x2000; struct pci_bus *child; + struct pci_dev *dev; int ret; /* @@ -859,8 +860,25 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) pci_scan_child_bus(vmd->bus); vmd_domain_reset(vmd); - list_for_each_entry(child, &vmd->bus->children, node) - pci_reset_bus(child->self); + + /* When Intel VMD is enabled, the OS does not discover the Root Ports + * owned by Intel VMD within the MMCFG space. pci_reset_bus() applies + * a reset to the parent of the PCI device supplied as argument. This + * is why we pass a child device, so the reset can be triggered at + * the Intel bridge level and propagated to all the children in the + * hierarchy. + */ + list_for_each_entry(child, &vmd->bus->children, node) { + if (!list_empty(&child->devices)) { + dev = list_first_entry(&child->devices, + struct pci_dev, bus_list); + if (pci_reset_bus(dev)) + pci_warn(dev, "can't reset device: %d\n", ret); + + break; + } + } + pci_assign_unassigned_bus_resources(vmd->bus); /* -- GitLab From 19098934f910b4d47cb30251dd39ffa57bef9523 Mon Sep 17 00:00:00 2001 From: John Thomson Date: Tue, 6 Dec 2022 06:46:45 +1000 Subject: [PATCH 1170/1423] PCI: mt7621: Add sentinel to quirks table Current driver is missing a sentinel in the struct soc_device_attribute array, which causes an oops when assessed by the soc_device_match(mt7621_pcie_quirks_match) call. This was only exposed once the CONFIG_SOC_MT7621 mt7621 soc_dev_attr was fixed to register the SOC as a device, in: commit 7c18b64bba3b ("mips: ralink: mt7621: do not use kzalloc too early") Fix it by adding the required sentinel. Link: https://lore.kernel.org/lkml/26ebbed1-0fe9-4af9-8466-65f841d0b382@app.fastmail.com Link: https://lore.kernel.org/r/20221205204645.301301-1-git@johnthomson.fastmail.com.au Fixes: b483b4e4d3f6 ("staging: mt7621-pci: add quirks for 'E2' revision using 'soc_device_attribute'") Signed-off-by: John Thomson Signed-off-by: Lorenzo Pieralisi Acked-by: Sergio Paracuellos --- drivers/pci/controller/pcie-mt7621.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/pcie-mt7621.c b/drivers/pci/controller/pcie-mt7621.c index 4bd1abf26008f..ee7aad09d6277 100644 --- a/drivers/pci/controller/pcie-mt7621.c +++ b/drivers/pci/controller/pcie-mt7621.c @@ -466,7 +466,8 @@ static int mt7621_pcie_register_host(struct pci_host_bridge *host) } static const struct soc_device_attribute mt7621_pcie_quirks_match[] = { - { .soc_id = "mt7621", .revision = "E2" } + { .soc_id = "mt7621", .revision = "E2" }, + { /* sentinel */ } }; static int mt7621_pcie_probe(struct platform_device *pdev) -- GitLab From 74eac50391ce42c5d0038d6f0e580576e53aec4e Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 5 Dec 2022 10:45:30 +0100 Subject: [PATCH 1171/1423] dt-bindings: PCI: qcom: Allow 'dma-coherent' property Devices on some PCIe buses may be cache coherent and must be marked as such in the devicetree to avoid data corruption. This is specifically needed on recent Qualcomm platforms like SC8280XP. Link: https://lore.kernel.org/r/20221205094530.12883-1-johan+linaro@kernel.org Signed-off-by: Johan Hovold Signed-off-by: Lorenzo Pieralisi Acked-by: Rob Herring --- Documentation/devicetree/bindings/pci/qcom,pcie.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie.yaml b/Documentation/devicetree/bindings/pci/qcom,pcie.yaml index 2f851c804bb07..a5859bb3dc28c 100644 --- a/Documentation/devicetree/bindings/pci/qcom,pcie.yaml +++ b/Documentation/devicetree/bindings/pci/qcom,pcie.yaml @@ -62,6 +62,8 @@ properties: minItems: 3 maxItems: 13 + dma-coherent: true + interconnects: maxItems: 2 -- GitLab From ec9eaf68c1dcd1b0d4e0bad0630ddac49c20bbe8 Mon Sep 17 00:00:00 2001 From: Frank Wunderlich Date: Sun, 27 Nov 2022 12:41:37 +0100 Subject: [PATCH 1172/1423] dt-bindings: PCI: mediatek-gen3: add SoC based clock config The PCIe driver covers different SOC which needing different clock configs. Define them based on compatible. Link: https://lore.kernel.org/r/20221127114142.156573-4-linux@fw-web.de Signed-off-by: Frank Wunderlich Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Reviewed-by: AngeloGioacchino Del Regno Reviewed-by: Matthias Brugger Acked-by: Jianjun Wang --- .../bindings/pci/mediatek-pcie-gen3.yaml | 47 ++++++++++++++----- 1 file changed, 35 insertions(+), 12 deletions(-) diff --git a/Documentation/devicetree/bindings/pci/mediatek-pcie-gen3.yaml b/Documentation/devicetree/bindings/pci/mediatek-pcie-gen3.yaml index bc90f0ec7bd99..ef5cc1fc4d107 100644 --- a/Documentation/devicetree/bindings/pci/mediatek-pcie-gen3.yaml +++ b/Documentation/devicetree/bindings/pci/mediatek-pcie-gen3.yaml @@ -43,9 +43,6 @@ description: |+ each set has its own address for MSI message, and supports 32 MSI vectors to generate interrupt. -allOf: - - $ref: /schemas/pci/pci-bus.yaml# - properties: compatible: oneOf: @@ -90,15 +87,7 @@ properties: maxItems: 6 clock-names: - items: - - const: pl_250m - - const: tl_26m - - const: tl_96m - - const: tl_32k - - const: peri_26m - - enum: - - top_133m # for MT8192 - - peri_mem # for MT8188/MT8195 + maxItems: 6 assigned-clocks: maxItems: 1 @@ -147,6 +136,40 @@ required: - '#interrupt-cells' - interrupt-controller +allOf: + - $ref: /schemas/pci/pci-bus.yaml# + - if: + properties: + compatible: + const: mediatek,mt8192-pcie + then: + properties: + clock-names: + items: + - const: pl_250m + - const: tl_26m + - const: tl_96m + - const: tl_32k + - const: peri_26m + - const: top_133m + - if: + properties: + compatible: + contains: + enum: + - mediatek,mt8188-pcie + - mediatek,mt8195-pcie + then: + properties: + clock-names: + items: + - const: pl_250m + - const: tl_26m + - const: tl_96m + - const: tl_32k + - const: peri_26m + - const: peri_mem + unevaluatedProperties: false examples: -- GitLab From d3fd0ee7a4a1e796413fab7affc72eeec31bed13 Mon Sep 17 00:00:00 2001 From: Frank Wunderlich Date: Sun, 27 Nov 2022 12:41:38 +0100 Subject: [PATCH 1173/1423] dt-bindings: PCI: mediatek-gen3: add support for mt7986 Add compatible string and clock-definition for mt7986. It needs 4 clocks for PCIe, define them in binding. Link: https://lore.kernel.org/r/20221127114142.156573-5-linux@fw-web.de Signed-off-by: Frank Wunderlich Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Reviewed-by: Matthias Brugger Acked-by: Jianjun Wang --- .../bindings/pci/mediatek-pcie-gen3.yaml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/Documentation/devicetree/bindings/pci/mediatek-pcie-gen3.yaml b/Documentation/devicetree/bindings/pci/mediatek-pcie-gen3.yaml index ef5cc1fc4d107..7e8c7a2a5f9b6 100644 --- a/Documentation/devicetree/bindings/pci/mediatek-pcie-gen3.yaml +++ b/Documentation/devicetree/bindings/pci/mediatek-pcie-gen3.yaml @@ -48,6 +48,7 @@ properties: oneOf: - items: - enum: + - mediatek,mt7986-pcie - mediatek,mt8188-pcie - mediatek,mt8195-pcie - const: mediatek,mt8192-pcie @@ -84,9 +85,11 @@ properties: enum: [ phy, mac ] clocks: + minItems: 4 maxItems: 6 clock-names: + minItems: 4 maxItems: 6 assigned-clocks: @@ -169,6 +172,20 @@ allOf: - const: tl_32k - const: peri_26m - const: peri_mem + - if: + properties: + compatible: + contains: + enum: + - mediatek,mt7986-pcie + then: + properties: + clock-names: + items: + - const: pl_250m + - const: tl_26m + - const: peri_26m + - const: top_133m unevaluatedProperties: false -- GitLab From c943a9374d12bebca2d0de7e273b1c723b58c122 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 6 Dec 2022 10:34:25 +0200 Subject: [PATCH 1174/1423] net/mlx5: Introduce ifc bits for pre_copy Introduce ifc related stuff to enable PRE_COPY of VF during migration. Signed-off-by: Shay Drory Acked-by: Leon Romanovsky Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-2-yishaih@nvidia.com Signed-off-by: Alex Williamson --- include/linux/mlx5/mlx5_ifc.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 5a4e914e2a6ff..230a96626a5f3 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1882,7 +1882,12 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 max_reformat_remove_size[0x8]; u8 max_reformat_remove_offset[0x8]; - u8 reserved_at_c0[0xe0]; + u8 reserved_at_c0[0x8]; + u8 migration_multi_load[0x1]; + u8 migration_tracking_state[0x1]; + u8 reserved_at_ca[0x16]; + + u8 reserved_at_e0[0xc0]; u8 reserved_at_1a0[0xb]; u8 log_min_mkey_entity_size[0x5]; @@ -11918,7 +11923,8 @@ struct mlx5_ifc_query_vhca_migration_state_in_bits { u8 reserved_at_20[0x10]; u8 op_mod[0x10]; - u8 reserved_at_40[0x10]; + u8 incremental[0x1]; + u8 reserved_at_41[0xf]; u8 vhca_id[0x10]; u8 reserved_at_60[0x20]; @@ -11944,7 +11950,9 @@ struct mlx5_ifc_save_vhca_state_in_bits { u8 reserved_at_20[0x10]; u8 op_mod[0x10]; - u8 reserved_at_40[0x10]; + u8 incremental[0x1]; + u8 set_track[0x1]; + u8 reserved_at_42[0xe]; u8 vhca_id[0x10]; u8 reserved_at_60[0x20]; -- GitLab From 4db52602a6074e9cc523500b8304600ff63e7b85 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 6 Dec 2022 10:34:26 +0200 Subject: [PATCH 1175/1423] vfio: Extend the device migration protocol with PRE_COPY The optional PRE_COPY states open the saving data transfer FD before reaching STOP_COPY and allows the device to dirty track internal state changes with the general idea to reduce the volume of data transferred in the STOP_COPY stage. While in PRE_COPY the device remains RUNNING, but the saving FD is open. Only if the device also supports RUNNING_P2P can it support PRE_COPY_P2P, which halts P2P transfers while continuing the saving FD. PRE_COPY, with P2P support, requires the driver to implement 7 new arcs and exists as an optional FSM branch between RUNNING and STOP_COPY: RUNNING -> PRE_COPY -> PRE_COPY_P2P -> STOP_COPY A new ioctl VFIO_MIG_GET_PRECOPY_INFO is provided to allow userspace to query the progress of the precopy operation in the driver with the idea it will judge to move to STOP_COPY at least once the initial data set is transferred, and possibly after the dirty size has shrunk appropriately. This ioctl is valid only in PRE_COPY states and kernel driver should return -EINVAL from any other migration state. Compared to the v1 clarification, STOP_COPY -> PRE_COPY is blocked and to be defined in future. We also split the pending_bytes report into the initial and sustaining values, e.g.: initial_bytes and dirty_bytes. initial_bytes: Amount of initial precopy data. dirty_bytes: Device state changes relative to data previously retrieved. These fields are not required to have any bearing to STOP_COPY phase. It is recommended to leave PRE_COPY for STOP_COPY only after the initial_bytes field reaches zero. Leaving PRE_COPY earlier might make things slower. Signed-off-by: Jason Gunthorpe Signed-off-by: Shay Drory Reviewed-by: Kevin Tian Reviewed-by: Shameer Kolothum Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-3-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 74 ++++++++++++++++++++++- include/uapi/linux/vfio.h | 123 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 191 insertions(+), 6 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 7f88569c3ebaf..03dbcd3d96f0e 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1042,7 +1042,7 @@ int vfio_mig_get_next_state(struct vfio_device *device, enum vfio_device_mig_state new_fsm, enum vfio_device_mig_state *next_fsm) { - enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_RUNNING_P2P + 1 }; + enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 }; /* * The coding in this table requires the driver to implement the * following FSM arcs: @@ -1057,30 +1057,65 @@ int vfio_mig_get_next_state(struct vfio_device *device, * RUNNING_P2P -> RUNNING * RUNNING_P2P -> STOP * STOP -> RUNNING_P2P - * Without P2P the driver must implement: + * + * If precopy is supported then the driver must support these additional + * FSM arcs: + * RUNNING -> PRE_COPY + * PRE_COPY -> RUNNING + * PRE_COPY -> STOP_COPY + * However, if precopy and P2P are supported together then the driver + * must support these additional arcs beyond the P2P arcs above: + * PRE_COPY -> RUNNING + * PRE_COPY -> PRE_COPY_P2P + * PRE_COPY_P2P -> PRE_COPY + * PRE_COPY_P2P -> RUNNING_P2P + * PRE_COPY_P2P -> STOP_COPY + * RUNNING -> PRE_COPY + * RUNNING_P2P -> PRE_COPY_P2P + * + * Without P2P and precopy the driver must implement: * RUNNING -> STOP * STOP -> RUNNING * * The coding will step through multiple states for some combination * transitions; if all optional features are supported, this means the * following ones: + * PRE_COPY -> PRE_COPY_P2P -> STOP_COPY + * PRE_COPY -> RUNNING -> RUNNING_P2P + * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP + * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING + * PRE_COPY_P2P -> RUNNING_P2P -> RUNNING + * PRE_COPY_P2P -> RUNNING_P2P -> STOP + * PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING * RESUMING -> STOP -> RUNNING_P2P + * RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P * RESUMING -> STOP -> RUNNING_P2P -> RUNNING + * RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY * RESUMING -> STOP -> STOP_COPY + * RUNNING -> RUNNING_P2P -> PRE_COPY_P2P * RUNNING -> RUNNING_P2P -> STOP * RUNNING -> RUNNING_P2P -> STOP -> RESUMING * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY + * RUNNING_P2P -> RUNNING -> PRE_COPY * RUNNING_P2P -> STOP -> RESUMING * RUNNING_P2P -> STOP -> STOP_COPY + * STOP -> RUNNING_P2P -> PRE_COPY_P2P * STOP -> RUNNING_P2P -> RUNNING + * STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY * STOP_COPY -> STOP -> RESUMING * STOP_COPY -> STOP -> RUNNING_P2P * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING + * + * The following transitions are blocked: + * STOP_COPY -> PRE_COPY + * STOP_COPY -> PRE_COPY_P2P */ static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = { [VFIO_DEVICE_STATE_STOP] = { [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, @@ -1089,14 +1124,38 @@ int vfio_mig_get_next_state(struct vfio_device *device, [VFIO_DEVICE_STATE_RUNNING] = { [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P, [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P, [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P, [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, }, + [VFIO_DEVICE_STATE_PRE_COPY] = { + [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING, + [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, + [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P, + [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING, + [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING, + [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, + }, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = { + [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P, + [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, + [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, + [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P, + [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, + [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, + }, [VFIO_DEVICE_STATE_STOP_COPY] = { [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR, [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, @@ -1105,6 +1164,8 @@ int vfio_mig_get_next_state(struct vfio_device *device, [VFIO_DEVICE_STATE_RESUMING] = { [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, @@ -1113,6 +1174,8 @@ int vfio_mig_get_next_state(struct vfio_device *device, [VFIO_DEVICE_STATE_RUNNING_P2P] = { [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, @@ -1121,6 +1184,8 @@ int vfio_mig_get_next_state(struct vfio_device *device, [VFIO_DEVICE_STATE_ERROR] = { [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR, [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR, [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR, [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR, [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR, @@ -1131,6 +1196,11 @@ int vfio_mig_get_next_state(struct vfio_device *device, static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = { [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY, [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY, + [VFIO_DEVICE_STATE_PRE_COPY] = + VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY | + VFIO_MIGRATION_P2P | + VFIO_MIGRATION_PRE_COPY, [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY, [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY, [VFIO_DEVICE_STATE_RUNNING_P2P] = diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 3e45dbaf190ef..23105eb036fa6 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -819,12 +819,20 @@ struct vfio_device_feature { * VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P means that RUNNING_P2P * is supported in addition to the STOP_COPY states. * + * VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY means that + * PRE_COPY is supported in addition to the STOP_COPY states. + * + * VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P | VFIO_MIGRATION_PRE_COPY + * means that RUNNING_P2P, PRE_COPY and PRE_COPY_P2P are supported + * in addition to the STOP_COPY states. + * * Other combinations of flags have behavior to be defined in the future. */ struct vfio_device_feature_migration { __aligned_u64 flags; #define VFIO_MIGRATION_STOP_COPY (1 << 0) #define VFIO_MIGRATION_P2P (1 << 1) +#define VFIO_MIGRATION_PRE_COPY (1 << 2) }; #define VFIO_DEVICE_FEATURE_MIGRATION 1 @@ -875,8 +883,13 @@ struct vfio_device_feature_mig_state { * RESUMING - The device is stopped and is loading a new internal state * ERROR - The device has failed and must be reset * - * And 1 optional state to support VFIO_MIGRATION_P2P: + * And optional states to support VFIO_MIGRATION_P2P: * RUNNING_P2P - RUNNING, except the device cannot do peer to peer DMA + * And VFIO_MIGRATION_PRE_COPY: + * PRE_COPY - The device is running normally but tracking internal state + * changes + * And VFIO_MIGRATION_P2P | VFIO_MIGRATION_PRE_COPY: + * PRE_COPY_P2P - PRE_COPY, except the device cannot do peer to peer DMA * * The FSM takes actions on the arcs between FSM states. The driver implements * the following behavior for the FSM arcs: @@ -908,20 +921,48 @@ struct vfio_device_feature_mig_state { * * To abort a RESUMING session the device must be reset. * + * PRE_COPY -> RUNNING * RUNNING_P2P -> RUNNING * While in RUNNING the device is fully operational, the device may generate * interrupts, DMA, respond to MMIO, all vfio device regions are functional, * and the device may advance its internal state. * + * The PRE_COPY arc will terminate a data transfer session. + * + * PRE_COPY_P2P -> RUNNING_P2P * RUNNING -> RUNNING_P2P * STOP -> RUNNING_P2P * While in RUNNING_P2P the device is partially running in the P2P quiescent * state defined below. * + * The PRE_COPY_P2P arc will terminate a data transfer session. + * + * RUNNING -> PRE_COPY + * RUNNING_P2P -> PRE_COPY_P2P * STOP -> STOP_COPY - * This arc begin the process of saving the device state and will return a - * new data_fd. + * PRE_COPY, PRE_COPY_P2P and STOP_COPY form the "saving group" of states + * which share a data transfer session. Moving between these states alters + * what is streamed in session, but does not terminate or otherwise affect + * the associated fd. + * + * These arcs begin the process of saving the device state and will return a + * new data_fd. The migration driver may perform actions such as enabling + * dirty logging of device state when entering PRE_COPY or PER_COPY_P2P. * + * Each arc does not change the device operation, the device remains + * RUNNING, P2P quiesced or in STOP. The STOP_COPY state is described below + * in PRE_COPY_P2P -> STOP_COPY. + * + * PRE_COPY -> PRE_COPY_P2P + * Entering PRE_COPY_P2P continues all the behaviors of PRE_COPY above. + * However, while in the PRE_COPY_P2P state, the device is partially running + * in the P2P quiescent state defined below, like RUNNING_P2P. + * + * PRE_COPY_P2P -> PRE_COPY + * This arc allows returning the device to a full RUNNING behavior while + * continuing all the behaviors of PRE_COPY. + * + * PRE_COPY_P2P -> STOP_COPY * While in the STOP_COPY state the device has the same behavior as STOP * with the addition that the data transfers session continues to stream the * migration state. End of stream on the FD indicates the entire device @@ -939,6 +980,13 @@ struct vfio_device_feature_mig_state { * device state for this arc if required to prepare the device to receive the * migration data. * + * STOP_COPY -> PRE_COPY + * STOP_COPY -> PRE_COPY_P2P + * These arcs are not permitted and return error if requested. Future + * revisions of this API may define behaviors for these arcs, in this case + * support will be discoverable by a new flag in + * VFIO_DEVICE_FEATURE_MIGRATION. + * * any -> ERROR * ERROR cannot be specified as a device state, however any transition request * can be failed with an errno return and may then move the device_state into @@ -950,7 +998,7 @@ struct vfio_device_feature_mig_state { * The optional peer to peer (P2P) quiescent state is intended to be a quiescent * state for the device for the purposes of managing multiple devices within a * user context where peer-to-peer DMA between devices may be active. The - * RUNNING_P2P states must prevent the device from initiating + * RUNNING_P2P and PRE_COPY_P2P states must prevent the device from initiating * any new P2P DMA transactions. If the device can identify P2P transactions * then it can stop only P2P DMA, otherwise it must stop all DMA. The migration * driver must complete any such outstanding operations prior to completing the @@ -963,6 +1011,8 @@ struct vfio_device_feature_mig_state { * above FSM arcs. As there are multiple paths through the FSM arcs the path * should be selected based on the following rules: * - Select the shortest path. + * - The path cannot have saving group states as interior arcs, only + * starting/end states. * Refer to vfio_mig_get_next_state() for the result of the algorithm. * * The automatic transit through the FSM arcs that make up the combination @@ -976,6 +1026,9 @@ struct vfio_device_feature_mig_state { * support them. The user can discover if these states are supported by using * VFIO_DEVICE_FEATURE_MIGRATION. By using combination transitions the user can * avoid knowing about these optional states if the kernel driver supports them. + * + * Arcs touching PRE_COPY and PRE_COPY_P2P are removed if support for PRE_COPY + * is not present. */ enum vfio_device_mig_state { VFIO_DEVICE_STATE_ERROR = 0, @@ -984,8 +1037,70 @@ enum vfio_device_mig_state { VFIO_DEVICE_STATE_STOP_COPY = 3, VFIO_DEVICE_STATE_RESUMING = 4, VFIO_DEVICE_STATE_RUNNING_P2P = 5, + VFIO_DEVICE_STATE_PRE_COPY = 6, + VFIO_DEVICE_STATE_PRE_COPY_P2P = 7, +}; + +/** + * VFIO_MIG_GET_PRECOPY_INFO - _IO(VFIO_TYPE, VFIO_BASE + 21) + * + * This ioctl is used on the migration data FD in the precopy phase of the + * migration data transfer. It returns an estimate of the current data sizes + * remaining to be transferred. It allows the user to judge when it is + * appropriate to leave PRE_COPY for STOP_COPY. + * + * This ioctl is valid only in PRE_COPY states and kernel driver should + * return -EINVAL from any other migration state. + * + * The vfio_precopy_info data structure returned by this ioctl provides + * estimates of data available from the device during the PRE_COPY states. + * This estimate is split into two categories, initial_bytes and + * dirty_bytes. + * + * The initial_bytes field indicates the amount of initial precopy + * data available from the device. This field should have a non-zero initial + * value and decrease as migration data is read from the device. + * It is recommended to leave PRE_COPY for STOP_COPY only after this field + * reaches zero. Leaving PRE_COPY earlier might make things slower. + * + * The dirty_bytes field tracks device state changes relative to data + * previously retrieved. This field starts at zero and may increase as + * the internal device state is modified or decrease as that modified + * state is read from the device. + * + * Userspace may use the combination of these fields to estimate the + * potential data size available during the PRE_COPY phases, as well as + * trends relative to the rate the device is dirtying its internal + * state, but these fields are not required to have any bearing relative + * to the data size available during the STOP_COPY phase. + * + * Drivers have a lot of flexibility in when and what they transfer during the + * PRE_COPY phase, and how they report this from VFIO_MIG_GET_PRECOPY_INFO. + * + * During pre-copy the migration data FD has a temporary "end of stream" that is + * reached when both initial_bytes and dirty_byte are zero. For instance, this + * may indicate that the device is idle and not currently dirtying any internal + * state. When read() is done on this temporary end of stream the kernel driver + * should return ENOMSG from read(). Userspace can wait for more data (which may + * never come) by using poll. + * + * Once in STOP_COPY the migration data FD has a permanent end of stream + * signaled in the usual way by read() always returning 0 and poll always + * returning readable. ENOMSG may not be returned in STOP_COPY. + * Support for this ioctl is mandatory if a driver claims to support + * VFIO_MIGRATION_PRE_COPY. + * + * Return: 0 on success, -1 and errno set on failure. + */ +struct vfio_precopy_info { + __u32 argsz; + __u32 flags; + __aligned_u64 initial_bytes; + __aligned_u64 dirty_bytes; }; +#define VFIO_MIG_GET_PRECOPY_INFO _IO(VFIO_TYPE, VFIO_BASE + 21) + /* * Upon VFIO_DEVICE_FEATURE_SET, allow the device to be moved into a low power * state with the platform-based power management. Device use of lower power -- GitLab From 0e7caa65d707b93fbb4322c6313f739fa9103dfa Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:27 +0200 Subject: [PATCH 1176/1423] vfio/mlx5: Enforce a single SAVE command at a time Enforce a single SAVE command at a time. As the SAVE command is an asynchronous one, we must enforce running only a single command at a time. This will preserve ordering between multiple calls and protect from races on the migration file data structure. This is a must for the next patches from the series where as part of PRE_COPY we may have multiple images to be saved and multiple SAVE commands may be issued from different flows. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-4-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 6 ++++++ drivers/vfio/pci/mlx5/cmd.h | 1 + drivers/vfio/pci/mlx5/main.c | 7 +++++++ 3 files changed, 14 insertions(+) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 0848bc905d3ea..55ee8036f59cd 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -281,6 +281,7 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); mlx5_core_dealloc_pd(mdev, async_data->pdn); kvfree(async_data->out); + complete(&migf->save_comp); fput(migf->filp); } @@ -321,6 +322,10 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, return -ENOTCONN; mdev = mvdev->mdev; + err = wait_for_completion_interruptible(&migf->save_comp); + if (err) + return err; + err = mlx5_core_alloc_pd(mdev, &pdn); if (err) return err; @@ -371,6 +376,7 @@ err_create_mkey: dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); err_dma_map: mlx5_core_dealloc_pd(mdev, pdn); + complete(&migf->save_comp); return err; } diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 921d5720a1e57..8ffa7699872c4 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -37,6 +37,7 @@ struct mlx5_vf_migration_file { unsigned long last_offset; struct mlx5vf_pci_core_device *mvdev; wait_queue_head_t poll_wait; + struct completion save_comp; struct mlx5_async_ctx async_ctx; struct mlx5vf_async_data async_data; }; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 6e9cf2aacc527..0d71ebb2a9727 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -245,6 +245,13 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); init_waitqueue_head(&migf->poll_wait); + init_completion(&migf->save_comp); + /* + * save_comp is being used as a binary semaphore built from + * a completion. A normal mutex cannot be used because the lock is + * passed between kernel threads and lockdep can't model this. + */ + complete(&migf->save_comp); mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, -- GitLab From 9945a67ea4b30657dd998c7fbbea1b3950747168 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:28 +0200 Subject: [PATCH 1177/1423] vfio/mlx5: Refactor PD usage This patch refactors PD usage such as its life cycle will be as of the migration file instead of allocating/destroying it upon each SAVE/LOAD command. This is a preparation step towards the PRE_COPY series where multiple images will be SAVED/LOADED and a single PD can be simply reused. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-5-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 53 ++++++++++++++++++++++++------------ drivers/vfio/pci/mlx5/cmd.h | 5 +++- drivers/vfio/pci/mlx5/main.c | 44 ++++++++++++++++++++++-------- 3 files changed, 71 insertions(+), 31 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 55ee8036f59cd..a97eac49e3d68 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -279,7 +279,6 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mlx5_core_destroy_mkey(mdev, async_data->mkey); dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); - mlx5_core_dealloc_pd(mdev, async_data->pdn); kvfree(async_data->out); complete(&migf->save_comp); fput(migf->filp); @@ -314,7 +313,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; struct mlx5vf_async_data *async_data; struct mlx5_core_dev *mdev; - u32 pdn, mkey; + u32 mkey; int err; lockdep_assert_held(&mvdev->state_mutex); @@ -326,16 +325,12 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, if (err) return err; - err = mlx5_core_alloc_pd(mdev, &pdn); - if (err) - return err; - err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); if (err) goto err_dma_map; - err = _create_mkey(mdev, pdn, migf, NULL, &mkey); + err = _create_mkey(mdev, migf->pdn, migf, NULL, &mkey); if (err) goto err_create_mkey; @@ -357,7 +352,6 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, migf->total_length = 0; get_file(migf->filp); async_data->mkey = mkey; - async_data->pdn = pdn; err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), async_data->out, out_size, mlx5vf_save_callback, @@ -375,7 +369,6 @@ err_out: err_create_mkey: dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); err_dma_map: - mlx5_core_dealloc_pd(mdev, pdn); complete(&migf->save_comp); return err; } @@ -386,7 +379,7 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_core_dev *mdev; u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {}; - u32 pdn, mkey; + u32 mkey; int err; lockdep_assert_held(&mvdev->state_mutex); @@ -400,15 +393,11 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, } mdev = mvdev->mdev; - err = mlx5_core_alloc_pd(mdev, &pdn); - if (err) - goto end; - err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0); if (err) - goto err_reg; + goto end; - err = _create_mkey(mdev, pdn, migf, NULL, &mkey); + err = _create_mkey(mdev, migf->pdn, migf, NULL, &mkey); if (err) goto err_mkey; @@ -424,13 +413,41 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, mlx5_core_destroy_mkey(mdev, mkey); err_mkey: dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0); -err_reg: - mlx5_core_dealloc_pd(mdev, pdn); end: mutex_unlock(&migf->lock); return err; } +int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf) +{ + int err; + + lockdep_assert_held(&migf->mvdev->state_mutex); + if (migf->mvdev->mdev_detach) + return -ENOTCONN; + + err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn); + return err; +} + +void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf) +{ + lockdep_assert_held(&migf->mvdev->state_mutex); + if (migf->mvdev->mdev_detach) + return; + + mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn); +} + +void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) +{ + lockdep_assert_held(&migf->mvdev->state_mutex); + + WARN_ON(migf->mvdev->mdev_detach); + + mlx5vf_cmd_dealloc_pd(migf); +} + static void combine_ranges(struct rb_root_cached *root, u32 cur_nodes, u32 req_nodes) { diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 8ffa7699872c4..ba760f956d535 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -16,7 +16,6 @@ struct mlx5vf_async_data { struct mlx5_async_work cb_work; struct work_struct work; int status; - u32 pdn; u32 mkey; void *out; }; @@ -27,6 +26,7 @@ struct mlx5_vf_migration_file { u8 disabled:1; u8 is_err:1; + u32 pdn; struct sg_append_table table; size_t total_length; size_t allocated_length; @@ -127,6 +127,9 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf); int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf); +int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf); +void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf); +void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf); void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 0d71ebb2a9727..1916f7c1468c1 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -236,12 +236,15 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf, O_RDONLY); if (IS_ERR(migf->filp)) { - int err = PTR_ERR(migf->filp); - - kfree(migf); - return ERR_PTR(err); + ret = PTR_ERR(migf->filp); + goto end; } + migf->mvdev = mvdev; + ret = mlx5vf_cmd_alloc_pd(migf); + if (ret) + goto out_free; + stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); init_waitqueue_head(&migf->poll_wait); @@ -257,20 +260,25 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &migf->total_length); if (ret) - goto out_free; + goto out_pd; ret = mlx5vf_add_migration_pages( migf, DIV_ROUND_UP_ULL(migf->total_length, PAGE_SIZE)); if (ret) - goto out_free; + goto out_pd; - migf->mvdev = mvdev; ret = mlx5vf_cmd_save_vhca_state(mvdev, migf); if (ret) - goto out_free; + goto out_save; return migf; +out_save: + mlx5vf_disable_fd(migf); +out_pd: + mlx5vf_cmd_dealloc_pd(migf); out_free: fput(migf->filp); +end: + kfree(migf); return ERR_PTR(ret); } @@ -352,6 +360,7 @@ static struct mlx5_vf_migration_file * mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) { struct mlx5_vf_migration_file *migf; + int ret; migf = kzalloc(sizeof(*migf), GFP_KERNEL); if (!migf) @@ -360,20 +369,30 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf, O_WRONLY); if (IS_ERR(migf->filp)) { - int err = PTR_ERR(migf->filp); - - kfree(migf); - return ERR_PTR(err); + ret = PTR_ERR(migf->filp); + goto end; } + + migf->mvdev = mvdev; + ret = mlx5vf_cmd_alloc_pd(migf); + if (ret) + goto out_free; + stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); return migf; +out_free: + fput(migf->filp); +end: + kfree(migf); + return ERR_PTR(ret); } void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) { if (mvdev->resuming_migf) { mlx5vf_disable_fd(mvdev->resuming_migf); + mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf); fput(mvdev->resuming_migf->filp); mvdev->resuming_migf = NULL; } @@ -381,6 +400,7 @@ void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx); cancel_work_sync(&mvdev->saving_migf->async_data.work); mlx5vf_disable_fd(mvdev->saving_migf); + mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf); fput(mvdev->saving_migf->filp); mvdev->saving_migf = NULL; } -- GitLab From 91454f8b9bf4ce6be1d9a0b4de401bc3c6313a95 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:29 +0200 Subject: [PATCH 1178/1423] vfio/mlx5: Refactor MKEY usage This patch refactors MKEY usage such as its life cycle will be as of the migration file instead of allocating/destroying it upon each SAVE/LOAD command. This is a preparation step towards the PRE_COPY series where multiple images will be SAVED/LOADED. We achieve it by having a new struct named mlx5_vhca_data_buffer which holds the mkey and its related stuff as of sg_append_table, allocated_length, etc. The above fields were taken out from the migration file main struct, into mlx5_vhca_data_buffer dedicated struct with the proper helpers in place. For now we have a single mlx5_vhca_data_buffer per migration file. However, in coming patches we'll have multiple of them to support multiple images. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-6-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 162 ++++++++++++++++++++++------------- drivers/vfio/pci/mlx5/cmd.h | 37 +++++--- drivers/vfio/pci/mlx5/main.c | 92 +++++++++++--------- 3 files changed, 178 insertions(+), 113 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index a97eac49e3d68..ed4c472d2eae5 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -210,11 +210,11 @@ err_exec: } static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, - struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf, struct mlx5_vhca_recv_buf *recv_buf, u32 *mkey) { - size_t npages = migf ? DIV_ROUND_UP(migf->total_length, PAGE_SIZE) : + size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) : recv_buf->npages; int err = 0, inlen; __be64 *mtt; @@ -232,10 +232,10 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, DIV_ROUND_UP(npages, 2)); mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); - if (migf) { + if (buf) { struct sg_dma_page_iter dma_iter; - for_each_sgtable_dma_page(&migf->table.sgt, &dma_iter, 0) + for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0) *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter)); } else { int i; @@ -255,20 +255,99 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, MLX5_SET(mkc, mkc, qpn, 0xffffff); MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2)); - MLX5_SET64(mkc, mkc, len, - migf ? migf->total_length : (npages * PAGE_SIZE)); + MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE); err = mlx5_core_create_mkey(mdev, mkey, in, inlen); kvfree(in); return err; } +static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) +{ + struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev; + struct mlx5_core_dev *mdev = mvdev->mdev; + int ret; + + lockdep_assert_held(&mvdev->state_mutex); + if (mvdev->mdev_detach) + return -ENOTCONN; + + if (buf->dmaed || !buf->allocated_length) + return -EINVAL; + + ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); + if (ret) + return ret; + + ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey); + if (ret) + goto err; + + buf->dmaed = true; + + return 0; +err: + dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); + return ret; +} + +void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) +{ + struct mlx5_vf_migration_file *migf = buf->migf; + struct sg_page_iter sg_iter; + + lockdep_assert_held(&migf->mvdev->state_mutex); + WARN_ON(migf->mvdev->mdev_detach); + + if (buf->dmaed) { + mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey); + dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt, + buf->dma_dir, 0); + } + + /* Undo alloc_pages_bulk_array() */ + for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) + __free_page(sg_page_iter_page(&sg_iter)); + sg_free_append_table(&buf->table); + kfree(buf); +} + +struct mlx5_vhca_data_buffer * +mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, + size_t length, + enum dma_data_direction dma_dir) +{ + struct mlx5_vhca_data_buffer *buf; + int ret; + + buf = kzalloc(sizeof(*buf), GFP_KERNEL); + if (!buf) + return ERR_PTR(-ENOMEM); + + buf->dma_dir = dma_dir; + buf->migf = migf; + if (length) { + ret = mlx5vf_add_migration_pages(buf, + DIV_ROUND_UP_ULL(length, PAGE_SIZE)); + if (ret) + goto end; + + ret = mlx5vf_dma_data_buffer(buf); + if (ret) + goto end; + } + + return buf; +end: + mlx5vf_free_data_buffer(buf); + return ERR_PTR(ret); +} + void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) { struct mlx5vf_async_data *async_data = container_of(_work, struct mlx5vf_async_data, work); struct mlx5_vf_migration_file *migf = container_of(async_data, struct mlx5_vf_migration_file, async_data); - struct mlx5_core_dev *mdev = migf->mvdev->mdev; mutex_lock(&migf->lock); if (async_data->status) { @@ -276,9 +355,6 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) wake_up_interruptible(&migf->poll_wait); } mutex_unlock(&migf->lock); - - mlx5_core_destroy_mkey(mdev, async_data->mkey); - dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); kvfree(async_data->out); complete(&migf->save_comp); fput(migf->filp); @@ -292,7 +368,7 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) struct mlx5_vf_migration_file, async_data); if (!status) { - WRITE_ONCE(migf->total_length, + WRITE_ONCE(migf->buf->length, MLX5_GET(save_vhca_state_out, async_data->out, actual_image_size)); wake_up_interruptible(&migf->poll_wait); @@ -307,39 +383,28 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) } int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf) + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf) { u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out); u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; struct mlx5vf_async_data *async_data; - struct mlx5_core_dev *mdev; - u32 mkey; int err; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; - mdev = mvdev->mdev; err = wait_for_completion_interruptible(&migf->save_comp); if (err) return err; - err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, - 0); - if (err) - goto err_dma_map; - - err = _create_mkey(mdev, migf->pdn, migf, NULL, &mkey); - if (err) - goto err_create_mkey; - MLX5_SET(save_vhca_state_in, in, opcode, MLX5_CMD_OP_SAVE_VHCA_STATE); MLX5_SET(save_vhca_state_in, in, op_mod, 0); MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id); - MLX5_SET(save_vhca_state_in, in, mkey, mkey); - MLX5_SET(save_vhca_state_in, in, size, migf->total_length); + MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey); + MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); async_data = &migf->async_data; async_data->out = kvzalloc(out_size, GFP_KERNEL); @@ -348,10 +413,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, goto err_out; } - /* no data exists till the callback comes back */ - migf->total_length = 0; get_file(migf->filp); - async_data->mkey = mkey; err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), async_data->out, out_size, mlx5vf_save_callback, @@ -365,57 +427,33 @@ err_exec: fput(migf->filp); kvfree(async_data->out); err_out: - mlx5_core_destroy_mkey(mdev, mkey); -err_create_mkey: - dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); -err_dma_map: complete(&migf->save_comp); return err; } int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf) + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf) { - struct mlx5_core_dev *mdev; u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {}; - u32 mkey; int err; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; - mutex_lock(&migf->lock); - if (!migf->total_length) { - err = -EINVAL; - goto end; - } - - mdev = mvdev->mdev; - err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0); + err = mlx5vf_dma_data_buffer(buf); if (err) - goto end; - - err = _create_mkey(mdev, migf->pdn, migf, NULL, &mkey); - if (err) - goto err_mkey; + return err; MLX5_SET(load_vhca_state_in, in, opcode, MLX5_CMD_OP_LOAD_VHCA_STATE); MLX5_SET(load_vhca_state_in, in, op_mod, 0); MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id); - MLX5_SET(load_vhca_state_in, in, mkey, mkey); - MLX5_SET(load_vhca_state_in, in, size, migf->total_length); - - err = mlx5_cmd_exec_inout(mdev, load_vhca_state, in, out); - - mlx5_core_destroy_mkey(mdev, mkey); -err_mkey: - dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0); -end: - mutex_unlock(&migf->lock); - return err; + MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey); + MLX5_SET(load_vhca_state_in, in, size, buf->length); + return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out); } int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf) @@ -445,6 +483,10 @@ void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) WARN_ON(migf->mvdev->mdev_detach); + if (migf->buf) { + mlx5vf_free_data_buffer(migf->buf); + migf->buf = NULL; + } mlx5vf_cmd_dealloc_pd(migf); } diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index ba760f956d535..b0f08dfc81207 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -12,11 +12,25 @@ #include #include +struct mlx5_vhca_data_buffer { + struct sg_append_table table; + loff_t start_pos; + u64 length; + u64 allocated_length; + u32 mkey; + enum dma_data_direction dma_dir; + u8 dmaed:1; + struct mlx5_vf_migration_file *migf; + /* Optimize mlx5vf_get_migration_page() for sequential access */ + struct scatterlist *last_offset_sg; + unsigned int sg_last_entry; + unsigned long last_offset; +}; + struct mlx5vf_async_data { struct mlx5_async_work cb_work; struct work_struct work; int status; - u32 mkey; void *out; }; @@ -27,14 +41,7 @@ struct mlx5_vf_migration_file { u8 is_err:1; u32 pdn; - struct sg_append_table table; - size_t total_length; - size_t allocated_length; - - /* Optimize mlx5vf_get_migration_page() for sequential access */ - struct scatterlist *last_offset_sg; - unsigned int sg_last_entry; - unsigned long last_offset; + struct mlx5_vhca_data_buffer *buf; struct mlx5vf_pci_core_device *mvdev; wait_queue_head_t poll_wait; struct completion save_comp; @@ -124,12 +131,20 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev); int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf); + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf); int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf); + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf); int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf); void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf); void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf); +struct mlx5_vhca_data_buffer * +mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, + size_t length, enum dma_data_direction dma_dir); +void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf); +int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, + unsigned int npages); void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 1916f7c1468c1..5f694fce854cf 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -33,7 +33,7 @@ static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) } static struct page * -mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf, +mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, unsigned long offset) { unsigned long cur_offset = 0; @@ -41,20 +41,20 @@ mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf, unsigned int i; /* All accesses are sequential */ - if (offset < migf->last_offset || !migf->last_offset_sg) { - migf->last_offset = 0; - migf->last_offset_sg = migf->table.sgt.sgl; - migf->sg_last_entry = 0; + if (offset < buf->last_offset || !buf->last_offset_sg) { + buf->last_offset = 0; + buf->last_offset_sg = buf->table.sgt.sgl; + buf->sg_last_entry = 0; } - cur_offset = migf->last_offset; + cur_offset = buf->last_offset; - for_each_sg(migf->last_offset_sg, sg, - migf->table.sgt.orig_nents - migf->sg_last_entry, i) { + for_each_sg(buf->last_offset_sg, sg, + buf->table.sgt.orig_nents - buf->sg_last_entry, i) { if (offset < sg->length + cur_offset) { - migf->last_offset_sg = sg; - migf->sg_last_entry += i; - migf->last_offset = cur_offset; + buf->last_offset_sg = sg; + buf->sg_last_entry += i; + buf->last_offset = cur_offset; return nth_page(sg_page(sg), (offset - cur_offset) / PAGE_SIZE); } @@ -63,8 +63,8 @@ mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf, return NULL; } -static int mlx5vf_add_migration_pages(struct mlx5_vf_migration_file *migf, - unsigned int npages) +int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, + unsigned int npages) { unsigned int to_alloc = npages; struct page **page_list; @@ -85,13 +85,13 @@ static int mlx5vf_add_migration_pages(struct mlx5_vf_migration_file *migf, } to_alloc -= filled; ret = sg_alloc_append_table_from_pages( - &migf->table, page_list, filled, 0, + &buf->table, page_list, filled, 0, filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, GFP_KERNEL); if (ret) goto err; - migf->allocated_length += filled * PAGE_SIZE; + buf->allocated_length += filled * PAGE_SIZE; /* clean input for another bulk allocation */ memset(page_list, 0, filled * sizeof(*page_list)); to_fill = min_t(unsigned int, to_alloc, @@ -108,16 +108,8 @@ err: static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) { - struct sg_page_iter sg_iter; - mutex_lock(&migf->lock); - /* Undo alloc_pages_bulk_array() */ - for_each_sgtable_page(&migf->table.sgt, &sg_iter, 0) - __free_page(sg_page_iter_page(&sg_iter)); - sg_free_append_table(&migf->table); migf->disabled = true; - migf->total_length = 0; - migf->allocated_length = 0; migf->filp->f_pos = 0; mutex_unlock(&migf->lock); } @@ -136,6 +128,7 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos) { struct mlx5_vf_migration_file *migf = filp->private_data; + struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; ssize_t done = 0; if (pos) @@ -144,16 +137,16 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (!(filp->f_flags & O_NONBLOCK)) { if (wait_event_interruptible(migf->poll_wait, - READ_ONCE(migf->total_length) || migf->is_err)) + READ_ONCE(vhca_buf->length) || migf->is_err)) return -ERESTARTSYS; } mutex_lock(&migf->lock); - if ((filp->f_flags & O_NONBLOCK) && !READ_ONCE(migf->total_length)) { + if ((filp->f_flags & O_NONBLOCK) && !READ_ONCE(vhca_buf->length)) { done = -EAGAIN; goto out_unlock; } - if (*pos > migf->total_length) { + if (*pos > vhca_buf->length) { done = -EINVAL; goto out_unlock; } @@ -162,7 +155,7 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, goto out_unlock; } - len = min_t(size_t, migf->total_length - *pos, len); + len = min_t(size_t, vhca_buf->length - *pos, len); while (len) { size_t page_offset; struct page *page; @@ -171,7 +164,7 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, int ret; page_offset = (*pos) % PAGE_SIZE; - page = mlx5vf_get_migration_page(migf, *pos - page_offset); + page = mlx5vf_get_migration_page(vhca_buf, *pos - page_offset); if (!page) { if (done == 0) done = -EINVAL; @@ -208,7 +201,7 @@ static __poll_t mlx5vf_save_poll(struct file *filp, mutex_lock(&migf->lock); if (migf->disabled || migf->is_err) pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; - else if (READ_ONCE(migf->total_length)) + else if (READ_ONCE(migf->buf->length)) pollflags = EPOLLIN | EPOLLRDNORM; mutex_unlock(&migf->lock); @@ -227,6 +220,8 @@ static struct mlx5_vf_migration_file * mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) { struct mlx5_vf_migration_file *migf; + struct mlx5_vhca_data_buffer *buf; + size_t length; int ret; migf = kzalloc(sizeof(*migf), GFP_KERNEL); @@ -257,22 +252,23 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) complete(&migf->save_comp); mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); - ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, - &migf->total_length); + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length); if (ret) goto out_pd; - ret = mlx5vf_add_migration_pages( - migf, DIV_ROUND_UP_ULL(migf->total_length, PAGE_SIZE)); - if (ret) + buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); goto out_pd; + } - ret = mlx5vf_cmd_save_vhca_state(mvdev, migf); + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf); if (ret) goto out_save; + migf->buf = buf; return migf; out_save: - mlx5vf_disable_fd(migf); + mlx5vf_free_data_buffer(buf); out_pd: mlx5vf_cmd_dealloc_pd(migf); out_free: @@ -286,6 +282,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, size_t len, loff_t *pos) { struct mlx5_vf_migration_file *migf = filp->private_data; + struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; loff_t requested_length; ssize_t done = 0; @@ -306,10 +303,10 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, goto out_unlock; } - if (migf->allocated_length < requested_length) { + if (vhca_buf->allocated_length < requested_length) { done = mlx5vf_add_migration_pages( - migf, - DIV_ROUND_UP(requested_length - migf->allocated_length, + vhca_buf, + DIV_ROUND_UP(requested_length - vhca_buf->allocated_length, PAGE_SIZE)); if (done) goto out_unlock; @@ -323,7 +320,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, int ret; page_offset = (*pos) % PAGE_SIZE; - page = mlx5vf_get_migration_page(migf, *pos - page_offset); + page = mlx5vf_get_migration_page(vhca_buf, *pos - page_offset); if (!page) { if (done == 0) done = -EINVAL; @@ -342,7 +339,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, len -= page_len; done += page_len; buf += page_len; - migf->total_length += page_len; + vhca_buf->length += page_len; } out_unlock: mutex_unlock(&migf->lock); @@ -360,6 +357,7 @@ static struct mlx5_vf_migration_file * mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) { struct mlx5_vf_migration_file *migf; + struct mlx5_vhca_data_buffer *buf; int ret; migf = kzalloc(sizeof(*migf), GFP_KERNEL); @@ -378,9 +376,18 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) if (ret) goto out_free; + buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out_pd; + } + + migf->buf = buf; stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); return migf; +out_pd: + mlx5vf_cmd_dealloc_pd(migf); out_free: fput(migf->filp); end: @@ -474,7 +481,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { ret = mlx5vf_cmd_load_vhca_state(mvdev, - mvdev->resuming_migf); + mvdev->resuming_migf, + mvdev->resuming_migf->buf); if (ret) return ERR_PTR(ret); mlx5vf_disable_fds(mvdev); -- GitLab From 8b599d143419669e57da3881d8293f17809688d7 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:30 +0200 Subject: [PATCH 1179/1423] vfio/mlx5: Refactor migration file state Refactor migration file state to be an emum which is mutual exclusive. As of that dropped the 'disabled' state as 'error' is the same from functional point of view. Next patches from the series will extend this enum for other relevant states. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-7-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 2 +- drivers/vfio/pci/mlx5/cmd.h | 7 +++++-- drivers/vfio/pci/mlx5/main.c | 11 ++++++----- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index ed4c472d2eae5..fcba12326185c 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -351,7 +351,7 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mutex_lock(&migf->lock); if (async_data->status) { - migf->is_err = true; + migf->state = MLX5_MIGF_STATE_ERROR; wake_up_interruptible(&migf->poll_wait); } mutex_unlock(&migf->lock); diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index b0f08dfc81207..14403e654e4e4 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -12,6 +12,10 @@ #include #include +enum mlx5_vf_migf_state { + MLX5_MIGF_STATE_ERROR = 1, +}; + struct mlx5_vhca_data_buffer { struct sg_append_table table; loff_t start_pos; @@ -37,8 +41,7 @@ struct mlx5vf_async_data { struct mlx5_vf_migration_file { struct file *filp; struct mutex lock; - u8 disabled:1; - u8 is_err:1; + enum mlx5_vf_migf_state state; u32 pdn; struct mlx5_vhca_data_buffer *buf; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 5f694fce854cf..d95646c2f010d 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -109,7 +109,7 @@ err: static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) { mutex_lock(&migf->lock); - migf->disabled = true; + migf->state = MLX5_MIGF_STATE_ERROR; migf->filp->f_pos = 0; mutex_unlock(&migf->lock); } @@ -137,7 +137,8 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (!(filp->f_flags & O_NONBLOCK)) { if (wait_event_interruptible(migf->poll_wait, - READ_ONCE(vhca_buf->length) || migf->is_err)) + READ_ONCE(vhca_buf->length) || + migf->state == MLX5_MIGF_STATE_ERROR)) return -ERESTARTSYS; } @@ -150,7 +151,7 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, done = -EINVAL; goto out_unlock; } - if (migf->disabled || migf->is_err) { + if (migf->state == MLX5_MIGF_STATE_ERROR) { done = -ENODEV; goto out_unlock; } @@ -199,7 +200,7 @@ static __poll_t mlx5vf_save_poll(struct file *filp, poll_wait(filp, &migf->poll_wait, wait); mutex_lock(&migf->lock); - if (migf->disabled || migf->is_err) + if (migf->state == MLX5_MIGF_STATE_ERROR) pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; else if (READ_ONCE(migf->buf->length)) pollflags = EPOLLIN | EPOLLRDNORM; @@ -298,7 +299,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, return -ENOMEM; mutex_lock(&migf->lock); - if (migf->disabled) { + if (migf->state == MLX5_MIGF_STATE_ERROR) { done = -ENODEV; goto out_unlock; } -- GitLab From c668878381b5702f867ec7f43ee3b74259c6ea03 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:31 +0200 Subject: [PATCH 1180/1423] vfio/mlx5: Refactor to use queue based data chunks Refactor to use queue based data chunks on the migration file. The SAVE command adds a chunk to the tail of the queue while the read() API finds the required chunk and returns its data. In case the queue is empty but the state of the migration file is MLX5_MIGF_STATE_COMPLETE, read() may not be blocked but will return 0 to indicate end of file. This is a step towards maintaining multiple images and their meta data (i.e. headers) on the migration file as part of next patches from the series. Note: At that point, we still use a single chunk on the migration file but becomes ready to support multiple. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-8-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 24 +++++- drivers/vfio/pci/mlx5/cmd.h | 5 ++ drivers/vfio/pci/mlx5/main.c | 145 +++++++++++++++++++++++++++-------- 3 files changed, 136 insertions(+), 38 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index fcba12326185c..0e36b4c8c8169 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -351,6 +351,7 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mutex_lock(&migf->lock); if (async_data->status) { + migf->buf = async_data->buf; migf->state = MLX5_MIGF_STATE_ERROR; wake_up_interruptible(&migf->poll_wait); } @@ -368,9 +369,15 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) struct mlx5_vf_migration_file, async_data); if (!status) { - WRITE_ONCE(migf->buf->length, - MLX5_GET(save_vhca_state_out, async_data->out, - actual_image_size)); + unsigned long flags; + + async_data->buf->length = + MLX5_GET(save_vhca_state_out, async_data->out, + actual_image_size); + spin_lock_irqsave(&migf->list_lock, flags); + list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); + spin_unlock_irqrestore(&migf->list_lock, flags); + migf->state = MLX5_MIGF_STATE_COMPLETE; wake_up_interruptible(&migf->poll_wait); } @@ -407,6 +414,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); async_data = &migf->async_data; + async_data->buf = buf; async_data->out = kvzalloc(out_size, GFP_KERNEL); if (!async_data->out) { err = -ENOMEM; @@ -479,14 +487,22 @@ void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf) void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) { - lockdep_assert_held(&migf->mvdev->state_mutex); + struct mlx5_vhca_data_buffer *entry; + lockdep_assert_held(&migf->mvdev->state_mutex); WARN_ON(migf->mvdev->mdev_detach); if (migf->buf) { mlx5vf_free_data_buffer(migf->buf); migf->buf = NULL; } + + while ((entry = list_first_entry_or_null(&migf->buf_list, + struct mlx5_vhca_data_buffer, buf_elm))) { + list_del(&entry->buf_elm); + mlx5vf_free_data_buffer(entry); + } + mlx5vf_cmd_dealloc_pd(migf); } diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 14403e654e4e4..6e594689566e4 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -14,6 +14,7 @@ enum mlx5_vf_migf_state { MLX5_MIGF_STATE_ERROR = 1, + MLX5_MIGF_STATE_COMPLETE, }; struct mlx5_vhca_data_buffer { @@ -24,6 +25,7 @@ struct mlx5_vhca_data_buffer { u32 mkey; enum dma_data_direction dma_dir; u8 dmaed:1; + struct list_head buf_elm; struct mlx5_vf_migration_file *migf; /* Optimize mlx5vf_get_migration_page() for sequential access */ struct scatterlist *last_offset_sg; @@ -34,6 +36,7 @@ struct mlx5_vhca_data_buffer { struct mlx5vf_async_data { struct mlx5_async_work cb_work; struct work_struct work; + struct mlx5_vhca_data_buffer *buf; int status; void *out; }; @@ -45,6 +48,8 @@ struct mlx5_vf_migration_file { u32 pdn; struct mlx5_vhca_data_buffer *buf; + spinlock_t list_lock; + struct list_head buf_list; struct mlx5vf_pci_core_device *mvdev; wait_queue_head_t poll_wait; struct completion save_comp; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index d95646c2f010d..ca16425811c4b 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -124,11 +124,90 @@ static int mlx5vf_release_file(struct inode *inode, struct file *filp) return 0; } +static struct mlx5_vhca_data_buffer * +mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos, + bool *end_of_data) +{ + struct mlx5_vhca_data_buffer *buf; + bool found = false; + + *end_of_data = false; + spin_lock_irq(&migf->list_lock); + if (list_empty(&migf->buf_list)) { + *end_of_data = true; + goto end; + } + + buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer, + buf_elm); + if (pos >= buf->start_pos && + pos < buf->start_pos + buf->length) { + found = true; + goto end; + } + + /* + * As we use a stream based FD we may expect having the data always + * on first chunk + */ + migf->state = MLX5_MIGF_STATE_ERROR; + +end: + spin_unlock_irq(&migf->list_lock); + return found ? buf : NULL; +} + +static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, + char __user **buf, size_t *len, loff_t *pos) +{ + unsigned long offset; + ssize_t done = 0; + size_t copy_len; + + copy_len = min_t(size_t, + vhca_buf->start_pos + vhca_buf->length - *pos, *len); + while (copy_len) { + size_t page_offset; + struct page *page; + size_t page_len; + u8 *from_buff; + int ret; + + offset = *pos - vhca_buf->start_pos; + page_offset = offset % PAGE_SIZE; + offset -= page_offset; + page = mlx5vf_get_migration_page(vhca_buf, offset); + if (!page) + return -EINVAL; + page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset); + from_buff = kmap_local_page(page); + ret = copy_to_user(*buf, from_buff + page_offset, page_len); + kunmap_local(from_buff); + if (ret) + return -EFAULT; + *pos += page_len; + *len -= page_len; + *buf += page_len; + done += page_len; + copy_len -= page_len; + } + + if (*pos >= vhca_buf->start_pos + vhca_buf->length) { + spin_lock_irq(&vhca_buf->migf->list_lock); + list_del_init(&vhca_buf->buf_elm); + spin_unlock_irq(&vhca_buf->migf->list_lock); + } + + return done; +} + static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos) { struct mlx5_vf_migration_file *migf = filp->private_data; - struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; + struct mlx5_vhca_data_buffer *vhca_buf; + bool first_loop_call = true; + bool end_of_data; ssize_t done = 0; if (pos) @@ -137,53 +216,47 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (!(filp->f_flags & O_NONBLOCK)) { if (wait_event_interruptible(migf->poll_wait, - READ_ONCE(vhca_buf->length) || - migf->state == MLX5_MIGF_STATE_ERROR)) + !list_empty(&migf->buf_list) || + migf->state == MLX5_MIGF_STATE_ERROR || + migf->state == MLX5_MIGF_STATE_COMPLETE)) return -ERESTARTSYS; } mutex_lock(&migf->lock); - if ((filp->f_flags & O_NONBLOCK) && !READ_ONCE(vhca_buf->length)) { - done = -EAGAIN; - goto out_unlock; - } - if (*pos > vhca_buf->length) { - done = -EINVAL; - goto out_unlock; - } if (migf->state == MLX5_MIGF_STATE_ERROR) { done = -ENODEV; goto out_unlock; } - len = min_t(size_t, vhca_buf->length - *pos, len); while (len) { - size_t page_offset; - struct page *page; - size_t page_len; - u8 *from_buff; - int ret; + ssize_t count; + + vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos, + &end_of_data); + if (first_loop_call) { + first_loop_call = false; + if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) { + if (filp->f_flags & O_NONBLOCK) { + done = -EAGAIN; + goto out_unlock; + } + } + } - page_offset = (*pos) % PAGE_SIZE; - page = mlx5vf_get_migration_page(vhca_buf, *pos - page_offset); - if (!page) { - if (done == 0) - done = -EINVAL; + if (end_of_data) + goto out_unlock; + + if (!vhca_buf) { + done = -EINVAL; goto out_unlock; } - page_len = min_t(size_t, len, PAGE_SIZE - page_offset); - from_buff = kmap_local_page(page); - ret = copy_to_user(buf, from_buff + page_offset, page_len); - kunmap_local(from_buff); - if (ret) { - done = -EFAULT; + count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos); + if (count < 0) { + done = count; goto out_unlock; } - *pos += page_len; - len -= page_len; - done += page_len; - buf += page_len; + done += count; } out_unlock: @@ -202,7 +275,8 @@ static __poll_t mlx5vf_save_poll(struct file *filp, mutex_lock(&migf->lock); if (migf->state == MLX5_MIGF_STATE_ERROR) pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; - else if (READ_ONCE(migf->buf->length)) + else if (!list_empty(&migf->buf_list) || + migf->state == MLX5_MIGF_STATE_COMPLETE) pollflags = EPOLLIN | EPOLLRDNORM; mutex_unlock(&migf->lock); @@ -253,6 +327,8 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) complete(&migf->save_comp); mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); + INIT_LIST_HEAD(&migf->buf_list); + spin_lock_init(&migf->list_lock); ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length); if (ret) goto out_pd; @@ -266,7 +342,6 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf); if (ret) goto out_save; - migf->buf = buf; return migf; out_save: mlx5vf_free_data_buffer(buf); @@ -386,6 +461,8 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) migf->buf = buf; stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); + INIT_LIST_HEAD(&migf->buf_list); + spin_lock_init(&migf->list_lock); return migf; out_pd: mlx5vf_cmd_dealloc_pd(migf); -- GitLab From 3319d287f4c04b9deece8ea00e27a70bbe32941b Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:32 +0200 Subject: [PATCH 1181/1423] vfio/mlx5: Introduce device transitions of PRE_COPY In order to support PRE_COPY, mlx5 driver is transferring multiple states (images) of the device. e.g.: the source VF can save and transfer multiple states, and the target VF will load them by that order. The device is saving three kinds of states: 1) Initial state - when the device moves to PRE_COPY state. 2) Middle state - during PRE_COPY phase via VFIO_MIG_GET_PRECOPY_INFO. There can be multiple states of this type. 3) Final state - when the device moves to STOP_COPY state. After moving to PRE_COPY state, user is holding the saving migf FD and can use it. For example: user can start transferring data via read() callback. Also, user can switch from PRE_COPY to STOP_COPY whenever he sees it fits. This will invoke saving of final state. This means that mlx5 VFIO device can be switched to STOP_COPY without transferring any data in PRE_COPY state. Therefore, when the device moves to STOP_COPY, mlx5 will store the final state on a dedicated queue entry on the list. Co-developed-by: Shay Drory Signed-off-by: Shay Drory Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-9-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 96 +++++++++++++++++++++++++++++++++--- drivers/vfio/pci/mlx5/cmd.h | 16 +++++- drivers/vfio/pci/mlx5/main.c | 90 ++++++++++++++++++++++++++++++--- 3 files changed, 184 insertions(+), 18 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 0e36b4c8c8169..5fcece201d4c0 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -14,18 +14,36 @@ _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev); int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) { + struct mlx5_vf_migration_file *migf = mvdev->saving_migf; u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {}; u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {}; + int err; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; + /* + * In case PRE_COPY is used, saving_migf is exposed while the device is + * running. Make sure to run only once there is no active save command. + * Running both in parallel, might end-up with a failure in the save + * command once it will try to turn on 'tracking' on a suspended device. + */ + if (migf) { + err = wait_for_completion_interruptible(&migf->save_comp); + if (err) + return err; + } + MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA); MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(suspend_vhca_in, in, op_mod, op_mod); - return mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out); + err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out); + if (migf) + complete(&migf->save_comp); + + return err; } int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) @@ -45,7 +63,7 @@ int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) } int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, - size_t *state_size) + size_t *state_size, u8 query_flags) { u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {}; @@ -59,6 +77,8 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE); MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0); + MLX5_SET(query_vhca_migration_state_in, in, incremental, + query_flags & MLX5VF_QUERY_INC); ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in, out); @@ -342,6 +362,56 @@ end: return ERR_PTR(ret); } +void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf) +{ + spin_lock_irq(&buf->migf->list_lock); + list_add_tail(&buf->buf_elm, &buf->migf->avail_list); + spin_unlock_irq(&buf->migf->list_lock); +} + +struct mlx5_vhca_data_buffer * +mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, + size_t length, enum dma_data_direction dma_dir) +{ + struct mlx5_vhca_data_buffer *buf, *temp_buf; + struct list_head free_list; + + lockdep_assert_held(&migf->mvdev->state_mutex); + if (migf->mvdev->mdev_detach) + return ERR_PTR(-ENOTCONN); + + INIT_LIST_HEAD(&free_list); + + spin_lock_irq(&migf->list_lock); + list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) { + if (buf->dma_dir == dma_dir) { + list_del_init(&buf->buf_elm); + if (buf->allocated_length >= length) { + spin_unlock_irq(&migf->list_lock); + goto found; + } + /* + * Prevent holding redundant buffers. Put in a free + * list and call at the end not under the spin lock + * (&migf->list_lock) to mlx5vf_free_data_buffer which + * might sleep. + */ + list_add(&buf->buf_elm, &free_list); + } + } + spin_unlock_irq(&migf->list_lock); + buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir); + +found: + while ((temp_buf = list_first_entry_or_null(&free_list, + struct mlx5_vhca_data_buffer, buf_elm))) { + list_del(&temp_buf->buf_elm); + mlx5vf_free_data_buffer(temp_buf); + } + + return buf; +} + void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) { struct mlx5vf_async_data *async_data = container_of(_work, @@ -351,7 +421,7 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mutex_lock(&migf->lock); if (async_data->status) { - migf->buf = async_data->buf; + mlx5vf_put_data_buffer(async_data->buf); migf->state = MLX5_MIGF_STATE_ERROR; wake_up_interruptible(&migf->poll_wait); } @@ -369,15 +439,19 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) struct mlx5_vf_migration_file, async_data); if (!status) { + size_t image_size; unsigned long flags; - async_data->buf->length = - MLX5_GET(save_vhca_state_out, async_data->out, - actual_image_size); + image_size = MLX5_GET(save_vhca_state_out, async_data->out, + actual_image_size); + async_data->buf->length = image_size; + async_data->buf->start_pos = migf->max_pos; + migf->max_pos += async_data->buf->length; spin_lock_irqsave(&migf->list_lock, flags); list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); spin_unlock_irqrestore(&migf->list_lock, flags); - migf->state = MLX5_MIGF_STATE_COMPLETE; + if (async_data->last_chunk) + migf->state = MLX5_MIGF_STATE_COMPLETE; wake_up_interruptible(&migf->poll_wait); } @@ -391,7 +465,8 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf, - struct mlx5_vhca_data_buffer *buf) + struct mlx5_vhca_data_buffer *buf, bool inc, + bool track) { u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out); u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; @@ -412,9 +487,12 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey); MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); + MLX5_SET(save_vhca_state_in, in, incremental, inc); + MLX5_SET(save_vhca_state_in, in, set_track, track); async_data = &migf->async_data; async_data->buf = buf; + async_data->last_chunk = !track; async_data->out = kvzalloc(out_size, GFP_KERNEL); if (!async_data->out) { err = -ENOMEM; @@ -497,6 +575,8 @@ void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) migf->buf = NULL; } + list_splice(&migf->avail_list, &migf->buf_list); + while ((entry = list_first_entry_or_null(&migf->buf_list, struct mlx5_vhca_data_buffer, buf_elm))) { list_del(&entry->buf_elm); diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 6e594689566e4..34e61c7aa23d1 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -38,6 +38,7 @@ struct mlx5vf_async_data { struct work_struct work; struct mlx5_vhca_data_buffer *buf; int status; + u8 last_chunk:1; void *out; }; @@ -47,9 +48,11 @@ struct mlx5_vf_migration_file { enum mlx5_vf_migf_state state; u32 pdn; + loff_t max_pos; struct mlx5_vhca_data_buffer *buf; spinlock_t list_lock; struct list_head buf_list; + struct list_head avail_list; struct mlx5vf_pci_core_device *mvdev; wait_queue_head_t poll_wait; struct completion save_comp; @@ -129,10 +132,14 @@ struct mlx5vf_pci_core_device { struct mlx5_core_dev *mdev; }; +enum { + MLX5VF_QUERY_INC = (1UL << 0), +}; + int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, - size_t *state_size); + size_t *state_size, u8 query_flags); void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, const struct vfio_migration_ops *mig_ops, const struct vfio_log_ops *log_ops); @@ -140,7 +147,8 @@ void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev); int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf, - struct mlx5_vhca_data_buffer *buf); + struct mlx5_vhca_data_buffer *buf, bool inc, + bool track); int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf, struct mlx5_vhca_data_buffer *buf); @@ -151,6 +159,10 @@ struct mlx5_vhca_data_buffer * mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, size_t length, enum dma_data_direction dma_dir); void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf); +struct mlx5_vhca_data_buffer * +mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, + size_t length, enum dma_data_direction dma_dir); +void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf); int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, unsigned int npages); void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index ca16425811c4b..9cabba4560447 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -195,6 +195,7 @@ static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, if (*pos >= vhca_buf->start_pos + vhca_buf->length) { spin_lock_irq(&vhca_buf->migf->list_lock); list_del_init(&vhca_buf->buf_elm); + list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list); spin_unlock_irq(&vhca_buf->migf->list_lock); } @@ -283,6 +284,16 @@ static __poll_t mlx5vf_save_poll(struct file *filp, return pollflags; } +/* + * FD is exposed and user can use it after receiving an error. + * Mark migf in error, and wake the user. + */ +static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) +{ + migf->state = MLX5_MIGF_STATE_ERROR; + wake_up_interruptible(&migf->poll_wait); +} + static const struct file_operations mlx5vf_save_fops = { .owner = THIS_MODULE, .read = mlx5vf_save_read, @@ -291,8 +302,42 @@ static const struct file_operations mlx5vf_save_fops = { .llseek = no_llseek, }; +static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) +{ + struct mlx5_vf_migration_file *migf = mvdev->saving_migf; + struct mlx5_vhca_data_buffer *buf; + size_t length; + int ret; + + if (migf->state == MLX5_MIGF_STATE_ERROR) + return -ENODEV; + + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, + MLX5VF_QUERY_INC); + if (ret) + goto err; + + buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto err; + } + + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false); + if (ret) + goto err_save; + + return 0; + +err_save: + mlx5vf_put_data_buffer(buf); +err: + mlx5vf_mark_err(migf); + return ret; +} + static struct mlx5_vf_migration_file * -mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) +mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) { struct mlx5_vf_migration_file *migf; struct mlx5_vhca_data_buffer *buf; @@ -328,8 +373,9 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); INIT_LIST_HEAD(&migf->buf_list); + INIT_LIST_HEAD(&migf->avail_list); spin_lock_init(&migf->list_lock); - ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length); + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0); if (ret) goto out_pd; @@ -339,7 +385,7 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) goto out_pd; } - ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf); + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track); if (ret) goto out_save; return migf; @@ -462,6 +508,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); INIT_LIST_HEAD(&migf->buf_list); + INIT_LIST_HEAD(&migf->avail_list); spin_lock_init(&migf->list_lock); return migf; out_pd: @@ -514,7 +561,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, return NULL; } - if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) { + if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) || + (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { ret = mlx5vf_cmd_suspend_vhca(mvdev, MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR); if (ret) @@ -522,7 +570,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, return NULL; } - if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) { + if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) { ret = mlx5vf_cmd_resume_vhca(mvdev, MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR); if (ret) @@ -533,7 +582,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { struct mlx5_vf_migration_file *migf; - migf = mlx5vf_pci_save_device_data(mvdev); + migf = mlx5vf_pci_save_device_data(mvdev, false); if (IS_ERR(migf)) return ERR_CAST(migf); get_file(migf->filp); @@ -541,7 +590,10 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, return migf->filp; } - if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP)) { + if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) || + (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && + new == VFIO_DEVICE_STATE_RUNNING_P2P)) { mlx5vf_disable_fds(mvdev); return NULL; } @@ -567,6 +619,28 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, return NULL; } + if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) || + (cur == VFIO_DEVICE_STATE_RUNNING_P2P && + new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { + struct mlx5_vf_migration_file *migf; + + migf = mlx5vf_pci_save_device_data(mvdev, true); + if (IS_ERR(migf)) + return ERR_CAST(migf); + get_file(migf->filp); + mvdev->saving_migf = migf; + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) { + ret = mlx5vf_cmd_suspend_vhca(mvdev, + MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); + if (ret) + return ERR_PTR(ret); + ret = mlx5vf_pci_save_device_inc_data(mvdev); + return ret ? ERR_PTR(ret) : NULL; + } + /* * vfio_mig_get_next_state() does not use arcs other than the above */ @@ -635,7 +709,7 @@ static int mlx5vf_pci_get_data_size(struct vfio_device *vdev, mutex_lock(&mvdev->state_mutex); ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, - &state_size); + &state_size, 0); if (!ret) *stop_copy_length = state_size; mlx5vf_state_mutex_unlock(mvdev); -- GitLab From 0c9a38fee8b210a8dfd3f177526daac567ec9265 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:33 +0200 Subject: [PATCH 1182/1423] vfio/mlx5: Introduce SW headers for migration states As mentioned in the previous patches, mlx5 is transferring multiple states when the PRE_COPY protocol is used. This states mechanism requires the target VM to know the states' size in order to execute multiple loads. Therefore, add SW header, with the needed information, for each saved state the source VM is transferring to the target VM. This patch implements the source VM handling of the headers, following patch will implement the target VM handling of the headers. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-10-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 56 ++++++++++++++++++++++++++++++++++-- drivers/vfio/pci/mlx5/cmd.h | 13 +++++++++ drivers/vfio/pci/mlx5/main.c | 2 +- 3 files changed, 67 insertions(+), 4 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 5fcece201d4c0..160fa38fc78dd 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -351,9 +351,11 @@ mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, if (ret) goto end; - ret = mlx5vf_dma_data_buffer(buf); - if (ret) - goto end; + if (dma_dir != DMA_NONE) { + ret = mlx5vf_dma_data_buffer(buf); + if (ret) + goto end; + } } return buf; @@ -422,6 +424,8 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mutex_lock(&migf->lock); if (async_data->status) { mlx5vf_put_data_buffer(async_data->buf); + if (async_data->header_buf) + mlx5vf_put_data_buffer(async_data->header_buf); migf->state = MLX5_MIGF_STATE_ERROR; wake_up_interruptible(&migf->poll_wait); } @@ -431,6 +435,32 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) fput(migf->filp); } +static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf, + size_t image_size) +{ + struct mlx5_vf_migration_file *migf = header_buf->migf; + struct mlx5_vf_migration_header header = {}; + unsigned long flags; + struct page *page; + u8 *to_buff; + + header.image_size = cpu_to_le64(image_size); + page = mlx5vf_get_migration_page(header_buf, 0); + if (!page) + return -EINVAL; + to_buff = kmap_local_page(page); + memcpy(to_buff, &header, sizeof(header)); + kunmap_local(to_buff); + header_buf->length = sizeof(header); + header_buf->header_image_size = image_size; + header_buf->start_pos = header_buf->migf->max_pos; + migf->max_pos += header_buf->length; + spin_lock_irqsave(&migf->list_lock, flags); + list_add_tail(&header_buf->buf_elm, &migf->buf_list); + spin_unlock_irqrestore(&migf->list_lock, flags); + return 0; +} + static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) { struct mlx5vf_async_data *async_data = container_of(context, @@ -444,6 +474,11 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) image_size = MLX5_GET(save_vhca_state_out, async_data->out, actual_image_size); + if (async_data->header_buf) { + status = add_buf_header(async_data->header_buf, image_size); + if (status) + goto err; + } async_data->buf->length = image_size; async_data->buf->start_pos = migf->max_pos; migf->max_pos += async_data->buf->length; @@ -455,6 +490,7 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) wake_up_interruptible(&migf->poll_wait); } +err: /* * The error and the cleanup flows can't run from an * interrupt context @@ -470,6 +506,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, { u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out); u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; + struct mlx5_vhca_data_buffer *header_buf = NULL; struct mlx5vf_async_data *async_data; int err; @@ -499,6 +536,16 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, goto err_out; } + if (MLX5VF_PRE_COPY_SUPP(mvdev)) { + header_buf = mlx5vf_get_data_buffer(migf, + sizeof(struct mlx5_vf_migration_header), DMA_NONE); + if (IS_ERR(header_buf)) { + err = PTR_ERR(header_buf); + goto err_free; + } + } + + async_data->header_buf = header_buf; get_file(migf->filp); err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), async_data->out, @@ -510,7 +557,10 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, return 0; err_exec: + if (header_buf) + mlx5vf_put_data_buffer(header_buf); fput(migf->filp); +err_free: kvfree(async_data->out); err_out: complete(&migf->save_comp); diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 34e61c7aa23d1..3e36ccca820ad 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -12,16 +12,26 @@ #include #include +#define MLX5VF_PRE_COPY_SUPP(mvdev) \ + ((mvdev)->core_device.vdev.migration_flags & VFIO_MIGRATION_PRE_COPY) + enum mlx5_vf_migf_state { MLX5_MIGF_STATE_ERROR = 1, MLX5_MIGF_STATE_COMPLETE, }; +struct mlx5_vf_migration_header { + __le64 image_size; + /* For future use in case we may need to change the kernel protocol */ + __le64 flags; +}; + struct mlx5_vhca_data_buffer { struct sg_append_table table; loff_t start_pos; u64 length; u64 allocated_length; + u64 header_image_size; u32 mkey; enum dma_data_direction dma_dir; u8 dmaed:1; @@ -37,6 +47,7 @@ struct mlx5vf_async_data { struct mlx5_async_work cb_work; struct work_struct work; struct mlx5_vhca_data_buffer *buf; + struct mlx5_vhca_data_buffer *header_buf; int status; u8 last_chunk:1; void *out; @@ -165,6 +176,8 @@ mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf); int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, unsigned int npages); +struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, + unsigned long offset); void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 9cabba4560447..9a36e36ec33b7 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -32,7 +32,7 @@ static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) core_device); } -static struct page * +struct page * mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, unsigned long offset) { -- GitLab From 0dce165b1adf8d7f67030bb257e00107db8022de Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:34 +0200 Subject: [PATCH 1183/1423] vfio/mlx5: Introduce vfio precopy ioctl implementation vfio precopy ioctl returns an estimation of data available for transferring from the device. Whenever a user is using VFIO_MIG_GET_PRECOPY_INFO, track the current state of the device, and if needed, append the dirty data to the transfer FD data. This is done by saving a middle state. As mlx5 runs the SAVE command asynchronously, make sure to query for incremental data only once there is no active save command. Running both in parallel, might end-up with a failure in the incremental query command on un-tracked vhca. Also, a middle state will be saved only after the previous state has finished its SAVE command and has been fully transferred, this prevents endless use resources. Co-developed-by: Shay Drory Signed-off-by: Shay Drory Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-11-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 16 +++++ drivers/vfio/pci/mlx5/main.c | 111 +++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 160fa38fc78dd..12e74ecebe641 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -67,12 +67,25 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, { u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {}; + bool inc = query_flags & MLX5VF_QUERY_INC; int ret; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; + /* + * In case PRE_COPY is used, saving_migf is exposed while device is + * running. Make sure to run only once there is no active save command. + * Running both in parallel, might end-up with a failure in the + * incremental query command on un-tracked vhca. + */ + if (inc) { + ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp); + if (ret) + return ret; + } + MLX5_SET(query_vhca_migration_state_in, in, opcode, MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE); MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id); @@ -82,6 +95,9 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in, out); + if (inc) + complete(&mvdev->saving_migf->save_comp); + if (ret) return ret; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 9a36e36ec33b7..2c8ac763057cd 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -294,10 +294,121 @@ static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) wake_up_interruptible(&migf->poll_wait); } +static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct mlx5_vf_migration_file *migf = filp->private_data; + struct mlx5vf_pci_core_device *mvdev = migf->mvdev; + struct mlx5_vhca_data_buffer *buf; + struct vfio_precopy_info info = {}; + loff_t *pos = &filp->f_pos; + unsigned long minsz; + size_t inc_length = 0; + bool end_of_data; + int ret; + + if (cmd != VFIO_MIG_GET_PRECOPY_INFO) + return -ENOTTY; + + minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + mutex_lock(&mvdev->state_mutex); + if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY && + mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) { + ret = -EINVAL; + goto err_state_unlock; + } + + /* + * We can't issue a SAVE command when the device is suspended, so as + * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra + * bytes that can't be read. + */ + if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) { + /* + * Once the query returns it's guaranteed that there is no + * active SAVE command. + * As so, the other code below is safe with the proper locks. + */ + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length, + MLX5VF_QUERY_INC); + if (ret) + goto err_state_unlock; + } + + mutex_lock(&migf->lock); + if (migf->state == MLX5_MIGF_STATE_ERROR) { + ret = -ENODEV; + goto err_migf_unlock; + } + + buf = mlx5vf_get_data_buff_from_pos(migf, *pos, &end_of_data); + if (buf) { + if (buf->start_pos == 0) { + info.initial_bytes = buf->header_image_size - *pos; + } else if (buf->start_pos == + sizeof(struct mlx5_vf_migration_header)) { + /* First data buffer following the header */ + info.initial_bytes = buf->start_pos + + buf->length - *pos; + } else { + info.dirty_bytes = buf->start_pos + buf->length - *pos; + } + } else { + if (!end_of_data) { + ret = -EINVAL; + goto err_migf_unlock; + } + + info.dirty_bytes = inc_length; + } + + if (!end_of_data || !inc_length) { + mutex_unlock(&migf->lock); + goto done; + } + + mutex_unlock(&migf->lock); + /* + * We finished transferring the current state and the device has a + * dirty state, save a new state to be ready for. + */ + buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + mlx5vf_mark_err(migf); + goto err_state_unlock; + } + + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true); + if (ret) { + mlx5vf_mark_err(migf); + mlx5vf_put_data_buffer(buf); + goto err_state_unlock; + } + +done: + mlx5vf_state_mutex_unlock(mvdev); + return copy_to_user((void __user *)arg, &info, minsz); +err_migf_unlock: + mutex_unlock(&migf->lock); +err_state_unlock: + mlx5vf_state_mutex_unlock(mvdev); + return ret; +} + static const struct file_operations mlx5vf_save_fops = { .owner = THIS_MODULE, .read = mlx5vf_save_read, .poll = mlx5vf_save_poll, + .unlocked_ioctl = mlx5vf_precopy_ioctl, + .compat_ioctl = compat_ptr_ioctl, .release = mlx5vf_release_file, .llseek = no_llseek, }; -- GitLab From 81156c27271c4a6c594e492c8d119fbacfc99f36 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:35 +0200 Subject: [PATCH 1184/1423] vfio/mlx5: Consider temporary end of stream as part of PRE_COPY During PRE_COPY the migration data FD may have a temporary "end of stream" that is reached when the initial_bytes were read and no other dirty data exists yet. For instance, this may indicate that the device is idle and not currently dirtying any internal state. When read() is done on this temporary end of stream the kernel driver should return ENOMSG from read(). Userspace can wait for more data or consider moving to STOP_COPY. To not block the user upon read() and let it get ENOMSG we add a new state named MLX5_MIGF_STATE_PRE_COPY on the migration file. In addition, we add the MLX5_MIGF_STATE_SAVE_LAST state to block the read() once we call the last SAVE upon moving to STOP_COPY. Any further error will be marked with MLX5_MIGF_STATE_ERROR and the user won't be blocked. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-12-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 7 +++++-- drivers/vfio/pci/mlx5/cmd.h | 2 ++ drivers/vfio/pci/mlx5/main.c | 7 +++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 12e74ecebe641..f6293da033cca 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -501,8 +501,8 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) spin_lock_irqsave(&migf->list_lock, flags); list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); spin_unlock_irqrestore(&migf->list_lock, flags); - if (async_data->last_chunk) - migf->state = MLX5_MIGF_STATE_COMPLETE; + migf->state = async_data->last_chunk ? + MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY; wake_up_interruptible(&migf->poll_wait); } @@ -561,6 +561,9 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, } } + if (async_data->last_chunk) + migf->state = MLX5_MIGF_STATE_SAVE_LAST; + async_data->header_buf = header_buf; get_file(migf->filp); err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 3e36ccca820ad..d048f23977dd1 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -17,6 +17,8 @@ enum mlx5_vf_migf_state { MLX5_MIGF_STATE_ERROR = 1, + MLX5_MIGF_STATE_PRE_COPY, + MLX5_MIGF_STATE_SAVE_LAST, MLX5_MIGF_STATE_COMPLETE, }; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 2c8ac763057cd..44b1543c751c9 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -219,6 +219,7 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (wait_event_interruptible(migf->poll_wait, !list_empty(&migf->buf_list) || migf->state == MLX5_MIGF_STATE_ERROR || + migf->state == MLX5_MIGF_STATE_PRE_COPY || migf->state == MLX5_MIGF_STATE_COMPLETE)) return -ERESTARTSYS; } @@ -236,6 +237,12 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, &end_of_data); if (first_loop_call) { first_loop_call = false; + /* Temporary end of file as part of PRE_COPY */ + if (end_of_data && migf->state == MLX5_MIGF_STATE_PRE_COPY) { + done = -ENOMSG; + goto out_unlock; + } + if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) { if (filp->f_flags & O_NONBLOCK) { done = -EAGAIN; -- GitLab From 34e2f27143d1b373f088e805f7e11cdf778f791d Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:36 +0200 Subject: [PATCH 1185/1423] vfio/mlx5: Introduce multiple loads In order to support PRE_COPY, mlx5 driver transfers multiple states (images) of the device. e.g.: the source VF can save and transfer multiple states, and the target VF will load them by that order. This patch implements the changes for the target VF to decompose the header for each state and to write and load multiple states. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-13-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 13 +- drivers/vfio/pci/mlx5/cmd.h | 10 ++ drivers/vfio/pci/mlx5/main.c | 279 +++++++++++++++++++++++++++++------ 3 files changed, 257 insertions(+), 45 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index f6293da033cca..993749818d90f 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -598,9 +598,11 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, if (mvdev->mdev_detach) return -ENOTCONN; - err = mlx5vf_dma_data_buffer(buf); - if (err) - return err; + if (!buf->dmaed) { + err = mlx5vf_dma_data_buffer(buf); + if (err) + return err; + } MLX5_SET(load_vhca_state_in, in, opcode, MLX5_CMD_OP_LOAD_VHCA_STATE); @@ -644,6 +646,11 @@ void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) migf->buf = NULL; } + if (migf->buf_header) { + mlx5vf_free_data_buffer(migf->buf_header); + migf->buf_header = NULL; + } + list_splice(&migf->avail_list, &migf->buf_list); while ((entry = list_first_entry_or_null(&migf->buf_list, diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index d048f23977dd1..7729eac8c78c1 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -22,6 +22,14 @@ enum mlx5_vf_migf_state { MLX5_MIGF_STATE_COMPLETE, }; +enum mlx5_vf_load_state { + MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER, + MLX5_VF_LOAD_STATE_READ_HEADER, + MLX5_VF_LOAD_STATE_PREP_IMAGE, + MLX5_VF_LOAD_STATE_READ_IMAGE, + MLX5_VF_LOAD_STATE_LOAD_IMAGE, +}; + struct mlx5_vf_migration_header { __le64 image_size; /* For future use in case we may need to change the kernel protocol */ @@ -60,9 +68,11 @@ struct mlx5_vf_migration_file { struct mutex lock; enum mlx5_vf_migf_state state; + enum mlx5_vf_load_state load_state; u32 pdn; loff_t max_pos; struct mlx5_vhca_data_buffer *buf; + struct mlx5_vhca_data_buffer *buf_header; spinlock_t list_lock; struct list_head buf_list; struct list_head avail_list; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 44b1543c751c9..5a669b73994a7 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -518,13 +518,162 @@ end: return ERR_PTR(ret); } +static int +mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf, + const char __user **buf, size_t *len, + loff_t *pos, ssize_t *done) +{ + unsigned long offset; + size_t page_offset; + struct page *page; + size_t page_len; + u8 *to_buff; + int ret; + + offset = *pos - vhca_buf->start_pos; + page_offset = offset % PAGE_SIZE; + + page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset); + if (!page) + return -EINVAL; + page_len = min_t(size_t, *len, PAGE_SIZE - page_offset); + to_buff = kmap_local_page(page); + ret = copy_from_user(to_buff + page_offset, *buf, page_len); + kunmap_local(to_buff); + if (ret) + return -EFAULT; + + *pos += page_len; + *done += page_len; + *buf += page_len; + *len -= page_len; + vhca_buf->length += page_len; + return 0; +} + +static int +mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf, + loff_t requested_length, + const char __user **buf, size_t *len, + loff_t *pos, ssize_t *done) +{ + int ret; + + if (requested_length > MAX_MIGRATION_SIZE) + return -ENOMEM; + + if (vhca_buf->allocated_length < requested_length) { + ret = mlx5vf_add_migration_pages( + vhca_buf, + DIV_ROUND_UP(requested_length - vhca_buf->allocated_length, + PAGE_SIZE)); + if (ret) + return ret; + } + + while (*len) { + ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos, + done); + if (ret) + return ret; + } + + return 0; +} + +static ssize_t +mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *vhca_buf, + size_t image_size, const char __user **buf, + size_t *len, loff_t *pos, ssize_t *done, + bool *has_work) +{ + size_t copy_len, to_copy; + int ret; + + to_copy = min_t(size_t, *len, image_size - vhca_buf->length); + copy_len = to_copy; + while (to_copy) { + ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos, + done); + if (ret) + return ret; + } + + *len -= copy_len; + if (vhca_buf->length == image_size) { + migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE; + migf->max_pos += image_size; + *has_work = true; + } + + return 0; +} + +static int +mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *vhca_buf, + const char __user **buf, + size_t *len, loff_t *pos, + ssize_t *done, bool *has_work) +{ + struct page *page; + size_t copy_len; + u8 *to_buff; + int ret; + + copy_len = min_t(size_t, *len, + sizeof(struct mlx5_vf_migration_header) - vhca_buf->length); + page = mlx5vf_get_migration_page(vhca_buf, 0); + if (!page) + return -EINVAL; + to_buff = kmap_local_page(page); + ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len); + if (ret) { + ret = -EFAULT; + goto end; + } + + *buf += copy_len; + *pos += copy_len; + *done += copy_len; + *len -= copy_len; + vhca_buf->length += copy_len; + if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) { + u64 flags; + + vhca_buf->header_image_size = le64_to_cpup((__le64 *)to_buff); + if (vhca_buf->header_image_size > MAX_MIGRATION_SIZE) { + ret = -ENOMEM; + goto end; + } + + flags = le64_to_cpup((__le64 *)(to_buff + + offsetof(struct mlx5_vf_migration_header, flags))); + if (flags) { + ret = -EOPNOTSUPP; + goto end; + } + + migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE; + migf->max_pos += vhca_buf->length; + *has_work = true; + } +end: + kunmap_local(to_buff); + return ret; +} + static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, size_t len, loff_t *pos) { struct mlx5_vf_migration_file *migf = filp->private_data; struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; + struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header; loff_t requested_length; + bool has_work = false; ssize_t done = 0; + int ret = 0; if (pos) return -ESPIPE; @@ -534,56 +683,83 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, check_add_overflow((loff_t)len, *pos, &requested_length)) return -EINVAL; - if (requested_length > MAX_MIGRATION_SIZE) - return -ENOMEM; - + mutex_lock(&migf->mvdev->state_mutex); mutex_lock(&migf->lock); if (migf->state == MLX5_MIGF_STATE_ERROR) { - done = -ENODEV; + ret = -ENODEV; goto out_unlock; } - if (vhca_buf->allocated_length < requested_length) { - done = mlx5vf_add_migration_pages( - vhca_buf, - DIV_ROUND_UP(requested_length - vhca_buf->allocated_length, - PAGE_SIZE)); - if (done) - goto out_unlock; - } + while (len || has_work) { + has_work = false; + switch (migf->load_state) { + case MLX5_VF_LOAD_STATE_READ_HEADER: + ret = mlx5vf_resume_read_header(migf, vhca_buf_header, + &buf, &len, pos, + &done, &has_work); + if (ret) + goto out_unlock; + break; + case MLX5_VF_LOAD_STATE_PREP_IMAGE: + { + u64 size = vhca_buf_header->header_image_size; + + if (vhca_buf->allocated_length < size) { + mlx5vf_free_data_buffer(vhca_buf); + + migf->buf = mlx5vf_alloc_data_buffer(migf, + size, DMA_TO_DEVICE); + if (IS_ERR(migf->buf)) { + ret = PTR_ERR(migf->buf); + migf->buf = NULL; + goto out_unlock; + } - while (len) { - size_t page_offset; - struct page *page; - size_t page_len; - u8 *to_buff; - int ret; + vhca_buf = migf->buf; + } - page_offset = (*pos) % PAGE_SIZE; - page = mlx5vf_get_migration_page(vhca_buf, *pos - page_offset); - if (!page) { - if (done == 0) - done = -EINVAL; - goto out_unlock; + vhca_buf->start_pos = migf->max_pos; + migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE; + break; } + case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER: + ret = mlx5vf_resume_read_image_no_header(vhca_buf, + requested_length, + &buf, &len, pos, &done); + if (ret) + goto out_unlock; + break; + case MLX5_VF_LOAD_STATE_READ_IMAGE: + ret = mlx5vf_resume_read_image(migf, vhca_buf, + vhca_buf_header->header_image_size, + &buf, &len, pos, &done, &has_work); + if (ret) + goto out_unlock; + break; + case MLX5_VF_LOAD_STATE_LOAD_IMAGE: + ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf); + if (ret) + goto out_unlock; + migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; - page_len = min_t(size_t, len, PAGE_SIZE - page_offset); - to_buff = kmap_local_page(page); - ret = copy_from_user(to_buff + page_offset, buf, page_len); - kunmap_local(to_buff); - if (ret) { - done = -EFAULT; - goto out_unlock; + /* prep header buf for next image */ + vhca_buf_header->length = 0; + vhca_buf_header->header_image_size = 0; + /* prep data buf for next image */ + vhca_buf->length = 0; + + break; + default: + break; } - *pos += page_len; - len -= page_len; - done += page_len; - buf += page_len; - vhca_buf->length += page_len; } + out_unlock: + if (ret) + migf->state = MLX5_MIGF_STATE_ERROR; mutex_unlock(&migf->lock); - return done; + mlx5vf_state_mutex_unlock(migf->mvdev); + return ret ? ret : done; } static const struct file_operations mlx5vf_resume_fops = { @@ -623,12 +799,29 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) } migf->buf = buf; + if (MLX5VF_PRE_COPY_SUPP(mvdev)) { + buf = mlx5vf_alloc_data_buffer(migf, + sizeof(struct mlx5_vf_migration_header), DMA_NONE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out_buf; + } + + migf->buf_header = buf; + migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; + } else { + /* Initial state will be to read the image */ + migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER; + } + stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); INIT_LIST_HEAD(&migf->buf_list); INIT_LIST_HEAD(&migf->avail_list); spin_lock_init(&migf->list_lock); return migf; +out_buf: + mlx5vf_free_data_buffer(buf); out_pd: mlx5vf_cmd_dealloc_pd(migf); out_free: @@ -728,11 +921,13 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, } if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { - ret = mlx5vf_cmd_load_vhca_state(mvdev, - mvdev->resuming_migf, - mvdev->resuming_migf->buf); - if (ret) - return ERR_PTR(ret); + if (!MLX5VF_PRE_COPY_SUPP(mvdev)) { + ret = mlx5vf_cmd_load_vhca_state(mvdev, + mvdev->resuming_migf, + mvdev->resuming_migf->buf); + if (ret) + return ERR_PTR(ret); + } mlx5vf_disable_fds(mvdev); return NULL; } -- GitLab From d6e18a4bec431c181a60d32876c6c89955b2a4f8 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 6 Dec 2022 10:34:37 +0200 Subject: [PATCH 1186/1423] vfio/mlx5: Fallback to STOP_COPY upon specific PRE_COPY error Before a SAVE command is issued, a QUERY command is issued in order to know the device data size. In case PRE_COPY is used, the above commands are issued while the device is running. Thus, it is possible that between the QUERY and the SAVE commands the state of the device will be changed significantly and thus the SAVE will fail. Currently, if a SAVE command is failing, the driver will fail the migration. In the above case, don't fail the migration, but don't allow for new SAVEs to be executed while the device is in a RUNNING state. Once the device will be moved to STOP_COPY, SAVE can be executed again and the full device state will be read. Signed-off-by: Shay Drory Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-14-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 27 ++++++++++++++++++++++++++- drivers/vfio/pci/mlx5/cmd.h | 2 ++ drivers/vfio/pci/mlx5/main.c | 6 ++++-- 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 993749818d90f..01ef695e94418 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -84,6 +84,19 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp); if (ret) return ret; + if (mvdev->saving_migf->state == + MLX5_MIGF_STATE_PRE_COPY_ERROR) { + /* + * In case we had a PRE_COPY error, only query full + * image for final image + */ + if (!(query_flags & MLX5VF_QUERY_FINAL)) { + *state_size = 0; + complete(&mvdev->saving_migf->save_comp); + return 0; + } + query_flags &= ~MLX5VF_QUERY_INC; + } } MLX5_SET(query_vhca_migration_state_in, in, opcode, @@ -442,7 +455,10 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mlx5vf_put_data_buffer(async_data->buf); if (async_data->header_buf) mlx5vf_put_data_buffer(async_data->header_buf); - migf->state = MLX5_MIGF_STATE_ERROR; + if (async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR) + migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR; + else + migf->state = MLX5_MIGF_STATE_ERROR; wake_up_interruptible(&migf->poll_wait); } mutex_unlock(&migf->lock); @@ -511,6 +527,8 @@ err: * The error and the cleanup flows can't run from an * interrupt context */ + if (status == -EREMOTEIO) + status = MLX5_GET(save_vhca_state_out, async_data->out, status); async_data->status = status; queue_work(migf->mvdev->cb_wq, &async_data->work); } @@ -534,6 +552,13 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, if (err) return err; + if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) + /* + * In case we had a PRE_COPY error, SAVE is triggered only for + * the final image, read device full image. + */ + inc = false; + MLX5_SET(save_vhca_state_in, in, opcode, MLX5_CMD_OP_SAVE_VHCA_STATE); MLX5_SET(save_vhca_state_in, in, op_mod, 0); diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 7729eac8c78c1..5483171d57ad3 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -17,6 +17,7 @@ enum mlx5_vf_migf_state { MLX5_MIGF_STATE_ERROR = 1, + MLX5_MIGF_STATE_PRE_COPY_ERROR, MLX5_MIGF_STATE_PRE_COPY, MLX5_MIGF_STATE_SAVE_LAST, MLX5_MIGF_STATE_COMPLETE, @@ -157,6 +158,7 @@ struct mlx5vf_pci_core_device { enum { MLX5VF_QUERY_INC = (1UL << 0), + MLX5VF_QUERY_FINAL = (1UL << 1), }; int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 5a669b73994a7..cd90eb86128c3 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -219,6 +219,7 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (wait_event_interruptible(migf->poll_wait, !list_empty(&migf->buf_list) || migf->state == MLX5_MIGF_STATE_ERROR || + migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR || migf->state == MLX5_MIGF_STATE_PRE_COPY || migf->state == MLX5_MIGF_STATE_COMPLETE)) return -ERESTARTSYS; @@ -238,7 +239,8 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (first_loop_call) { first_loop_call = false; /* Temporary end of file as part of PRE_COPY */ - if (end_of_data && migf->state == MLX5_MIGF_STATE_PRE_COPY) { + if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY || + migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) { done = -ENOMSG; goto out_unlock; } @@ -431,7 +433,7 @@ static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) return -ENODEV; ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, - MLX5VF_QUERY_INC); + MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL); if (ret) goto err; -- GitLab From ccc2a52e464d1c983efede1a6d44728c151cb2ed Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 6 Dec 2022 10:34:38 +0200 Subject: [PATCH 1187/1423] vfio/mlx5: Enable MIGRATION_PRE_COPY flag Now that everything has been set up for MIGRATION_PRE_COPY, enable it. Signed-off-by: Shay Drory Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-15-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 01ef695e94418..64e68d13cb98f 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -222,6 +222,11 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization)) mvdev->core_device.vdev.log_ops = log_ops; + if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) && + MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state)) + mvdev->core_device.vdev.migration_flags |= + VFIO_MIGRATION_PRE_COPY; + end: mlx5_vf_put_core_dev(mvdev->mdev); } -- GitLab From 64ffbbb1e948876dd9044c32a3ab8a790662b9bb Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Wed, 23 Nov 2022 11:32:33 +0000 Subject: [PATCH 1188/1423] hisi_acc_vfio_pci: Add support for precopy IOCTL PRECOPY IOCTL in the case of HiSiIicon ACC driver can be used to perform the device compatibility check earlier during migration. Signed-off-by: Shameer Kolothum Link: https://lore.kernel.org/r/20221123113236.896-2-shameerali.kolothum.thodi@huawei.com Signed-off-by: Alex Williamson --- .../vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 52 +++++++++++++++++++ .../vfio/pci/hisilicon/hisi_acc_vfio_pci.h | 1 + 2 files changed, 53 insertions(+) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 0c0c0c7f05215..f3b74a06edb68 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -764,9 +764,58 @@ hisi_acc_vf_pci_resume(struct hisi_acc_vf_core_device *hisi_acc_vdev) stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); + migf->hisi_acc_vdev = hisi_acc_vdev; return migf; } +static long hisi_acc_vf_precopy_ioctl(struct file *filp, + unsigned int cmd, unsigned long arg) +{ + struct hisi_acc_vf_migration_file *migf = filp->private_data; + struct hisi_acc_vf_core_device *hisi_acc_vdev = migf->hisi_acc_vdev; + loff_t *pos = &filp->f_pos; + struct vfio_precopy_info info; + unsigned long minsz; + int ret; + + if (cmd != VFIO_MIG_GET_PRECOPY_INFO) + return -ENOTTY; + + minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + if (info.argsz < minsz) + return -EINVAL; + + mutex_lock(&hisi_acc_vdev->state_mutex); + if (hisi_acc_vdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY) { + mutex_unlock(&hisi_acc_vdev->state_mutex); + return -EINVAL; + } + + mutex_lock(&migf->lock); + + if (migf->disabled) { + ret = -ENODEV; + goto out; + } + + if (*pos > migf->total_length) { + ret = -EINVAL; + goto out; + } + + info.dirty_bytes = 0; + info.initial_bytes = migf->total_length - *pos; + + ret = copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; +out: + mutex_unlock(&migf->lock); + mutex_unlock(&hisi_acc_vdev->state_mutex); + return ret; +} + static ssize_t hisi_acc_vf_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos) { @@ -807,6 +856,8 @@ out_unlock: static const struct file_operations hisi_acc_vf_save_fops = { .owner = THIS_MODULE, .read = hisi_acc_vf_save_read, + .unlocked_ioctl = hisi_acc_vf_precopy_ioctl, + .compat_ioctl = compat_ptr_ioctl, .release = hisi_acc_vf_release_file, .llseek = no_llseek, }; @@ -832,6 +883,7 @@ hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev) stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); + migf->hisi_acc_vdev = hisi_acc_vdev; ret = vf_qm_state_save(hisi_acc_vdev, migf); if (ret) { diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h index 67343325b3201..11d51345f5b51 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h @@ -91,6 +91,7 @@ struct hisi_acc_vf_migration_file { struct mutex lock; bool disabled; + struct hisi_acc_vf_core_device *hisi_acc_vdev; struct acc_vf_data vf_data; size_t total_length; }; -- GitLab From d9a871e4a143047d1d84a606772af319f11516f9 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Wed, 23 Nov 2022 11:32:34 +0000 Subject: [PATCH 1189/1423] hisi_acc_vfio_pci: Introduce support for PRE_COPY state transitions The saving_migf is open in PRE_COPY state if it is supported and reads initial device match data. hisi_acc_vf_stop_copy() is refactored to make use of common code. Signed-off-by: Shameer Kolothum Link: https://lore.kernel.org/r/20221123113236.896-3-shameerali.kolothum.thodi@huawei.com Signed-off-by: Alex Williamson --- .../vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 74 ++++++++++++++++++- 1 file changed, 71 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index f3b74a06edb68..c8658636a84c0 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -863,7 +863,7 @@ static const struct file_operations hisi_acc_vf_save_fops = { }; static struct hisi_acc_vf_migration_file * -hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev) +hisi_acc_open_saving_migf(struct hisi_acc_vf_core_device *hisi_acc_vdev) { struct hisi_acc_vf_migration_file *migf; int ret; @@ -885,7 +885,7 @@ hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev) mutex_init(&migf->lock); migf->hisi_acc_vdev = hisi_acc_vdev; - ret = vf_qm_state_save(hisi_acc_vdev, migf); + ret = vf_qm_get_match_data(hisi_acc_vdev, &migf->vf_data); if (ret) { fput(migf->filp); return ERR_PTR(ret); @@ -894,6 +894,44 @@ hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev) return migf; } +static struct hisi_acc_vf_migration_file * +hisi_acc_vf_pre_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev) +{ + struct hisi_acc_vf_migration_file *migf; + + migf = hisi_acc_open_saving_migf(hisi_acc_vdev); + if (IS_ERR(migf)) + return migf; + + migf->total_length = QM_MATCH_SIZE; + return migf; +} + +static struct hisi_acc_vf_migration_file * +hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev, bool open) +{ + int ret; + struct hisi_acc_vf_migration_file *migf = NULL; + + if (open) { + /* + * Userspace didn't use PRECOPY support. Hence saving_migf + * is not opened yet. + */ + migf = hisi_acc_open_saving_migf(hisi_acc_vdev); + if (IS_ERR(migf)) + return migf; + } else { + migf = hisi_acc_vdev->saving_migf; + } + + ret = vf_qm_state_save(hisi_acc_vdev, migf); + if (ret) + return ERR_PTR(ret); + + return open ? migf : NULL; +} + static int hisi_acc_vf_stop_device(struct hisi_acc_vf_core_device *hisi_acc_vdev) { struct device *dev = &hisi_acc_vdev->vf_dev->dev; @@ -921,6 +959,31 @@ hisi_acc_vf_set_device_state(struct hisi_acc_vf_core_device *hisi_acc_vdev, u32 cur = hisi_acc_vdev->mig_state; int ret; + if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) { + struct hisi_acc_vf_migration_file *migf; + + migf = hisi_acc_vf_pre_copy(hisi_acc_vdev); + if (IS_ERR(migf)) + return ERR_CAST(migf); + get_file(migf->filp); + hisi_acc_vdev->saving_migf = migf; + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_STOP_COPY) { + struct hisi_acc_vf_migration_file *migf; + + ret = hisi_acc_vf_stop_device(hisi_acc_vdev); + if (ret) + return ERR_PTR(ret); + + migf = hisi_acc_vf_stop_copy(hisi_acc_vdev, false); + if (IS_ERR(migf)) + return ERR_CAST(migf); + + return NULL; + } + if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_STOP) { ret = hisi_acc_vf_stop_device(hisi_acc_vdev); if (ret) @@ -931,7 +994,7 @@ hisi_acc_vf_set_device_state(struct hisi_acc_vf_core_device *hisi_acc_vdev, if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { struct hisi_acc_vf_migration_file *migf; - migf = hisi_acc_vf_stop_copy(hisi_acc_vdev); + migf = hisi_acc_vf_stop_copy(hisi_acc_vdev, true); if (IS_ERR(migf)) return ERR_CAST(migf); get_file(migf->filp); @@ -963,6 +1026,11 @@ hisi_acc_vf_set_device_state(struct hisi_acc_vf_core_device *hisi_acc_vdev, return NULL; } + if (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) { + hisi_acc_vf_disable_fds(hisi_acc_vdev); + return NULL; + } + if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING) { hisi_acc_vf_start_device(hisi_acc_vdev); return NULL; -- GitLab From 190125adcad4c5850fd74ecd697e20a446b74ed8 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Wed, 23 Nov 2022 11:32:35 +0000 Subject: [PATCH 1190/1423] hisi_acc_vfio_pci: Move the dev compatibility tests for early check Instead of waiting till data transfer is complete to perform dev compatibility, do it as soon as we have enough data to perform the check. This will be useful when we enable the support for PRE_COPY. Signed-off-by: Shameer Kolothum Link: https://lore.kernel.org/r/20221123113236.896-4-shameerali.kolothum.thodi@huawei.com Signed-off-by: Alex Williamson --- .../vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 19 +++++++------------ .../vfio/pci/hisilicon/hisi_acc_vfio_pci.h | 1 + 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index c8658636a84c0..9a51f41e1d2aa 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -360,8 +360,8 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, u32 que_iso_state; int ret; - if (migf->total_length < QM_MATCH_SIZE) - return -EINVAL; + if (migf->total_length < QM_MATCH_SIZE || hisi_acc_vdev->match_done) + return 0; if (vf_data->acc_magic != ACC_DEV_MAGIC) { dev_err(dev, "failed to match ACC_DEV_MAGIC\n"); @@ -406,6 +406,7 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, } hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state; + hisi_acc_vdev->match_done = true; return 0; } @@ -493,10 +494,6 @@ static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev, struct device *dev = &vf_qm->pdev->dev; int ret; - ret = vf_qm_get_match_data(hisi_acc_vdev, vf_data); - if (ret) - return ret; - if (unlikely(qm_wait_dev_not_ready(vf_qm))) { /* Update state and return with match data */ vf_data->vf_qm_state = QM_NOT_READY; @@ -673,12 +670,6 @@ static int hisi_acc_vf_load_state(struct hisi_acc_vf_core_device *hisi_acc_vdev) struct hisi_acc_vf_migration_file *migf = hisi_acc_vdev->resuming_migf; int ret; - /* Check dev compatibility */ - ret = vf_qm_check_match(hisi_acc_vdev, migf); - if (ret) { - dev_err(dev, "failed to match the VF!\n"); - return ret; - } /* Recover data to VF */ ret = vf_qm_load_data(hisi_acc_vdev, migf); if (ret) { @@ -732,6 +723,10 @@ static ssize_t hisi_acc_vf_resume_write(struct file *filp, const char __user *bu *pos += len; done = len; migf->total_length += len; + + ret = vf_qm_check_match(migf->hisi_acc_vdev, migf); + if (ret) + done = -EFAULT; out_unlock: mutex_unlock(&migf->lock); return done; diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h index 11d51345f5b51..dcabfeec6ca19 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h @@ -98,6 +98,7 @@ struct hisi_acc_vf_migration_file { struct hisi_acc_vf_core_device { struct vfio_pci_core_device core_device; + u8 match_done:1; u8 deferred_reset:1; /* For migration state */ struct mutex state_mutex; -- GitLab From f2240b4441cc5bd87820371ee79ad7c9125e76b6 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Wed, 23 Nov 2022 11:32:36 +0000 Subject: [PATCH 1191/1423] hisi_acc_vfio_pci: Enable PRE_COPY flag Now that we have everything to support the PRE_COPY state, enable it. Signed-off-by: Shameer Kolothum Link: https://lore.kernel.org/r/20221123113236.896-5-shameerali.kolothum.thodi@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 9a51f41e1d2aa..51941bb4f31f0 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1351,7 +1351,7 @@ static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev) hisi_acc_vdev->vf_dev = pdev; mutex_init(&hisi_acc_vdev->state_mutex); - core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY; + core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY; core_vdev->mig_ops = &hisi_acc_vfio_pci_migrn_state_ops; return vfio_pci_core_init_dev(core_vdev); -- GitLab From 533aae7c94dbc2b14301cfd68ae7e0e90f0c8438 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 5 Dec 2022 13:39:02 +0100 Subject: [PATCH 1192/1423] gpiolib: cdev: fix NULL-pointer dereferences There are several places where we can crash the kernel by requesting lines, unbinding the GPIO device, then calling any of the system calls relevant to the GPIO character device's annonymous file descriptors: ioctl(), read(), poll(). While I observed it with the GPIO simulator, it will also happen for any of the GPIO devices that can be hot-unplugged - for instance any HID GPIO expander (e.g. CP2112). This affects both v1 and v2 uAPI. This fixes it partially by checking if gdev->chip is not NULL but it doesn't entirely remedy the situation as we still have a race condition in which another thread can remove the device after the check. Fixes: d7c51b47ac11 ("gpio: userspace ABI for reading/writing GPIO lines") Fixes: 3c0d9c635ae2 ("gpiolib: cdev: support GPIO_V2_GET_LINE_IOCTL and GPIO_V2_LINE_GET_VALUES_IOCTL") Fixes: aad955842d1c ("gpiolib: cdev: support GPIO_V2_GET_LINEINFO_IOCTL and GPIO_V2_GET_LINEINFO_WATCH_IOCTL") Fixes: a54756cb24ea ("gpiolib: cdev: support GPIO_V2_LINE_SET_CONFIG_IOCTL") Fixes: 7b8e00d98168 ("gpiolib: cdev: support GPIO_V2_LINE_SET_VALUES_IOCTL") Signed-off-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko Reviewed-by: Linus Walleij --- drivers/gpio/gpiolib-cdev.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/drivers/gpio/gpiolib-cdev.c b/drivers/gpio/gpiolib-cdev.c index 08606f32372ce..ac10c9494bf02 100644 --- a/drivers/gpio/gpiolib-cdev.c +++ b/drivers/gpio/gpiolib-cdev.c @@ -201,6 +201,9 @@ static long linehandle_ioctl(struct file *file, unsigned int cmd, unsigned int i; int ret; + if (!lh->gdev->chip) + return -ENODEV; + switch (cmd) { case GPIOHANDLE_GET_LINE_VALUES_IOCTL: /* NOTE: It's okay to read values of output lines */ @@ -1384,6 +1387,9 @@ static long linereq_ioctl(struct file *file, unsigned int cmd, struct linereq *lr = file->private_data; void __user *ip = (void __user *)arg; + if (!lr->gdev->chip) + return -ENODEV; + switch (cmd) { case GPIO_V2_LINE_GET_VALUES_IOCTL: return linereq_get_values(lr, ip); @@ -1410,6 +1416,9 @@ static __poll_t linereq_poll(struct file *file, struct linereq *lr = file->private_data; __poll_t events = 0; + if (!lr->gdev->chip) + return EPOLLHUP | EPOLLERR; + poll_wait(file, &lr->wait, wait); if (!kfifo_is_empty_spinlocked_noirqsave(&lr->events, @@ -1429,6 +1438,9 @@ static ssize_t linereq_read(struct file *file, ssize_t bytes_read = 0; int ret; + if (!lr->gdev->chip) + return -ENODEV; + if (count < sizeof(le)) return -EINVAL; @@ -1716,6 +1728,9 @@ static __poll_t lineevent_poll(struct file *file, struct lineevent_state *le = file->private_data; __poll_t events = 0; + if (!le->gdev->chip) + return EPOLLHUP | EPOLLERR; + poll_wait(file, &le->wait, wait); if (!kfifo_is_empty_spinlocked_noirqsave(&le->events, &le->wait.lock)) @@ -1740,6 +1755,9 @@ static ssize_t lineevent_read(struct file *file, ssize_t ge_size; int ret; + if (!le->gdev->chip) + return -ENODEV; + /* * When compatible system call is being used the struct gpioevent_data, * in case of at least ia32, has different size due to the alignment @@ -1821,6 +1839,9 @@ static long lineevent_ioctl(struct file *file, unsigned int cmd, void __user *ip = (void __user *)arg; struct gpiohandle_data ghd; + if (!le->gdev->chip) + return -ENODEV; + /* * We can get the value for an event line but not set it, * because it is input by definition. @@ -2407,6 +2428,9 @@ static __poll_t lineinfo_watch_poll(struct file *file, struct gpio_chardev_data *cdev = file->private_data; __poll_t events = 0; + if (!cdev->gdev->chip) + return EPOLLHUP | EPOLLERR; + poll_wait(file, &cdev->wait, pollt); if (!kfifo_is_empty_spinlocked_noirqsave(&cdev->events, @@ -2425,6 +2449,9 @@ static ssize_t lineinfo_watch_read(struct file *file, char __user *buf, int ret; size_t event_size; + if (!cdev->gdev->chip) + return -ENODEV; + #ifndef CONFIG_GPIO_CDEV_V1 event_size = sizeof(struct gpio_v2_line_info_changed); if (count < event_size) -- GitLab From bdbbae241a04f387ba910b8609f95fad5f1470c7 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 5 Dec 2022 13:39:03 +0100 Subject: [PATCH 1193/1423] gpiolib: protect the GPIO device against being dropped while in use by user-space While any of the GPIO cdev syscalls is in progress, the kernel can call gpiochip_remove() (for instance, when a USB GPIO expander is disconnected) which will set gdev->chip to NULL after which any subsequent access will cause a crash. To avoid that: use an RW-semaphore in which the syscalls take it for reading (so that we don't needlessly prohibit the user-space from calling syscalls simultaneously) while gpiochip_remove() takes it for writing so that it can only happen once all syscalls return. Fixes: d7c51b47ac11 ("gpio: userspace ABI for reading/writing GPIO lines") Fixes: 3c0d9c635ae2 ("gpiolib: cdev: support GPIO_V2_GET_LINE_IOCTL and GPIO_V2_LINE_GET_VALUES_IOCTL") Fixes: aad955842d1c ("gpiolib: cdev: support GPIO_V2_GET_LINEINFO_IOCTL and GPIO_V2_GET_LINEINFO_WATCH_IOCTL") Fixes: a54756cb24ea ("gpiolib: cdev: support GPIO_V2_LINE_SET_CONFIG_IOCTL") Fixes: 7b8e00d98168 ("gpiolib: cdev: support GPIO_V2_LINE_SET_VALUES_IOCTL") Signed-off-by: Bartosz Golaszewski [Nick: fixed a build failure with CDEV_V1 disabled] Co-authored-by: Nick Hainke Reviewed-by: Kent Gibson Reviewed-by: Andy Shevchenko Reviewed-by: Linus Walleij --- drivers/gpio/gpiolib-cdev.c | 177 +++++++++++++++++++++++++++++++----- drivers/gpio/gpiolib.c | 4 + drivers/gpio/gpiolib.h | 5 + 3 files changed, 161 insertions(+), 25 deletions(-) diff --git a/drivers/gpio/gpiolib-cdev.c b/drivers/gpio/gpiolib-cdev.c index ac10c9494bf02..f7289c2f3a3c3 100644 --- a/drivers/gpio/gpiolib-cdev.c +++ b/drivers/gpio/gpiolib-cdev.c @@ -55,6 +55,50 @@ static_assert(IS_ALIGNED(sizeof(struct gpio_v2_line_values), 8)); * interface to gpiolib GPIOs via ioctl()s. */ +typedef __poll_t (*poll_fn)(struct file *, struct poll_table_struct *); +typedef long (*ioctl_fn)(struct file *, unsigned int, unsigned long); +typedef ssize_t (*read_fn)(struct file *, char __user *, + size_t count, loff_t *); + +static __poll_t call_poll_locked(struct file *file, + struct poll_table_struct *wait, + struct gpio_device *gdev, poll_fn func) +{ + __poll_t ret; + + down_read(&gdev->sem); + ret = func(file, wait); + up_read(&gdev->sem); + + return ret; +} + +static long call_ioctl_locked(struct file *file, unsigned int cmd, + unsigned long arg, struct gpio_device *gdev, + ioctl_fn func) +{ + long ret; + + down_read(&gdev->sem); + ret = func(file, cmd, arg); + up_read(&gdev->sem); + + return ret; +} + +static ssize_t call_read_locked(struct file *file, char __user *buf, + size_t count, loff_t *f_ps, + struct gpio_device *gdev, read_fn func) +{ + ssize_t ret; + + down_read(&gdev->sem); + ret = func(file, buf, count, f_ps); + up_read(&gdev->sem); + + return ret; +} + /* * GPIO line handle management */ @@ -191,8 +235,8 @@ static long linehandle_set_config(struct linehandle_state *lh, return 0; } -static long linehandle_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) +static long linehandle_ioctl_unlocked(struct file *file, unsigned int cmd, + unsigned long arg) { struct linehandle_state *lh = file->private_data; void __user *ip = (void __user *)arg; @@ -250,6 +294,15 @@ static long linehandle_ioctl(struct file *file, unsigned int cmd, } } +static long linehandle_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct linehandle_state *lh = file->private_data; + + return call_ioctl_locked(file, cmd, arg, lh->gdev, + linehandle_ioctl_unlocked); +} + #ifdef CONFIG_COMPAT static long linehandle_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg) @@ -1381,8 +1434,8 @@ static long linereq_set_config(struct linereq *lr, void __user *ip) return ret; } -static long linereq_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) +static long linereq_ioctl_unlocked(struct file *file, unsigned int cmd, + unsigned long arg) { struct linereq *lr = file->private_data; void __user *ip = (void __user *)arg; @@ -1402,6 +1455,15 @@ static long linereq_ioctl(struct file *file, unsigned int cmd, } } +static long linereq_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct linereq *lr = file->private_data; + + return call_ioctl_locked(file, cmd, arg, lr->gdev, + linereq_ioctl_unlocked); +} + #ifdef CONFIG_COMPAT static long linereq_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg) @@ -1410,8 +1472,8 @@ static long linereq_ioctl_compat(struct file *file, unsigned int cmd, } #endif -static __poll_t linereq_poll(struct file *file, - struct poll_table_struct *wait) +static __poll_t linereq_poll_unlocked(struct file *file, + struct poll_table_struct *wait) { struct linereq *lr = file->private_data; __poll_t events = 0; @@ -1428,10 +1490,16 @@ static __poll_t linereq_poll(struct file *file, return events; } -static ssize_t linereq_read(struct file *file, - char __user *buf, - size_t count, - loff_t *f_ps) +static __poll_t linereq_poll(struct file *file, + struct poll_table_struct *wait) +{ + struct linereq *lr = file->private_data; + + return call_poll_locked(file, wait, lr->gdev, linereq_poll_unlocked); +} + +static ssize_t linereq_read_unlocked(struct file *file, char __user *buf, + size_t count, loff_t *f_ps) { struct linereq *lr = file->private_data; struct gpio_v2_line_event le; @@ -1485,6 +1553,15 @@ static ssize_t linereq_read(struct file *file, return bytes_read; } +static ssize_t linereq_read(struct file *file, char __user *buf, + size_t count, loff_t *f_ps) +{ + struct linereq *lr = file->private_data; + + return call_read_locked(file, buf, count, f_ps, lr->gdev, + linereq_read_unlocked); +} + static void linereq_free(struct linereq *lr) { unsigned int i; @@ -1722,8 +1799,8 @@ struct lineevent_state { (GPIOEVENT_REQUEST_RISING_EDGE | \ GPIOEVENT_REQUEST_FALLING_EDGE) -static __poll_t lineevent_poll(struct file *file, - struct poll_table_struct *wait) +static __poll_t lineevent_poll_unlocked(struct file *file, + struct poll_table_struct *wait) { struct lineevent_state *le = file->private_data; __poll_t events = 0; @@ -1739,15 +1816,21 @@ static __poll_t lineevent_poll(struct file *file, return events; } +static __poll_t lineevent_poll(struct file *file, + struct poll_table_struct *wait) +{ + struct lineevent_state *le = file->private_data; + + return call_poll_locked(file, wait, le->gdev, lineevent_poll_unlocked); +} + struct compat_gpioeevent_data { compat_u64 timestamp; u32 id; }; -static ssize_t lineevent_read(struct file *file, - char __user *buf, - size_t count, - loff_t *f_ps) +static ssize_t lineevent_read_unlocked(struct file *file, char __user *buf, + size_t count, loff_t *f_ps) { struct lineevent_state *le = file->private_data; struct gpioevent_data ge; @@ -1815,6 +1898,15 @@ static ssize_t lineevent_read(struct file *file, return bytes_read; } +static ssize_t lineevent_read(struct file *file, char __user *buf, + size_t count, loff_t *f_ps) +{ + struct lineevent_state *le = file->private_data; + + return call_read_locked(file, buf, count, f_ps, le->gdev, + lineevent_read_unlocked); +} + static void lineevent_free(struct lineevent_state *le) { if (le->irq) @@ -1832,8 +1924,8 @@ static int lineevent_release(struct inode *inode, struct file *file) return 0; } -static long lineevent_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) +static long lineevent_ioctl_unlocked(struct file *file, unsigned int cmd, + unsigned long arg) { struct lineevent_state *le = file->private_data; void __user *ip = (void __user *)arg; @@ -1864,6 +1956,15 @@ static long lineevent_ioctl(struct file *file, unsigned int cmd, return -EINVAL; } +static long lineevent_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct lineevent_state *le = file->private_data; + + return call_ioctl_locked(file, cmd, arg, le->gdev, + lineevent_ioctl_unlocked); +} + #ifdef CONFIG_COMPAT static long lineevent_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg) @@ -2422,8 +2523,8 @@ static int lineinfo_changed_notify(struct notifier_block *nb, return NOTIFY_OK; } -static __poll_t lineinfo_watch_poll(struct file *file, - struct poll_table_struct *pollt) +static __poll_t lineinfo_watch_poll_unlocked(struct file *file, + struct poll_table_struct *pollt) { struct gpio_chardev_data *cdev = file->private_data; __poll_t events = 0; @@ -2440,8 +2541,17 @@ static __poll_t lineinfo_watch_poll(struct file *file, return events; } -static ssize_t lineinfo_watch_read(struct file *file, char __user *buf, - size_t count, loff_t *off) +static __poll_t lineinfo_watch_poll(struct file *file, + struct poll_table_struct *pollt) +{ + struct gpio_chardev_data *cdev = file->private_data; + + return call_poll_locked(file, pollt, cdev->gdev, + lineinfo_watch_poll_unlocked); +} + +static ssize_t lineinfo_watch_read_unlocked(struct file *file, char __user *buf, + size_t count, loff_t *off) { struct gpio_chardev_data *cdev = file->private_data; struct gpio_v2_line_info_changed event; @@ -2519,6 +2629,15 @@ static ssize_t lineinfo_watch_read(struct file *file, char __user *buf, return bytes_read; } +static ssize_t lineinfo_watch_read(struct file *file, char __user *buf, + size_t count, loff_t *off) +{ + struct gpio_chardev_data *cdev = file->private_data; + + return call_read_locked(file, buf, count, off, cdev->gdev, + lineinfo_watch_read_unlocked); +} + /** * gpio_chrdev_open() - open the chardev for ioctl operations * @inode: inode for this chardev @@ -2532,13 +2651,17 @@ static int gpio_chrdev_open(struct inode *inode, struct file *file) struct gpio_chardev_data *cdev; int ret = -ENOMEM; + down_read(&gdev->sem); + /* Fail on open if the backing gpiochip is gone */ - if (!gdev->chip) - return -ENODEV; + if (!gdev->chip) { + ret = -ENODEV; + goto out_unlock; + } cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); if (!cdev) - return -ENOMEM; + goto out_unlock; cdev->watched_lines = bitmap_zalloc(gdev->chip->ngpio, GFP_KERNEL); if (!cdev->watched_lines) @@ -2561,6 +2684,8 @@ static int gpio_chrdev_open(struct inode *inode, struct file *file) if (ret) goto out_unregister_notifier; + up_read(&gdev->sem); + return ret; out_unregister_notifier: @@ -2570,6 +2695,8 @@ out_free_bitmap: bitmap_free(cdev->watched_lines); out_free_cdev: kfree(cdev); +out_unlock: + up_read(&gdev->sem); return ret; } diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 0058ee83989df..b8e17a4e38c51 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -790,6 +790,7 @@ int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data, spin_unlock_irqrestore(&gpio_lock, flags); BLOCKING_INIT_NOTIFIER_HEAD(&gdev->notifier); + init_rwsem(&gdev->sem); #ifdef CONFIG_PINCTRL INIT_LIST_HEAD(&gdev->pin_ranges); @@ -924,6 +925,8 @@ void gpiochip_remove(struct gpio_chip *gc) unsigned long flags; unsigned int i; + down_write(&gdev->sem); + /* FIXME: should the legacy sysfs handling be moved to gpio_device? */ gpiochip_sysfs_unregister(gdev); gpiochip_free_hogs(gc); @@ -958,6 +961,7 @@ void gpiochip_remove(struct gpio_chip *gc) * gone. */ gcdev_unregister(gdev); + up_write(&gdev->sem); put_device(&gdev->dev); } EXPORT_SYMBOL_GPL(gpiochip_remove); diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h index e443c1023a374..b3c2db6eba80c 100644 --- a/drivers/gpio/gpiolib.h +++ b/drivers/gpio/gpiolib.h @@ -15,6 +15,7 @@ #include #include #include +#include #define GPIOCHIP_NAME "gpiochip" @@ -39,6 +40,9 @@ * @list: links gpio_device:s together for traversal * @notifier: used to notify subscribers about lines being requested, released * or reconfigured + * @sem: protects the structure from a NULL-pointer dereference of @chip by + * user-space operations when the device gets unregistered during + * a hot-unplug event * @pin_ranges: range of pins served by the GPIO driver * * This state container holds most of the runtime variable data @@ -60,6 +64,7 @@ struct gpio_device { void *data; struct list_head list; struct blocking_notifier_head notifier; + struct rw_semaphore sem; #ifdef CONFIG_PINCTRL /* -- GitLab From d074f0aebde5649f7a9f1807551efc019b8e81c4 Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Wed, 7 Dec 2022 16:32:18 +0800 Subject: [PATCH 1194/1423] RDMA/hfi1: use sysfs_emit() to instead of scnprintf() Follow the advice of the Documentation/filesystems/sysfs.rst and show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. Signed-off-by: ye xingchen Link: https://lore.kernel.org/r/202212071632188074249@zte.com.cn Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hfi1/driver.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index 8e71bef9d9826..bcc6bc0540f03 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -112,7 +112,7 @@ static int hfi1_caps_get(char *buffer, const struct kernel_param *kp) cap_mask &= ~HFI1_CAP_LOCKED_SMASK; cap_mask |= ((cap_mask & HFI1_CAP_K2U) << HFI1_CAP_USER_SHIFT); - return scnprintf(buffer, PAGE_SIZE, "0x%lx", cap_mask); + return sysfs_emit(buffer, "0x%lx\n", cap_mask); } struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi) -- GitLab From 7ccb966779645636679a723588b7bae4f0a8d7d5 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 14 Nov 2022 10:42:25 -0800 Subject: [PATCH 1195/1423] PCI: aardvark: Switch to using devm_gpiod_get_optional() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch the driver to the generic version of gpiod API (and away from OF-specific variant), so that we can stop exporting devm_gpiod_get_from_of_node(). Link: https://lore.kernel.org/r/Y3KMEZFv6dpxA+Gv@google.com Signed-off-by: Dmitry Torokhov Signed-off-by: Bjorn Helgaas Reviewed-by: Linus Walleij Acked-by: Pali Rohár --- drivers/pci/controller/pci-aardvark.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index ba36bbc5897d0..513d8edf3a5cf 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -1859,20 +1859,18 @@ static int advk_pcie_probe(struct platform_device *pdev) return ret; } - pcie->reset_gpio = devm_gpiod_get_from_of_node(dev, dev->of_node, - "reset-gpios", 0, - GPIOD_OUT_LOW, - "pcie1-reset"); + pcie->reset_gpio = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_LOW); ret = PTR_ERR_OR_ZERO(pcie->reset_gpio); if (ret) { - if (ret == -ENOENT) { - pcie->reset_gpio = NULL; - } else { - if (ret != -EPROBE_DEFER) - dev_err(dev, "Failed to get reset-gpio: %i\n", - ret); - return ret; - } + if (ret != -EPROBE_DEFER) + dev_err(dev, "Failed to get reset-gpio: %i\n", ret); + return ret; + } + + ret = gpiod_set_consumer_name(pcie->reset_gpio, "pcie1-reset"); + if (ret) { + dev_err(dev, "Failed to set reset gpio name: %d\n", ret); + return ret; } ret = of_pci_get_max_link_speed(dev->of_node); -- GitLab From 6d4671b534f6c084e92ef167a52dc47e55f636c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Tue, 27 Sep 2022 16:19:17 +0200 Subject: [PATCH 1196/1423] PCI: pciehp: Enable Command Completed Interrupt only if supported MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The No Command Completed Support bit in the Slot Capabilities register indicates whether Command Completed Interrupt Enable is unsupported. We already check whether No Command Completed Support bit is set in pcie_wait_cmd(), and do not wait in this case. Don't enable this Command Completed Interrupt at all if NCCS is set, so that when users dump configuration space from userspace, the dump does not confuse them by saying that Command Completed Interrupt is not supported, but it is enabled. Link: https://lore.kernel.org/r/20220927141926.8895-2-kabel@kernel.org Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Bjorn Helgaas Reviewed-by: Lukas Wunner --- drivers/pci/hotplug/pciehp_hpc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index 040ae076ec0e9..10e9670eea0b0 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -811,7 +811,9 @@ static void pcie_enable_notification(struct controller *ctrl) else cmd |= PCI_EXP_SLTCTL_PDCE; if (!pciehp_poll_mode) - cmd |= PCI_EXP_SLTCTL_HPIE | PCI_EXP_SLTCTL_CCIE; + cmd |= PCI_EXP_SLTCTL_HPIE; + if (!pciehp_poll_mode && !NO_CMD_CMPL(ctrl)) + cmd |= PCI_EXP_SLTCTL_CCIE; mask = (PCI_EXP_SLTCTL_PDCE | PCI_EXP_SLTCTL_ABPE | PCI_EXP_SLTCTL_PFDE | -- GitLab From 3256412fc57b6cabbc1cf1317d020e42f6d7aeab Mon Sep 17 00:00:00 2001 From: Weilong Chen Date: Tue, 8 Nov 2022 09:58:11 +0800 Subject: [PATCH 1197/1423] i2c: hisi: Add support to get clock frequency from clock The clk_rate attribute is not generic device tree bindings for I2C busses described in Documentation/devicetree/bindings/i2c/i2c.txt. It can be managed by clock binding. Support the driver to obtain clock information by clk_rate or clock property. Find clock first, if not, fall back to clk_rate. Signed-off-by: Weilong Chen Acked-by: Yicong Yang Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-hisi.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/i2c/busses/i2c-hisi.c b/drivers/i2c/busses/i2c-hisi.c index bcc97e4fcb654..8c6c7075c765c 100644 --- a/drivers/i2c/busses/i2c-hisi.c +++ b/drivers/i2c/busses/i2c-hisi.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -88,6 +89,7 @@ struct hisi_i2c_controller { struct i2c_adapter adapter; void __iomem *iobase; struct device *dev; + struct clk *clk; int irq; /* Intermediates for recording the transfer process */ @@ -454,10 +456,15 @@ static int hisi_i2c_probe(struct platform_device *pdev) return ret; } - ret = device_property_read_u64(dev, "clk_rate", &clk_rate_hz); - if (ret) { - dev_err(dev, "failed to get clock frequency, ret = %d\n", ret); - return ret; + ctlr->clk = devm_clk_get_optional_enabled(&pdev->dev, NULL); + if (IS_ERR_OR_NULL(ctlr->clk)) { + ret = device_property_read_u64(dev, "clk_rate", &clk_rate_hz); + if (ret) { + dev_err(dev, "failed to get clock frequency, ret = %d\n", ret); + return ret; + } + } else { + clk_rate_hz = clk_get_rate(ctlr->clk); } ctlr->clk_rate_khz = DIV_ROUND_UP_ULL(clk_rate_hz, HZ_PER_KHZ); -- GitLab From 810199f7315604bd969409109f1c96b4ebe772ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 19 Oct 2022 22:28:08 +0200 Subject: [PATCH 1198/1423] i2c: xiic: Make sure to disable clock on .remove() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If for whatever reasons pm_runtime_resume_and_get() failed, .remove() is exited early, the clock isn't freed and runtime PM state isn't reset. The right thing to do however is to free all resources that don't need HW access after a problem with runtime PM. Also issue a warning in that case and return 0 to suppress a less helpful warning by the driver core. Signed-off-by: Uwe Kleine-König Acked-by: Michal Simek Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-xiic.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/i2c/busses/i2c-xiic.c b/drivers/i2c/busses/i2c-xiic.c index 277a02455cddd..bee5a2ef1f229 100644 --- a/drivers/i2c/busses/i2c-xiic.c +++ b/drivers/i2c/busses/i2c-xiic.c @@ -858,11 +858,14 @@ static int xiic_i2c_remove(struct platform_device *pdev) /* remove adapter & data */ i2c_del_adapter(&i2c->adap); - ret = pm_runtime_resume_and_get(i2c->dev); + ret = pm_runtime_get_sync(i2c->dev); + if (ret < 0) - return ret; + dev_warn(&pdev->dev, "Failed to activate device for removal (%pe)\n", + ERR_PTR(ret)); + else + xiic_deinit(i2c); - xiic_deinit(i2c); pm_runtime_put_sync(i2c->dev); clk_disable_unprepare(i2c->clk); pm_runtime_disable(&pdev->dev); -- GitLab From b3525072835b523b397d459fadd0785d9c24bbd1 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 24 Oct 2022 14:29:39 +0100 Subject: [PATCH 1199/1423] orangefs: remove variable i Variable i is just being incremented and it's never used anywhere else. The variable and the increment are redundant so remove it. Signed-off-by: Colin Ian King Signed-off-by: Mike Marshall --- fs/orangefs/inode.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 7a8c0c6e698de..eaa35a9661151 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -530,7 +530,6 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb, size_t count = iov_iter_count(iter); ssize_t total_count = 0; ssize_t ret = -EINVAL; - int i = 0; gossip_debug(GOSSIP_FILE_DEBUG, "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n", @@ -556,7 +555,6 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb, while (iov_iter_count(iter)) { size_t each_count = iov_iter_count(iter); size_t amt_complete; - i++; /* how much to transfer in this loop iteration */ if (each_count > orangefs_bufmap_size_query()) -- GitLab From 610defdccff7b955fe899a825990c2202153a22e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 17 Oct 2022 22:49:37 +0100 Subject: [PATCH 1200/1423] orangefs: remove redundant assignment to variable buffer_index The variable buffer_index is assigned a value that is never read, it is assigned just before the function returns. The assignment is redundant and can be removed. Cleans up clang scan build warning: fs/orangefs/file.c:276:3: warning: Value stored to 'buffer_index' is never read [deadcode.DeadStores] Signed-off-by: Colin Ian King Signed-off-by: Mike Marshall --- fs/orangefs/file.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 732661aa26804..167fa43b24f9a 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -273,7 +273,6 @@ out: gossip_debug(GOSSIP_FILE_DEBUG, "%s(%pU): PUT buffer_index %d\n", __func__, handle, buffer_index); - buffer_index = -1; } op_release(new_op); return ret; -- GitLab From ea60a4ad0cf88b411cde6888b8c890935686ecd7 Mon Sep 17 00:00:00 2001 From: Zhang Xiaoxu Date: Tue, 18 Oct 2022 12:40:04 +0800 Subject: [PATCH 1201/1423] orangefs: Fix sysfs not cleanup when dev init failed When the dev init failed, should cleanup the sysfs, otherwise, the module will never be loaded since can not create duplicate sysfs directory: sysfs: cannot create duplicate filename '/fs/orangefs' CPU: 1 PID: 6549 Comm: insmod Tainted: G W 6.0.0+ #44 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-1.fc33 04/01/2014 Call Trace: dump_stack_lvl+0x34/0x44 sysfs_warn_dup.cold+0x17/0x24 sysfs_create_dir_ns+0x16d/0x180 kobject_add_internal+0x156/0x3a0 kobject_init_and_add+0xcf/0x120 orangefs_sysfs_init+0x7e/0x3a0 [orangefs] orangefs_init+0xfe/0x1000 [orangefs] do_one_initcall+0x87/0x2a0 do_init_module+0xdf/0x320 load_module+0x2f98/0x3330 __do_sys_finit_module+0x113/0x1b0 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 kobject_add_internal failed for orangefs with -EEXIST, don't try to register things with the same name in the same directory. Fixes: 2f83ace37181 ("orangefs: put register_chrdev immediately before register_filesystem") Signed-off-by: Zhang Xiaoxu Signed-off-by: Mike Marshall --- fs/orangefs/orangefs-mod.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c index cd7297815f91e..5ab741c60b7e2 100644 --- a/fs/orangefs/orangefs-mod.c +++ b/fs/orangefs/orangefs-mod.c @@ -141,7 +141,7 @@ static int __init orangefs_init(void) gossip_err("%s: could not initialize device subsystem %d!\n", __func__, ret); - goto cleanup_device; + goto cleanup_sysfs; } ret = register_filesystem(&orangefs_fs_type); @@ -152,11 +152,11 @@ static int __init orangefs_init(void) goto out; } - orangefs_sysfs_exit(); - -cleanup_device: orangefs_dev_cleanup(); +cleanup_sysfs: + orangefs_sysfs_exit(); + sysfs_init_failed: orangefs_debugfs_cleanup(); -- GitLab From d23417a5bf3a3afc55de5442eb46e1e60458b0a1 Mon Sep 17 00:00:00 2001 From: Zhang Xiaoxu Date: Tue, 18 Oct 2022 12:40:05 +0800 Subject: [PATCH 1202/1423] orangefs: Fix kmemleak in orangefs_prepare_debugfs_help_string() When insert and remove the orangefs module, then debug_help_string will be leaked: unreferenced object 0xffff8881652ba000 (size 4096): comm "insmod", pid 1701, jiffies 4294893639 (age 13218.530s) hex dump (first 32 bytes): 43 6c 69 65 6e 74 20 44 65 62 75 67 20 4b 65 79 Client Debug Key 77 6f 72 64 73 20 61 72 65 20 75 6e 6b 6e 6f 77 words are unknow backtrace: [<0000000004e6f8e3>] kmalloc_trace+0x27/0xa0 [<0000000006f75d85>] orangefs_prepare_debugfs_help_string+0x5e/0x480 [orangefs] [<0000000091270a2a>] _sub_I_65535_1+0x57/0xf70 [crc_itu_t] [<000000004b1ee1a3>] do_one_initcall+0x87/0x2a0 [<000000001d0614ae>] do_init_module+0xdf/0x320 [<00000000efef068c>] load_module+0x2f98/0x3330 [<000000006533b44d>] __do_sys_finit_module+0x113/0x1b0 [<00000000a0da6f99>] do_syscall_64+0x35/0x80 [<000000007790b19b>] entry_SYSCALL_64_after_hwframe+0x46/0xb0 When remove the module, should always free debug_help_string. Should always free the allocated buffer when change the free_debug_help_string. Signed-off-by: Zhang Xiaoxu Signed-off-by: Mike Marshall --- fs/orangefs/orangefs-debugfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index 29eaa45443727..a848b6ef95990 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -222,6 +222,8 @@ out: void orangefs_debugfs_cleanup(void) { debugfs_remove_recursive(debug_dir); + kfree(debug_help_string); + debug_help_string = NULL; } /* open ORANGEFS_KMOD_DEBUG_HELP_FILE */ @@ -671,6 +673,7 @@ int orangefs_prepare_debugfs_help_string(int at_boot) memset(debug_help_string, 0, DEBUG_HELP_STRING_SIZE); strlcat(debug_help_string, new, string_size); mutex_unlock(&orangefs_help_file_lock); + kfree(new); } rc = 0; -- GitLab From 1f2c0e8a587bcafad85019a2d80f158d8d41a868 Mon Sep 17 00:00:00 2001 From: Zhang Xiaoxu Date: Tue, 18 Oct 2022 12:40:06 +0800 Subject: [PATCH 1203/1423] orangefs: Fix kmemleak in orangefs_sysfs_init() When insert and remove the orangefs module, there are kobjects memory leaked as below: unreferenced object 0xffff88810f95af00 (size 64): comm "insmod", pid 783, jiffies 4294813439 (age 65.512s) hex dump (first 32 bytes): a0 83 af 01 81 88 ff ff 08 af 95 0f 81 88 ff ff ................ 08 af 95 0f 81 88 ff ff 00 00 00 00 00 00 00 00 ................ backtrace: [<0000000031ab7788>] kmalloc_trace+0x27/0xa0 [<000000005a6e4dfe>] orangefs_sysfs_init+0x42/0x3a0 [<00000000722645ca>] 0xffffffffa02780fe [<000000004232d9f7>] do_one_initcall+0x87/0x2a0 [<0000000054f22384>] do_init_module+0xdf/0x320 [<000000003263bdea>] load_module+0x2f98/0x3330 [<0000000052cd4153>] __do_sys_finit_module+0x113/0x1b0 [<00000000250ae02b>] do_syscall_64+0x35/0x80 [<00000000f11c03c7>] entry_SYSCALL_64_after_hwframe+0x46/0xb0 unreferenced object 0xffff88810f95ae80 (size 64): comm "insmod", pid 783, jiffies 4294813439 (age 65.512s) hex dump (first 32 bytes): c8 90 0f 02 81 88 ff ff 88 ae 95 0f 81 88 ff ff ................ 88 ae 95 0f 81 88 ff ff 00 00 00 00 00 00 00 00 ................ backtrace: [<0000000031ab7788>] kmalloc_trace+0x27/0xa0 [<000000001a4841fa>] orangefs_sysfs_init+0xc7/0x3a0 [<00000000722645ca>] 0xffffffffa02780fe [<000000004232d9f7>] do_one_initcall+0x87/0x2a0 [<0000000054f22384>] do_init_module+0xdf/0x320 [<000000003263bdea>] load_module+0x2f98/0x3330 [<0000000052cd4153>] __do_sys_finit_module+0x113/0x1b0 [<00000000250ae02b>] do_syscall_64+0x35/0x80 [<00000000f11c03c7>] entry_SYSCALL_64_after_hwframe+0x46/0xb0 unreferenced object 0xffff88810f95ae00 (size 64): comm "insmod", pid 783, jiffies 4294813440 (age 65.511s) hex dump (first 32 bytes): 60 87 a1 00 81 88 ff ff 08 ae 95 0f 81 88 ff ff `............... 08 ae 95 0f 81 88 ff ff 00 00 00 00 00 00 00 00 ................ backtrace: [<0000000031ab7788>] kmalloc_trace+0x27/0xa0 [<000000005915e797>] orangefs_sysfs_init+0x12b/0x3a0 [<00000000722645ca>] 0xffffffffa02780fe [<000000004232d9f7>] do_one_initcall+0x87/0x2a0 [<0000000054f22384>] do_init_module+0xdf/0x320 [<000000003263bdea>] load_module+0x2f98/0x3330 [<0000000052cd4153>] __do_sys_finit_module+0x113/0x1b0 [<00000000250ae02b>] do_syscall_64+0x35/0x80 [<00000000f11c03c7>] entry_SYSCALL_64_after_hwframe+0x46/0xb0 unreferenced object 0xffff88810f95ad80 (size 64): comm "insmod", pid 783, jiffies 4294813440 (age 65.511s) hex dump (first 32 bytes): 78 90 0f 02 81 88 ff ff 88 ad 95 0f 81 88 ff ff x............... 88 ad 95 0f 81 88 ff ff 00 00 00 00 00 00 00 00 ................ backtrace: [<0000000031ab7788>] kmalloc_trace+0x27/0xa0 [<000000007a14eb35>] orangefs_sysfs_init+0x1ac/0x3a0 [<00000000722645ca>] 0xffffffffa02780fe [<000000004232d9f7>] do_one_initcall+0x87/0x2a0 [<0000000054f22384>] do_init_module+0xdf/0x320 [<000000003263bdea>] load_module+0x2f98/0x3330 [<0000000052cd4153>] __do_sys_finit_module+0x113/0x1b0 [<00000000250ae02b>] do_syscall_64+0x35/0x80 [<00000000f11c03c7>] entry_SYSCALL_64_after_hwframe+0x46/0xb0 unreferenced object 0xffff88810f95ac00 (size 64): comm "insmod", pid 783, jiffies 4294813440 (age 65.531s) hex dump (first 32 bytes): e0 ff 67 02 81 88 ff ff 08 ac 95 0f 81 88 ff ff ..g............. 08 ac 95 0f 81 88 ff ff 00 00 00 00 00 00 00 00 ................ backtrace: [<0000000031ab7788>] kmalloc_trace+0x27/0xa0 [<000000001f38adcb>] orangefs_sysfs_init+0x291/0x3a0 [<00000000722645ca>] 0xffffffffa02780fe [<000000004232d9f7>] do_one_initcall+0x87/0x2a0 [<0000000054f22384>] do_init_module+0xdf/0x320 [<000000003263bdea>] load_module+0x2f98/0x3330 [<0000000052cd4153>] __do_sys_finit_module+0x113/0x1b0 [<00000000250ae02b>] do_syscall_64+0x35/0x80 [<00000000f11c03c7>] entry_SYSCALL_64_after_hwframe+0x46/0xb0 unreferenced object 0xffff88810f95ab80 (size 64): comm "insmod", pid 783, jiffies 4294813441 (age 65.530s) hex dump (first 32 bytes): 50 bf 2f 02 81 88 ff ff 88 ab 95 0f 81 88 ff ff P./............. 88 ab 95 0f 81 88 ff ff 00 00 00 00 00 00 00 00 ................ backtrace: [<0000000031ab7788>] kmalloc_trace+0x27/0xa0 [<000000009cc7d95b>] orangefs_sysfs_init+0x2f5/0x3a0 [<00000000722645ca>] 0xffffffffa02780fe [<000000004232d9f7>] do_one_initcall+0x87/0x2a0 [<0000000054f22384>] do_init_module+0xdf/0x320 [<000000003263bdea>] load_module+0x2f98/0x3330 [<0000000052cd4153>] __do_sys_finit_module+0x113/0x1b0 [<00000000250ae02b>] do_syscall_64+0x35/0x80 [<00000000f11c03c7>] entry_SYSCALL_64_after_hwframe+0x46/0xb0 Should add release function for each kobject_type to free the memory. Signed-off-by: Zhang Xiaoxu Signed-off-by: Mike Marshall --- fs/orangefs/orangefs-sysfs.c | 71 ++++++++++++++++++++++++++++++++---- 1 file changed, 63 insertions(+), 8 deletions(-) diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c index de80b62553bb1..be4ba03a01a0f 100644 --- a/fs/orangefs/orangefs-sysfs.c +++ b/fs/orangefs/orangefs-sysfs.c @@ -896,9 +896,18 @@ static struct attribute *orangefs_default_attrs[] = { }; ATTRIBUTE_GROUPS(orangefs_default); +static struct kobject *orangefs_obj; + +static void orangefs_obj_release(struct kobject *kobj) +{ + kfree(orangefs_obj); + orangefs_obj = NULL; +} + static struct kobj_type orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = orangefs_default_groups, + .release = orangefs_obj_release, }; static struct orangefs_attribute acache_hard_limit_attribute = @@ -934,9 +943,18 @@ static struct attribute *acache_orangefs_default_attrs[] = { }; ATTRIBUTE_GROUPS(acache_orangefs_default); +static struct kobject *acache_orangefs_obj; + +static void acache_orangefs_obj_release(struct kobject *kobj) +{ + kfree(acache_orangefs_obj); + acache_orangefs_obj = NULL; +} + static struct kobj_type acache_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = acache_orangefs_default_groups, + .release = acache_orangefs_obj_release, }; static struct orangefs_attribute capcache_hard_limit_attribute = @@ -972,9 +990,18 @@ static struct attribute *capcache_orangefs_default_attrs[] = { }; ATTRIBUTE_GROUPS(capcache_orangefs_default); +static struct kobject *capcache_orangefs_obj; + +static void capcache_orangefs_obj_release(struct kobject *kobj) +{ + kfree(capcache_orangefs_obj); + capcache_orangefs_obj = NULL; +} + static struct kobj_type capcache_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = capcache_orangefs_default_groups, + .release = capcache_orangefs_obj_release, }; static struct orangefs_attribute ccache_hard_limit_attribute = @@ -1010,9 +1037,18 @@ static struct attribute *ccache_orangefs_default_attrs[] = { }; ATTRIBUTE_GROUPS(ccache_orangefs_default); +static struct kobject *ccache_orangefs_obj; + +static void ccache_orangefs_obj_release(struct kobject *kobj) +{ + kfree(ccache_orangefs_obj); + ccache_orangefs_obj = NULL; +} + static struct kobj_type ccache_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = ccache_orangefs_default_groups, + .release = ccache_orangefs_obj_release, }; static struct orangefs_attribute ncache_hard_limit_attribute = @@ -1048,9 +1084,18 @@ static struct attribute *ncache_orangefs_default_attrs[] = { }; ATTRIBUTE_GROUPS(ncache_orangefs_default); +static struct kobject *ncache_orangefs_obj; + +static void ncache_orangefs_obj_release(struct kobject *kobj) +{ + kfree(ncache_orangefs_obj); + ncache_orangefs_obj = NULL; +} + static struct kobj_type ncache_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = ncache_orangefs_default_groups, + .release = ncache_orangefs_obj_release, }; static struct orangefs_attribute pc_acache_attribute = @@ -1079,9 +1124,18 @@ static struct attribute *pc_orangefs_default_attrs[] = { }; ATTRIBUTE_GROUPS(pc_orangefs_default); +static struct kobject *pc_orangefs_obj; + +static void pc_orangefs_obj_release(struct kobject *kobj) +{ + kfree(pc_orangefs_obj); + pc_orangefs_obj = NULL; +} + static struct kobj_type pc_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = pc_orangefs_default_groups, + .release = pc_orangefs_obj_release, }; static struct orangefs_attribute stats_reads_attribute = @@ -1103,19 +1157,20 @@ static struct attribute *stats_orangefs_default_attrs[] = { }; ATTRIBUTE_GROUPS(stats_orangefs_default); +static struct kobject *stats_orangefs_obj; + +static void stats_orangefs_obj_release(struct kobject *kobj) +{ + kfree(stats_orangefs_obj); + stats_orangefs_obj = NULL; +} + static struct kobj_type stats_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = stats_orangefs_default_groups, + .release = stats_orangefs_obj_release, }; -static struct kobject *orangefs_obj; -static struct kobject *acache_orangefs_obj; -static struct kobject *capcache_orangefs_obj; -static struct kobject *ccache_orangefs_obj; -static struct kobject *ncache_orangefs_obj; -static struct kobject *pc_orangefs_obj; -static struct kobject *stats_orangefs_obj; - int orangefs_sysfs_init(void) { int rc = -EINVAL; -- GitLab From 31720a2b109b3080eb77e97b8f6f50a27b4ae599 Mon Sep 17 00:00:00 2001 From: Zhang Xiaoxu Date: Tue, 18 Oct 2022 12:40:07 +0800 Subject: [PATCH 1204/1423] orangefs: Fix kmemleak in orangefs_{kernel,client}_debug_init() When insert and remove the orangefs module, there are memory leaked as below: unreferenced object 0xffff88816b0cc000 (size 2048): comm "insmod", pid 783, jiffies 4294813439 (age 65.512s) hex dump (first 32 bytes): 6e 6f 6e 65 0a 00 00 00 00 00 00 00 00 00 00 00 none............ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<0000000031ab7788>] kmalloc_trace+0x27/0xa0 [<000000005b405fee>] orangefs_debugfs_init.cold+0xaf/0x17f [<00000000e5a0085b>] 0xffffffffa02780f9 [<000000004232d9f7>] do_one_initcall+0x87/0x2a0 [<0000000054f22384>] do_init_module+0xdf/0x320 [<000000003263bdea>] load_module+0x2f98/0x3330 [<0000000052cd4153>] __do_sys_finit_module+0x113/0x1b0 [<00000000250ae02b>] do_syscall_64+0x35/0x80 [<00000000f11c03c7>] entry_SYSCALL_64_after_hwframe+0x46/0xb0 Use the golbal variable as the buffer rather than dynamic allocate to slove the problem. Signed-off-by: Zhang Xiaoxu Signed-off-by: Mike Marshall --- fs/orangefs/orangefs-debugfs.c | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index a848b6ef95990..1b508f5433846 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -194,15 +194,10 @@ void orangefs_debugfs_init(int debug_mask) */ static void orangefs_kernel_debug_init(void) { - int rc = -ENOMEM; - char *k_buffer = NULL; + static char k_buffer[ORANGEFS_MAX_DEBUG_STRING_LEN] = { }; gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__); - k_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL); - if (!k_buffer) - goto out; - if (strlen(kernel_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) { strcpy(k_buffer, kernel_debug_string); strcat(k_buffer, "\n"); @@ -213,9 +208,6 @@ static void orangefs_kernel_debug_init(void) debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE, 0444, debug_dir, k_buffer, &kernel_debug_fops); - -out: - gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc); } @@ -299,18 +291,13 @@ static int help_show(struct seq_file *m, void *v) /* * initialize the client-debug file. */ -static int orangefs_client_debug_init(void) +static void orangefs_client_debug_init(void) { - int rc = -ENOMEM; - char *c_buffer = NULL; + static char c_buffer[ORANGEFS_MAX_DEBUG_STRING_LEN] = { }; gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__); - c_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL); - if (!c_buffer) - goto out; - if (strlen(client_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) { strcpy(c_buffer, client_debug_string); strcat(c_buffer, "\n"); @@ -324,13 +311,6 @@ static int orangefs_client_debug_init(void) debug_dir, c_buffer, &kernel_debug_fops); - - rc = 0; - -out: - - gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc); - return rc; } /* open ORANGEFS_KMOD_DEBUG_FILE or ORANGEFS_CLIENT_DEBUG_FILE.*/ -- GitLab From 2d47b79d2bd39cc6369eccf94a06568d84c906ae Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Sat, 29 Oct 2022 17:38:25 +0800 Subject: [PATCH 1205/1423] i2c: mux: reg: check return value after calling platform_get_resource() It will cause null-ptr-deref in resource_size(), if platform_get_resource() returns NULL, move calling resource_size() after devm_ioremap_resource() that will check 'res' to avoid null-ptr-deref. And use devm_platform_get_and_ioremap_resource() to simplify code. Fixes: b3fdd32799d8 ("i2c: mux: Add register-based mux i2c-mux-reg") Signed-off-by: Yang Yingliang Signed-off-by: Wolfram Sang --- drivers/i2c/muxes/i2c-mux-reg.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/i2c/muxes/i2c-mux-reg.c b/drivers/i2c/muxes/i2c-mux-reg.c index 0e0679f65cf77..30a6de1694e07 100644 --- a/drivers/i2c/muxes/i2c-mux-reg.c +++ b/drivers/i2c/muxes/i2c-mux-reg.c @@ -183,13 +183,12 @@ static int i2c_mux_reg_probe(struct platform_device *pdev) if (!mux->data.reg) { dev_info(&pdev->dev, "Register not set, using platform resource\n"); - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - mux->data.reg_size = resource_size(res); - mux->data.reg = devm_ioremap_resource(&pdev->dev, res); + mux->data.reg = devm_platform_get_and_ioremap_resource(pdev, 0, &res); if (IS_ERR(mux->data.reg)) { ret = PTR_ERR(mux->data.reg); goto err_put_parent; } + mux->data.reg_size = resource_size(res); } if (mux->data.reg_size != 4 && mux->data.reg_size != 2 && -- GitLab From 39244cc754829bf707dccd12e2ce37510f5b1f8d Mon Sep 17 00:00:00 2001 From: Zheyu Ma Date: Fri, 29 Jul 2022 19:02:16 +0800 Subject: [PATCH 1206/1423] i2c: ismt: Fix an out-of-bounds bug in ismt_access() When the driver does not check the data from the user, the variable 'data->block[0]' may be very large to cause an out-of-bounds bug. The following log can reveal it: [ 33.995542] i2c i2c-1: ioctl, cmd=0x720, arg=0x7ffcb3dc3a20 [ 33.995978] ismt_smbus 0000:00:05.0: I2C_SMBUS_BLOCK_DATA: WRITE [ 33.996475] ================================================================== [ 33.996995] BUG: KASAN: out-of-bounds in ismt_access.cold+0x374/0x214b [ 33.997473] Read of size 18446744073709551615 at addr ffff88810efcfdb1 by task ismt_poc/485 [ 33.999450] Call Trace: [ 34.001849] memcpy+0x20/0x60 [ 34.002077] ismt_access.cold+0x374/0x214b [ 34.003382] __i2c_smbus_xfer+0x44f/0xfb0 [ 34.004007] i2c_smbus_xfer+0x10a/0x390 [ 34.004291] i2cdev_ioctl_smbus+0x2c8/0x710 [ 34.005196] i2cdev_ioctl+0x5ec/0x74c Fix this bug by checking the size of 'data->block[0]' first. Fixes: 13f35ac14cd0 ("i2c: Adding support for Intel iSMT SMBus 2.0 host controller") Signed-off-by: Zheyu Ma Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-ismt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/i2c/busses/i2c-ismt.c b/drivers/i2c/busses/i2c-ismt.c index fe2349590f75e..c74985d77b0ec 100644 --- a/drivers/i2c/busses/i2c-ismt.c +++ b/drivers/i2c/busses/i2c-ismt.c @@ -509,6 +509,9 @@ static int ismt_access(struct i2c_adapter *adap, u16 addr, if (read_write == I2C_SMBUS_WRITE) { /* Block Write */ dev_dbg(dev, "I2C_SMBUS_BLOCK_DATA: WRITE\n"); + if (data->block[0] < 1 || data->block[0] > I2C_SMBUS_BLOCK_MAX) + return -EINVAL; + dma_size = data->block[0] + 1; dma_direction = DMA_TO_DEVICE; desc->wr_len_cmd = dma_size; -- GitLab From 76007ccc5727f86c105e96697e96dcf2df6b1634 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 7 Dec 2022 13:07:10 -0800 Subject: [PATCH 1207/1423] PCI: mvebu: Switch to using gpiod API Switch the driver away from legacy gpio/of_gpio API to gpiod API, and remove use of of_get_named_gpio_flags() which I want to make private to gpiolib. Link: https://lore.kernel.org/r/Y5EAft42YiT66mVj@google.com Signed-off-by: Dmitry Torokhov Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/pci-mvebu.c | 51 ++++++++++-------------------- 1 file changed, 17 insertions(+), 34 deletions(-) diff --git a/drivers/pci/controller/pci-mvebu.c b/drivers/pci/controller/pci-mvebu.c index 1ced73726a267..b12e128fc5d76 100644 --- a/drivers/pci/controller/pci-mvebu.c +++ b/drivers/pci/controller/pci-mvebu.c @@ -11,14 +11,14 @@ #include #include #include -#include +#include #include +#include #include #include #include #include #include -#include #include #include @@ -1261,9 +1261,8 @@ static int mvebu_pcie_parse_port(struct mvebu_pcie *pcie, struct mvebu_pcie_port *port, struct device_node *child) { struct device *dev = &pcie->pdev->dev; - enum of_gpio_flags flags; u32 slot_power_limit; - int reset_gpio, ret; + int ret; u32 num_lanes; port->pcie = pcie; @@ -1327,40 +1326,24 @@ static int mvebu_pcie_parse_port(struct mvebu_pcie *pcie, port->name, child); } - reset_gpio = of_get_named_gpio_flags(child, "reset-gpios", 0, &flags); - if (reset_gpio == -EPROBE_DEFER) { - ret = reset_gpio; + port->reset_name = devm_kasprintf(dev, GFP_KERNEL, "%s-reset", + port->name); + if (!port->reset_name) { + ret = -ENOMEM; goto err; } - if (gpio_is_valid(reset_gpio)) { - unsigned long gpio_flags; - - port->reset_name = devm_kasprintf(dev, GFP_KERNEL, "%s-reset", - port->name); - if (!port->reset_name) { - ret = -ENOMEM; + port->reset_gpio = devm_fwnode_gpiod_get(dev, of_fwnode_handle(child), + "reset", GPIOD_OUT_HIGH, + port->name); + ret = PTR_ERR_OR_ZERO(port->reset_gpio); + if (ret) { + if (ret != -ENOENT) goto err; - } - - if (flags & OF_GPIO_ACTIVE_LOW) { - dev_info(dev, "%pOF: reset gpio is active low\n", - child); - gpio_flags = GPIOF_ACTIVE_LOW | - GPIOF_OUT_INIT_LOW; - } else { - gpio_flags = GPIOF_OUT_INIT_HIGH; - } - - ret = devm_gpio_request_one(dev, reset_gpio, gpio_flags, - port->reset_name); - if (ret) { - if (ret == -EPROBE_DEFER) - goto err; - goto skip; - } - - port->reset_gpio = gpio_to_desc(reset_gpio); + /* reset gpio is optional */ + port->reset_gpio = NULL; + devm_kfree(dev, port->reset_name); + port->reset_name = NULL; } slot_power_limit = of_pci_get_slot_power_limit(child, -- GitLab From fb4907f487254375830f135dcfe5dd7e6f8b705f Mon Sep 17 00:00:00 2001 From: Chao Leng Date: Fri, 25 Nov 2022 09:00:26 +0800 Subject: [PATCH 1208/1423] RDMA/cma: Change RoCE packet life time from 18 to 16 The ack timeout retransmission time is affected by the following two factors: one is packet life time, another is the HCA processing time. Now the default packet lifetime(CMA_IBOE_PACKET_LIFETIME) is 18. That means the minimum ack timeout is 2 seconds (2^(18+1)*4us=2.097seconds). The packet lifetime means the maximum transmission time of packets on the network, 2 seconds is too long. Assume the network is a clos topology with three layers, every packet will pass through five hops of switches. Assume the buffer of every switch is 128MB and the port transmission rate is 25 Gbit/s, the maximum transmission time of the packet is 200ms(128MB*5/25Gbit/s). Add double redundancy, it is less than 500ms. So change the CMA_IBOE_PACKET_LIFETIME to 16, the maximum transmission time of the packet will be about 500+ms, it is long enough. Link: https://lore.kernel.org/r/20221125010026.755-1-lengchao@huawei.com Signed-off-by: Chao Leng Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index cc2222b85c881..2f5b5e6f3d110 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -47,7 +47,7 @@ MODULE_LICENSE("Dual BSD/GPL"); #define CMA_CM_RESPONSE_TIMEOUT 20 #define CMA_MAX_CM_RETRIES 15 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) -#define CMA_IBOE_PACKET_LIFETIME 18 +#define CMA_IBOE_PACKET_LIFETIME 16 #define CMA_PREFERRED_ROCE_GID_TYPE IB_GID_TYPE_ROCE_UDP_ENCAP static const char * const cma_events[] = { -- GitLab From 487d65090a3dce1ae54946aded55d0f8ac87cbab Mon Sep 17 00:00:00 2001 From: Yixing Liu Date: Sat, 26 Nov 2022 18:29:06 +0800 Subject: [PATCH 1209/1423] RDMA/hns: Fix the gid problem caused by free mr After the hns roce driver is loaded, if you modify the mac address of the network port, the following error will appear: __ib_cache_gid_add: unable to add gid fe80:0000:0000:0000:4600:4dff:fe22:abb5 error=-28 hns3 0000:7d:00.0 hns_0: attr path_mtu(1) invalid while modify qp The reason for the error is that the gid being occupied will cause the failure to modify the gid. The gid is occupied by the loopback QP used by free mr. When the mac address is modified, the gid will change. If there is a busy QP at this time, the gid will not be released and the modification will fail. The QP of free mr is created using the ib interface. The ib interface will add a reference count to the gid, resulting in this error scenario. Considering that free mr is solving a bug in HIP08, not an actual business, it is not necessary to use ib interfaces. Fixes: 70f92521584f ("RDMA/hns: Use the reserved loopback QPs to free MR before destroying MPT") Link: https://lore.kernel.org/r/20221126102911.2921820-2-xuhaoyue1@hisilicon.com Signed-off-by: Yixing Liu Signed-off-by: Haoyue Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 175 ++++++++++++++++----- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 8 +- 2 files changed, 137 insertions(+), 46 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 939811867249f..93c6772396865 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2634,31 +2634,124 @@ static void free_dip_list(struct hns_roce_dev *hr_dev) spin_unlock_irqrestore(&hr_dev->dip_list_lock, flags); } -static void free_mr_exit(struct hns_roce_dev *hr_dev) +static struct ib_pd *free_mr_init_pd(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hns_roce_v2_free_mr *free_mr = &priv->free_mr; + struct ib_device *ibdev = &hr_dev->ib_dev; + struct hns_roce_pd *hr_pd; + struct ib_pd *pd; + + hr_pd = kzalloc(sizeof(*hr_pd), GFP_KERNEL); + if (ZERO_OR_NULL_PTR(hr_pd)) + return NULL; + pd = &hr_pd->ibpd; + pd->device = ibdev; + + if (hns_roce_alloc_pd(pd, NULL)) { + ibdev_err(ibdev, "failed to create pd for free mr.\n"); + kfree(hr_pd); + return NULL; + } + free_mr->rsv_pd = to_hr_pd(pd); + free_mr->rsv_pd->ibpd.device = &hr_dev->ib_dev; + free_mr->rsv_pd->ibpd.uobject = NULL; + free_mr->rsv_pd->ibpd.__internal_mr = NULL; + atomic_set(&free_mr->rsv_pd->ibpd.usecnt, 0); + + return pd; +} + +static struct ib_cq *free_mr_init_cq(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hns_roce_v2_free_mr *free_mr = &priv->free_mr; + struct ib_device *ibdev = &hr_dev->ib_dev; + struct ib_cq_init_attr cq_init_attr = {}; + struct hns_roce_cq *hr_cq; + struct ib_cq *cq; + + cq_init_attr.cqe = HNS_ROCE_FREE_MR_USED_CQE_NUM; + + hr_cq = kzalloc(sizeof(*hr_cq), GFP_KERNEL); + if (ZERO_OR_NULL_PTR(hr_cq)) + return NULL; + + cq = &hr_cq->ib_cq; + cq->device = ibdev; + + if (hns_roce_create_cq(cq, &cq_init_attr, NULL)) { + ibdev_err(ibdev, "failed to create cq for free mr.\n"); + kfree(hr_cq); + return NULL; + } + free_mr->rsv_cq = to_hr_cq(cq); + free_mr->rsv_cq->ib_cq.device = &hr_dev->ib_dev; + free_mr->rsv_cq->ib_cq.uobject = NULL; + free_mr->rsv_cq->ib_cq.comp_handler = NULL; + free_mr->rsv_cq->ib_cq.event_handler = NULL; + free_mr->rsv_cq->ib_cq.cq_context = NULL; + atomic_set(&free_mr->rsv_cq->ib_cq.usecnt, 0); + + return cq; +} + +static int free_mr_init_qp(struct hns_roce_dev *hr_dev, struct ib_cq *cq, + struct ib_qp_init_attr *init_attr, int i) { struct hns_roce_v2_priv *priv = hr_dev->priv; struct hns_roce_v2_free_mr *free_mr = &priv->free_mr; + struct ib_device *ibdev = &hr_dev->ib_dev; + struct hns_roce_qp *hr_qp; + struct ib_qp *qp; int ret; + + hr_qp = kzalloc(sizeof(*hr_qp), GFP_KERNEL); + if (ZERO_OR_NULL_PTR(hr_qp)) + return -ENOMEM; + + qp = &hr_qp->ibqp; + qp->device = ibdev; + + ret = hns_roce_create_qp(qp, init_attr, NULL); + if (ret) { + ibdev_err(ibdev, "failed to create qp for free mr.\n"); + kfree(hr_qp); + return ret; + } + + free_mr->rsv_qp[i] = hr_qp; + free_mr->rsv_qp[i]->ibqp.recv_cq = cq; + free_mr->rsv_qp[i]->ibqp.send_cq = cq; + + return 0; +} + +static void free_mr_exit(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hns_roce_v2_free_mr *free_mr = &priv->free_mr; + struct ib_qp *qp; int i; for (i = 0; i < ARRAY_SIZE(free_mr->rsv_qp); i++) { if (free_mr->rsv_qp[i]) { - ret = ib_destroy_qp(free_mr->rsv_qp[i]); - if (ret) - ibdev_err(&hr_dev->ib_dev, - "failed to destroy qp in free mr.\n"); - + qp = &free_mr->rsv_qp[i]->ibqp; + hns_roce_v2_destroy_qp(qp, NULL); + kfree(free_mr->rsv_qp[i]); free_mr->rsv_qp[i] = NULL; } } if (free_mr->rsv_cq) { - ib_destroy_cq(free_mr->rsv_cq); + hns_roce_destroy_cq(&free_mr->rsv_cq->ib_cq, NULL); + kfree(free_mr->rsv_cq); free_mr->rsv_cq = NULL; } if (free_mr->rsv_pd) { - ib_dealloc_pd(free_mr->rsv_pd); + hns_roce_dealloc_pd(&free_mr->rsv_pd->ibpd, NULL); + kfree(free_mr->rsv_pd); free_mr->rsv_pd = NULL; } } @@ -2667,55 +2760,46 @@ static int free_mr_alloc_res(struct hns_roce_dev *hr_dev) { struct hns_roce_v2_priv *priv = hr_dev->priv; struct hns_roce_v2_free_mr *free_mr = &priv->free_mr; - struct ib_device *ibdev = &hr_dev->ib_dev; - struct ib_cq_init_attr cq_init_attr = {}; struct ib_qp_init_attr qp_init_attr = {}; struct ib_pd *pd; struct ib_cq *cq; - struct ib_qp *qp; int ret; int i; - pd = ib_alloc_pd(ibdev, 0); - if (IS_ERR(pd)) { - ibdev_err(ibdev, "failed to create pd for free mr.\n"); - return PTR_ERR(pd); - } - free_mr->rsv_pd = pd; + pd = free_mr_init_pd(hr_dev); + if (!pd) + return -ENOMEM; - cq_init_attr.cqe = HNS_ROCE_FREE_MR_USED_CQE_NUM; - cq = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_init_attr); - if (IS_ERR(cq)) { - ibdev_err(ibdev, "failed to create cq for free mr.\n"); - ret = PTR_ERR(cq); - goto create_failed; + cq = free_mr_init_cq(hr_dev); + if (!cq) { + ret = -ENOMEM; + goto create_failed_cq; } - free_mr->rsv_cq = cq; qp_init_attr.qp_type = IB_QPT_RC; qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; - qp_init_attr.send_cq = free_mr->rsv_cq; - qp_init_attr.recv_cq = free_mr->rsv_cq; + qp_init_attr.send_cq = cq; + qp_init_attr.recv_cq = cq; for (i = 0; i < ARRAY_SIZE(free_mr->rsv_qp); i++) { qp_init_attr.cap.max_send_wr = HNS_ROCE_FREE_MR_USED_SQWQE_NUM; qp_init_attr.cap.max_send_sge = HNS_ROCE_FREE_MR_USED_SQSGE_NUM; qp_init_attr.cap.max_recv_wr = HNS_ROCE_FREE_MR_USED_RQWQE_NUM; qp_init_attr.cap.max_recv_sge = HNS_ROCE_FREE_MR_USED_RQSGE_NUM; - qp = ib_create_qp(free_mr->rsv_pd, &qp_init_attr); - if (IS_ERR(qp)) { - ibdev_err(ibdev, "failed to create qp for free mr.\n"); - ret = PTR_ERR(qp); - goto create_failed; - } - - free_mr->rsv_qp[i] = qp; + ret = free_mr_init_qp(hr_dev, cq, &qp_init_attr, i); + if (ret) + goto create_failed_qp; } return 0; -create_failed: - free_mr_exit(hr_dev); +create_failed_qp: + hns_roce_destroy_cq(cq, NULL); + kfree(cq); + +create_failed_cq: + hns_roce_dealloc_pd(pd, NULL); + kfree(pd); return ret; } @@ -2731,14 +2815,17 @@ static int free_mr_modify_rsv_qp(struct hns_roce_dev *hr_dev, int mask; int ret; - hr_qp = to_hr_qp(free_mr->rsv_qp[sl_num]); + hr_qp = to_hr_qp(&free_mr->rsv_qp[sl_num]->ibqp); hr_qp->free_mr_en = 1; + hr_qp->ibqp.device = ibdev; + hr_qp->ibqp.qp_type = IB_QPT_RC; mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS; attr->qp_state = IB_QPS_INIT; attr->port_num = 1; attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE; - ret = ib_modify_qp(&hr_qp->ibqp, attr, mask); + ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, attr, mask, IB_QPS_INIT, + IB_QPS_INIT); if (ret) { ibdev_err(ibdev, "failed to modify qp to init, ret = %d.\n", ret); @@ -2759,7 +2846,8 @@ static int free_mr_modify_rsv_qp(struct hns_roce_dev *hr_dev, rdma_ah_set_sl(&attr->ah_attr, (u8)sl_num); - ret = ib_modify_qp(&hr_qp->ibqp, attr, mask); + ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, attr, mask, IB_QPS_INIT, + IB_QPS_RTR); hr_dev->loop_idc = loopback; if (ret) { ibdev_err(ibdev, "failed to modify qp to rtr, ret = %d.\n", @@ -2773,7 +2861,8 @@ static int free_mr_modify_rsv_qp(struct hns_roce_dev *hr_dev, attr->sq_psn = HNS_ROCE_FREE_MR_USED_PSN; attr->retry_cnt = HNS_ROCE_FREE_MR_USED_QP_RETRY_CNT; attr->timeout = HNS_ROCE_FREE_MR_USED_QP_TIMEOUT; - ret = ib_modify_qp(&hr_qp->ibqp, attr, mask); + ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, attr, mask, IB_QPS_RTR, + IB_QPS_RTS); if (ret) ibdev_err(ibdev, "failed to modify qp to rts, ret = %d.\n", ret); @@ -3416,7 +3505,7 @@ static void free_mr_send_cmd_to_hw(struct hns_roce_dev *hr_dev) mutex_lock(&free_mr->mutex); for (i = 0; i < ARRAY_SIZE(free_mr->rsv_qp); i++) { - hr_qp = to_hr_qp(free_mr->rsv_qp[i]); + hr_qp = free_mr->rsv_qp[i]; ret = free_mr_post_send_lp_wqe(hr_qp); if (ret) { @@ -3431,7 +3520,7 @@ static void free_mr_send_cmd_to_hw(struct hns_roce_dev *hr_dev) end = msecs_to_jiffies(HNS_ROCE_V2_FREE_MR_TIMEOUT) + jiffies; while (cqe_cnt) { - npolled = hns_roce_v2_poll_cq(free_mr->rsv_cq, cqe_cnt, wc); + npolled = hns_roce_v2_poll_cq(&free_mr->rsv_cq->ib_cq, cqe_cnt, wc); if (npolled < 0) { ibdev_err(ibdev, "failed to poll cqe for free mr, remain %d cqe.\n", @@ -5474,7 +5563,7 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev, return ret; } -static int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) +int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) { struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index b11579027e827..017462e52843c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -1329,9 +1329,9 @@ struct hns_roce_link_table { #define HNS_ROCE_EXT_LLM_MIN_PAGES(que_num) ((que_num) * 4 + 2) struct hns_roce_v2_free_mr { - struct ib_qp *rsv_qp[HNS_ROCE_FREE_MR_USED_QP_NUM]; - struct ib_cq *rsv_cq; - struct ib_pd *rsv_pd; + struct hns_roce_qp *rsv_qp[HNS_ROCE_FREE_MR_USED_QP_NUM]; + struct hns_roce_cq *rsv_cq; + struct hns_roce_pd *rsv_pd; struct mutex mutex; }; @@ -1461,6 +1461,8 @@ struct hns_roce_sccc_clr_done { __le32 rsv[5]; }; +int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata); + static inline void hns_roce_write64(struct hns_roce_dev *hr_dev, __le32 val[2], void __iomem *dest) { -- GitLab From bc34c04f7b97c3794dec5a6d6d27ffd5f0e4f5c8 Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Sat, 26 Nov 2022 18:29:07 +0800 Subject: [PATCH 1210/1423] RDMA/hns: Fix AH attr queried by query_qp The queried AH attr is invalid. This patch fix it. This problem is found by rdma-core test test_mr_rereg_pd ERROR: test_mr_rereg_pd (tests.test_mr.MRTest) Test that cover rereg MR's PD with this flow: ---------------------------------------------------------------------- Traceback (most recent call last): File "./tests/test_mr.py", line 157, in test_mr_rereg_pd self.restate_qps() File "./tests/test_mr.py", line 113, in restate_qps self.server.qp.to_rts(self.server_qp_attr) File "qp.pyx", line 1137, in pyverbs.qp.QP.to_rts File "qp.pyx", line 1123, in pyverbs.qp.QP.to_rtr pyverbs.pyverbs_error.PyverbsRDMAError: Failed to modify QP state to RTR. Errno: 22, Invalid argument Fixes: 926a01dc000d ("RDMA/hns: Add QP operations support for hip08 SoC") Link: https://lore.kernel.org/r/20221126102911.2921820-3-xuhaoyue1@hisilicon.com Signed-off-by: Chengchang Tang Signed-off-by: Haoyue Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 93c6772396865..a8f8c790d31dc 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -5470,6 +5470,8 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, rdma_ah_set_sl(&qp_attr->ah_attr, hr_reg_read(&context, QPC_SL)); + rdma_ah_set_port_num(&qp_attr->ah_attr, hr_qp->port + 1); + rdma_ah_set_ah_flags(&qp_attr->ah_attr, IB_AH_GRH); grh->flow_label = hr_reg_read(&context, QPC_FL); grh->sgid_index = hr_reg_read(&context, QPC_GMV_IDX); grh->hop_limit = hr_reg_read(&context, QPC_HOPLIMIT); -- GitLab From 9fb39ef2ff3e18f1740625ba04093dfbef086d2b Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Sat, 26 Nov 2022 18:29:08 +0800 Subject: [PATCH 1211/1423] RDMA/hns: Fix PBL page MTR find Now, The address of the first two pages in the MR will be searched, which use to speed up the lookup of the pbl table for hardware. An exception will occur when there is only one page in this MR. This patch fix the number of page to search. Fixes: 9b2cf76c9f05 ("RDMA/hns: Optimize PBL buffer allocation process") Link: https://lore.kernel.org/r/20221126102911.2921820-4-xuhaoyue1@hisilicon.com Signed-off-by: Chengchang Tang Signed-off-by: Haoyue Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index a8f8c790d31dc..41835cb05983a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -3274,7 +3274,8 @@ static int set_mtpt_pbl(struct hns_roce_dev *hr_dev, int i, count; count = hns_roce_mtr_find(hr_dev, &mr->pbl_mtr, 0, pages, - ARRAY_SIZE(pages), &pbl_ba); + min_t(int, ARRAY_SIZE(pages), mr->npages), + &pbl_ba); if (count < 1) { ibdev_err(ibdev, "failed to find PBL mtr, count = %d.\n", count); -- GitLab From 99dc5a0712883d5d13b620d25b3759d429577bc8 Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Sat, 26 Nov 2022 18:29:09 +0800 Subject: [PATCH 1212/1423] RDMA/hns: Fix page size cap from firmware Add verification to make sure the roce page size cap is supported by the system page size. Fixes: ba6bb7e97421 ("RDMA/hns: Add interfaces to get pf capabilities from firmware") Link: https://lore.kernel.org/r/20221126102911.2921820-5-xuhaoyue1@hisilicon.com Signed-off-by: Chengchang Tang Signed-off-by: Haoyue Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 41835cb05983a..6e74735bbcf87 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2345,6 +2345,9 @@ static int hns_roce_query_pf_caps(struct hns_roce_dev *hr_dev) caps->wqe_sge_hop_num = hr_reg_read(resp_d, PF_CAPS_D_EX_SGE_HOP_NUM); caps->wqe_rq_hop_num = hr_reg_read(resp_d, PF_CAPS_D_RQWQE_HOP_NUM); + if (!(caps->page_size_cap & PAGE_SIZE)) + caps->page_size_cap = HNS_ROCE_V2_PAGE_SIZE_SUPPORTED; + return 0; } -- GitLab From 667d6164b84884c64de3fc18670cd5a98b0b10cf Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Sat, 26 Nov 2022 18:29:10 +0800 Subject: [PATCH 1213/1423] RDMA/hns: Fix error code of CMD The error code is fixed to EIO when CMD fails to excute. This patch converts the error status reported by firmware to linux errno. Fixes: a04ff739f2a9 ("RDMA/hns: Add command queue support for hip08 RoCE driver") Link: https://lore.kernel.org/r/20221126102911.2921820-6-xuhaoyue1@hisilicon.com Signed-off-by: Chengchang Tang Signed-off-by: Haoyue Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 26 +++++++++++++++++++++- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 5 +++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 6e74735bbcf87..f32100c6f1d9e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1277,6 +1277,30 @@ static void update_cmdq_status(struct hns_roce_dev *hr_dev) hr_dev->cmd.state = HNS_ROCE_CMDQ_STATE_FATAL_ERR; } +static int hns_roce_cmd_err_convert_errno(u16 desc_ret) +{ + struct hns_roce_cmd_errcode errcode_table[] = { + {CMD_EXEC_SUCCESS, 0}, + {CMD_NO_AUTH, -EPERM}, + {CMD_NOT_EXIST, -EOPNOTSUPP}, + {CMD_CRQ_FULL, -EXFULL}, + {CMD_NEXT_ERR, -ENOSR}, + {CMD_NOT_EXEC, -ENOTBLK}, + {CMD_PARA_ERR, -EINVAL}, + {CMD_RESULT_ERR, -ERANGE}, + {CMD_TIMEOUT, -ETIME}, + {CMD_HILINK_ERR, -ENOLINK}, + {CMD_INFO_ILLEGAL, -ENXIO}, + {CMD_INVALID, -EBADR}, + }; + u16 i; + + for (i = 0; i < ARRAY_SIZE(errcode_table); i++) + if (desc_ret == errcode_table[i].return_status) + return errcode_table[i].errno; + return -EIO; +} + static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev, struct hns_roce_cmq_desc *desc, int num) { @@ -1322,7 +1346,7 @@ static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev, dev_err_ratelimited(hr_dev->dev, "Cmdq IO error, opcode = 0x%x, return = 0x%x.\n", desc->opcode, desc_ret); - ret = -EIO; + ret = hns_roce_cmd_err_convert_errno(desc_ret); } } else { /* FW/HW reset or incorrect number of desc */ diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 017462e52843c..47fad456839d1 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -273,6 +273,11 @@ enum hns_roce_cmd_return_status { CMD_OTHER_ERR = 0xff }; +struct hns_roce_cmd_errcode { + enum hns_roce_cmd_return_status return_status; + int errno; +}; + enum hns_roce_sgid_type { GID_TYPE_FLAG_ROCE_V1 = 0, GID_TYPE_FLAG_ROCE_V2_IPV4, -- GitLab From 682c0722addae4b4a1440c9db9d8c86cb8e09ce5 Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Sat, 26 Nov 2022 18:29:11 +0800 Subject: [PATCH 1214/1423] RDMA/hns: Fix XRC caps on HIP08 XRC caps has been set by default. But in fact, XRC is not supported in HIP08. Fixes: 32548870d438 ("RDMA/hns: Add support for XRC on HIP09") Link: https://lore.kernel.org/r/20221126102911.2921820-7-xuhaoyue1@hisilicon.com Signed-off-by: Chengchang Tang Signed-off-by: Haoyue Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index f32100c6f1d9e..2716852f5e929 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2051,13 +2051,14 @@ static void set_default_caps(struct hns_roce_dev *hr_dev) caps->flags |= HNS_ROCE_CAP_FLAG_ATOMIC | HNS_ROCE_CAP_FLAG_MW | HNS_ROCE_CAP_FLAG_SRQ | HNS_ROCE_CAP_FLAG_FRMR | - HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL | HNS_ROCE_CAP_FLAG_XRC; + HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL; caps->gid_table_len[0] = HNS_ROCE_V2_GID_INDEX_NUM; if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) { caps->flags |= HNS_ROCE_CAP_FLAG_STASH | - HNS_ROCE_CAP_FLAG_DIRECT_WQE; + HNS_ROCE_CAP_FLAG_DIRECT_WQE | + HNS_ROCE_CAP_FLAG_XRC; caps->max_sq_inline = HNS_ROCE_V3_MAX_SQ_INLINE; } else { caps->max_sq_inline = HNS_ROCE_V2_MAX_SQ_INLINE; -- GitLab From 6cfe7bd0dfd33033683639039b5608d6534c19eb Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Thu, 8 Dec 2022 05:19:54 -0500 Subject: [PATCH 1215/1423] RDMA/mlx5: Remove not-used IB_FLOW_SPEC_IB define IB_FLOW_SPEC_IB is not used in mlx5 and can be deleted. Signed-off-by: Zhu Yanjun Link: https://lore.kernel.org/r/20221208101954.687960-1-yanjun.zhu@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/fs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c index 490ec308e3098..3008632a6c206 100644 --- a/drivers/infiniband/hw/mlx5/fs.c +++ b/drivers/infiniband/hw/mlx5/fs.c @@ -127,7 +127,6 @@ static int check_mpls_supp_fields(u32 field_support, const __be32 *set_mask) } #define LAST_ETH_FIELD vlan_tag -#define LAST_IB_FIELD sl #define LAST_IPV4_FIELD tos #define LAST_IPV6_FIELD traffic_class #define LAST_TCP_UDP_FIELD src_port -- GitLab From 0bf588274f73b29d7058042a6b7cc2b764502cc1 Mon Sep 17 00:00:00 2001 From: Volker Lendecke Date: Wed, 23 Nov 2022 17:51:21 +0100 Subject: [PATCH 1216/1423] Fix path in cifs/usage.rst /sys/module/... not /proc/module/... Signed-off-by: Volker Lendecke Signed-off-by: Steve French --- Documentation/admin-guide/cifs/usage.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/cifs/usage.rst b/Documentation/admin-guide/cifs/usage.rst index 3766bf8a1c20e..ed3b8dc854ecf 100644 --- a/Documentation/admin-guide/cifs/usage.rst +++ b/Documentation/admin-guide/cifs/usage.rst @@ -858,7 +858,7 @@ CIFS kernel module parameters These module parameters can be specified or modified either during the time of module loading or during the runtime by using the interface:: - /proc/module/cifs/parameters/ + /sys/module/cifs/parameters/ i.e.:: -- GitLab From 83fb8abec29383eb0cf35495d21669e38548771b Mon Sep 17 00:00:00 2001 From: Volker Lendecke Date: Fri, 25 Nov 2022 12:26:00 +0100 Subject: [PATCH 1217/1423] cifs: Add "extbuf" and "extbuflen" args to smb2_compound_op() Will carry the variable-sized reply from SMB_FIND_FILE_POSIX_INFO Signed-off-by: Volker Lendecke Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/smb2inode.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 68e08c85fbb87..1be86ba950b35 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -59,6 +59,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, __u32 desired_access, __u32 create_disposition, __u32 create_options, umode_t mode, void *ptr, int command, struct cifsFileInfo *cfile, + __u8 **extbuf, size_t *extbuflen, struct kvec *err_iov, int *err_buftype) { struct cop_vars *vars = NULL; @@ -539,7 +540,7 @@ int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, ACL_NO_MODE, data, SMB2_OP_QUERY_INFO, cfile, - err_iov, err_buftype); + NULL, NULL, err_iov, err_buftype); if (rc == -EOPNOTSUPP) { if (err_iov[0].iov_base && err_buftype[0] != CIFS_NO_BUFFER && ((struct smb2_hdr *)err_iov[0].iov_base)->Command == SMB2_CREATE && @@ -555,7 +556,7 @@ int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, ACL_NO_MODE, data, - SMB2_OP_QUERY_INFO, cfile, NULL, NULL); + SMB2_OP_QUERY_INFO, cfile, NULL, NULL, NULL, NULL); } out: @@ -589,7 +590,7 @@ int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, ACL_NO_MODE, data, SMB2_OP_POSIX_QUERY_INFO, cfile, - err_iov, err_buftype); + NULL, NULL, err_iov, err_buftype); if (rc == -EOPNOTSUPP) { /* BB TODO: When support for special files added to Samba re-verify this path */ if (err_iov[0].iov_base && err_buftype[0] != CIFS_NO_BUFFER && @@ -606,7 +607,7 @@ int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, ACL_NO_MODE, data, - SMB2_OP_POSIX_QUERY_INFO, cfile, NULL, NULL); + SMB2_OP_POSIX_QUERY_INFO, cfile, NULL, NULL, NULL, NULL); } out: @@ -624,7 +625,7 @@ smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode, return smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, CREATE_NOT_FILE, mode, NULL, SMB2_OP_MKDIR, - NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL); } void @@ -646,7 +647,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name, tmprc = smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, CREATE_NOT_FILE, ACL_NO_MODE, - &data, SMB2_OP_SET_INFO, cfile, NULL, NULL); + &data, SMB2_OP_SET_INFO, cfile, NULL, NULL, NULL, NULL); if (tmprc == 0) cifs_i->cifsAttrs = dosattrs; } @@ -658,7 +659,7 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, drop_cached_dir_by_name(xid, tcon, name, cifs_sb); return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, CREATE_NOT_FILE, ACL_NO_MODE, - NULL, SMB2_OP_RMDIR, NULL, NULL, NULL); + NULL, SMB2_OP_RMDIR, NULL, NULL, NULL, NULL, NULL); } int @@ -667,7 +668,7 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, { return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, - ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL, NULL, NULL); + ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL, NULL, NULL, NULL, NULL); } static int @@ -686,7 +687,7 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, } rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access, FILE_OPEN, 0, ACL_NO_MODE, smb2_to_name, - command, cfile, NULL, NULL); + command, cfile, NULL, NULL, NULL, NULL); smb2_rename_path: kfree(smb2_to_name); return rc; @@ -727,7 +728,7 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); return smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_WRITE_DATA, FILE_OPEN, 0, ACL_NO_MODE, - &eof, SMB2_OP_SET_EOF, cfile, NULL, NULL); + &eof, SMB2_OP_SET_EOF, cfile, NULL, NULL, NULL, NULL); } int @@ -754,7 +755,7 @@ smb2_set_file_info(struct inode *inode, const char *full_path, rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, cfile, - NULL, NULL); + NULL, NULL, NULL, NULL); cifs_put_tlink(tlink); return rc; } -- GitLab From 64ce47cb1b29d7d6aab6dcc287ae1fddb4876bd5 Mon Sep 17 00:00:00 2001 From: Volker Lendecke Date: Fri, 25 Nov 2022 12:37:44 +0100 Subject: [PATCH 1218/1423] cifs: Parse owner/group for stat in smb311 posix extensions stat was returning default owner and group (unlike readdir) for SMB3.1.1 POSIX extensions Signed-off-by: Volker Lendecke Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/inode.c | 13 ++++++++---- fs/cifs/smb2inode.c | 49 ++++++++++++++++++++++++++++++++++++++++++--- fs/cifs/smb2proto.h | 5 ++++- 3 files changed, 59 insertions(+), 8 deletions(-) diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 4e2ca3c6e5c00..286a5400b94e5 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -632,6 +632,8 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, /* Fill a cifs_fattr struct with info from POSIX info struct */ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_info_data *data, + struct cifs_sid *owner, + struct cifs_sid *group, struct super_block *sb, bool adjust_tz, bool symlink) { struct smb311_posix_qinfo *info = &data->posix_fi; @@ -680,8 +682,8 @@ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_ope } /* else if reparse point ... TODO: add support for FIFO and blk dev; special file types */ - fattr->cf_uid = cifs_sb->ctx->linux_uid; /* TODO: map uid and gid from SID */ - fattr->cf_gid = cifs_sb->ctx->linux_gid; + sid_to_id(cifs_sb, owner, fattr, SIDOWNER); + sid_to_id(cifs_sb, group, fattr, SIDGROUP); cifs_dbg(FYI, "POSIX query info: mode 0x%x uniqueid 0x%llx nlink %d\n", fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink); @@ -1175,6 +1177,7 @@ smb311_posix_get_inode_info(struct inode **inode, struct cifs_fattr fattr = {0}; bool symlink = false; struct cifs_open_info_data data = {}; + struct cifs_sid owner, group; int rc = 0; int tmprc = 0; @@ -1192,7 +1195,8 @@ smb311_posix_get_inode_info(struct inode **inode, goto out; } - rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, full_path, &data, &adjust_tz, + rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, full_path, &data, + &owner, &group, &adjust_tz, &symlink); /* @@ -1201,7 +1205,8 @@ smb311_posix_get_inode_info(struct inode **inode, switch (rc) { case 0: - smb311_posix_info_to_fattr(&fattr, &data, sb, adjust_tz, symlink); + smb311_posix_info_to_fattr(&fattr, &data, &owner, &group, + sb, adjust_tz, symlink); break; case -EREMOTE: /* DFS link, no metadata available on this server */ diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 1be86ba950b35..fbd46db1023ae 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -431,6 +431,21 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, &rsp_iov[1], sizeof(idata->posix_fi) /* add SIDs */, (char *)&idata->posix_fi); } + if (rc == 0) { + unsigned int length = le32_to_cpu(qi_rsp->OutputBufferLength); + + if (length > sizeof(idata->posix_fi)) { + char *base = (char *)rsp_iov[1].iov_base + + le16_to_cpu(qi_rsp->OutputBufferOffset) + + sizeof(idata->posix_fi); + *extbuflen = length - sizeof(idata->posix_fi); + *extbuf = kmemdup(base, *extbuflen, GFP_KERNEL); + if (!*extbuf) + rc = -ENOMEM; + } else { + rc = -EINVAL; + } + } if (rqst[1].rq_iov) SMB2_query_info_free(&rqst[1]); if (rqst[2].rq_iov) @@ -569,13 +584,20 @@ out: int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, - struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse) + struct cifs_open_info_data *data, + struct cifs_sid *owner, + struct cifs_sid *group, + bool *adjust_tz, bool *reparse) { int rc; __u32 create_options = 0; struct cifsFileInfo *cfile; struct kvec err_iov[3] = {}; int err_buftype[3] = {}; + __u8 *sidsbuf = NULL; + __u8 *sidsbuf_end = NULL; + size_t sidsbuflen = 0; + size_t owner_len, group_len; *adjust_tz = false; *reparse = false; @@ -590,7 +612,7 @@ int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, ACL_NO_MODE, data, SMB2_OP_POSIX_QUERY_INFO, cfile, - NULL, NULL, err_iov, err_buftype); + &sidsbuf, &sidsbuflen, err_iov, err_buftype); if (rc == -EOPNOTSUPP) { /* BB TODO: When support for special files added to Samba re-verify this path */ if (err_iov[0].iov_base && err_buftype[0] != CIFS_NO_BUFFER && @@ -607,10 +629,31 @@ int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, ACL_NO_MODE, data, - SMB2_OP_POSIX_QUERY_INFO, cfile, NULL, NULL, NULL, NULL); + SMB2_OP_POSIX_QUERY_INFO, cfile, + &sidsbuf, &sidsbuflen, NULL, NULL); + } + + if (rc == 0) { + sidsbuf_end = sidsbuf + sidsbuflen; + + owner_len = posix_info_sid_size(sidsbuf, sidsbuf_end); + if (owner_len == -1) { + rc = -EINVAL; + goto out; + } + memcpy(owner, sidsbuf, owner_len); + + group_len = posix_info_sid_size( + sidsbuf + owner_len, sidsbuf_end); + if (group_len == -1) { + rc = -EINVAL; + goto out; + } + memcpy(group, sidsbuf + owner_len, group_len); } out: + kfree(sidsbuf); free_rsp_buf(err_buftype[0], err_iov[0].iov_base); free_rsp_buf(err_buftype[1], err_iov[1].iov_base); free_rsp_buf(err_buftype[2], err_iov[2].iov_base); diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index be21b5d26f67e..d5d7ffb7711c4 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -277,7 +277,10 @@ extern int smb2_query_info_compound(const unsigned int xid, /* query path info from the server using SMB311 POSIX extensions*/ int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, - struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse); + struct cifs_open_info_data *data, + struct cifs_sid *owner, + struct cifs_sid *group, + bool *adjust_tz, bool *reparse); int posix_info_parse(const void *beg, const void *end, struct smb2_posix_info_parsed *out); int posix_info_sid_size(const void *beg, const void *end); -- GitLab From 9381666e289852f93be9d7f4f7844017e04f6315 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Nov 2022 14:18:33 +0100 Subject: [PATCH 1219/1423] cifs: wire up >migrate_folio CIFS does not use page private data that needs migration, so it can just wire up filemap_migrate_folio. This prepares for removing ->writepage, which is used as a fallback if no migrate_folio method is set. Signed-off-by: Christoph Hellwig Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/file.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index cd96982099309..6be924caed393 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -5240,10 +5240,10 @@ const struct address_space_operations cifs_addr_ops = { .direct_IO = cifs_direct_io, .invalidate_folio = cifs_invalidate_folio, .launder_folio = cifs_launder_folio, + .migrate_folio = filemap_migrate_folio, /* - * TODO: investigate and if useful we could add an cifs_migratePage - * helper (under an CONFIG_MIGRATION) in the future, and also - * investigate and add an is_dirty_writeback helper if needed + * TODO: investigate and if useful we could add an is_dirty_writeback + * helper if needed */ .swap_activate = cifs_swap_activate, .swap_deactivate = cifs_swap_deactivate, @@ -5264,4 +5264,5 @@ const struct address_space_operations cifs_addr_ops_smallbuf = { .release_folio = cifs_release_folio, .invalidate_folio = cifs_invalidate_folio, .launder_folio = cifs_launder_folio, + .migrate_folio = filemap_migrate_folio, }; -- GitLab From bff9018d3a52c45711bd63c446a2c80c0275e935 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Nov 2022 14:18:34 +0100 Subject: [PATCH 1220/1423] cifs: stop using generic_writepages generic_writepages is just a wrapper that calls ->writepages on a range, and thus in the way of eventually removing ->writepage. Switch cifs to just open code it in preparation of removing ->writepage. [note: I suspect just integrating the small wsize case with the rest of the writeback code might be a better idea here, but that needs someone more familiar with the code] Signed-off-by: Christoph Hellwig Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/file.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 6be924caed393..ec14e38411a13 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2646,6 +2646,21 @@ wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages, return rc; } +static int +cifs_writepage_locked(struct page *page, struct writeback_control *wbc); + +static int cifs_write_one_page(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct address_space *mapping = data; + int ret; + + ret = cifs_writepage_locked(page, wbc); + unlock_page(page); + mapping_set_error(mapping, ret); + return ret; +} + static int cifs_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -2662,10 +2677,11 @@ static int cifs_writepages(struct address_space *mapping, /* * If wsize is smaller than the page cache size, default to writing - * one page at a time via cifs_writepage + * one page at a time. */ if (cifs_sb->ctx->wsize < PAGE_SIZE) - return generic_writepages(mapping, wbc); + return write_cache_pages(mapping, wbc, cifs_write_one_page, + mapping); xid = get_xid(); if (wbc->range_cyclic) { -- GitLab From ebaad77c89921c8237ca17791d5462bd289052d0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Nov 2022 14:18:35 +0100 Subject: [PATCH 1221/1423] cifs: remove ->writepage ->writepage is a very inefficient method to write back data, and only used through write_cache_pages or a a fallback when no ->migrate_folio method is present. Now that cifs implements ->migrate_folio and doesn't call generic_writepages, the writepage method can be removed. Signed-off-by: Christoph Hellwig Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/file.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index ec14e38411a13..6701257541ab2 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2868,13 +2868,6 @@ retry_write: return rc; } -static int cifs_writepage(struct page *page, struct writeback_control *wbc) -{ - int rc = cifs_writepage_locked(page, wbc); - unlock_page(page); - return rc; -} - static int cifs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) @@ -5247,7 +5240,6 @@ static bool cifs_dirty_folio(struct address_space *mapping, struct folio *folio) const struct address_space_operations cifs_addr_ops = { .read_folio = cifs_read_folio, .readahead = cifs_readahead, - .writepage = cifs_writepage, .writepages = cifs_writepages, .write_begin = cifs_write_begin, .write_end = cifs_write_end, @@ -5272,7 +5264,6 @@ const struct address_space_operations cifs_addr_ops = { */ const struct address_space_operations cifs_addr_ops_smallbuf = { .read_folio = cifs_read_folio, - .writepage = cifs_writepage, .writepages = cifs_writepages, .write_begin = cifs_write_begin, .write_end = cifs_write_end, -- GitLab From d406d26745aba3365ab9171b2d5cbea9c1757305 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Mon, 5 Dec 2022 23:31:53 -0300 Subject: [PATCH 1222/1423] cifs: skip alloc when request has no pages When smb3_init_transform_rq() was being called with requests (@old_rq) which had no pages, it was unnecessarily allocating a single page for every request in @new_rq. Fix this by skipping page array allocation when requests have no pages (e.g. !smb_rqst::rq_npages). Also get rid of deprecated kmap() and use kmap_local_page() instead while we're at it. Signed-off-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index bfaafd02fb1f2..72b22d033ed56 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -4445,21 +4445,27 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst, int rc = -ENOMEM; for (i = 1; i < num_rqst; i++) { - npages = old_rq[i - 1].rq_npages; + struct smb_rqst *old = &old_rq[i - 1]; + struct smb_rqst *new = &new_rq[i]; + + orig_len += smb_rqst_len(server, old); + new->rq_iov = old->rq_iov; + new->rq_nvec = old->rq_nvec; + + npages = old->rq_npages; + if (!npages) + continue; + pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); if (!pages) goto err_free; - new_rq[i].rq_pages = pages; - new_rq[i].rq_npages = npages; - new_rq[i].rq_offset = old_rq[i - 1].rq_offset; - new_rq[i].rq_pagesz = old_rq[i - 1].rq_pagesz; - new_rq[i].rq_tailsz = old_rq[i - 1].rq_tailsz; - new_rq[i].rq_iov = old_rq[i - 1].rq_iov; - new_rq[i].rq_nvec = old_rq[i - 1].rq_nvec; - - orig_len += smb_rqst_len(server, &old_rq[i - 1]); + new->rq_pages = pages; + new->rq_npages = npages; + new->rq_offset = old->rq_offset; + new->rq_pagesz = old->rq_pagesz; + new->rq_tailsz = old->rq_tailsz; for (j = 0; j < npages; j++) { pages[j] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); @@ -4472,14 +4478,14 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst, char *dst, *src; unsigned int offset, len; - rqst_page_get_length(&new_rq[i], j, &len, &offset); + rqst_page_get_length(new, j, &len, &offset); - dst = (char *) kmap(new_rq[i].rq_pages[j]) + offset; - src = (char *) kmap(old_rq[i - 1].rq_pages[j]) + offset; + dst = kmap_local_page(new->rq_pages[j]) + offset; + src = kmap_local_page(old->rq_pages[j]) + offset; memcpy(dst, src, len); - kunmap(new_rq[i].rq_pages[j]); - kunmap(old_rq[i - 1].rq_pages[j]); + kunmap(new->rq_pages[j]); + kunmap(old->rq_pages[j]); } } -- GitLab From 52f31ed228212ba572c44e15e818a3a5c74122c0 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 8 Dec 2022 08:29:22 -0800 Subject: [PATCH 1223/1423] xfs: dquot shrinker doesn't check for XFS_DQFLAG_FREEING Resulting in a UAF if the shrinker races with some other dquot freeing mechanism that sets XFS_DQFLAG_FREEING before the dquot is removed from the LRU. This can occur if a dquot purge races with drop_caches. Reported-by: syzbot+912776840162c13db1a3@syzkaller.appspotmail.com Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_qm.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 18bb4ec4d7c9b..ff53d40a2dae3 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -422,6 +422,14 @@ xfs_qm_dquot_isolate( if (!xfs_dqlock_nowait(dqp)) goto out_miss_busy; + /* + * If something else is freeing this dquot and hasn't yet removed it + * from the LRU, leave it for the freeing task to complete the freeing + * process rather than risk it being free from under us here. + */ + if (dqp->q_flags & XFS_DQFLAG_FREEING) + goto out_miss_unlock; + /* * This dquot has acquired a reference in the meantime remove it from * the freelist and try again. @@ -441,10 +449,8 @@ xfs_qm_dquot_isolate( * skip it so there is time for the IO to complete before we try to * reclaim it again on the next LRU pass. */ - if (!xfs_dqflock_nowait(dqp)) { - xfs_dqunlock(dqp); - goto out_miss_busy; - } + if (!xfs_dqflock_nowait(dqp)) + goto out_miss_unlock; if (XFS_DQ_IS_DIRTY(dqp)) { struct xfs_buf *bp = NULL; @@ -478,6 +484,8 @@ xfs_qm_dquot_isolate( XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaims); return LRU_REMOVED; +out_miss_unlock: + xfs_dqunlock(dqp); out_miss_busy: trace_xfs_dqreclaim_busy(dqp); XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses); -- GitLab From 7dfb216eda99bbfc2a8c3b03d2eec63314f52b3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Amadeusz=20S=C5=82awi=C5=84ski?= Date: Thu, 8 Dec 2022 23:40:16 +0100 Subject: [PATCH 1224/1423] ACPICA: Fix operand resolution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In our tests we get UBSAN warning coming from ACPI parser. This is caused by trying to resolve operands when there is none. [ 0.000000] Linux version 5.15.0-rc3chromeavsrel1.0.184+ (root@...) (gcc (Ubuntu 10.3.0-1ubuntu1~20.04) 10.3.0, GNU ld (GNU Binutils for Ubuntu) 2.34) #1 SMP PREEMPT Sat Oct 16 00:08:27 UTC 2021 ... [ 14.719508] ================================================================================ [ 14.719551] UBSAN: array-index-out-of-bounds in /.../linux/drivers/acpi/acpica/dswexec.c:401:12 [ 14.719594] index -1 is out of range for type 'acpi_operand_object *[9]' [ 14.719621] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.15.0-rc3chromeavsrel1.0.184+ #1 [ 14.719657] Hardware name: Intel Corp. Geminilake/GLK RVP2 LP4SD (07), BIOS GELKRVPA.X64.0214.B50.2009111159 09/11/2020 [ 14.719694] Call Trace: [ 14.719712] dump_stack_lvl+0x38/0x49 [ 14.719749] dump_stack+0x10/0x12 [ 14.719775] ubsan_epilogue+0x9/0x45 [ 14.719801] __ubsan_handle_out_of_bounds.cold+0x44/0x49 [ 14.719835] acpi_ds_exec_end_op+0x1d7/0x6b5 [ 14.719870] acpi_ps_parse_loop+0x942/0xb34 ... Problem happens because WalkState->NumOperands is 0 and it is used when trying to access into operands table. Actual code is: WalkState->Operands [WalkState->NumOperands -1] which causes out of bound access. Improve the check before above access to check if ACPI opcode should have any arguments (operands) at all. Link: https://github.com/acpica/acpica/pull/745 Signed-off-by: Amadeusz Sławiński Reviewed-by: Cezary Rojewski Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/dswexec.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/acpica/dswexec.c b/drivers/acpi/acpica/dswexec.c index e8ad41387f84c..b082eb942a0f8 100644 --- a/drivers/acpi/acpica/dswexec.c +++ b/drivers/acpi/acpica/dswexec.c @@ -389,9 +389,11 @@ acpi_status acpi_ds_exec_end_op(struct acpi_walk_state *walk_state) /* * All opcodes require operand resolution, with the only exceptions - * being the object_type and size_of operators. + * being the object_type and size_of operators as well as opcodes that + * take no arguments. */ - if (!(walk_state->op_info->flags & AML_NO_OPERAND_RESOLVE)) { + if (!(walk_state->op_info->flags & AML_NO_OPERAND_RESOLVE) && + (walk_state->op_info->flags & AML_HAS_ARGS)) { /* Resolve all operands */ -- GitLab From 7a9d74e7e403cb2e60d4d00c05f2f3ab2a33d0c3 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 8 Dec 2022 15:23:32 +0100 Subject: [PATCH 1225/1423] ACPICA: include/acpi/acpixf.h: Fix indentation A bunch of the functions declared in include/acpi/acpixf.h have their name aligned a space after the '(' of e.g. the `ACPI_EXTERNAL_RETURN_STATUS(acpi_status` line above rather then being directly aligned after the '('. This breaks applying patches generated from the ACPICA upstream git, remove the extra space before the function-names and all the arguments to fix this. Signed-off-by: Hans de Goede Signed-off-by: Rafael J. Wysocki --- include/acpi/acpixf.h | 120 +++++++++++++++++++++--------------------- 1 file changed, 60 insertions(+), 60 deletions(-) diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 9e49b37fc869f..d1329d6d526d9 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -589,82 +589,82 @@ ACPI_EXTERNAL_RETURN_STATUS(acpi_status acpi_install_initialization_handler (acpi_init_handler handler, u32 function)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status - acpi_install_sci_handler(acpi_sci_handler - address, - void *context)) -ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status - acpi_remove_sci_handler(acpi_sci_handler - address)) + acpi_install_sci_handler(acpi_sci_handler + address, + void *context)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status - acpi_install_global_event_handler - (acpi_gbl_event_handler handler, - void *context)) + acpi_remove_sci_handler(acpi_sci_handler + address)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status - acpi_install_fixed_event_handler(u32 - acpi_event, - acpi_event_handler - handler, - void - *context)) + acpi_install_global_event_handler + (acpi_gbl_event_handler handler, + void *context)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status - acpi_remove_fixed_event_handler(u32 acpi_event, + acpi_install_fixed_event_handler(u32 + acpi_event, acpi_event_handler - handler)) -ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status - acpi_install_gpe_handler(acpi_handle - gpe_device, - u32 gpe_number, - u32 type, - acpi_gpe_handler - address, - void *context)) + handler, + void + *context)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status - acpi_install_gpe_raw_handler(acpi_handle - gpe_device, - u32 gpe_number, - u32 type, - acpi_gpe_handler - address, - void *context)) + acpi_remove_fixed_event_handler(u32 acpi_event, + acpi_event_handler + handler)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status - acpi_remove_gpe_handler(acpi_handle gpe_device, + acpi_install_gpe_handler(acpi_handle + gpe_device, u32 gpe_number, + u32 type, acpi_gpe_handler - address)) -ACPI_EXTERNAL_RETURN_STATUS(acpi_status - acpi_install_notify_handler(acpi_handle device, - u32 handler_type, - acpi_notify_handler - handler, + address, void *context)) +ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status + acpi_install_gpe_raw_handler(acpi_handle + gpe_device, + u32 gpe_number, + u32 type, + acpi_gpe_handler + address, + void *context)) +ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status + acpi_remove_gpe_handler(acpi_handle gpe_device, + u32 gpe_number, + acpi_gpe_handler + address)) ACPI_EXTERNAL_RETURN_STATUS(acpi_status - acpi_remove_notify_handler(acpi_handle device, + acpi_install_notify_handler(acpi_handle device, u32 handler_type, acpi_notify_handler - handler)) -ACPI_EXTERNAL_RETURN_STATUS(acpi_status - acpi_install_address_space_handler(acpi_handle - device, - acpi_adr_space_type - space_id, - acpi_adr_space_handler - handler, - acpi_adr_space_setup - setup, - void *context)) -ACPI_EXTERNAL_RETURN_STATUS(acpi_status - acpi_remove_address_space_handler(acpi_handle + handler, + void *context)) +ACPI_EXTERNAL_RETURN_STATUS(acpi_status + acpi_remove_notify_handler(acpi_handle device, + u32 handler_type, + acpi_notify_handler + handler)) +ACPI_EXTERNAL_RETURN_STATUS(acpi_status + acpi_install_address_space_handler(acpi_handle device, acpi_adr_space_type space_id, acpi_adr_space_handler - handler)) -ACPI_EXTERNAL_RETURN_STATUS(acpi_status - acpi_install_exception_handler - (acpi_exception_handler handler)) -ACPI_EXTERNAL_RETURN_STATUS(acpi_status - acpi_install_interface_handler - (acpi_interface_handler handler)) + handler, + acpi_adr_space_setup + setup, + void *context)) +ACPI_EXTERNAL_RETURN_STATUS(acpi_status + acpi_remove_address_space_handler(acpi_handle + device, + acpi_adr_space_type + space_id, + acpi_adr_space_handler + handler)) +ACPI_EXTERNAL_RETURN_STATUS(acpi_status + acpi_install_exception_handler + (acpi_exception_handler handler)) +ACPI_EXTERNAL_RETURN_STATUS(acpi_status + acpi_install_interface_handler + (acpi_interface_handler handler)) /* * Global Lock interfaces -- GitLab From 54c516aeb8b39eeae6450b7d8076d381568dca46 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 8 Dec 2022 15:23:33 +0100 Subject: [PATCH 1226/1423] ACPICA: Allow address_space_handler Install and _REG execution as 2 separate steps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ACPI-2.0 says that the EC op_region handler must be available immediately (like the standard default op_region handlers): Quoting from the ACPI spec version 6.3: "6.5.4 _REG (Region) ... 2. OSPM must make Embedded Controller operation regions, accessed via the Embedded Controllers described in ECDT, available before executing any control method. These operation regions may become inaccessible after OSPM runs _REG(EmbeddedControl, 0)." So the OS must probe the ECDT described EC and install the OpRegion handler before calling acpi_enable_subsystem() and acpi_initialize_objects(). This is a problem because calling acpi_install_address_space_handler() does not just install the op_region handler, it also runs the EC's _REG method. This _REG method may rely on initialization done by the _INI methods of one of the PCI / _SB root devices. For the other early/default op_region handlers the op_region handler install and the _REG execution is split into 2 separate steps: 1. acpi_ev_install_region_handlers(), called early from acpi_load_tables() 2. acpi_ev_initialize_op_regions(), called from acpi_initialize_objects() To fix the EC op_region issue, add 2 bew functions: 1. acpi_install_address_space_handler_no_reg() 2. acpi_execute_reg_methods() to allow doing things in 2 steps for other op_region handlers, like the EC handler, too. Note that the comment describing acpi_ev_install_region_handlers() even has an alinea describing this problem. Using the new methods allows users to avoid this problem. Link: https://github.com/acpica/acpica/pull/786 Link: https://bugzilla.kernel.org/show_bug.cgi?id=214899 Reported-and-tested-by: Johannes Penßel Signed-off-by: Hans de Goede Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/evxfregn.c | 92 +++++++++++++++++++++++++++++++--- include/acpi/acpixf.h | 10 ++++ 2 files changed, 95 insertions(+), 7 deletions(-) diff --git a/drivers/acpi/acpica/evxfregn.c b/drivers/acpi/acpica/evxfregn.c index 0a8372bf6a77d..a5c19f46ec17d 100644 --- a/drivers/acpi/acpica/evxfregn.c +++ b/drivers/acpi/acpica/evxfregn.c @@ -20,13 +20,14 @@ ACPI_MODULE_NAME("evxfregn") /******************************************************************************* * - * FUNCTION: acpi_install_address_space_handler + * FUNCTION: acpi_install_address_space_handler_internal * * PARAMETERS: device - Handle for the device * space_id - The address space ID * handler - Address of the handler * setup - Address of the setup function * context - Value passed to the handler on each access + * Run_reg - Run _REG methods for this address space? * * RETURN: Status * @@ -37,13 +38,16 @@ ACPI_MODULE_NAME("evxfregn") * are executed here, and these methods can only be safely executed after * the default handlers have been installed and the hardware has been * initialized (via acpi_enable_subsystem.) + * To avoid this problem pass FALSE for Run_Reg and later on call + * acpi_execute_reg_methods() to execute _REG. * ******************************************************************************/ -acpi_status -acpi_install_address_space_handler(acpi_handle device, - acpi_adr_space_type space_id, - acpi_adr_space_handler handler, - acpi_adr_space_setup setup, void *context) +static acpi_status +acpi_install_address_space_handler_internal(acpi_handle device, + acpi_adr_space_type space_id, + acpi_adr_space_handler handler, + acpi_adr_space_setup setup, + void *context, u8 run_reg) { struct acpi_namespace_node *node; acpi_status status; @@ -80,14 +84,40 @@ acpi_install_address_space_handler(acpi_handle device, /* Run all _REG methods for this address space */ - acpi_ev_execute_reg_methods(node, space_id, ACPI_REG_CONNECT); + if (run_reg) { + acpi_ev_execute_reg_methods(node, space_id, ACPI_REG_CONNECT); + } unlock_and_exit: (void)acpi_ut_release_mutex(ACPI_MTX_NAMESPACE); return_ACPI_STATUS(status); } +acpi_status +acpi_install_address_space_handler(acpi_handle device, + acpi_adr_space_type space_id, + acpi_adr_space_handler handler, + acpi_adr_space_setup setup, void *context) +{ + return acpi_install_address_space_handler_internal(device, space_id, + handler, setup, + context, TRUE); +} + ACPI_EXPORT_SYMBOL(acpi_install_address_space_handler) +acpi_status +acpi_install_address_space_handler_no_reg(acpi_handle device, + acpi_adr_space_type space_id, + acpi_adr_space_handler handler, + acpi_adr_space_setup setup, + void *context) +{ + return acpi_install_address_space_handler_internal(device, space_id, + handler, setup, + context, FALSE); +} + +ACPI_EXPORT_SYMBOL(acpi_install_address_space_handler_no_reg) /******************************************************************************* * @@ -228,3 +258,51 @@ unlock_and_exit: } ACPI_EXPORT_SYMBOL(acpi_remove_address_space_handler) +/******************************************************************************* + * + * FUNCTION: acpi_execute_reg_methods + * + * PARAMETERS: device - Handle for the device + * space_id - The address space ID + * + * RETURN: Status + * + * DESCRIPTION: Execute _REG for all op_regions of a given space_id. + * + ******************************************************************************/ +acpi_status +acpi_execute_reg_methods(acpi_handle device, acpi_adr_space_type space_id) +{ + struct acpi_namespace_node *node; + acpi_status status; + + ACPI_FUNCTION_TRACE(acpi_execute_reg_methods); + + /* Parameter validation */ + + if (!device) { + return_ACPI_STATUS(AE_BAD_PARAMETER); + } + + status = acpi_ut_acquire_mutex(ACPI_MTX_NAMESPACE); + if (ACPI_FAILURE(status)) { + return_ACPI_STATUS(status); + } + + /* Convert and validate the device handle */ + + node = acpi_ns_validate_handle(device); + if (node) { + + /* Run all _REG methods for this address space */ + + acpi_ev_execute_reg_methods(node, space_id, ACPI_REG_CONNECT); + } else { + status = AE_BAD_PARAMETER; + } + + (void)acpi_ut_release_mutex(ACPI_MTX_NAMESPACE); + return_ACPI_STATUS(status); +} + +ACPI_EXPORT_SYMBOL(acpi_execute_reg_methods) diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index d1329d6d526d9..9778408f8db40 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -652,6 +652,16 @@ ACPI_EXTERNAL_RETURN_STATUS(acpi_status acpi_adr_space_setup setup, void *context)) +ACPI_EXTERNAL_RETURN_STATUS(acpi_status + acpi_install_address_space_handler_no_reg + (acpi_handle device, acpi_adr_space_type space_id, + acpi_adr_space_handler handler, + acpi_adr_space_setup setup, + void *context)) +ACPI_EXTERNAL_RETURN_STATUS(acpi_status + acpi_execute_reg_methods(acpi_handle device, + acpi_adr_space_type + space_id)) ACPI_EXTERNAL_RETURN_STATUS(acpi_status acpi_remove_address_space_handler(acpi_handle device, -- GitLab From a5072078dbfaa9d70130805766dfa34bbb7bf2a7 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 8 Dec 2022 15:23:34 +0100 Subject: [PATCH 1227/1423] ACPI: EC: Fix EC address space handler unregistration When an ECDT table is present the EC address space handler gets registered on the root node. So to unregister it properly the unregister call also must be done on the root node. Store the ACPI handle used for the acpi_install_address_space_handler() call and use te same handle for the acpi_remove_address_space_handler() call. Reported-by: Rafael J. Wysocki Signed-off-by: Hans de Goede Signed-off-by: Rafael J. Wysocki --- drivers/acpi/ec.c | 4 +++- drivers/acpi/internal.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index 9751b84c1b221..5a21e4d58322b 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -1475,6 +1475,7 @@ static int ec_install_handlers(struct acpi_ec *ec, struct acpi_device *device) return -ENODEV; } set_bit(EC_FLAGS_EC_HANDLER_INSTALLED, &ec->flags); + ec->address_space_handler_holder = ec->handle; } if (!device) @@ -1526,7 +1527,8 @@ static int ec_install_handlers(struct acpi_ec *ec, struct acpi_device *device) static void ec_remove_handlers(struct acpi_ec *ec) { if (test_bit(EC_FLAGS_EC_HANDLER_INSTALLED, &ec->flags)) { - if (ACPI_FAILURE(acpi_remove_address_space_handler(ec->handle, + if (ACPI_FAILURE(acpi_remove_address_space_handler( + ec->address_space_handler_holder, ACPI_ADR_SPACE_EC, &acpi_ec_space_handler))) pr_err("failed to remove space handler\n"); clear_bit(EC_FLAGS_EC_HANDLER_INSTALLED, &ec->flags); diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index 219c02df9a08c..ec584442fb298 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -173,6 +173,7 @@ enum acpi_ec_event_state { struct acpi_ec { acpi_handle handle; + acpi_handle address_space_handler_holder; int gpe; int irq; unsigned long command_addr; -- GitLab From ab4620f58d38206687b9f99d9d2cc1d5a2640985 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 8 Dec 2022 15:23:35 +0100 Subject: [PATCH 1228/1423] ACPI: EC: Fix ECDT probe ordering issues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ACPI-2.0 says that the EC OpRegion handler must be available immediately (like the standard default OpRegion handlers): Quoting from the ACPI spec version 6.3: "6.5.4 _REG (Region) ... 2. OSPM must make Embedded Controller operation regions, accessed via the Embedded Controllers described in ECDT, available before executing any control method. These operation regions may become inaccessible after OSPM runs _REG(EmbeddedControl, 0)." So acpi_bus_init() calls acpi_ec_ecdt_probe(), which calls acpi_install_address_space_handler() to install the EC's OpRegion handler, early on. This not only installs the OpRegion handler, but also calls the EC's _REG method. The _REG method call is a problem because it may rely on initialization done by the _INI methods of one of the PCI / _SB root devs, see for example: https://bugzilla.kernel.org/show_bug.cgi?id=214899 . Generally speaking _REG methods are executed when the ACPI-device they are part of has a driver bound to it. Where as _INI methods must be executed at table load time (according to the spec). The problem here is that the early acpi_install_address_space_handler() call causes the _REG handler to run too early. To allow fixing this the ACPICA code now allows to split the OpRegion handler installation and the executing of _REG into 2 separate steps. This commit uses this ACPICA functionality to fix the EC probe ordering by delaying the executing of _REG for ECDT described ECs till the matching EC device in the DSDT gets parsed and acpi_ec_add() for it gets called. This moves the calling of _REG for the EC on devices with an ECDT to the same point in time where it is called on devices without an ECDT table. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=214899 Reported-and-tested-by: Johannes Penßel Signed-off-by: Hans de Goede Signed-off-by: Rafael J. Wysocki --- drivers/acpi/ec.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index 5a21e4d58322b..73ac2f222897b 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -94,6 +94,7 @@ enum { EC_FLAGS_QUERY_ENABLED, /* Query is enabled */ EC_FLAGS_EVENT_HANDLER_INSTALLED, /* Event handler installed */ EC_FLAGS_EC_HANDLER_INSTALLED, /* OpReg handler installed */ + EC_FLAGS_EC_REG_CALLED, /* OpReg ACPI _REG method called */ EC_FLAGS_QUERY_METHODS_INSTALLED, /* _Qxx handlers installed */ EC_FLAGS_STARTED, /* Driver is started */ EC_FLAGS_STOPPED, /* Driver is stopped */ @@ -1446,6 +1447,7 @@ static bool install_gpio_irq_event_handler(struct acpi_ec *ec) * ec_install_handlers - Install service callbacks and register query methods. * @ec: Target EC. * @device: ACPI device object corresponding to @ec. + * @call_reg: If _REG should be called to notify OpRegion availability * * Install a handler for the EC address space type unless it has been installed * already. If @device is not NULL, also look for EC query methods in the @@ -1458,7 +1460,8 @@ static bool install_gpio_irq_event_handler(struct acpi_ec *ec) * -EPROBE_DEFER if GPIO IRQ acquisition needs to be deferred, * or 0 (success) otherwise. */ -static int ec_install_handlers(struct acpi_ec *ec, struct acpi_device *device) +static int ec_install_handlers(struct acpi_ec *ec, struct acpi_device *device, + bool call_reg) { acpi_status status; @@ -1466,10 +1469,10 @@ static int ec_install_handlers(struct acpi_ec *ec, struct acpi_device *device) if (!test_bit(EC_FLAGS_EC_HANDLER_INSTALLED, &ec->flags)) { acpi_ec_enter_noirq(ec); - status = acpi_install_address_space_handler(ec->handle, - ACPI_ADR_SPACE_EC, - &acpi_ec_space_handler, - NULL, ec); + status = acpi_install_address_space_handler_no_reg(ec->handle, + ACPI_ADR_SPACE_EC, + &acpi_ec_space_handler, + NULL, ec); if (ACPI_FAILURE(status)) { acpi_ec_stop(ec, false); return -ENODEV; @@ -1478,6 +1481,11 @@ static int ec_install_handlers(struct acpi_ec *ec, struct acpi_device *device) ec->address_space_handler_holder = ec->handle; } + if (call_reg && !test_bit(EC_FLAGS_EC_REG_CALLED, &ec->flags)) { + acpi_execute_reg_methods(ec->handle, ACPI_ADR_SPACE_EC); + set_bit(EC_FLAGS_EC_REG_CALLED, &ec->flags); + } + if (!device) return 0; @@ -1564,11 +1572,11 @@ static void ec_remove_handlers(struct acpi_ec *ec) } } -static int acpi_ec_setup(struct acpi_ec *ec, struct acpi_device *device) +static int acpi_ec_setup(struct acpi_ec *ec, struct acpi_device *device, bool call_reg) { int ret; - ret = ec_install_handlers(ec, device); + ret = ec_install_handlers(ec, device, call_reg); if (ret) return ret; @@ -1633,7 +1641,7 @@ static int acpi_ec_add(struct acpi_device *device) } } - ret = acpi_ec_setup(ec, device); + ret = acpi_ec_setup(ec, device, true); if (ret) goto err; @@ -1753,7 +1761,7 @@ void __init acpi_ec_dsdt_probe(void) * At this point, the GPE is not fully initialized, so do not to * handle the events. */ - ret = acpi_ec_setup(ec, NULL); + ret = acpi_ec_setup(ec, NULL, true); if (ret) { acpi_ec_free(ec); return; @@ -1947,7 +1955,7 @@ void __init acpi_ec_ecdt_probe(void) * At this point, the namespace is not initialized, so do not find * the namespace objects, or handle the events. */ - ret = acpi_ec_setup(ec, NULL); + ret = acpi_ec_setup(ec, NULL, false); if (ret) { acpi_ec_free(ec); goto out; -- GitLab From c1ddc3dad85dda4421e852c72f7596cdb10e9fc6 Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Thu, 8 Dec 2022 13:38:50 +0100 Subject: [PATCH 1229/1423] PCI: xilinx-nwl: Fix coding style violations Fix code alignments and remove additional newline. Link: https://lore.kernel.org/r/17c75e7003bb8c43a0f45ae3d7c45cac230ef852.1670503129.git.michal.simek@amd.com Signed-off-by: Michal Simek Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/pcie-xilinx-nwl.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/pci/controller/pcie-xilinx-nwl.c b/drivers/pci/controller/pcie-xilinx-nwl.c index 40d070e54ad2e..e10a58649bf57 100644 --- a/drivers/pci/controller/pcie-xilinx-nwl.c +++ b/drivers/pci/controller/pcie-xilinx-nwl.c @@ -474,15 +474,15 @@ static int nwl_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, for (i = 0; i < nr_irqs; i++) { irq_domain_set_info(domain, virq + i, bit + i, &nwl_irq_chip, - domain->host_data, handle_simple_irq, - NULL, NULL); + domain->host_data, handle_simple_irq, + NULL, NULL); } mutex_unlock(&msi->lock); return 0; } static void nwl_irq_domain_free(struct irq_domain *domain, unsigned int virq, - unsigned int nr_irqs) + unsigned int nr_irqs) { struct irq_data *data = irq_domain_get_irq_data(domain, virq); struct nwl_pcie *pcie = irq_data_get_irq_chip_data(data); @@ -722,7 +722,6 @@ static int nwl_pcie_bridge_init(struct nwl_pcie *pcie) /* Enable all misc interrupts */ nwl_bridge_writel(pcie, MSGF_MISC_SR_MASKALL, MSGF_MISC_MASK); - /* Disable all legacy interrupts */ nwl_bridge_writel(pcie, (u32)~MSGF_LEG_SR_MASKALL, MSGF_LEG_MASK); -- GitLab From 1c8a8ec0a0e9a1176022a35c4daf04fe1594d270 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 Nov 2022 10:43:44 +0100 Subject: [PATCH 1230/1423] f2fs: remove struct segment_allocation default_salloc_ops There is only single instance of these ops, so remove the indirection and call allocate_segment_by_default directly. Signed-off-by: Christoph Hellwig Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 11 ++--------- fs/f2fs/segment.h | 6 ------ 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0ff451ea18f66..bbe6556799ce9 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2926,7 +2926,7 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, return; alloc: old_segno = curseg->segno; - SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true); + allocate_segment_by_default(sbi, type, true); locate_dirty_segment(sbi, old_segno); } @@ -2957,10 +2957,6 @@ void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) f2fs_up_read(&SM_I(sbi)->curseg_lock); } -static const struct segment_allocation default_salloc_ops = { - .allocate_segment = allocate_segment_by_default, -}; - bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) { @@ -3284,7 +3280,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, get_atssr_segment(sbi, type, se->type, AT_SSR, se->mtime); else - sit_i->s_ops->allocate_segment(sbi, type, false); + allocate_segment_by_default(sbi, type, false); } /* * segment dirty status should be updated after segment allocation, @@ -4270,9 +4266,6 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; #endif - /* init SIT information */ - sit_i->s_ops = &default_salloc_ops; - sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg; sit_i->written_valid_blocks = 0; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index be8f2d7d007b9..3ad1b7b6fa946 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -222,10 +222,6 @@ struct sec_entry { unsigned int valid_blocks; /* # of valid blocks in a section */ }; -struct segment_allocation { - void (*allocate_segment)(struct f2fs_sb_info *, int, bool); -}; - #define MAX_SKIP_GC_COUNT 16 struct revoke_entry { @@ -235,8 +231,6 @@ struct revoke_entry { }; struct sit_info { - const struct segment_allocation *s_ops; - block_t sit_base_addr; /* start block address of SIT area */ block_t sit_blocks; /* # of blocks used by SIT area */ block_t written_valid_blocks; /* # of valid blocks in main area */ -- GitLab From 8442d94b8ac8d5d8300725a9ffa9def526b71170 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 Nov 2022 10:43:45 +0100 Subject: [PATCH 1231/1423] f2fs: open code allocate_segment_by_default allocate_segment_by_default has just two callers, which use very different code pathes inside it based on the force paramter. Just open code the logic in the two callers using a new helper to decided if a new segment should be allocated. Signed-off-by: Christoph Hellwig Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 50 +++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index bbe6556799ce9..c4e118eb7d197 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2849,31 +2849,20 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, return 0; } -/* - * flush out current segment and replace it with new segment - * This function should be returned with success, otherwise BUG - */ -static void allocate_segment_by_default(struct f2fs_sb_info *sbi, - int type, bool force) +static bool need_new_seg(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); - if (force) - new_curseg(sbi, type, true); - else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && - curseg->seg_type == CURSEG_WARM_NODE) - new_curseg(sbi, type, false); - else if (curseg->alloc_type == LFS && - is_next_segment_free(sbi, curseg, type) && - likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) - new_curseg(sbi, type, false); - else if (f2fs_need_SSR(sbi) && - get_ssr_segment(sbi, type, SSR, 0)) - change_curseg(sbi, type, true); - else - new_curseg(sbi, type, false); - - stat_inc_seg_type(sbi, curseg); + if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && + curseg->seg_type == CURSEG_WARM_NODE) + return true; + if (curseg->alloc_type == LFS && + is_next_segment_free(sbi, curseg, type) && + likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + return true; + if (!f2fs_need_SSR(sbi) || !get_ssr_segment(sbi, type, SSR, 0)) + return true; + return false; } void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, @@ -2926,7 +2915,8 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, return; alloc: old_segno = curseg->segno; - allocate_segment_by_default(sbi, type, true); + new_curseg(sbi, type, true); + stat_inc_seg_type(sbi, curseg); locate_dirty_segment(sbi, old_segno); } @@ -3276,11 +3266,19 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, update_sit_entry(sbi, old_blkaddr, -1); if (!__has_curseg_space(sbi, curseg)) { - if (from_gc) + /* + * Flush out current segment and replace it with new segment. + */ + if (from_gc) { get_atssr_segment(sbi, type, se->type, AT_SSR, se->mtime); - else - allocate_segment_by_default(sbi, type, false); + } else { + if (need_new_seg(sbi, type)) + new_curseg(sbi, type, false); + else + change_curseg(sbi, type, true); + stat_inc_seg_type(sbi, curseg); + } } /* * segment dirty status should be updated after segment allocation, -- GitLab From 5bcd655fffaec24e849bda1207446f5cc821713e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 Nov 2022 10:43:46 +0100 Subject: [PATCH 1232/1423] f2fs: remove the unused flush argument to change_curseg Signed-off-by: Christoph Hellwig Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c4e118eb7d197..9486ca49ecb15 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2656,7 +2656,7 @@ bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno) * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks */ -static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush) +static void change_curseg(struct f2fs_sb_info *sbi, int type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -2664,9 +2664,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush) struct f2fs_summary_block *sum_node; struct page *sum_page; - if (flush) - write_sum_page(sbi, curseg->sum_blk, - GET_SUM_BLOCK(sbi, curseg->segno)); + write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno)); __set_test_and_inuse(sbi, new_segno); @@ -2705,7 +2703,7 @@ static void get_atssr_segment(struct f2fs_sb_info *sbi, int type, struct seg_entry *se = get_seg_entry(sbi, curseg->next_segno); curseg->seg_type = se->type; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } else { /* allocate cold segment by default */ curseg->seg_type = CURSEG_COLD_DATA; @@ -2880,7 +2878,7 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, goto unlock; if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type, SSR, 0)) - change_curseg(sbi, type, true); + change_curseg(sbi, type); else new_curseg(sbi, type, true); @@ -3276,7 +3274,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, if (need_new_seg(sbi, type)) new_curseg(sbi, type, false); else - change_curseg(sbi, type, true); + change_curseg(sbi, type); stat_inc_seg_type(sbi, curseg); } } @@ -3539,7 +3537,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, /* change the current segment */ if (segno != curseg->segno) { curseg->next_segno = segno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); @@ -3567,7 +3565,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (recover_curseg) { if (old_cursegno != curseg->segno) { curseg->next_segno = old_cursegno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } curseg->next_blkoff = old_blkoff; curseg->alloc_type = old_alloc_type; -- GitLab From 398bb30d4f4e857ee1352130c6935f0fb16d7af2 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 26 Nov 2022 10:38:07 +0800 Subject: [PATCH 1233/1423] MAINTAINERS: Add f2fs bug tracker link As f2fs component in bugzilla.kernel.org was created and used since 2018-7. Signed-off-by: Chao Yu Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 6 +++++- MAINTAINERS | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 6e67c5e6c7c37..67e1f3e86f32f 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -25,10 +25,14 @@ a consistency checking tool (fsck.f2fs), and a debugging tool (dump.f2fs). - git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs-tools.git -For reporting bugs and sending patches, please use the following mailing list: +For sending patches, please use the following mailing list: - linux-f2fs-devel@lists.sourceforge.net +For reporting bugs, please use the following f2fs bug tracker link: + +- https://bugzilla.kernel.org/enter_bug.cgi?product=File%20System&component=f2fs + Background and Design issues ============================ diff --git a/MAINTAINERS b/MAINTAINERS index cf0f185023724..01fdbb592ea70 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7828,6 +7828,7 @@ M: Chao Yu L: linux-f2fs-devel@lists.sourceforge.net S: Maintained W: https://f2fs.wiki.kernel.org/ +B: https://bugzilla.kernel.org/enter_bug.cgi?product=File%20System&component=f2fs T: git git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs.git F: Documentation/ABI/testing/sysfs-fs-f2fs F: Documentation/filesystems/f2fs.rst -- GitLab From 870af777da22505851174a34c0228042d7ed5f5f Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Fri, 25 Nov 2022 19:47:36 +0800 Subject: [PATCH 1234/1423] f2fs: do some cleanup for f2fs module init Just for cleanup, no functional changes. Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 46 ++++++---------------------------------------- fs/f2fs/data.c | 14 ++++---------- fs/f2fs/gc.c | 4 +--- fs/f2fs/recovery.c | 4 +--- fs/f2fs/super.c | 8 ++------ 5 files changed, 14 insertions(+), 62 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 74d3f2d2271f3..9723f0bed9237 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -567,10 +567,7 @@ MODULE_PARM_DESC(num_compress_pages, int f2fs_init_compress_mempool(void) { compress_page_pool = mempool_create_page_pool(num_compress_pages, 0); - if (!compress_page_pool) - return -ENOMEM; - - return 0; + return compress_page_pool ? 0 : -ENOMEM; } void f2fs_destroy_compress_mempool(void) @@ -1983,9 +1980,7 @@ int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) sbi->page_array_slab = f2fs_kmem_cache_create(slab_name, sbi->page_array_slab_size); - if (!sbi->page_array_slab) - return -ENOMEM; - return 0; + return sbi->page_array_slab ? 0 : -ENOMEM; } void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) @@ -1993,53 +1988,24 @@ void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) kmem_cache_destroy(sbi->page_array_slab); } -static int __init f2fs_init_cic_cache(void) +int __init f2fs_init_compress_cache(void) { cic_entry_slab = f2fs_kmem_cache_create("f2fs_cic_entry", sizeof(struct compress_io_ctx)); if (!cic_entry_slab) return -ENOMEM; - return 0; -} - -static void f2fs_destroy_cic_cache(void) -{ - kmem_cache_destroy(cic_entry_slab); -} - -static int __init f2fs_init_dic_cache(void) -{ dic_entry_slab = f2fs_kmem_cache_create("f2fs_dic_entry", sizeof(struct decompress_io_ctx)); if (!dic_entry_slab) - return -ENOMEM; - return 0; -} - -static void f2fs_destroy_dic_cache(void) -{ - kmem_cache_destroy(dic_entry_slab); -} - -int __init f2fs_init_compress_cache(void) -{ - int err; - - err = f2fs_init_cic_cache(); - if (err) - goto out; - err = f2fs_init_dic_cache(); - if (err) goto free_cic; return 0; free_cic: - f2fs_destroy_cic_cache(); -out: + kmem_cache_destroy(cic_entry_slab); return -ENOMEM; } void f2fs_destroy_compress_cache(void) { - f2fs_destroy_dic_cache(); - f2fs_destroy_cic_cache(); + kmem_cache_destroy(dic_entry_slab); + kmem_cache_destroy(cic_entry_slab); } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 560fa80590e99..35c19248b1e2d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -39,10 +39,8 @@ static struct bio_set f2fs_bioset; int __init f2fs_init_bioset(void) { - if (bioset_init(&f2fs_bioset, F2FS_BIO_POOL_SIZE, - 0, BIOSET_NEED_BVECS)) - return -ENOMEM; - return 0; + return bioset_init(&f2fs_bioset, F2FS_BIO_POOL_SIZE, + 0, BIOSET_NEED_BVECS); } void f2fs_destroy_bioset(void) @@ -4090,9 +4088,7 @@ int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi) sbi->post_read_wq = alloc_workqueue("f2fs_post_read_wq", WQ_UNBOUND | WQ_HIGHPRI, num_online_cpus()); - if (!sbi->post_read_wq) - return -ENOMEM; - return 0; + return sbi->post_read_wq ? 0 : -ENOMEM; } void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi) @@ -4105,9 +4101,7 @@ int __init f2fs_init_bio_entry_cache(void) { bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab", sizeof(struct bio_entry)); - if (!bio_entry_slab) - return -ENOMEM; - return 0; + return bio_entry_slab ? 0 : -ENOMEM; } void f2fs_destroy_bio_entry_cache(void) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index f1b68eda2235d..d19e26b2e875e 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1904,9 +1904,7 @@ int __init f2fs_create_garbage_collection_cache(void) { victim_entry_slab = f2fs_kmem_cache_create("f2fs_victim_entry", sizeof(struct victim_entry)); - if (!victim_entry_slab) - return -ENOMEM; - return 0; + return victim_entry_slab ? 0 : -ENOMEM; } void f2fs_destroy_garbage_collection_cache(void) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index dea95b48b647d..77fd453949b1b 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -923,9 +923,7 @@ int __init f2fs_create_recovery_cache(void) { fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", sizeof(struct fsync_inode_entry)); - if (!fsync_entry_slab) - return -ENOMEM; - return 0; + return fsync_entry_slab ? 0 : -ENOMEM; } void f2fs_destroy_recovery_cache(void) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index daf14b55a9729..a5f6f632cf7cf 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -288,9 +288,7 @@ static int __init f2fs_create_casefold_cache(void) { f2fs_cf_name_slab = f2fs_kmem_cache_create("f2fs_casefolded_name", F2FS_NAME_LEN); - if (!f2fs_cf_name_slab) - return -ENOMEM; - return 0; + return f2fs_cf_name_slab ? 0 : -ENOMEM; } static void f2fs_destroy_casefold_cache(void) @@ -4647,9 +4645,7 @@ static int __init init_inodecache(void) f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache", sizeof(struct f2fs_inode_info), 0, SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL); - if (!f2fs_inode_cachep) - return -ENOMEM; - return 0; + return f2fs_inode_cachep ? 0 : -ENOMEM; } static void destroy_inodecache(void) -- GitLab From 4bd1d80efb5af640f99157f39b50fb11326ce641 Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Mon, 29 Aug 2022 23:52:19 +0300 Subject: [PATCH 1235/1423] riscv: mm: notify remote harts about mmu cache updates Current implementation of update_mmu_cache function performs local TLB flush. It does not take into account ASID information. Besides, it does not take into account other harts currently running the same mm context or possible migration of the running context to other harts. Meanwhile TLB flush is not performed for every context switch if ASID support is enabled. Patch [1] proposed to add ASID support to update_mmu_cache to avoid flushing local TLB entirely. This patch takes into account other harts currently running the same mm context as well as possible migration of this context to other harts. For this purpose the approach from flush_icache_mm is reused. Remote harts currently running the same mm context are informed via SBI calls that they need to flush their local TLBs. All the other harts are marked as needing a deferred TLB flush when this mm context runs on them. [1] https://lore.kernel.org/linux-riscv/20220821013926.8968-1-tjytimi@163.com/ Signed-off-by: Sergey Matyukevich Fixes: 65d4b9c53017 ("RISC-V: Implement ASID allocator") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/linux-riscv/20220829205219.283543-1-geomatsi@gmail.com/#t Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/mmu.h | 2 ++ arch/riscv/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/tlbflush.h | 18 ++++++++++++++++++ arch/riscv/mm/context.c | 10 ++++++++++ arch/riscv/mm/tlbflush.c | 28 +++++++++++----------------- 5 files changed, 42 insertions(+), 18 deletions(-) diff --git a/arch/riscv/include/asm/mmu.h b/arch/riscv/include/asm/mmu.h index 0099dc1161683..5ff1f19fd45c2 100644 --- a/arch/riscv/include/asm/mmu.h +++ b/arch/riscv/include/asm/mmu.h @@ -19,6 +19,8 @@ typedef struct { #ifdef CONFIG_SMP /* A local icache flush is needed before user execution can resume. */ cpumask_t icache_stale_mask; + /* A local tlb flush is needed before user execution can resume. */ + cpumask_t tlb_stale_mask; #endif } mm_context_t; diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index c61ae83aadee8..2359f1f9bda9c 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -415,7 +415,7 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, * Relying on flush_tlb_fix_spurious_fault would suffice, but * the extra traps reduce performance. So, eagerly SFENCE.VMA. */ - local_flush_tlb_page(address); + flush_tlb_page(vma, address); } #define __HAVE_ARCH_UPDATE_MMU_TLB diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h index 801019381dea3..907b9efd39a87 100644 --- a/arch/riscv/include/asm/tlbflush.h +++ b/arch/riscv/include/asm/tlbflush.h @@ -22,6 +22,24 @@ static inline void local_flush_tlb_page(unsigned long addr) { ALT_FLUSH_TLB_PAGE(__asm__ __volatile__ ("sfence.vma %0" : : "r" (addr) : "memory")); } + +static inline void local_flush_tlb_all_asid(unsigned long asid) +{ + __asm__ __volatile__ ("sfence.vma x0, %0" + : + : "r" (asid) + : "memory"); +} + +static inline void local_flush_tlb_page_asid(unsigned long addr, + unsigned long asid) +{ + __asm__ __volatile__ ("sfence.vma %0, %1" + : + : "r" (addr), "r" (asid) + : "memory"); +} + #else /* CONFIG_MMU */ #define local_flush_tlb_all() do { } while (0) #define local_flush_tlb_page(addr) do { } while (0) diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c index 7acbfbd14557e..80ce9caba8d22 100644 --- a/arch/riscv/mm/context.c +++ b/arch/riscv/mm/context.c @@ -196,6 +196,16 @@ switch_mm_fast: if (need_flush_tlb) local_flush_tlb_all(); +#ifdef CONFIG_SMP + else { + cpumask_t *mask = &mm->context.tlb_stale_mask; + + if (cpumask_test_cpu(cpu, mask)) { + cpumask_clear_cpu(cpu, mask); + local_flush_tlb_all_asid(cntx & asid_mask); + } + } +#endif } static void set_mm_noasid(struct mm_struct *mm) diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c index 37ed760d007c3..ce7dfc81bb3fe 100644 --- a/arch/riscv/mm/tlbflush.c +++ b/arch/riscv/mm/tlbflush.c @@ -5,23 +5,7 @@ #include #include #include - -static inline void local_flush_tlb_all_asid(unsigned long asid) -{ - __asm__ __volatile__ ("sfence.vma x0, %0" - : - : "r" (asid) - : "memory"); -} - -static inline void local_flush_tlb_page_asid(unsigned long addr, - unsigned long asid) -{ - __asm__ __volatile__ ("sfence.vma %0, %1" - : - : "r" (addr), "r" (asid) - : "memory"); -} +#include void flush_tlb_all(void) { @@ -31,6 +15,7 @@ void flush_tlb_all(void) static void __sbi_tlb_flush_range(struct mm_struct *mm, unsigned long start, unsigned long size, unsigned long stride) { + struct cpumask *pmask = &mm->context.tlb_stale_mask; struct cpumask *cmask = mm_cpumask(mm); unsigned int cpuid; bool broadcast; @@ -44,6 +29,15 @@ static void __sbi_tlb_flush_range(struct mm_struct *mm, unsigned long start, if (static_branch_unlikely(&use_asid_allocator)) { unsigned long asid = atomic_long_read(&mm->context.id); + /* + * TLB will be immediately flushed on harts concurrently + * executing this MM context. TLB flush on other harts + * is deferred until this MM context migrates there. + */ + cpumask_setall(pmask); + cpumask_clear_cpu(cpuid, pmask); + cpumask_andnot(pmask, pmask, cmask); + if (broadcast) { sbi_remote_sfence_vma_asid(cmask, start, size, asid); } else if (size <= stride) { -- GitLab From b0f4c74eadbf69a3298f38566bfaa2e202541f2f Mon Sep 17 00:00:00 2001 From: Andrew Bresticker Date: Fri, 11 Nov 2022 17:31:08 -0500 Subject: [PATCH 1236/1423] RISC-V: Fix unannoted hardirqs-on in return to userspace slow-path The return to userspace path in entry.S may enable interrupts without the corresponding lockdep annotation, producing a splat[0] when DEBUG_LOCKDEP is enabled. Simply calling __trace_hardirqs_on() here gets a bit messy due to the use of RA to point back to ret_from_exception, so just move the whole slow-path loop into C. It's more readable and it lets us use local_irq_{enable,disable}(), avoiding the need for manual annotations altogether. [0]: ------------[ cut here ]------------ DEBUG_LOCKS_WARN_ON(!lockdep_hardirqs_enabled()) WARNING: CPU: 2 PID: 1 at kernel/locking/lockdep.c:5512 check_flags+0x10a/0x1e0 Modules linked in: CPU: 2 PID: 1 Comm: init Not tainted 6.1.0-rc4-00160-gb56b6e2b4f31 #53 Hardware name: riscv-virtio,qemu (DT) epc : check_flags+0x10a/0x1e0 ra : check_flags+0x10a/0x1e0 status: 0000000200000100 badaddr: 0000000000000000 cause: 0000000000000003 [] lock_is_held_type+0x78/0x14e [] __might_resched+0x26/0x22c [] __might_sleep+0x3c/0x66 [] get_signal+0x9e/0xa70 [] do_notify_resume+0x6e/0x422 [] ret_from_exception+0x0/0x10 irq event stamp: 44512 hardirqs last enabled at (44511): [] _raw_spin_unlock_irqrestore+0x54/0x62 hardirqs last disabled at (44512): [] __trace_hardirqs_off+0xc/0x14 softirqs last enabled at (44472): [] __do_softirq+0x3de/0x51e softirqs last disabled at (44467): [] irq_exit+0xd6/0x104 ---[ end trace 0000000000000000 ]--- possible reason: unannotated irqs-on. Signed-off-by: Andrew Bresticker Fixes: 3c4697982982 ("riscv: Enable LOCKDEP_SUPPORT & fixup TRACE_IRQFLAGS_SUPPORT") Link: https://lore.kernel.org/r/20221111223108.1976562-1-abrestic@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/entry.S | 18 +++++------------- arch/riscv/kernel/signal.c | 34 +++++++++++++++++++++------------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index b9eda3fcbd6d7..58dfa8595e19d 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -263,12 +263,11 @@ ret_from_exception: #endif bnez s0, resume_kernel -resume_userspace: /* Interrupts must be disabled here so flags are checked atomically */ REG_L s0, TASK_TI_FLAGS(tp) /* current_thread_info->flags */ andi s1, s0, _TIF_WORK_MASK - bnez s1, work_pending - + bnez s1, resume_userspace_slow +resume_userspace: #ifdef CONFIG_CONTEXT_TRACKING_USER call user_enter_callable #endif @@ -368,19 +367,12 @@ resume_kernel: j restore_all #endif -work_pending: +resume_userspace_slow: /* Enter slow path for supplementary processing */ - la ra, ret_from_exception - andi s1, s0, _TIF_NEED_RESCHED - bnez s1, work_resched -work_notifysig: - /* Handle pending signals and notify-resume requests */ - csrs CSR_STATUS, SR_IE /* Enable interrupts for do_notify_resume() */ move a0, sp /* pt_regs */ move a1, s0 /* current_thread_info->flags */ - tail do_notify_resume -work_resched: - tail schedule + call do_work_pending + j resume_userspace /* Slow paths for ptrace. */ handle_syscall_trace_enter: diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c index 5c591123c4409..bfb2afa4135f8 100644 --- a/arch/riscv/kernel/signal.c +++ b/arch/riscv/kernel/signal.c @@ -313,19 +313,27 @@ static void do_signal(struct pt_regs *regs) } /* - * notification of userspace execution resumption - * - triggered by the _TIF_WORK_MASK flags + * Handle any pending work on the resume-to-userspace path, as indicated by + * _TIF_WORK_MASK. Entered from assembly with IRQs off. */ -asmlinkage __visible void do_notify_resume(struct pt_regs *regs, - unsigned long thread_info_flags) +asmlinkage __visible void do_work_pending(struct pt_regs *regs, + unsigned long thread_info_flags) { - if (thread_info_flags & _TIF_UPROBE) - uprobe_notify_resume(regs); - - /* Handle pending signal delivery */ - if (thread_info_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) - do_signal(regs); - - if (thread_info_flags & _TIF_NOTIFY_RESUME) - resume_user_mode_work(regs); + do { + if (thread_info_flags & _TIF_NEED_RESCHED) { + schedule(); + } else { + local_irq_enable(); + if (thread_info_flags & _TIF_UPROBE) + uprobe_notify_resume(regs); + /* Handle pending signal delivery */ + if (thread_info_flags & (_TIF_SIGPENDING | + _TIF_NOTIFY_SIGNAL)) + do_signal(regs); + if (thread_info_flags & _TIF_NOTIFY_RESUME) + resume_user_mode_work(regs); + } + local_irq_disable(); + thread_info_flags = read_thread_flags(); + } while (thread_info_flags & _TIF_WORK_MASK); } -- GitLab From b91676fc16cd384a81e3af52c641aa61985cc231 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Mon, 14 Nov 2022 14:35:34 +0530 Subject: [PATCH 1237/1423] RISC-V: Fix MEMREMAP_WB for systems with Svpbmt Currently, the memremap() called with MEMREMAP_WB maps memory using the generic ioremap() function which breaks on system with Svpbmt because memory mapped using _PAGE_IOREMAP page attributes is treated as strongly-ordered non-cacheable IO memory. To address this, we implement RISC-V specific arch_memremap_wb() which maps memory using _PAGE_KERNEL page attributes resulting in write-back cacheable mapping on systems with Svpbmt. Fixes: ff689fd21cb1 ("riscv: add RISC-V Svpbmt extension support") Co-developed-by: Mayuresh Chitale Signed-off-by: Mayuresh Chitale Signed-off-by: Anup Patel Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20221114090536.1662624-2-apatel@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/io.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/riscv/include/asm/io.h b/arch/riscv/include/asm/io.h index 92080a2279372..42497d487a174 100644 --- a/arch/riscv/include/asm/io.h +++ b/arch/riscv/include/asm/io.h @@ -135,4 +135,9 @@ __io_writes_outs(outs, u64, q, __io_pbr(), __io_paw()) #include +#ifdef CONFIG_MMU +#define arch_memremap_wb(addr, size) \ + ((__force void *)ioremap_prot((addr), (size), _PAGE_KERNEL)) +#endif + #endif /* _ASM_RISCV_IO_H */ -- GitLab From a49ab905a1fc8630a94221f9a06ce0dafb266576 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Mon, 14 Nov 2022 14:35:35 +0530 Subject: [PATCH 1238/1423] RISC-V: Implement arch specific PMEM APIs The NVDIMM PMEM driver expects arch specific APIs for cache maintenance and if arch does not provide these APIs then NVDIMM PMEM driver will always use MEMREMAP_WT to map persistent memory which in-turn maps as UC memory type defined by the RISC-V Svpbmt specification. Now that the Svpbmt and Zicbom support is available in RISC-V kernel, we implement PMEM APIs using ALT_CMO_OP() macros so that the NVDIMM PMEM driver can use MEMREMAP_WB to map persistent memory. Co-developed-by: Mayuresh Chitale Signed-off-by: Mayuresh Chitale Signed-off-by: Anup Patel Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20221114090536.1662624-3-apatel@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/mm/Makefile | 1 + arch/riscv/mm/pmem.c | 21 +++++++++++++++++++++ 3 files changed, 23 insertions(+) create mode 100644 arch/riscv/mm/pmem.c diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 6b48a3ae98439..025e2a1b1c60d 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -25,6 +25,7 @@ config RISCV select ARCH_HAS_GIGANTIC_PAGE select ARCH_HAS_KCOV select ARCH_HAS_MMIOWB + select ARCH_HAS_PMEM_API select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SET_DIRECT_MAP if MMU select ARCH_HAS_SET_MEMORY if MMU diff --git a/arch/riscv/mm/Makefile b/arch/riscv/mm/Makefile index d76aabf4b94d6..b4f35da889bfb 100644 --- a/arch/riscv/mm/Makefile +++ b/arch/riscv/mm/Makefile @@ -13,6 +13,7 @@ obj-y += extable.o obj-$(CONFIG_MMU) += fault.o pageattr.o obj-y += cacheflush.o obj-y += context.o +obj-y += pmem.o ifeq ($(CONFIG_MMU),y) obj-$(CONFIG_SMP) += tlbflush.o diff --git a/arch/riscv/mm/pmem.c b/arch/riscv/mm/pmem.c new file mode 100644 index 0000000000000..089df92ae8760 --- /dev/null +++ b/arch/riscv/mm/pmem.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022 Ventana Micro Systems Inc. + */ + +#include +#include + +#include + +void arch_wb_cache_pmem(void *addr, size_t size) +{ + ALT_CMO_OP(clean, addr, size, riscv_cbom_block_size); +} +EXPORT_SYMBOL_GPL(arch_wb_cache_pmem); + +void arch_invalidate_pmem(void *addr, size_t size) +{ + ALT_CMO_OP(inval, addr, size, riscv_cbom_block_size); +} +EXPORT_SYMBOL_GPL(arch_invalidate_pmem); -- GitLab From 497bcbe3ce0466123a834f2777a8a762bd5d7aae Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Mon, 14 Nov 2022 14:35:36 +0530 Subject: [PATCH 1239/1423] RISC-V: Enable PMEM drivers We now have PMEM arch support available in RISC-V kernel so let us enable relevant drivers in defconfig. Signed-off-by: Anup Patel Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20221114090536.1662624-4-apatel@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/configs/defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index 05fd5fcf24f9b..462da9f7410d0 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -159,6 +159,7 @@ CONFIG_VIRTIO_MMIO=y CONFIG_RPMSG_CHAR=y CONFIG_RPMSG_CTRL=y CONFIG_RPMSG_VIRTIO=y +CONFIG_LIBNVDIMM=y CONFIG_EXT4_FS=y CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y -- GitLab From fdb1742aff436399f5769a7559bbb71c7f37a85f Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Fri, 18 Nov 2022 10:42:59 +0000 Subject: [PATCH 1240/1423] irqchip/sifive-plic: remove user selectability of SIFIVE_PLIC The SiFive PLIC driver is used by all current implementations, including those that do not have a SiFive PLIC. The current driver supports more than just SiFive PLICs at present and, where possible, future PLIC implementations will also use this driver. As every supported RISC-V SoC selects the driver directly in Kconfig.socs there's no point in exposing this kconfig option to users. The Kconfig help text, in its current form, is misleading. There's no point doing anything about that though, as it will no longer be user selectable. Remove it. Suggested-by: Marc Zyngier Signed-off-by: Conor Dooley Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221118104300.85016-2-conor@kernel.org Signed-off-by: Palmer Dabbelt --- drivers/irqchip/Kconfig | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 7ef9f5e696d31..ecb3e3119d2e1 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -551,18 +551,10 @@ config RISCV_INTC If you don't know what to do here, say Y. config SIFIVE_PLIC - bool "SiFive Platform-Level Interrupt Controller" + bool depends on RISCV select IRQ_DOMAIN_HIERARCHY select GENERIC_IRQ_EFFECTIVE_AFF_MASK if SMP - help - This enables support for the PLIC chip found in SiFive (and - potentially other) RISC-V systems. The PLIC controls devices - interrupts and connects them to each core's local interrupt - controller. Aside from timer and software interrupts, all other - interrupt sources are subordinate to the PLIC. - - If you don't know what to do here, say Y. config EXYNOS_IRQ_COMBINER bool "Samsung Exynos IRQ combiner support" if COMPILE_TEST -- GitLab From d8fb13070c3c99b6a17b75fda28943f9261e23e7 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Fri, 18 Nov 2022 10:43:00 +0000 Subject: [PATCH 1241/1423] irqchip/riscv-intc: remove user selectability of RISCV_INTC Since commit e71ee06e3ca3 ("RISC-V: Force select RISCV_INTC for CONFIG_RISCV") the driver has been enabled at the arch level - and is mandatory anyway. There's no point exposing this as a choice to users, so stop bothering. Signed-off-by: Conor Dooley Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221118104300.85016-3-conor@kernel.org Signed-off-by: Palmer Dabbelt --- drivers/irqchip/Kconfig | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index ecb3e3119d2e1..4633a549ebbf6 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -538,17 +538,8 @@ config TI_PRUSS_INTC different processors within the SoC. config RISCV_INTC - bool "RISC-V Local Interrupt Controller" + bool depends on RISCV - default y - help - This enables support for the per-HART local interrupt controller - found in standard RISC-V systems. The per-HART local interrupt - controller handles timer interrupts, software interrupts, and - hardware interrupts. Without a per-HART local interrupt controller, - a RISC-V system will be unable to handle any interrupts. - - If you don't know what to do here, say Y. config SIFIVE_PLIC bool -- GitLab From bf3d7b1d8499ca46874c7373d2043ecbe252cccc Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Fri, 18 Nov 2022 10:43:01 +0000 Subject: [PATCH 1242/1423] RISC-V: stop selecting SIFIVE_PLIC at the SoC level The SIFIVE_PLIC driver is used by all current RISC-V SoCs & will be, where possible, used for future implementations. Rather than having each driver select the option on a case-by-case basis, do so at the arch level. Signed-off-by: Conor Dooley Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20221118104300.85016-4-conor@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/Kconfig.socs | 5 ----- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 6b48a3ae98439..3ee67dc4e98bf 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -123,6 +123,7 @@ config RISCV select PCI_MSI if PCI select RISCV_INTC select RISCV_TIMER if RISCV_SBI + select SIFIVE_PLIC select SPARSE_IRQ select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK diff --git a/arch/riscv/Kconfig.socs b/arch/riscv/Kconfig.socs index 69774bb362d6a..15e391f38f758 100644 --- a/arch/riscv/Kconfig.socs +++ b/arch/riscv/Kconfig.socs @@ -3,7 +3,6 @@ menu "SoC selection" config SOC_MICROCHIP_POLARFIRE bool "Microchip PolarFire SoCs" select MCHP_CLK_MPFS - select SIFIVE_PLIC help This enables support for Microchip PolarFire SoC platforms. @@ -13,7 +12,6 @@ config SOC_SIFIVE select SERIAL_SIFIVE_CONSOLE if TTY select CLK_SIFIVE select CLK_SIFIVE_PRCI - select SIFIVE_PLIC select ERRATA_SIFIVE if !XIP_KERNEL help This enables support for SiFive SoC platform hardware. @@ -22,7 +20,6 @@ config SOC_STARFIVE bool "StarFive SoCs" select PINCTRL select RESET_CONTROLLER - select SIFIVE_PLIC help This enables support for StarFive SoC platform hardware. @@ -34,7 +31,6 @@ config SOC_VIRT select POWER_RESET_SYSCON_POWEROFF select GOLDFISH select RTC_DRV_GOLDFISH if RTC_CLASS - select SIFIVE_PLIC select PM_GENERIC_DOMAINS if PM select PM_GENERIC_DOMAINS_OF if PM && OF select RISCV_SBI_CPUIDLE if CPU_IDLE && RISCV_SBI @@ -47,7 +43,6 @@ config SOC_CANAAN select CLINT_TIMER if RISCV_M_MODE select SERIAL_SIFIVE if TTY select SERIAL_SIFIVE_CONSOLE if TTY - select SIFIVE_PLIC select ARCH_HAS_RESET_CONTROLLER select PINCTRL select COMMON_CLK -- GitLab From de59b6ed0618b909be78f6bc60874a57dd016063 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Wed, 23 Nov 2022 23:02:57 +0800 Subject: [PATCH 1243/1423] riscv: boot: add zstd support Support build the zstd compressed Image.zst. Similar as other compressed formats, the Image.zst is not self-decompressing and the bootloader still needs to handle decompression before launching the kernel image. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20221123150257.3108-1-jszhang@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/boot/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/riscv/boot/Makefile b/arch/riscv/boot/Makefile index d1a49adcb1d76..c72de7232abb2 100644 --- a/arch/riscv/boot/Makefile +++ b/arch/riscv/boot/Makefile @@ -56,6 +56,9 @@ $(obj)/Image.lzma: $(obj)/Image FORCE $(obj)/Image.lzo: $(obj)/Image FORCE $(call if_changed,lzo) +$(obj)/Image.zst: $(obj)/Image FORCE + $(call if_changed,zstd) + $(obj)/loader.bin: $(obj)/loader FORCE $(call if_changed,objcopy) -- GitLab From 0c49688174f5347c3f8012e84c0ffa0d2b2890c8 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Sat, 26 Nov 2022 00:09:19 -0600 Subject: [PATCH 1244/1423] riscv: Fix crash during early errata patching The patch function for the T-Head PBMT errata calls __pa_symbol() before relocation. This crashes when CONFIG_DEBUG_VIRTUAL is enabled, because __pa_symbol() forwards to __phys_addr_symbol(), and __phys_addr_symbol() checks against the absolute kernel start/end address. Fix this by checking against the kernel map instead of a symbol address. Fixes: a35707c3d850 ("riscv: add memory-type errata for T-Head") Reviewed-by: Heiko Stuebner Tested-by: Heiko Stuebner Signed-off-by: Samuel Holland Link: https://lore.kernel.org/r/20221126060920.65009-1-samuel@sholland.org Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/physaddr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/mm/physaddr.c b/arch/riscv/mm/physaddr.c index 19cf25a74ee29..9b18bda74154e 100644 --- a/arch/riscv/mm/physaddr.c +++ b/arch/riscv/mm/physaddr.c @@ -22,7 +22,7 @@ EXPORT_SYMBOL(__virt_to_phys); phys_addr_t __phys_addr_symbol(unsigned long x) { unsigned long kernel_start = kernel_map.virt_addr; - unsigned long kernel_end = (unsigned long)_end; + unsigned long kernel_end = kernel_start + kernel_map.size; /* * Boundary checking aginst the kernel image mapping. -- GitLab From 583286e2072ed25c31b7db14d69fdf03f1fae7ba Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Sat, 26 Nov 2022 00:09:20 -0600 Subject: [PATCH 1245/1423] riscv: Move cast inside kernel_mapping_[pv]a_to_[vp]a Before commit 44c922572952 ("RISC-V: enable XIP"), these macros cast their argument to unsigned long. That commit moved the cast after an assignment to an unsigned long variable, rendering it ineffectual. Move the cast back, so we can remove the cast at each call site. Reviewed-by: Alexandre Ghiti Reviewed-by: Heiko Stuebner Signed-off-by: Samuel Holland Link: https://lore.kernel.org/r/20221126060920.65009-2-samuel@sholland.org Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/page.h | 18 +++++++++--------- arch/riscv/mm/init.c | 16 ++++++++-------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index ac70b0fd9a9a3..9f432c1b52899 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -123,20 +123,20 @@ extern phys_addr_t phys_ram_base; ((x) >= PAGE_OFFSET && (!IS_ENABLED(CONFIG_64BIT) || (x) < PAGE_OFFSET + KERN_VIRT_SIZE)) #define linear_mapping_pa_to_va(x) ((void *)((unsigned long)(x) + kernel_map.va_pa_offset)) -#define kernel_mapping_pa_to_va(y) ({ \ - unsigned long _y = y; \ - (IS_ENABLED(CONFIG_XIP_KERNEL) && _y < phys_ram_base) ? \ - (void *)((unsigned long)(_y) + kernel_map.va_kernel_xip_pa_offset) : \ - (void *)((unsigned long)(_y) + kernel_map.va_kernel_pa_offset + XIP_OFFSET); \ +#define kernel_mapping_pa_to_va(y) ({ \ + unsigned long _y = (unsigned long)(y); \ + (IS_ENABLED(CONFIG_XIP_KERNEL) && _y < phys_ram_base) ? \ + (void *)(_y + kernel_map.va_kernel_xip_pa_offset) : \ + (void *)(_y + kernel_map.va_kernel_pa_offset + XIP_OFFSET); \ }) #define __pa_to_va_nodebug(x) linear_mapping_pa_to_va(x) #define linear_mapping_va_to_pa(x) ((unsigned long)(x) - kernel_map.va_pa_offset) #define kernel_mapping_va_to_pa(y) ({ \ - unsigned long _y = y; \ - (IS_ENABLED(CONFIG_XIP_KERNEL) && _y < kernel_map.virt_addr + XIP_OFFSET) ? \ - ((unsigned long)(_y) - kernel_map.va_kernel_xip_pa_offset) : \ - ((unsigned long)(_y) - kernel_map.va_kernel_pa_offset - XIP_OFFSET); \ + unsigned long _y = (unsigned long)(y); \ + (IS_ENABLED(CONFIG_XIP_KERNEL) && _y < kernel_map.virt_addr + XIP_OFFSET) ? \ + (_y - kernel_map.va_kernel_xip_pa_offset) : \ + (_y - kernel_map.va_kernel_pa_offset - XIP_OFFSET); \ }) #define __va_to_pa_nodebug(x) ({ \ diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index b56a0a75533fe..7d59516ce6b39 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -927,15 +927,15 @@ static void __init pt_ops_set_early(void) */ static void __init pt_ops_set_fixmap(void) { - pt_ops.alloc_pte = kernel_mapping_pa_to_va((uintptr_t)alloc_pte_fixmap); - pt_ops.get_pte_virt = kernel_mapping_pa_to_va((uintptr_t)get_pte_virt_fixmap); + pt_ops.alloc_pte = kernel_mapping_pa_to_va(alloc_pte_fixmap); + pt_ops.get_pte_virt = kernel_mapping_pa_to_va(get_pte_virt_fixmap); #ifndef __PAGETABLE_PMD_FOLDED - pt_ops.alloc_pmd = kernel_mapping_pa_to_va((uintptr_t)alloc_pmd_fixmap); - pt_ops.get_pmd_virt = kernel_mapping_pa_to_va((uintptr_t)get_pmd_virt_fixmap); - pt_ops.alloc_pud = kernel_mapping_pa_to_va((uintptr_t)alloc_pud_fixmap); - pt_ops.get_pud_virt = kernel_mapping_pa_to_va((uintptr_t)get_pud_virt_fixmap); - pt_ops.alloc_p4d = kernel_mapping_pa_to_va((uintptr_t)alloc_p4d_fixmap); - pt_ops.get_p4d_virt = kernel_mapping_pa_to_va((uintptr_t)get_p4d_virt_fixmap); + pt_ops.alloc_pmd = kernel_mapping_pa_to_va(alloc_pmd_fixmap); + pt_ops.get_pmd_virt = kernel_mapping_pa_to_va(get_pmd_virt_fixmap); + pt_ops.alloc_pud = kernel_mapping_pa_to_va(alloc_pud_fixmap); + pt_ops.get_pud_virt = kernel_mapping_pa_to_va(get_pud_virt_fixmap); + pt_ops.alloc_p4d = kernel_mapping_pa_to_va(alloc_p4d_fixmap); + pt_ops.get_p4d_virt = kernel_mapping_pa_to_va(get_p4d_virt_fixmap); #endif } -- GitLab From e8b9a055fa0481679132781db574ecb771960f16 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 7 Dec 2022 21:48:07 +0000 Subject: [PATCH 1246/1423] KVM: arm64: selftests: Align VA space allocator with TTBR0 An interesting feature of the Arm architecture is that the stage-1 MMU supports two distinct VA regions, controlled by TTBR{0,1}_EL1. As KVM selftests on arm64 only uses TTBR0_EL1, the VA space is constrained to [0, 2^(va_bits-1)). This is different from other architectures that allow for addressing low and high regions of the VA space from a single page table. KVM selftests' VA space allocator presumes the valid address range is split between low and high memory based the MSB, which of course is a poor match for arm64's TTBR0 region. Allow architectures to override the default VA space layout. Make use of the override to align vpages_valid with the behavior of TTBR0 on arm64. Signed-off-by: Oliver Upton Message-Id: <20221207214809.489070-4-oliver.upton@linux.dev> Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/include/kvm_util_base.h | 1 + .../testing/selftests/kvm/lib/aarch64/processor.c | 10 ++++++++++ tools/testing/selftests/kvm/lib/kvm_util.c | 15 ++++++++++----- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 37500c92dd0a6..2e267cd692889 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -408,6 +408,7 @@ void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id); +void vm_populate_vaddr_bitmap(struct kvm_vm *vm); vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min); vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min); vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index 316de70db91da..5972a23b27654 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -541,3 +541,13 @@ void kvm_selftest_arch_init(void) */ guest_modes_append_default(); } + +void vm_vaddr_populate_bitmap(struct kvm_vm *vm) +{ + /* + * arm64 selftests use only TTBR0_EL1, meaning that the valid VA space + * is [0, 2^(64 - TCR_EL1.T0SZ)). + */ + sparsebit_set_num(vm->vpages_valid, 0, + (1ULL << vm->va_bits) >> vm->page_shift); +} diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index e9607eb089bee..c88c3ace16d2f 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -186,6 +186,15 @@ const struct vm_guest_mode_params vm_guest_mode_params[] = { _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES, "Missing new mode params?"); +__weak void vm_vaddr_populate_bitmap(struct kvm_vm *vm) +{ + sparsebit_set_num(vm->vpages_valid, + 0, (1ULL << (vm->va_bits - 1)) >> vm->page_shift); + sparsebit_set_num(vm->vpages_valid, + (~((1ULL << (vm->va_bits - 1)) - 1)) >> vm->page_shift, + (1ULL << (vm->va_bits - 1)) >> vm->page_shift); +} + struct kvm_vm *____vm_create(enum vm_guest_mode mode) { struct kvm_vm *vm; @@ -274,11 +283,7 @@ struct kvm_vm *____vm_create(enum vm_guest_mode mode) /* Limit to VA-bit canonical virtual addresses. */ vm->vpages_valid = sparsebit_alloc(); - sparsebit_set_num(vm->vpages_valid, - 0, (1ULL << (vm->va_bits - 1)) >> vm->page_shift); - sparsebit_set_num(vm->vpages_valid, - (~((1ULL << (vm->va_bits - 1)) - 1)) >> vm->page_shift, - (1ULL << (vm->va_bits - 1)) >> vm->page_shift); + vm_vaddr_populate_bitmap(vm); /* Limit physical addresses to PA-bits. */ vm->max_gfn = vm_compute_max_gfn(vm); -- GitLab From 2afc1fbbdab2aee831561f09f859989dcd5ed648 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 7 Dec 2022 21:48:08 +0000 Subject: [PATCH 1247/1423] KVM: selftests: Allocate ucall pool from MEM_REGION_DATA MEM_REGION_TEST_DATA is meant to hold data explicitly used by a selftest, not implicit allocations due to the selftests infrastructure. Allocate the ucall pool from MEM_REGION_DATA much like the rest of the selftests library allocations. Fixes: 426729b2cf2e ("KVM: selftests: Add ucall pool based implementation") Signed-off-by: Oliver Upton Message-Id: <20221207214809.489070-5-oliver.upton@linux.dev> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/ucall_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/lib/ucall_common.c b/tools/testing/selftests/kvm/lib/ucall_common.c index fcae96461e460..72420171c0d47 100644 --- a/tools/testing/selftests/kvm/lib/ucall_common.c +++ b/tools/testing/selftests/kvm/lib/ucall_common.c @@ -22,7 +22,7 @@ void ucall_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa) vm_vaddr_t vaddr; int i; - vaddr = vm_vaddr_alloc(vm, sizeof(*hdr), KVM_UTIL_MIN_VADDR); + vaddr = __vm_vaddr_alloc(vm, sizeof(*hdr), KVM_UTIL_MIN_VADDR, MEM_REGION_DATA); hdr = (struct ucall_header *)addr_gva2hva(vm, vaddr); memset(hdr, 0, sizeof(*hdr)); -- GitLab From e9612987e437b7ada686f472c7596686fabecb2b Mon Sep 17 00:00:00 2001 From: Giovanni Cabiddu Date: Mon, 28 Nov 2022 12:21:12 +0000 Subject: [PATCH 1248/1423] crypto: qat - relocate bufferlist logic Move the logic that maps, unmaps and converts scatterlists into QAT bufferlists from qat_algs.c to a new module, qat_bl. This is to allow reuse of the logic by the data compression service. This commit does not implement any functional change. Signed-off-by: Giovanni Cabiddu Reviewed-by: Wojciech Ziemba Reviewed-by: Adam Guerin Signed-off-by: Herbert Xu --- drivers/crypto/qat/qat_common/Makefile | 3 +- drivers/crypto/qat/qat_common/qat_algs.c | 184 +-------------------- drivers/crypto/qat/qat_common/qat_bl.c | 194 +++++++++++++++++++++++ drivers/crypto/qat/qat_common/qat_bl.h | 17 ++ 4 files changed, 214 insertions(+), 184 deletions(-) create mode 100644 drivers/crypto/qat/qat_common/qat_bl.c create mode 100644 drivers/crypto/qat/qat_common/qat_bl.h diff --git a/drivers/crypto/qat/qat_common/Makefile b/drivers/crypto/qat/qat_common/Makefile index 80919cfcc29da..b0587d03eac29 100644 --- a/drivers/crypto/qat/qat_common/Makefile +++ b/drivers/crypto/qat/qat_common/Makefile @@ -19,7 +19,8 @@ intel_qat-objs := adf_cfg.o \ qat_asym_algs.o \ qat_algs_send.o \ qat_uclo.o \ - qat_hal.o + qat_hal.o \ + qat_bl.o intel_qat-$(CONFIG_DEBUG_FS) += adf_transport_debug.o intel_qat-$(CONFIG_PCI_IOV) += adf_sriov.o adf_vf_isr.o adf_pfvf_utils.o \ diff --git a/drivers/crypto/qat/qat_common/qat_algs.c b/drivers/crypto/qat/qat_common/qat_algs.c index cad9c58caab13..2ee4fa64032fa 100644 --- a/drivers/crypto/qat/qat_common/qat_algs.c +++ b/drivers/crypto/qat/qat_common/qat_algs.c @@ -23,6 +23,7 @@ #include "icp_qat_hw.h" #include "icp_qat_fw.h" #include "icp_qat_fw_la.h" +#include "qat_bl.h" #define QAT_AES_HW_CONFIG_ENC(alg, mode) \ ICP_QAT_HW_CIPHER_CONFIG_BUILD(mode, alg, \ @@ -663,189 +664,6 @@ static int qat_alg_aead_setkey(struct crypto_aead *tfm, const u8 *key, return qat_alg_aead_newkey(tfm, key, keylen); } -static void qat_alg_free_bufl(struct qat_crypto_instance *inst, - struct qat_crypto_request *qat_req) -{ - struct device *dev = &GET_DEV(inst->accel_dev); - struct qat_alg_buf_list *bl = qat_req->buf.bl; - struct qat_alg_buf_list *blout = qat_req->buf.blout; - dma_addr_t blp = qat_req->buf.blp; - dma_addr_t blpout = qat_req->buf.bloutp; - size_t sz = qat_req->buf.sz; - size_t sz_out = qat_req->buf.sz_out; - int bl_dma_dir; - int i; - - bl_dma_dir = blp != blpout ? DMA_TO_DEVICE : DMA_BIDIRECTIONAL; - - for (i = 0; i < bl->num_bufs; i++) - dma_unmap_single(dev, bl->bufers[i].addr, - bl->bufers[i].len, bl_dma_dir); - - dma_unmap_single(dev, blp, sz, DMA_TO_DEVICE); - - if (!qat_req->buf.sgl_src_valid) - kfree(bl); - - if (blp != blpout) { - /* If out of place operation dma unmap only data */ - int bufless = blout->num_bufs - blout->num_mapped_bufs; - - for (i = bufless; i < blout->num_bufs; i++) { - dma_unmap_single(dev, blout->bufers[i].addr, - blout->bufers[i].len, - DMA_FROM_DEVICE); - } - dma_unmap_single(dev, blpout, sz_out, DMA_TO_DEVICE); - - if (!qat_req->buf.sgl_dst_valid) - kfree(blout); - } -} - -static int qat_alg_sgl_to_bufl(struct qat_crypto_instance *inst, - struct scatterlist *sgl, - struct scatterlist *sglout, - struct qat_crypto_request *qat_req, - gfp_t flags) -{ - struct device *dev = &GET_DEV(inst->accel_dev); - int i, sg_nctr = 0; - int n = sg_nents(sgl); - struct qat_alg_buf_list *bufl; - struct qat_alg_buf_list *buflout = NULL; - dma_addr_t blp = DMA_MAPPING_ERROR; - dma_addr_t bloutp = DMA_MAPPING_ERROR; - struct scatterlist *sg; - size_t sz_out, sz = struct_size(bufl, bufers, n); - int node = dev_to_node(&GET_DEV(inst->accel_dev)); - int bufl_dma_dir; - - if (unlikely(!n)) - return -EINVAL; - - qat_req->buf.sgl_src_valid = false; - qat_req->buf.sgl_dst_valid = false; - - if (n > QAT_MAX_BUFF_DESC) { - bufl = kzalloc_node(sz, flags, node); - if (unlikely(!bufl)) - return -ENOMEM; - } else { - bufl = &qat_req->buf.sgl_src.sgl_hdr; - memset(bufl, 0, sizeof(struct qat_alg_buf_list)); - qat_req->buf.sgl_src_valid = true; - } - - bufl_dma_dir = sgl != sglout ? DMA_TO_DEVICE : DMA_BIDIRECTIONAL; - - for_each_sg(sgl, sg, n, i) - bufl->bufers[i].addr = DMA_MAPPING_ERROR; - - for_each_sg(sgl, sg, n, i) { - int y = sg_nctr; - - if (!sg->length) - continue; - - bufl->bufers[y].addr = dma_map_single(dev, sg_virt(sg), - sg->length, - bufl_dma_dir); - bufl->bufers[y].len = sg->length; - if (unlikely(dma_mapping_error(dev, bufl->bufers[y].addr))) - goto err_in; - sg_nctr++; - } - bufl->num_bufs = sg_nctr; - blp = dma_map_single(dev, bufl, sz, DMA_TO_DEVICE); - if (unlikely(dma_mapping_error(dev, blp))) - goto err_in; - qat_req->buf.bl = bufl; - qat_req->buf.blp = blp; - qat_req->buf.sz = sz; - /* Handle out of place operation */ - if (sgl != sglout) { - struct qat_alg_buf *bufers; - - n = sg_nents(sglout); - sz_out = struct_size(buflout, bufers, n); - sg_nctr = 0; - - if (n > QAT_MAX_BUFF_DESC) { - buflout = kzalloc_node(sz_out, flags, node); - if (unlikely(!buflout)) - goto err_in; - } else { - buflout = &qat_req->buf.sgl_dst.sgl_hdr; - memset(buflout, 0, sizeof(struct qat_alg_buf_list)); - qat_req->buf.sgl_dst_valid = true; - } - - bufers = buflout->bufers; - for_each_sg(sglout, sg, n, i) - bufers[i].addr = DMA_MAPPING_ERROR; - - for_each_sg(sglout, sg, n, i) { - int y = sg_nctr; - - if (!sg->length) - continue; - - bufers[y].addr = dma_map_single(dev, sg_virt(sg), - sg->length, - DMA_FROM_DEVICE); - if (unlikely(dma_mapping_error(dev, bufers[y].addr))) - goto err_out; - bufers[y].len = sg->length; - sg_nctr++; - } - buflout->num_bufs = sg_nctr; - buflout->num_mapped_bufs = sg_nctr; - bloutp = dma_map_single(dev, buflout, sz_out, DMA_TO_DEVICE); - if (unlikely(dma_mapping_error(dev, bloutp))) - goto err_out; - qat_req->buf.blout = buflout; - qat_req->buf.bloutp = bloutp; - qat_req->buf.sz_out = sz_out; - } else { - /* Otherwise set the src and dst to the same address */ - qat_req->buf.bloutp = qat_req->buf.blp; - qat_req->buf.sz_out = 0; - } - return 0; - -err_out: - if (!dma_mapping_error(dev, bloutp)) - dma_unmap_single(dev, bloutp, sz_out, DMA_TO_DEVICE); - - n = sg_nents(sglout); - for (i = 0; i < n; i++) - if (!dma_mapping_error(dev, buflout->bufers[i].addr)) - dma_unmap_single(dev, buflout->bufers[i].addr, - buflout->bufers[i].len, - DMA_FROM_DEVICE); - - if (!qat_req->buf.sgl_dst_valid) - kfree(buflout); - -err_in: - if (!dma_mapping_error(dev, blp)) - dma_unmap_single(dev, blp, sz, DMA_TO_DEVICE); - - n = sg_nents(sgl); - for (i = 0; i < n; i++) - if (!dma_mapping_error(dev, bufl->bufers[i].addr)) - dma_unmap_single(dev, bufl->bufers[i].addr, - bufl->bufers[i].len, - bufl_dma_dir); - - if (!qat_req->buf.sgl_src_valid) - kfree(bufl); - - dev_err(dev, "Failed to map buf for dma\n"); - return -ENOMEM; -} - static void qat_aead_alg_callback(struct icp_qat_fw_la_resp *qat_resp, struct qat_crypto_request *qat_req) { diff --git a/drivers/crypto/qat/qat_common/qat_bl.c b/drivers/crypto/qat/qat_common/qat_bl.c new file mode 100644 index 0000000000000..6d0a39f8ce109 --- /dev/null +++ b/drivers/crypto/qat/qat_common/qat_bl.c @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2014 - 2022 Intel Corporation */ +#include +#include +#include +#include +#include +#include +#include "adf_accel_devices.h" +#include "qat_bl.h" +#include "qat_crypto.h" + +void qat_alg_free_bufl(struct qat_crypto_instance *inst, + struct qat_crypto_request *qat_req) +{ + struct device *dev = &GET_DEV(inst->accel_dev); + struct qat_alg_buf_list *bl = qat_req->buf.bl; + struct qat_alg_buf_list *blout = qat_req->buf.blout; + dma_addr_t blp = qat_req->buf.blp; + dma_addr_t blpout = qat_req->buf.bloutp; + size_t sz = qat_req->buf.sz; + size_t sz_out = qat_req->buf.sz_out; + int bl_dma_dir; + int i; + + bl_dma_dir = blp != blpout ? DMA_TO_DEVICE : DMA_BIDIRECTIONAL; + + for (i = 0; i < bl->num_bufs; i++) + dma_unmap_single(dev, bl->bufers[i].addr, + bl->bufers[i].len, bl_dma_dir); + + dma_unmap_single(dev, blp, sz, DMA_TO_DEVICE); + + if (!qat_req->buf.sgl_src_valid) + kfree(bl); + + if (blp != blpout) { + /* If out of place operation dma unmap only data */ + int bufless = blout->num_bufs - blout->num_mapped_bufs; + + for (i = bufless; i < blout->num_bufs; i++) { + dma_unmap_single(dev, blout->bufers[i].addr, + blout->bufers[i].len, + DMA_FROM_DEVICE); + } + dma_unmap_single(dev, blpout, sz_out, DMA_TO_DEVICE); + + if (!qat_req->buf.sgl_dst_valid) + kfree(blout); + } +} + +int qat_alg_sgl_to_bufl(struct qat_crypto_instance *inst, + struct scatterlist *sgl, + struct scatterlist *sglout, + struct qat_crypto_request *qat_req, + gfp_t flags) +{ + struct device *dev = &GET_DEV(inst->accel_dev); + int i, sg_nctr = 0; + int n = sg_nents(sgl); + struct qat_alg_buf_list *bufl; + struct qat_alg_buf_list *buflout = NULL; + dma_addr_t blp = DMA_MAPPING_ERROR; + dma_addr_t bloutp = DMA_MAPPING_ERROR; + struct scatterlist *sg; + size_t sz_out, sz = struct_size(bufl, bufers, n); + int node = dev_to_node(&GET_DEV(inst->accel_dev)); + int bufl_dma_dir; + + if (unlikely(!n)) + return -EINVAL; + + qat_req->buf.sgl_src_valid = false; + qat_req->buf.sgl_dst_valid = false; + + if (n > QAT_MAX_BUFF_DESC) { + bufl = kzalloc_node(sz, flags, node); + if (unlikely(!bufl)) + return -ENOMEM; + } else { + bufl = &qat_req->buf.sgl_src.sgl_hdr; + memset(bufl, 0, sizeof(struct qat_alg_buf_list)); + qat_req->buf.sgl_src_valid = true; + } + + bufl_dma_dir = sgl != sglout ? DMA_TO_DEVICE : DMA_BIDIRECTIONAL; + + for_each_sg(sgl, sg, n, i) + bufl->bufers[i].addr = DMA_MAPPING_ERROR; + + for_each_sg(sgl, sg, n, i) { + int y = sg_nctr; + + if (!sg->length) + continue; + + bufl->bufers[y].addr = dma_map_single(dev, sg_virt(sg), + sg->length, + bufl_dma_dir); + bufl->bufers[y].len = sg->length; + if (unlikely(dma_mapping_error(dev, bufl->bufers[y].addr))) + goto err_in; + sg_nctr++; + } + bufl->num_bufs = sg_nctr; + blp = dma_map_single(dev, bufl, sz, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(dev, blp))) + goto err_in; + qat_req->buf.bl = bufl; + qat_req->buf.blp = blp; + qat_req->buf.sz = sz; + /* Handle out of place operation */ + if (sgl != sglout) { + struct qat_alg_buf *bufers; + + n = sg_nents(sglout); + sz_out = struct_size(buflout, bufers, n); + sg_nctr = 0; + + if (n > QAT_MAX_BUFF_DESC) { + buflout = kzalloc_node(sz_out, flags, node); + if (unlikely(!buflout)) + goto err_in; + } else { + buflout = &qat_req->buf.sgl_dst.sgl_hdr; + memset(buflout, 0, sizeof(struct qat_alg_buf_list)); + qat_req->buf.sgl_dst_valid = true; + } + + bufers = buflout->bufers; + for_each_sg(sglout, sg, n, i) + bufers[i].addr = DMA_MAPPING_ERROR; + + for_each_sg(sglout, sg, n, i) { + int y = sg_nctr; + + if (!sg->length) + continue; + + bufers[y].addr = dma_map_single(dev, sg_virt(sg), + sg->length, + DMA_FROM_DEVICE); + if (unlikely(dma_mapping_error(dev, bufers[y].addr))) + goto err_out; + bufers[y].len = sg->length; + sg_nctr++; + } + buflout->num_bufs = sg_nctr; + buflout->num_mapped_bufs = sg_nctr; + bloutp = dma_map_single(dev, buflout, sz_out, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(dev, bloutp))) + goto err_out; + qat_req->buf.blout = buflout; + qat_req->buf.bloutp = bloutp; + qat_req->buf.sz_out = sz_out; + } else { + /* Otherwise set the src and dst to the same address */ + qat_req->buf.bloutp = qat_req->buf.blp; + qat_req->buf.sz_out = 0; + } + return 0; + +err_out: + if (!dma_mapping_error(dev, bloutp)) + dma_unmap_single(dev, bloutp, sz_out, DMA_TO_DEVICE); + + n = sg_nents(sglout); + for (i = 0; i < n; i++) + if (!dma_mapping_error(dev, buflout->bufers[i].addr)) + dma_unmap_single(dev, buflout->bufers[i].addr, + buflout->bufers[i].len, + DMA_FROM_DEVICE); + + if (!qat_req->buf.sgl_dst_valid) + kfree(buflout); + +err_in: + if (!dma_mapping_error(dev, blp)) + dma_unmap_single(dev, blp, sz, DMA_TO_DEVICE); + + n = sg_nents(sgl); + for (i = 0; i < n; i++) + if (!dma_mapping_error(dev, bufl->bufers[i].addr)) + dma_unmap_single(dev, bufl->bufers[i].addr, + bufl->bufers[i].len, + bufl_dma_dir); + + if (!qat_req->buf.sgl_src_valid) + kfree(bufl); + + dev_err(dev, "Failed to map buf for dma\n"); + return -ENOMEM; +} diff --git a/drivers/crypto/qat/qat_common/qat_bl.h b/drivers/crypto/qat/qat_common/qat_bl.h new file mode 100644 index 0000000000000..7a916f1ec645f --- /dev/null +++ b/drivers/crypto/qat/qat_common/qat_bl.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright(c) 2014 - 2022 Intel Corporation */ +#ifndef QAT_BL_H +#define QAT_BL_H +#include +#include +#include "qat_crypto.h" + +void qat_alg_free_bufl(struct qat_crypto_instance *inst, + struct qat_crypto_request *qat_req); +int qat_alg_sgl_to_bufl(struct qat_crypto_instance *inst, + struct scatterlist *sgl, + struct scatterlist *sglout, + struct qat_crypto_request *qat_req, + gfp_t flags); + +#endif -- GitLab From b0cd997f35598c4fc01bf22061e1eb88fc10afad Mon Sep 17 00:00:00 2001 From: Giovanni Cabiddu Date: Mon, 28 Nov 2022 12:21:13 +0000 Subject: [PATCH 1249/1423] crypto: qat - rename bufferlist functions Rename the functions qat_alg_sgl_to_bufl() and qat_alg_free_bufl() as qat_bl_sgl_to_bufl() and qat_bl_free_bufl() after their relocation into the qat_bl module. This commit does not implement any functional change. Signed-off-by: Giovanni Cabiddu Signed-off-by: Herbert Xu --- drivers/crypto/qat/qat_common/qat_algs.c | 20 ++++++++++---------- drivers/crypto/qat/qat_common/qat_bl.c | 14 +++++++------- drivers/crypto/qat/qat_common/qat_bl.h | 14 +++++++------- 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/drivers/crypto/qat/qat_common/qat_algs.c b/drivers/crypto/qat/qat_common/qat_algs.c index 2ee4fa64032fa..3e7e9fffe28b3 100644 --- a/drivers/crypto/qat/qat_common/qat_algs.c +++ b/drivers/crypto/qat/qat_common/qat_algs.c @@ -673,7 +673,7 @@ static void qat_aead_alg_callback(struct icp_qat_fw_la_resp *qat_resp, u8 stat_filed = qat_resp->comn_resp.comn_status; int res = 0, qat_res = ICP_QAT_FW_COMN_RESP_CRYPTO_STAT_GET(stat_filed); - qat_alg_free_bufl(inst, qat_req); + qat_bl_free_bufl(inst, qat_req); if (unlikely(qat_res != ICP_QAT_FW_COMN_STATUS_FLAG_OK)) res = -EBADMSG; areq->base.complete(&areq->base, res); @@ -743,7 +743,7 @@ static void qat_skcipher_alg_callback(struct icp_qat_fw_la_resp *qat_resp, u8 stat_filed = qat_resp->comn_resp.comn_status; int res = 0, qat_res = ICP_QAT_FW_COMN_RESP_CRYPTO_STAT_GET(stat_filed); - qat_alg_free_bufl(inst, qat_req); + qat_bl_free_bufl(inst, qat_req); if (unlikely(qat_res != ICP_QAT_FW_COMN_STATUS_FLAG_OK)) res = -EINVAL; @@ -799,7 +799,7 @@ static int qat_alg_aead_dec(struct aead_request *areq) if (cipher_len % AES_BLOCK_SIZE != 0) return -EINVAL; - ret = qat_alg_sgl_to_bufl(ctx->inst, areq->src, areq->dst, qat_req, f); + ret = qat_bl_sgl_to_bufl(ctx->inst, areq->src, areq->dst, qat_req, f); if (unlikely(ret)) return ret; @@ -821,7 +821,7 @@ static int qat_alg_aead_dec(struct aead_request *areq) ret = qat_alg_send_sym_message(qat_req, ctx->inst, &areq->base); if (ret == -ENOSPC) - qat_alg_free_bufl(ctx->inst, qat_req); + qat_bl_free_bufl(ctx->inst, qat_req); return ret; } @@ -842,7 +842,7 @@ static int qat_alg_aead_enc(struct aead_request *areq) if (areq->cryptlen % AES_BLOCK_SIZE != 0) return -EINVAL; - ret = qat_alg_sgl_to_bufl(ctx->inst, areq->src, areq->dst, qat_req, f); + ret = qat_bl_sgl_to_bufl(ctx->inst, areq->src, areq->dst, qat_req, f); if (unlikely(ret)) return ret; @@ -866,7 +866,7 @@ static int qat_alg_aead_enc(struct aead_request *areq) ret = qat_alg_send_sym_message(qat_req, ctx->inst, &areq->base); if (ret == -ENOSPC) - qat_alg_free_bufl(ctx->inst, qat_req); + qat_bl_free_bufl(ctx->inst, qat_req); return ret; } @@ -1027,7 +1027,7 @@ static int qat_alg_skcipher_encrypt(struct skcipher_request *req) if (req->cryptlen == 0) return 0; - ret = qat_alg_sgl_to_bufl(ctx->inst, req->src, req->dst, qat_req, f); + ret = qat_bl_sgl_to_bufl(ctx->inst, req->src, req->dst, qat_req, f); if (unlikely(ret)) return ret; @@ -1048,7 +1048,7 @@ static int qat_alg_skcipher_encrypt(struct skcipher_request *req) ret = qat_alg_send_sym_message(qat_req, ctx->inst, &req->base); if (ret == -ENOSPC) - qat_alg_free_bufl(ctx->inst, qat_req); + qat_bl_free_bufl(ctx->inst, qat_req); return ret; } @@ -1093,7 +1093,7 @@ static int qat_alg_skcipher_decrypt(struct skcipher_request *req) if (req->cryptlen == 0) return 0; - ret = qat_alg_sgl_to_bufl(ctx->inst, req->src, req->dst, qat_req, f); + ret = qat_bl_sgl_to_bufl(ctx->inst, req->src, req->dst, qat_req, f); if (unlikely(ret)) return ret; @@ -1115,7 +1115,7 @@ static int qat_alg_skcipher_decrypt(struct skcipher_request *req) ret = qat_alg_send_sym_message(qat_req, ctx->inst, &req->base); if (ret == -ENOSPC) - qat_alg_free_bufl(ctx->inst, qat_req); + qat_bl_free_bufl(ctx->inst, qat_req); return ret; } diff --git a/drivers/crypto/qat/qat_common/qat_bl.c b/drivers/crypto/qat/qat_common/qat_bl.c index 6d0a39f8ce109..8f7743f3c89b9 100644 --- a/drivers/crypto/qat/qat_common/qat_bl.c +++ b/drivers/crypto/qat/qat_common/qat_bl.c @@ -10,8 +10,8 @@ #include "qat_bl.h" #include "qat_crypto.h" -void qat_alg_free_bufl(struct qat_crypto_instance *inst, - struct qat_crypto_request *qat_req) +void qat_bl_free_bufl(struct qat_crypto_instance *inst, + struct qat_crypto_request *qat_req) { struct device *dev = &GET_DEV(inst->accel_dev); struct qat_alg_buf_list *bl = qat_req->buf.bl; @@ -50,11 +50,11 @@ void qat_alg_free_bufl(struct qat_crypto_instance *inst, } } -int qat_alg_sgl_to_bufl(struct qat_crypto_instance *inst, - struct scatterlist *sgl, - struct scatterlist *sglout, - struct qat_crypto_request *qat_req, - gfp_t flags) +int qat_bl_sgl_to_bufl(struct qat_crypto_instance *inst, + struct scatterlist *sgl, + struct scatterlist *sglout, + struct qat_crypto_request *qat_req, + gfp_t flags) { struct device *dev = &GET_DEV(inst->accel_dev); int i, sg_nctr = 0; diff --git a/drivers/crypto/qat/qat_common/qat_bl.h b/drivers/crypto/qat/qat_common/qat_bl.h index 7a916f1ec645f..ed4c200ac6197 100644 --- a/drivers/crypto/qat/qat_common/qat_bl.h +++ b/drivers/crypto/qat/qat_common/qat_bl.h @@ -6,12 +6,12 @@ #include #include "qat_crypto.h" -void qat_alg_free_bufl(struct qat_crypto_instance *inst, - struct qat_crypto_request *qat_req); -int qat_alg_sgl_to_bufl(struct qat_crypto_instance *inst, - struct scatterlist *sgl, - struct scatterlist *sglout, - struct qat_crypto_request *qat_req, - gfp_t flags); +void qat_bl_free_bufl(struct qat_crypto_instance *inst, + struct qat_crypto_request *qat_req); +int qat_bl_sgl_to_bufl(struct qat_crypto_instance *inst, + struct scatterlist *sgl, + struct scatterlist *sglout, + struct qat_crypto_request *qat_req, + gfp_t flags); #endif -- GitLab From 3ed330d0dba61d2e08a0eed7aa3d5def3f0c749b Mon Sep 17 00:00:00 2001 From: Giovanni Cabiddu Date: Mon, 28 Nov 2022 12:21:14 +0000 Subject: [PATCH 1250/1423] crypto: qat - change bufferlist logic interface The functions qat_alg_sgl_to_bufl() and qat_alg_free_bufl() take as argument a qat_crypto_instance and a qat_crypto_request structure. These two structures are used only to get a reference to the adf_accel_dev and qat_crypto_request_buffs. In order to reuse these functions for the compression service, change the signature so that they take adf_accel_dev and qat_crypto_request_buffs. Signed-off-by: Giovanni Cabiddu Reviewed-by: Wojciech Ziemba Reviewed-by: Adam Guerin Signed-off-by: Herbert Xu --- drivers/crypto/qat/qat_common/qat_algs.c | 24 +++++---- drivers/crypto/qat/qat_common/qat_bl.c | 62 ++++++++++++------------ drivers/crypto/qat/qat_common/qat_bl.h | 8 +-- 3 files changed, 49 insertions(+), 45 deletions(-) diff --git a/drivers/crypto/qat/qat_common/qat_algs.c b/drivers/crypto/qat/qat_common/qat_algs.c index 3e7e9fffe28b3..dfa65e42db781 100644 --- a/drivers/crypto/qat/qat_common/qat_algs.c +++ b/drivers/crypto/qat/qat_common/qat_algs.c @@ -673,7 +673,7 @@ static void qat_aead_alg_callback(struct icp_qat_fw_la_resp *qat_resp, u8 stat_filed = qat_resp->comn_resp.comn_status; int res = 0, qat_res = ICP_QAT_FW_COMN_RESP_CRYPTO_STAT_GET(stat_filed); - qat_bl_free_bufl(inst, qat_req); + qat_bl_free_bufl(inst->accel_dev, &qat_req->buf); if (unlikely(qat_res != ICP_QAT_FW_COMN_STATUS_FLAG_OK)) res = -EBADMSG; areq->base.complete(&areq->base, res); @@ -743,7 +743,7 @@ static void qat_skcipher_alg_callback(struct icp_qat_fw_la_resp *qat_resp, u8 stat_filed = qat_resp->comn_resp.comn_status; int res = 0, qat_res = ICP_QAT_FW_COMN_RESP_CRYPTO_STAT_GET(stat_filed); - qat_bl_free_bufl(inst, qat_req); + qat_bl_free_bufl(inst->accel_dev, &qat_req->buf); if (unlikely(qat_res != ICP_QAT_FW_COMN_STATUS_FLAG_OK)) res = -EINVAL; @@ -799,7 +799,8 @@ static int qat_alg_aead_dec(struct aead_request *areq) if (cipher_len % AES_BLOCK_SIZE != 0) return -EINVAL; - ret = qat_bl_sgl_to_bufl(ctx->inst, areq->src, areq->dst, qat_req, f); + ret = qat_bl_sgl_to_bufl(ctx->inst->accel_dev, areq->src, areq->dst, + &qat_req->buf, f); if (unlikely(ret)) return ret; @@ -821,7 +822,7 @@ static int qat_alg_aead_dec(struct aead_request *areq) ret = qat_alg_send_sym_message(qat_req, ctx->inst, &areq->base); if (ret == -ENOSPC) - qat_bl_free_bufl(ctx->inst, qat_req); + qat_bl_free_bufl(ctx->inst->accel_dev, &qat_req->buf); return ret; } @@ -842,7 +843,8 @@ static int qat_alg_aead_enc(struct aead_request *areq) if (areq->cryptlen % AES_BLOCK_SIZE != 0) return -EINVAL; - ret = qat_bl_sgl_to_bufl(ctx->inst, areq->src, areq->dst, qat_req, f); + ret = qat_bl_sgl_to_bufl(ctx->inst->accel_dev, areq->src, areq->dst, + &qat_req->buf, f); if (unlikely(ret)) return ret; @@ -866,7 +868,7 @@ static int qat_alg_aead_enc(struct aead_request *areq) ret = qat_alg_send_sym_message(qat_req, ctx->inst, &areq->base); if (ret == -ENOSPC) - qat_bl_free_bufl(ctx->inst, qat_req); + qat_bl_free_bufl(ctx->inst->accel_dev, &qat_req->buf); return ret; } @@ -1027,7 +1029,8 @@ static int qat_alg_skcipher_encrypt(struct skcipher_request *req) if (req->cryptlen == 0) return 0; - ret = qat_bl_sgl_to_bufl(ctx->inst, req->src, req->dst, qat_req, f); + ret = qat_bl_sgl_to_bufl(ctx->inst->accel_dev, req->src, req->dst, + &qat_req->buf, f); if (unlikely(ret)) return ret; @@ -1048,7 +1051,7 @@ static int qat_alg_skcipher_encrypt(struct skcipher_request *req) ret = qat_alg_send_sym_message(qat_req, ctx->inst, &req->base); if (ret == -ENOSPC) - qat_bl_free_bufl(ctx->inst, qat_req); + qat_bl_free_bufl(ctx->inst->accel_dev, &qat_req->buf); return ret; } @@ -1093,7 +1096,8 @@ static int qat_alg_skcipher_decrypt(struct skcipher_request *req) if (req->cryptlen == 0) return 0; - ret = qat_bl_sgl_to_bufl(ctx->inst, req->src, req->dst, qat_req, f); + ret = qat_bl_sgl_to_bufl(ctx->inst->accel_dev, req->src, req->dst, + &qat_req->buf, f); if (unlikely(ret)) return ret; @@ -1115,7 +1119,7 @@ static int qat_alg_skcipher_decrypt(struct skcipher_request *req) ret = qat_alg_send_sym_message(qat_req, ctx->inst, &req->base); if (ret == -ENOSPC) - qat_bl_free_bufl(ctx->inst, qat_req); + qat_bl_free_bufl(ctx->inst->accel_dev, &qat_req->buf); return ret; } diff --git a/drivers/crypto/qat/qat_common/qat_bl.c b/drivers/crypto/qat/qat_common/qat_bl.c index 8f7743f3c89b9..5e319887f8d69 100644 --- a/drivers/crypto/qat/qat_common/qat_bl.c +++ b/drivers/crypto/qat/qat_common/qat_bl.c @@ -10,16 +10,16 @@ #include "qat_bl.h" #include "qat_crypto.h" -void qat_bl_free_bufl(struct qat_crypto_instance *inst, - struct qat_crypto_request *qat_req) +void qat_bl_free_bufl(struct adf_accel_dev *accel_dev, + struct qat_crypto_request_buffs *buf) { - struct device *dev = &GET_DEV(inst->accel_dev); - struct qat_alg_buf_list *bl = qat_req->buf.bl; - struct qat_alg_buf_list *blout = qat_req->buf.blout; - dma_addr_t blp = qat_req->buf.blp; - dma_addr_t blpout = qat_req->buf.bloutp; - size_t sz = qat_req->buf.sz; - size_t sz_out = qat_req->buf.sz_out; + struct device *dev = &GET_DEV(accel_dev); + struct qat_alg_buf_list *bl = buf->bl; + struct qat_alg_buf_list *blout = buf->blout; + dma_addr_t blp = buf->blp; + dma_addr_t blpout = buf->bloutp; + size_t sz = buf->sz; + size_t sz_out = buf->sz_out; int bl_dma_dir; int i; @@ -31,7 +31,7 @@ void qat_bl_free_bufl(struct qat_crypto_instance *inst, dma_unmap_single(dev, blp, sz, DMA_TO_DEVICE); - if (!qat_req->buf.sgl_src_valid) + if (!buf->sgl_src_valid) kfree(bl); if (blp != blpout) { @@ -45,18 +45,18 @@ void qat_bl_free_bufl(struct qat_crypto_instance *inst, } dma_unmap_single(dev, blpout, sz_out, DMA_TO_DEVICE); - if (!qat_req->buf.sgl_dst_valid) + if (!buf->sgl_dst_valid) kfree(blout); } } -int qat_bl_sgl_to_bufl(struct qat_crypto_instance *inst, +int qat_bl_sgl_to_bufl(struct adf_accel_dev *accel_dev, struct scatterlist *sgl, struct scatterlist *sglout, - struct qat_crypto_request *qat_req, + struct qat_crypto_request_buffs *buf, gfp_t flags) { - struct device *dev = &GET_DEV(inst->accel_dev); + struct device *dev = &GET_DEV(accel_dev); int i, sg_nctr = 0; int n = sg_nents(sgl); struct qat_alg_buf_list *bufl; @@ -65,23 +65,23 @@ int qat_bl_sgl_to_bufl(struct qat_crypto_instance *inst, dma_addr_t bloutp = DMA_MAPPING_ERROR; struct scatterlist *sg; size_t sz_out, sz = struct_size(bufl, bufers, n); - int node = dev_to_node(&GET_DEV(inst->accel_dev)); + int node = dev_to_node(&GET_DEV(accel_dev)); int bufl_dma_dir; if (unlikely(!n)) return -EINVAL; - qat_req->buf.sgl_src_valid = false; - qat_req->buf.sgl_dst_valid = false; + buf->sgl_src_valid = false; + buf->sgl_dst_valid = false; if (n > QAT_MAX_BUFF_DESC) { bufl = kzalloc_node(sz, flags, node); if (unlikely(!bufl)) return -ENOMEM; } else { - bufl = &qat_req->buf.sgl_src.sgl_hdr; + bufl = &buf->sgl_src.sgl_hdr; memset(bufl, 0, sizeof(struct qat_alg_buf_list)); - qat_req->buf.sgl_src_valid = true; + buf->sgl_src_valid = true; } bufl_dma_dir = sgl != sglout ? DMA_TO_DEVICE : DMA_BIDIRECTIONAL; @@ -107,9 +107,9 @@ int qat_bl_sgl_to_bufl(struct qat_crypto_instance *inst, blp = dma_map_single(dev, bufl, sz, DMA_TO_DEVICE); if (unlikely(dma_mapping_error(dev, blp))) goto err_in; - qat_req->buf.bl = bufl; - qat_req->buf.blp = blp; - qat_req->buf.sz = sz; + buf->bl = bufl; + buf->blp = blp; + buf->sz = sz; /* Handle out of place operation */ if (sgl != sglout) { struct qat_alg_buf *bufers; @@ -123,9 +123,9 @@ int qat_bl_sgl_to_bufl(struct qat_crypto_instance *inst, if (unlikely(!buflout)) goto err_in; } else { - buflout = &qat_req->buf.sgl_dst.sgl_hdr; + buflout = &buf->sgl_dst.sgl_hdr; memset(buflout, 0, sizeof(struct qat_alg_buf_list)); - qat_req->buf.sgl_dst_valid = true; + buf->sgl_dst_valid = true; } bufers = buflout->bufers; @@ -151,13 +151,13 @@ int qat_bl_sgl_to_bufl(struct qat_crypto_instance *inst, bloutp = dma_map_single(dev, buflout, sz_out, DMA_TO_DEVICE); if (unlikely(dma_mapping_error(dev, bloutp))) goto err_out; - qat_req->buf.blout = buflout; - qat_req->buf.bloutp = bloutp; - qat_req->buf.sz_out = sz_out; + buf->blout = buflout; + buf->bloutp = bloutp; + buf->sz_out = sz_out; } else { /* Otherwise set the src and dst to the same address */ - qat_req->buf.bloutp = qat_req->buf.blp; - qat_req->buf.sz_out = 0; + buf->bloutp = buf->blp; + buf->sz_out = 0; } return 0; @@ -172,7 +172,7 @@ err_out: buflout->bufers[i].len, DMA_FROM_DEVICE); - if (!qat_req->buf.sgl_dst_valid) + if (!buf->sgl_dst_valid) kfree(buflout); err_in: @@ -186,7 +186,7 @@ err_in: bufl->bufers[i].len, bufl_dma_dir); - if (!qat_req->buf.sgl_src_valid) + if (!buf->sgl_src_valid) kfree(bufl); dev_err(dev, "Failed to map buf for dma\n"); diff --git a/drivers/crypto/qat/qat_common/qat_bl.h b/drivers/crypto/qat/qat_common/qat_bl.h index ed4c200ac6197..241299c219dd5 100644 --- a/drivers/crypto/qat/qat_common/qat_bl.h +++ b/drivers/crypto/qat/qat_common/qat_bl.h @@ -6,12 +6,12 @@ #include #include "qat_crypto.h" -void qat_bl_free_bufl(struct qat_crypto_instance *inst, - struct qat_crypto_request *qat_req); -int qat_bl_sgl_to_bufl(struct qat_crypto_instance *inst, +void qat_bl_free_bufl(struct adf_accel_dev *accel_dev, + struct qat_crypto_request_buffs *buf); +int qat_bl_sgl_to_bufl(struct adf_accel_dev *accel_dev, struct scatterlist *sgl, struct scatterlist *sglout, - struct qat_crypto_request *qat_req, + struct qat_crypto_request_buffs *buf, gfp_t flags); #endif -- GitLab From 36ebc7472afeb58f1eb1d4c1f0546b9e98acea46 Mon Sep 17 00:00:00 2001 From: Giovanni Cabiddu Date: Mon, 28 Nov 2022 12:21:15 +0000 Subject: [PATCH 1251/1423] crypto: qat - generalize crypto request buffers The structure qat_crypto_request_buffs which contains the source and destination buffer lists and correspondent sizes and dma addresses is also required for the compression service. Rename it as qat_request_buffs and move it to qat_bl.h. Signed-off-by: Giovanni Cabiddu Reviewed-by: Wojciech Ziemba Reviewed-by: Adam Guerin Signed-off-by: Herbert Xu --- drivers/crypto/qat/qat_common/qat_bl.c | 4 +-- drivers/crypto/qat/qat_common/qat_bl.h | 38 ++++++++++++++++++++-- drivers/crypto/qat/qat_common/qat_crypto.h | 36 ++------------------ 3 files changed, 39 insertions(+), 39 deletions(-) diff --git a/drivers/crypto/qat/qat_common/qat_bl.c b/drivers/crypto/qat/qat_common/qat_bl.c index 5e319887f8d69..c32b12d386f0a 100644 --- a/drivers/crypto/qat/qat_common/qat_bl.c +++ b/drivers/crypto/qat/qat_common/qat_bl.c @@ -11,7 +11,7 @@ #include "qat_crypto.h" void qat_bl_free_bufl(struct adf_accel_dev *accel_dev, - struct qat_crypto_request_buffs *buf) + struct qat_request_buffs *buf) { struct device *dev = &GET_DEV(accel_dev); struct qat_alg_buf_list *bl = buf->bl; @@ -53,7 +53,7 @@ void qat_bl_free_bufl(struct adf_accel_dev *accel_dev, int qat_bl_sgl_to_bufl(struct adf_accel_dev *accel_dev, struct scatterlist *sgl, struct scatterlist *sglout, - struct qat_crypto_request_buffs *buf, + struct qat_request_buffs *buf, gfp_t flags) { struct device *dev = &GET_DEV(accel_dev); diff --git a/drivers/crypto/qat/qat_common/qat_bl.h b/drivers/crypto/qat/qat_common/qat_bl.h index 241299c219dd5..1c534c57a36bc 100644 --- a/drivers/crypto/qat/qat_common/qat_bl.h +++ b/drivers/crypto/qat/qat_common/qat_bl.h @@ -4,14 +4,46 @@ #define QAT_BL_H #include #include -#include "qat_crypto.h" + +#define QAT_MAX_BUFF_DESC 4 + +struct qat_alg_buf { + u32 len; + u32 resrvd; + u64 addr; +} __packed; + +struct qat_alg_buf_list { + u64 resrvd; + u32 num_bufs; + u32 num_mapped_bufs; + struct qat_alg_buf bufers[]; +} __packed; + +struct qat_alg_fixed_buf_list { + struct qat_alg_buf_list sgl_hdr; + struct qat_alg_buf descriptors[QAT_MAX_BUFF_DESC]; +} __packed __aligned(64); + +struct qat_request_buffs { + struct qat_alg_buf_list *bl; + dma_addr_t blp; + struct qat_alg_buf_list *blout; + dma_addr_t bloutp; + size_t sz; + size_t sz_out; + bool sgl_src_valid; + bool sgl_dst_valid; + struct qat_alg_fixed_buf_list sgl_src; + struct qat_alg_fixed_buf_list sgl_dst; +}; void qat_bl_free_bufl(struct adf_accel_dev *accel_dev, - struct qat_crypto_request_buffs *buf); + struct qat_request_buffs *buf); int qat_bl_sgl_to_bufl(struct adf_accel_dev *accel_dev, struct scatterlist *sgl, struct scatterlist *sglout, - struct qat_crypto_request_buffs *buf, + struct qat_request_buffs *buf, gfp_t flags); #endif diff --git a/drivers/crypto/qat/qat_common/qat_crypto.h b/drivers/crypto/qat/qat_common/qat_crypto.h index df3c738ce323a..bb116357a5684 100644 --- a/drivers/crypto/qat/qat_common/qat_crypto.h +++ b/drivers/crypto/qat/qat_common/qat_crypto.h @@ -8,6 +8,7 @@ #include #include "adf_accel_devices.h" #include "icp_qat_fw_la.h" +#include "qat_bl.h" struct qat_instance_backlog { struct list_head list; @@ -35,39 +36,6 @@ struct qat_crypto_instance { struct qat_instance_backlog backlog; }; -#define QAT_MAX_BUFF_DESC 4 - -struct qat_alg_buf { - u32 len; - u32 resrvd; - u64 addr; -} __packed; - -struct qat_alg_buf_list { - u64 resrvd; - u32 num_bufs; - u32 num_mapped_bufs; - struct qat_alg_buf bufers[]; -} __packed; - -struct qat_alg_fixed_buf_list { - struct qat_alg_buf_list sgl_hdr; - struct qat_alg_buf descriptors[QAT_MAX_BUFF_DESC]; -} __packed __aligned(64); - -struct qat_crypto_request_buffs { - struct qat_alg_buf_list *bl; - dma_addr_t blp; - struct qat_alg_buf_list *blout; - dma_addr_t bloutp; - size_t sz; - size_t sz_out; - bool sgl_src_valid; - bool sgl_dst_valid; - struct qat_alg_fixed_buf_list sgl_src; - struct qat_alg_fixed_buf_list sgl_dst; -}; - struct qat_crypto_request; struct qat_crypto_request { @@ -80,7 +48,7 @@ struct qat_crypto_request { struct aead_request *aead_req; struct skcipher_request *skcipher_req; }; - struct qat_crypto_request_buffs buf; + struct qat_request_buffs buf; void (*cb)(struct icp_qat_fw_la_resp *resp, struct qat_crypto_request *req); union { -- GitLab From cf692906bd61af2eec06a32a83d2a8ec3acf3548 Mon Sep 17 00:00:00 2001 From: Giovanni Cabiddu Date: Mon, 28 Nov 2022 12:21:16 +0000 Subject: [PATCH 1252/1423] crypto: qat - extend buffer list interface The compression service requires an additional pre-allocated buffer for each destination scatter list. Extend the function qat_alg_sgl_to_bufl() to take an additional structure that contains the dma address and the size of the extra buffer which will be appended in the destination FW SGL. The logic that unmaps buffers in qat_alg_free_bufl() has been changed to start unmapping from buffer 0 instead of skipping the initial buffers num_buff - num_mapped_bufs as that functionality was not used in the code. Signed-off-by: Giovanni Cabiddu Reviewed-by: Wojciech Ziemba Reviewed-by: Adam Guerin Signed-off-by: Herbert Xu --- drivers/crypto/qat/qat_common/qat_algs.c | 8 ++-- drivers/crypto/qat/qat_common/qat_bl.c | 58 ++++++++++++++++++------ drivers/crypto/qat/qat_common/qat_bl.h | 6 +++ 3 files changed, 54 insertions(+), 18 deletions(-) diff --git a/drivers/crypto/qat/qat_common/qat_algs.c b/drivers/crypto/qat/qat_common/qat_algs.c index dfa65e42db781..b4b9f0aa59b98 100644 --- a/drivers/crypto/qat/qat_common/qat_algs.c +++ b/drivers/crypto/qat/qat_common/qat_algs.c @@ -800,7 +800,7 @@ static int qat_alg_aead_dec(struct aead_request *areq) return -EINVAL; ret = qat_bl_sgl_to_bufl(ctx->inst->accel_dev, areq->src, areq->dst, - &qat_req->buf, f); + &qat_req->buf, NULL, f); if (unlikely(ret)) return ret; @@ -844,7 +844,7 @@ static int qat_alg_aead_enc(struct aead_request *areq) return -EINVAL; ret = qat_bl_sgl_to_bufl(ctx->inst->accel_dev, areq->src, areq->dst, - &qat_req->buf, f); + &qat_req->buf, NULL, f); if (unlikely(ret)) return ret; @@ -1030,7 +1030,7 @@ static int qat_alg_skcipher_encrypt(struct skcipher_request *req) return 0; ret = qat_bl_sgl_to_bufl(ctx->inst->accel_dev, req->src, req->dst, - &qat_req->buf, f); + &qat_req->buf, NULL, f); if (unlikely(ret)) return ret; @@ -1097,7 +1097,7 @@ static int qat_alg_skcipher_decrypt(struct skcipher_request *req) return 0; ret = qat_bl_sgl_to_bufl(ctx->inst->accel_dev, req->src, req->dst, - &qat_req->buf, f); + &qat_req->buf, NULL, f); if (unlikely(ret)) return ret; diff --git a/drivers/crypto/qat/qat_common/qat_bl.c b/drivers/crypto/qat/qat_common/qat_bl.c index c32b12d386f0a..221a4eb610a38 100644 --- a/drivers/crypto/qat/qat_common/qat_bl.c +++ b/drivers/crypto/qat/qat_common/qat_bl.c @@ -35,10 +35,7 @@ void qat_bl_free_bufl(struct adf_accel_dev *accel_dev, kfree(bl); if (blp != blpout) { - /* If out of place operation dma unmap only data */ - int bufless = blout->num_bufs - blout->num_mapped_bufs; - - for (i = bufless; i < blout->num_bufs; i++) { + for (i = 0; i < blout->num_mapped_bufs; i++) { dma_unmap_single(dev, blout->bufers[i].addr, blout->bufers[i].len, DMA_FROM_DEVICE); @@ -50,11 +47,13 @@ void qat_bl_free_bufl(struct adf_accel_dev *accel_dev, } } -int qat_bl_sgl_to_bufl(struct adf_accel_dev *accel_dev, - struct scatterlist *sgl, - struct scatterlist *sglout, - struct qat_request_buffs *buf, - gfp_t flags) +static int __qat_bl_sgl_to_bufl(struct adf_accel_dev *accel_dev, + struct scatterlist *sgl, + struct scatterlist *sglout, + struct qat_request_buffs *buf, + dma_addr_t extra_dst_buff, + size_t sz_extra_dst_buff, + gfp_t flags) { struct device *dev = &GET_DEV(accel_dev); int i, sg_nctr = 0; @@ -86,7 +85,7 @@ int qat_bl_sgl_to_bufl(struct adf_accel_dev *accel_dev, bufl_dma_dir = sgl != sglout ? DMA_TO_DEVICE : DMA_BIDIRECTIONAL; - for_each_sg(sgl, sg, n, i) + for (i = 0; i < n; i++) bufl->bufers[i].addr = DMA_MAPPING_ERROR; for_each_sg(sgl, sg, n, i) { @@ -113,8 +112,10 @@ int qat_bl_sgl_to_bufl(struct adf_accel_dev *accel_dev, /* Handle out of place operation */ if (sgl != sglout) { struct qat_alg_buf *bufers; + int extra_buff = extra_dst_buff ? 1 : 0; + int n_sglout = sg_nents(sglout); - n = sg_nents(sglout); + n = n_sglout + extra_buff; sz_out = struct_size(buflout, bufers, n); sg_nctr = 0; @@ -129,10 +130,10 @@ int qat_bl_sgl_to_bufl(struct adf_accel_dev *accel_dev, } bufers = buflout->bufers; - for_each_sg(sglout, sg, n, i) + for (i = 0; i < n; i++) bufers[i].addr = DMA_MAPPING_ERROR; - for_each_sg(sglout, sg, n, i) { + for_each_sg(sglout, sg, n_sglout, i) { int y = sg_nctr; if (!sg->length) @@ -146,7 +147,13 @@ int qat_bl_sgl_to_bufl(struct adf_accel_dev *accel_dev, bufers[y].len = sg->length; sg_nctr++; } + if (extra_buff) { + bufers[sg_nctr].addr = extra_dst_buff; + bufers[sg_nctr].len = sz_extra_dst_buff; + } + buflout->num_bufs = sg_nctr; + buflout->num_bufs += extra_buff; buflout->num_mapped_bufs = sg_nctr; bloutp = dma_map_single(dev, buflout, sz_out, DMA_TO_DEVICE); if (unlikely(dma_mapping_error(dev, bloutp))) @@ -166,11 +173,14 @@ err_out: dma_unmap_single(dev, bloutp, sz_out, DMA_TO_DEVICE); n = sg_nents(sglout); - for (i = 0; i < n; i++) + for (i = 0; i < n; i++) { + if (buflout->bufers[i].addr == extra_dst_buff) + break; if (!dma_mapping_error(dev, buflout->bufers[i].addr)) dma_unmap_single(dev, buflout->bufers[i].addr, buflout->bufers[i].len, DMA_FROM_DEVICE); + } if (!buf->sgl_dst_valid) kfree(buflout); @@ -192,3 +202,23 @@ err_in: dev_err(dev, "Failed to map buf for dma\n"); return -ENOMEM; } + +int qat_bl_sgl_to_bufl(struct adf_accel_dev *accel_dev, + struct scatterlist *sgl, + struct scatterlist *sglout, + struct qat_request_buffs *buf, + struct qat_sgl_to_bufl_params *params, + gfp_t flags) +{ + dma_addr_t extra_dst_buff = 0; + size_t sz_extra_dst_buff = 0; + + if (params) { + extra_dst_buff = params->extra_dst_buff; + sz_extra_dst_buff = params->sz_extra_dst_buff; + } + + return __qat_bl_sgl_to_bufl(accel_dev, sgl, sglout, buf, + extra_dst_buff, sz_extra_dst_buff, + flags); +} diff --git a/drivers/crypto/qat/qat_common/qat_bl.h b/drivers/crypto/qat/qat_common/qat_bl.h index 1c534c57a36bc..0c174fee9e645 100644 --- a/drivers/crypto/qat/qat_common/qat_bl.h +++ b/drivers/crypto/qat/qat_common/qat_bl.h @@ -38,12 +38,18 @@ struct qat_request_buffs { struct qat_alg_fixed_buf_list sgl_dst; }; +struct qat_sgl_to_bufl_params { + dma_addr_t extra_dst_buff; + size_t sz_extra_dst_buff; +}; + void qat_bl_free_bufl(struct adf_accel_dev *accel_dev, struct qat_request_buffs *buf); int qat_bl_sgl_to_bufl(struct adf_accel_dev *accel_dev, struct scatterlist *sgl, struct scatterlist *sglout, struct qat_request_buffs *buf, + struct qat_sgl_to_bufl_params *params, gfp_t flags); #endif -- GitLab From 4d76f3880987a00da79f455876488ac3c7343e83 Mon Sep 17 00:00:00 2001 From: Giovanni Cabiddu Date: Mon, 28 Nov 2022 12:21:17 +0000 Subject: [PATCH 1253/1423] crypto: qat - relocate backlog related structures Move the structures qat_instance_backlog and qat_alg_req from qat_crypto.h to qat_algs_send.h since they are not unique to crypto. Both structures will be used by the compression service to support requests with the CRYPTO_TFM_REQ_MAY_BACKLOG flag set. Signed-off-by: Giovanni Cabiddu Reviewed-by: Wojciech Ziemba Reviewed-by: Adam Guerin Signed-off-by: Herbert Xu --- drivers/crypto/qat/qat_common/qat_algs_send.h | 16 +++++++++++++++- drivers/crypto/qat/qat_common/qat_crypto.h | 14 +------------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/drivers/crypto/qat/qat_common/qat_algs_send.h b/drivers/crypto/qat/qat_common/qat_algs_send.h index 5ce9f4f69d8ff..0baca16e1eff0 100644 --- a/drivers/crypto/qat/qat_common/qat_algs_send.h +++ b/drivers/crypto/qat/qat_common/qat_algs_send.h @@ -3,7 +3,21 @@ #ifndef QAT_ALGS_SEND_H #define QAT_ALGS_SEND_H -#include "qat_crypto.h" +#include +#include "adf_transport_internal.h" + +struct qat_instance_backlog { + struct list_head list; + spinlock_t lock; /* protects backlog list */ +}; + +struct qat_alg_req { + u32 *fw_req; + struct adf_etr_ring_data *tx_ring; + struct crypto_async_request *base; + struct list_head list; + struct qat_instance_backlog *backlog; +}; int qat_alg_send_message(struct qat_alg_req *req); void qat_alg_send_backlog(struct qat_instance_backlog *backlog); diff --git a/drivers/crypto/qat/qat_common/qat_crypto.h b/drivers/crypto/qat/qat_common/qat_crypto.h index bb116357a5684..505e881022a7d 100644 --- a/drivers/crypto/qat/qat_common/qat_crypto.h +++ b/drivers/crypto/qat/qat_common/qat_crypto.h @@ -8,21 +8,9 @@ #include #include "adf_accel_devices.h" #include "icp_qat_fw_la.h" +#include "qat_algs_send.h" #include "qat_bl.h" -struct qat_instance_backlog { - struct list_head list; - spinlock_t lock; /* protects backlog list */ -}; - -struct qat_alg_req { - u32 *fw_req; - struct adf_etr_ring_data *tx_ring; - struct crypto_async_request *base; - struct list_head list; - struct qat_instance_backlog *backlog; -}; - struct qat_crypto_instance { struct adf_etr_ring_data *sym_tx; struct adf_etr_ring_data *sym_rx; -- GitLab From 79d8dbf155d4e670b6ac20acbb6b22f02c728da5 Mon Sep 17 00:00:00 2001 From: Giovanni Cabiddu Date: Mon, 28 Nov 2022 12:21:18 +0000 Subject: [PATCH 1254/1423] crypto: qat - relocate qat_algs_alloc_flags() Move qat_algs_alloc_flags() from qat_crypto.h to qat_bl.h as this will be used also by the compression logic. Signed-off-by: Giovanni Cabiddu Reviewed-by: Wojciech Ziemba Reviewed-by: Adam Guerin Signed-off-by: Herbert Xu --- drivers/crypto/qat/qat_common/qat_bl.h | 6 ++++++ drivers/crypto/qat/qat_common/qat_crypto.h | 5 ----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/crypto/qat/qat_common/qat_bl.h b/drivers/crypto/qat/qat_common/qat_bl.h index 0c174fee9e645..5f2ea8f352f76 100644 --- a/drivers/crypto/qat/qat_common/qat_bl.h +++ b/drivers/crypto/qat/qat_common/qat_bl.h @@ -2,6 +2,7 @@ /* Copyright(c) 2014 - 2022 Intel Corporation */ #ifndef QAT_BL_H #define QAT_BL_H +#include #include #include @@ -52,4 +53,9 @@ int qat_bl_sgl_to_bufl(struct adf_accel_dev *accel_dev, struct qat_sgl_to_bufl_params *params, gfp_t flags); +static inline gfp_t qat_algs_alloc_flags(struct crypto_async_request *req) +{ + return req->flags & CRYPTO_TFM_REQ_MAY_SLEEP ? GFP_KERNEL : GFP_ATOMIC; +} + #endif diff --git a/drivers/crypto/qat/qat_common/qat_crypto.h b/drivers/crypto/qat/qat_common/qat_crypto.h index 505e881022a7d..6a0e961bb9dc7 100644 --- a/drivers/crypto/qat/qat_common/qat_crypto.h +++ b/drivers/crypto/qat/qat_common/qat_crypto.h @@ -65,9 +65,4 @@ static inline bool adf_hw_dev_has_crypto(struct adf_accel_dev *accel_dev) return true; } -static inline gfp_t qat_algs_alloc_flags(struct crypto_async_request *req) -{ - return req->flags & CRYPTO_TFM_REQ_MAY_SLEEP ? GFP_KERNEL : GFP_ATOMIC; -} - #endif -- GitLab From 93b2f5799cee57814a36882e61ef5f03d5dc5392 Mon Sep 17 00:00:00 2001 From: Giovanni Cabiddu Date: Mon, 28 Nov 2022 12:21:19 +0000 Subject: [PATCH 1255/1423] crypto: qat - rename and relocate GEN2 config function Rename qat_crypto_dev_config() in adf_gen2_dev_config() and relocate it to the newly created file adf_gen2_config.c. This function is specific to QAT GEN2 devices and will be used also to configure the compression service. In addition change the drivers to use the dev_config() in the hardware data structure (which for GEN2 devices now points to adf_gen2_dev_config()), for consistency. Signed-off-by: Giovanni Cabiddu Reviewed-by: Wojciech Ziemba Reviewed-by: Adam Guerin Signed-off-by: Herbert Xu --- .../crypto/qat/qat_c3xxx/adf_c3xxx_hw_data.c | 2 + drivers/crypto/qat/qat_c3xxx/adf_drv.c | 2 +- .../qat/qat_c3xxxvf/adf_c3xxxvf_hw_data.c | 2 + .../crypto/qat/qat_c62x/adf_c62x_hw_data.c | 2 + drivers/crypto/qat/qat_c62x/adf_drv.c | 2 +- .../qat/qat_c62xvf/adf_c62xvf_hw_data.c | 2 + drivers/crypto/qat/qat_common/Makefile | 1 + .../crypto/qat/qat_common/adf_common_drv.h | 1 - .../crypto/qat/qat_common/adf_gen2_config.c | 131 ++++++++++++++++++ .../crypto/qat/qat_common/adf_gen2_config.h | 10 ++ drivers/crypto/qat/qat_common/qat_crypto.c | 120 +--------------- .../qat/qat_dh895xcc/adf_dh895xcc_hw_data.c | 2 + drivers/crypto/qat/qat_dh895xcc/adf_drv.c | 2 +- .../qat_dh895xccvf/adf_dh895xccvf_hw_data.c | 2 + 14 files changed, 158 insertions(+), 123 deletions(-) create mode 100644 drivers/crypto/qat/qat_common/adf_gen2_config.c create mode 100644 drivers/crypto/qat/qat_common/adf_gen2_config.h diff --git a/drivers/crypto/qat/qat_c3xxx/adf_c3xxx_hw_data.c b/drivers/crypto/qat/qat_c3xxx/adf_c3xxx_hw_data.c index 50d5afa26a9be..c0519a79060a9 100644 --- a/drivers/crypto/qat/qat_c3xxx/adf_c3xxx_hw_data.c +++ b/drivers/crypto/qat/qat_c3xxx/adf_c3xxx_hw_data.c @@ -2,6 +2,7 @@ /* Copyright(c) 2014 - 2021 Intel Corporation */ #include #include +#include #include #include #include "adf_c3xxx_hw_data.h" @@ -124,6 +125,7 @@ void adf_init_hw_data_c3xxx(struct adf_hw_device_data *hw_data) hw_data->reset_device = adf_reset_flr; hw_data->set_ssm_wdtimer = adf_gen2_set_ssm_wdtimer; hw_data->disable_iov = adf_disable_sriov; + hw_data->dev_config = adf_gen2_dev_config; adf_gen2_init_pf_pfvf_ops(&hw_data->pfvf_ops); adf_gen2_init_hw_csr_ops(&hw_data->csr_ops); diff --git a/drivers/crypto/qat/qat_c3xxx/adf_drv.c b/drivers/crypto/qat/qat_c3xxx/adf_drv.c index 2aef0bb791dfa..1f4fbf4562b22 100644 --- a/drivers/crypto/qat/qat_c3xxx/adf_drv.c +++ b/drivers/crypto/qat/qat_c3xxx/adf_drv.c @@ -201,7 +201,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto out_err_disable_aer; } - ret = qat_crypto_dev_config(accel_dev); + ret = hw_data->dev_config(accel_dev); if (ret) goto out_err_disable_aer; diff --git a/drivers/crypto/qat/qat_c3xxxvf/adf_c3xxxvf_hw_data.c b/drivers/crypto/qat/qat_c3xxxvf/adf_c3xxxvf_hw_data.c index a9fbe57b32ae3..6c37dda6da2ee 100644 --- a/drivers/crypto/qat/qat_c3xxxvf/adf_c3xxxvf_hw_data.c +++ b/drivers/crypto/qat/qat_c3xxxvf/adf_c3xxxvf_hw_data.c @@ -2,6 +2,7 @@ /* Copyright(c) 2015 - 2021 Intel Corporation */ #include #include +#include #include #include #include @@ -86,6 +87,7 @@ void adf_init_hw_data_c3xxxiov(struct adf_hw_device_data *hw_data) hw_data->get_sku = get_sku; hw_data->enable_ints = adf_vf_void_noop; hw_data->dev_class->instances++; + hw_data->dev_config = adf_gen2_dev_config; adf_devmgr_update_class_index(hw_data); adf_gen2_init_vf_pfvf_ops(&hw_data->pfvf_ops); adf_gen2_init_hw_csr_ops(&hw_data->csr_ops); diff --git a/drivers/crypto/qat/qat_c62x/adf_c62x_hw_data.c b/drivers/crypto/qat/qat_c62x/adf_c62x_hw_data.c index c00386fe6587a..689358cb7bb08 100644 --- a/drivers/crypto/qat/qat_c62x/adf_c62x_hw_data.c +++ b/drivers/crypto/qat/qat_c62x/adf_c62x_hw_data.c @@ -2,6 +2,7 @@ /* Copyright(c) 2014 - 2021 Intel Corporation */ #include #include +#include #include #include #include "adf_c62x_hw_data.h" @@ -126,6 +127,7 @@ void adf_init_hw_data_c62x(struct adf_hw_device_data *hw_data) hw_data->reset_device = adf_reset_flr; hw_data->set_ssm_wdtimer = adf_gen2_set_ssm_wdtimer; hw_data->disable_iov = adf_disable_sriov; + hw_data->dev_config = adf_gen2_dev_config; adf_gen2_init_pf_pfvf_ops(&hw_data->pfvf_ops); adf_gen2_init_hw_csr_ops(&hw_data->csr_ops); diff --git a/drivers/crypto/qat/qat_c62x/adf_drv.c b/drivers/crypto/qat/qat_c62x/adf_drv.c index 56163083f1616..4ccaf298250c3 100644 --- a/drivers/crypto/qat/qat_c62x/adf_drv.c +++ b/drivers/crypto/qat/qat_c62x/adf_drv.c @@ -201,7 +201,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto out_err_disable_aer; } - ret = qat_crypto_dev_config(accel_dev); + ret = hw_data->dev_config(accel_dev); if (ret) goto out_err_disable_aer; diff --git a/drivers/crypto/qat/qat_c62xvf/adf_c62xvf_hw_data.c b/drivers/crypto/qat/qat_c62xvf/adf_c62xvf_hw_data.c index 0282038fca548..521110ecd07fd 100644 --- a/drivers/crypto/qat/qat_c62xvf/adf_c62xvf_hw_data.c +++ b/drivers/crypto/qat/qat_c62xvf/adf_c62xvf_hw_data.c @@ -2,6 +2,7 @@ /* Copyright(c) 2015 - 2021 Intel Corporation */ #include #include +#include #include #include #include @@ -86,6 +87,7 @@ void adf_init_hw_data_c62xiov(struct adf_hw_device_data *hw_data) hw_data->get_sku = get_sku; hw_data->enable_ints = adf_vf_void_noop; hw_data->dev_class->instances++; + hw_data->dev_config = adf_gen2_dev_config; adf_devmgr_update_class_index(hw_data); adf_gen2_init_vf_pfvf_ops(&hw_data->pfvf_ops); adf_gen2_init_hw_csr_ops(&hw_data->csr_ops); diff --git a/drivers/crypto/qat/qat_common/Makefile b/drivers/crypto/qat/qat_common/Makefile index b0587d03eac29..b59b6315134b6 100644 --- a/drivers/crypto/qat/qat_common/Makefile +++ b/drivers/crypto/qat/qat_common/Makefile @@ -12,6 +12,7 @@ intel_qat-objs := adf_cfg.o \ adf_hw_arbiter.o \ adf_sysfs.o \ adf_gen2_hw_data.o \ + adf_gen2_config.o \ adf_gen4_hw_data.o \ adf_gen4_pm.o \ qat_crypto.o \ diff --git a/drivers/crypto/qat/qat_common/adf_common_drv.h b/drivers/crypto/qat/qat_common/adf_common_drv.h index 7bb477c3ce25f..b8ec0268d2d29 100644 --- a/drivers/crypto/qat/qat_common/adf_common_drv.h +++ b/drivers/crypto/qat/qat_common/adf_common_drv.h @@ -110,7 +110,6 @@ int adf_init_etr_data(struct adf_accel_dev *accel_dev); void adf_cleanup_etr_data(struct adf_accel_dev *accel_dev); int qat_crypto_register(void); int qat_crypto_unregister(void); -int qat_crypto_dev_config(struct adf_accel_dev *accel_dev); int qat_crypto_vf_dev_config(struct adf_accel_dev *accel_dev); struct qat_crypto_instance *qat_crypto_get_instance_node(int node); void qat_crypto_put_instance(struct qat_crypto_instance *inst); diff --git a/drivers/crypto/qat/qat_common/adf_gen2_config.c b/drivers/crypto/qat/qat_common/adf_gen2_config.c new file mode 100644 index 0000000000000..1c490e1859a76 --- /dev/null +++ b/drivers/crypto/qat/qat_common/adf_gen2_config.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2022 Intel Corporation */ +#include "adf_accel_devices.h" +#include "adf_cfg.h" +#include "adf_cfg_strings.h" +#include "adf_gen2_config.h" +#include "adf_common_drv.h" +#include "qat_crypto.h" +#include "adf_transport_access_macros.h" + +static int adf_gen2_crypto_dev_config(struct adf_accel_dev *accel_dev) +{ + char key[ADF_CFG_MAX_KEY_LEN_IN_BYTES]; + int banks = GET_MAX_BANKS(accel_dev); + int cpus = num_online_cpus(); + unsigned long val; + int instances; + int ret; + int i; + + if (adf_hw_dev_has_crypto(accel_dev)) + instances = min(cpus, banks); + else + instances = 0; + + ret = adf_cfg_section_add(accel_dev, ADF_KERNEL_SEC); + if (ret) + goto err; + + ret = adf_cfg_section_add(accel_dev, "Accelerator0"); + if (ret) + goto err; + + for (i = 0; i < instances; i++) { + val = i; + snprintf(key, sizeof(key), ADF_CY "%d" ADF_RING_ASYM_BANK_NUM, i); + ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, + key, &val, ADF_DEC); + if (ret) + goto err; + + snprintf(key, sizeof(key), ADF_CY "%d" ADF_RING_SYM_BANK_NUM, i); + ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, + key, &val, ADF_DEC); + if (ret) + goto err; + + snprintf(key, sizeof(key), ADF_CY "%d" ADF_ETRMGR_CORE_AFFINITY, + i); + ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, + key, &val, ADF_DEC); + if (ret) + goto err; + + snprintf(key, sizeof(key), ADF_CY "%d" ADF_RING_ASYM_SIZE, i); + val = 128; + ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, + key, &val, ADF_DEC); + if (ret) + goto err; + + val = 512; + snprintf(key, sizeof(key), ADF_CY "%d" ADF_RING_SYM_SIZE, i); + ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, + key, &val, ADF_DEC); + if (ret) + goto err; + + val = 0; + snprintf(key, sizeof(key), ADF_CY "%d" ADF_RING_ASYM_TX, i); + ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, + key, &val, ADF_DEC); + if (ret) + goto err; + + val = 2; + snprintf(key, sizeof(key), ADF_CY "%d" ADF_RING_SYM_TX, i); + ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, + key, &val, ADF_DEC); + if (ret) + goto err; + + val = 8; + snprintf(key, sizeof(key), ADF_CY "%d" ADF_RING_ASYM_RX, i); + ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, + key, &val, ADF_DEC); + if (ret) + goto err; + + val = 10; + snprintf(key, sizeof(key), ADF_CY "%d" ADF_RING_SYM_RX, i); + ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, + key, &val, ADF_DEC); + if (ret) + goto err; + + val = ADF_COALESCING_DEF_TIME; + snprintf(key, sizeof(key), ADF_ETRMGR_COALESCE_TIMER_FORMAT, i); + ret = adf_cfg_add_key_value_param(accel_dev, "Accelerator0", + key, &val, ADF_DEC); + if (ret) + goto err; + } + + val = i; + ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, ADF_NUM_CY, + &val, ADF_DEC); + if (ret) + goto err; + + set_bit(ADF_STATUS_CONFIGURED, &accel_dev->status); + return 0; +err: + dev_err(&GET_DEV(accel_dev), "Failed to start QAT accel dev\n"); + return ret; +} + +/** + * adf_gen2_dev_config() - create dev config required to create instances + * + * @accel_dev: Pointer to acceleration device. + * + * Function creates device configuration required to create instances + * + * Return: 0 on success, error code otherwise. + */ +int adf_gen2_dev_config(struct adf_accel_dev *accel_dev) +{ + return adf_gen2_crypto_dev_config(accel_dev); +} +EXPORT_SYMBOL_GPL(adf_gen2_dev_config); diff --git a/drivers/crypto/qat/qat_common/adf_gen2_config.h b/drivers/crypto/qat/qat_common/adf_gen2_config.h new file mode 100644 index 0000000000000..4bf9da2de68aa --- /dev/null +++ b/drivers/crypto/qat/qat_common/adf_gen2_config.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright(c) 2022 Intel Corporation */ +#ifndef ADF_GEN2_CONFIG_H_ +#define ADF_GEN2_CONFIG_H_ + +#include "adf_accel_devices.h" + +int adf_gen2_dev_config(struct adf_accel_dev *accel_dev); + +#endif diff --git a/drivers/crypto/qat/qat_common/qat_crypto.c b/drivers/crypto/qat/qat_common/qat_crypto.c index 9341d892533a7..e31199eade5b6 100644 --- a/drivers/crypto/qat/qat_common/qat_crypto.c +++ b/drivers/crypto/qat/qat_common/qat_crypto.c @@ -5,7 +5,6 @@ #include "adf_accel_devices.h" #include "adf_common_drv.h" #include "adf_transport.h" -#include "adf_transport_access_macros.h" #include "adf_cfg.h" #include "adf_cfg_strings.h" #include "adf_gen2_hw_data.h" @@ -126,126 +125,9 @@ int qat_crypto_vf_dev_config(struct adf_accel_dev *accel_dev) return -EFAULT; } - return qat_crypto_dev_config(accel_dev); + return GET_HW_DATA(accel_dev)->dev_config(accel_dev); } -/** - * qat_crypto_dev_config() - create dev config required to create crypto inst. - * - * @accel_dev: Pointer to acceleration device. - * - * Function creates device configuration required to create crypto instances - * - * Return: 0 on success, error code otherwise. - */ -int qat_crypto_dev_config(struct adf_accel_dev *accel_dev) -{ - char key[ADF_CFG_MAX_KEY_LEN_IN_BYTES]; - int banks = GET_MAX_BANKS(accel_dev); - int cpus = num_online_cpus(); - unsigned long val; - int instances; - int ret; - int i; - - if (adf_hw_dev_has_crypto(accel_dev)) - instances = min(cpus, banks); - else - instances = 0; - - ret = adf_cfg_section_add(accel_dev, ADF_KERNEL_SEC); - if (ret) - goto err; - - ret = adf_cfg_section_add(accel_dev, "Accelerator0"); - if (ret) - goto err; - - for (i = 0; i < instances; i++) { - val = i; - snprintf(key, sizeof(key), ADF_CY "%d" ADF_RING_ASYM_BANK_NUM, i); - ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, - key, &val, ADF_DEC); - if (ret) - goto err; - - snprintf(key, sizeof(key), ADF_CY "%d" ADF_RING_SYM_BANK_NUM, i); - ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, - key, &val, ADF_DEC); - if (ret) - goto err; - - snprintf(key, sizeof(key), ADF_CY "%d" ADF_ETRMGR_CORE_AFFINITY, - i); - ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, - key, &val, ADF_DEC); - if (ret) - goto err; - - snprintf(key, sizeof(key), ADF_CY "%d" ADF_RING_ASYM_SIZE, i); - val = 128; - ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, - key, &val, ADF_DEC); - if (ret) - goto err; - - val = 512; - snprintf(key, sizeof(key), ADF_CY "%d" ADF_RING_SYM_SIZE, i); - ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, - key, &val, ADF_DEC); - if (ret) - goto err; - - val = 0; - snprintf(key, sizeof(key), ADF_CY "%d" ADF_RING_ASYM_TX, i); - ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, - key, &val, ADF_DEC); - if (ret) - goto err; - - val = 2; - snprintf(key, sizeof(key), ADF_CY "%d" ADF_RING_SYM_TX, i); - ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, - key, &val, ADF_DEC); - if (ret) - goto err; - - val = 8; - snprintf(key, sizeof(key), ADF_CY "%d" ADF_RING_ASYM_RX, i); - ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, - key, &val, ADF_DEC); - if (ret) - goto err; - - val = 10; - snprintf(key, sizeof(key), ADF_CY "%d" ADF_RING_SYM_RX, i); - ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, - key, &val, ADF_DEC); - if (ret) - goto err; - - val = ADF_COALESCING_DEF_TIME; - snprintf(key, sizeof(key), ADF_ETRMGR_COALESCE_TIMER_FORMAT, i); - ret = adf_cfg_add_key_value_param(accel_dev, "Accelerator0", - key, &val, ADF_DEC); - if (ret) - goto err; - } - - val = i; - ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, ADF_NUM_CY, - &val, ADF_DEC); - if (ret) - goto err; - - set_bit(ADF_STATUS_CONFIGURED, &accel_dev->status); - return 0; -err: - dev_err(&GET_DEV(accel_dev), "Failed to start QAT accel dev\n"); - return ret; -} -EXPORT_SYMBOL_GPL(qat_crypto_dev_config); - static int qat_crypto_create_instances(struct adf_accel_dev *accel_dev) { unsigned long num_inst, num_msg_sym, num_msg_asym; diff --git a/drivers/crypto/qat/qat_dh895xcc/adf_dh895xcc_hw_data.c b/drivers/crypto/qat/qat_dh895xcc/adf_dh895xcc_hw_data.c index cb3bdd3618fb0..baacf817abf68 100644 --- a/drivers/crypto/qat/qat_dh895xcc/adf_dh895xcc_hw_data.c +++ b/drivers/crypto/qat/qat_dh895xcc/adf_dh895xcc_hw_data.c @@ -2,6 +2,7 @@ /* Copyright(c) 2014 - 2021 Intel Corporation */ #include #include +#include #include #include #include "adf_dh895xcc_hw_data.h" @@ -234,6 +235,7 @@ void adf_init_hw_data_dh895xcc(struct adf_hw_device_data *hw_data) hw_data->enable_ints = adf_gen2_enable_ints; hw_data->reset_device = adf_reset_sbr; hw_data->disable_iov = adf_disable_sriov; + hw_data->dev_config = adf_gen2_dev_config; adf_gen2_init_pf_pfvf_ops(&hw_data->pfvf_ops); hw_data->pfvf_ops.enable_vf2pf_interrupts = enable_vf2pf_interrupts; diff --git a/drivers/crypto/qat/qat_dh895xcc/adf_drv.c b/drivers/crypto/qat/qat_dh895xcc/adf_drv.c index acca56752aa02..ebeb17b67fcd5 100644 --- a/drivers/crypto/qat/qat_dh895xcc/adf_drv.c +++ b/drivers/crypto/qat/qat_dh895xcc/adf_drv.c @@ -201,7 +201,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto out_err_disable_aer; } - ret = qat_crypto_dev_config(accel_dev); + ret = hw_data->dev_config(accel_dev); if (ret) goto out_err_disable_aer; diff --git a/drivers/crypto/qat/qat_dh895xccvf/adf_dh895xccvf_hw_data.c b/drivers/crypto/qat/qat_dh895xccvf/adf_dh895xccvf_hw_data.c index 31c14d7e1c115..b933a00fb91b8 100644 --- a/drivers/crypto/qat/qat_dh895xccvf/adf_dh895xccvf_hw_data.c +++ b/drivers/crypto/qat/qat_dh895xccvf/adf_dh895xccvf_hw_data.c @@ -2,6 +2,7 @@ /* Copyright(c) 2015 - 2021 Intel Corporation */ #include #include +#include #include #include #include @@ -86,6 +87,7 @@ void adf_init_hw_data_dh895xcciov(struct adf_hw_device_data *hw_data) hw_data->get_sku = get_sku; hw_data->enable_ints = adf_vf_void_noop; hw_data->dev_class->instances++; + hw_data->dev_config = adf_gen2_dev_config; adf_devmgr_update_class_index(hw_data); adf_gen2_init_vf_pfvf_ops(&hw_data->pfvf_ops); adf_gen2_init_hw_csr_ops(&hw_data->csr_ops); -- GitLab From 1198ae56c9a520384dcf53f01cd9adecd73751d0 Mon Sep 17 00:00:00 2001 From: Giovanni Cabiddu Date: Mon, 28 Nov 2022 12:21:20 +0000 Subject: [PATCH 1256/1423] crypto: qat - expose deflate through acomp api for QAT GEN2 Add infrastructure for implementing the acomp APIs in the QAT driver and expose the deflate algorithm for QAT GEN2 devices. This adds (1) the compression service which includes logic to create, allocate and handle compression instances; (2) logic to create configuration entries at probe time for the compression instances; (3) updates to the firmware API for allowing the compression service; and; (4) a back-end for deflate that implements the acomp api for QAT GEN2 devices. The implementation configures the device to produce data compressed statically, optimized for throughput over compression ratio. Signed-off-by: Giovanni Cabiddu Reviewed-by: Wojciech Ziemba Reviewed-by: Adam Guerin Signed-off-by: Herbert Xu --- drivers/crypto/qat/qat_4xxx/adf_drv.c | 6 + .../crypto/qat/qat_c3xxx/adf_c3xxx_hw_data.c | 2 + .../qat/qat_c3xxxvf/adf_c3xxxvf_hw_data.c | 2 + .../crypto/qat/qat_c62x/adf_c62x_hw_data.c | 2 + .../qat/qat_c62xvf/adf_c62xvf_hw_data.c | 2 + drivers/crypto/qat/qat_common/Makefile | 3 + .../crypto/qat/qat_common/adf_accel_devices.h | 14 + .../crypto/qat/qat_common/adf_cfg_strings.h | 1 + .../crypto/qat/qat_common/adf_common_drv.h | 8 + drivers/crypto/qat/qat_common/adf_ctl_drv.c | 6 + .../crypto/qat/qat_common/adf_gen2_config.c | 99 ++++- drivers/crypto/qat/qat_common/adf_gen2_dc.c | 70 +++ drivers/crypto/qat/qat_common/adf_gen2_dc.h | 10 + drivers/crypto/qat/qat_common/adf_init.c | 11 + drivers/crypto/qat/qat_common/adf_sriov.c | 4 + drivers/crypto/qat/qat_common/icp_qat_fw.h | 24 ++ .../crypto/qat/qat_common/icp_qat_fw_comp.h | 404 ++++++++++++++++++ drivers/crypto/qat/qat_common/icp_qat_hw.h | 66 +++ drivers/crypto/qat/qat_common/qat_comp_algs.c | 274 ++++++++++++ drivers/crypto/qat/qat_common/qat_comp_req.h | 113 +++++ .../crypto/qat/qat_common/qat_compression.c | 297 +++++++++++++ .../crypto/qat/qat_common/qat_compression.h | 37 ++ .../qat/qat_dh895xcc/adf_dh895xcc_hw_data.c | 2 + .../qat_dh895xccvf/adf_dh895xccvf_hw_data.c | 2 + 24 files changed, 1447 insertions(+), 12 deletions(-) create mode 100644 drivers/crypto/qat/qat_common/adf_gen2_dc.c create mode 100644 drivers/crypto/qat/qat_common/adf_gen2_dc.h create mode 100644 drivers/crypto/qat/qat_common/icp_qat_fw_comp.h create mode 100644 drivers/crypto/qat/qat_common/qat_comp_algs.c create mode 100644 drivers/crypto/qat/qat_common/qat_comp_req.h create mode 100644 drivers/crypto/qat/qat_common/qat_compression.c create mode 100644 drivers/crypto/qat/qat_common/qat_compression.h diff --git a/drivers/crypto/qat/qat_4xxx/adf_drv.c b/drivers/crypto/qat/qat_4xxx/adf_drv.c index 670a58b25cb16..8496c451b48e1 100644 --- a/drivers/crypto/qat/qat_4xxx/adf_drv.c +++ b/drivers/crypto/qat/qat_4xxx/adf_drv.c @@ -155,6 +155,12 @@ int adf_crypto_dev_config(struct adf_accel_dev *accel_dev) if (ret) goto err; + val = 0; + ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, ADF_NUM_DC, + &val, ADF_DEC); + if (ret) + goto err; + set_bit(ADF_STATUS_CONFIGURED, &accel_dev->status); return 0; err: diff --git a/drivers/crypto/qat/qat_c3xxx/adf_c3xxx_hw_data.c b/drivers/crypto/qat/qat_c3xxx/adf_c3xxx_hw_data.c index c0519a79060a9..c55c51a07677d 100644 --- a/drivers/crypto/qat/qat_c3xxx/adf_c3xxx_hw_data.c +++ b/drivers/crypto/qat/qat_c3xxx/adf_c3xxx_hw_data.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include "adf_c3xxx_hw_data.h" @@ -129,6 +130,7 @@ void adf_init_hw_data_c3xxx(struct adf_hw_device_data *hw_data) adf_gen2_init_pf_pfvf_ops(&hw_data->pfvf_ops); adf_gen2_init_hw_csr_ops(&hw_data->csr_ops); + adf_gen2_init_dc_ops(&hw_data->dc_ops); } void adf_clean_hw_data_c3xxx(struct adf_hw_device_data *hw_data) diff --git a/drivers/crypto/qat/qat_c3xxxvf/adf_c3xxxvf_hw_data.c b/drivers/crypto/qat/qat_c3xxxvf/adf_c3xxxvf_hw_data.c index 6c37dda6da2ee..84d9486e04de6 100644 --- a/drivers/crypto/qat/qat_c3xxxvf/adf_c3xxxvf_hw_data.c +++ b/drivers/crypto/qat/qat_c3xxxvf/adf_c3xxxvf_hw_data.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -91,6 +92,7 @@ void adf_init_hw_data_c3xxxiov(struct adf_hw_device_data *hw_data) adf_devmgr_update_class_index(hw_data); adf_gen2_init_vf_pfvf_ops(&hw_data->pfvf_ops); adf_gen2_init_hw_csr_ops(&hw_data->csr_ops); + adf_gen2_init_dc_ops(&hw_data->dc_ops); } void adf_clean_hw_data_c3xxxiov(struct adf_hw_device_data *hw_data) diff --git a/drivers/crypto/qat/qat_c62x/adf_c62x_hw_data.c b/drivers/crypto/qat/qat_c62x/adf_c62x_hw_data.c index 689358cb7bb08..b7aa19d2fa804 100644 --- a/drivers/crypto/qat/qat_c62x/adf_c62x_hw_data.c +++ b/drivers/crypto/qat/qat_c62x/adf_c62x_hw_data.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include "adf_c62x_hw_data.h" @@ -131,6 +132,7 @@ void adf_init_hw_data_c62x(struct adf_hw_device_data *hw_data) adf_gen2_init_pf_pfvf_ops(&hw_data->pfvf_ops); adf_gen2_init_hw_csr_ops(&hw_data->csr_ops); + adf_gen2_init_dc_ops(&hw_data->dc_ops); } void adf_clean_hw_data_c62x(struct adf_hw_device_data *hw_data) diff --git a/drivers/crypto/qat/qat_c62xvf/adf_c62xvf_hw_data.c b/drivers/crypto/qat/qat_c62xvf/adf_c62xvf_hw_data.c index 521110ecd07fd..751d7aa57fc7f 100644 --- a/drivers/crypto/qat/qat_c62xvf/adf_c62xvf_hw_data.c +++ b/drivers/crypto/qat/qat_c62xvf/adf_c62xvf_hw_data.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -91,6 +92,7 @@ void adf_init_hw_data_c62xiov(struct adf_hw_device_data *hw_data) adf_devmgr_update_class_index(hw_data); adf_gen2_init_vf_pfvf_ops(&hw_data->pfvf_ops); adf_gen2_init_hw_csr_ops(&hw_data->csr_ops); + adf_gen2_init_dc_ops(&hw_data->dc_ops); } void adf_clean_hw_data_c62xiov(struct adf_hw_device_data *hw_data) diff --git a/drivers/crypto/qat/qat_common/Makefile b/drivers/crypto/qat/qat_common/Makefile index b59b6315134b6..e3db4786738f7 100644 --- a/drivers/crypto/qat/qat_common/Makefile +++ b/drivers/crypto/qat/qat_common/Makefile @@ -15,7 +15,10 @@ intel_qat-objs := adf_cfg.o \ adf_gen2_config.o \ adf_gen4_hw_data.o \ adf_gen4_pm.o \ + adf_gen2_dc.o \ qat_crypto.o \ + qat_compression.o \ + qat_comp_algs.o \ qat_algs.o \ qat_asym_algs.o \ qat_algs_send.o \ diff --git a/drivers/crypto/qat/qat_common/adf_accel_devices.h b/drivers/crypto/qat/qat_common/adf_accel_devices.h index 0a55a4f34dcfd..284f5aad3ee0b 100644 --- a/drivers/crypto/qat/qat_common/adf_accel_devices.h +++ b/drivers/crypto/qat/qat_common/adf_accel_devices.h @@ -163,6 +163,10 @@ struct adf_pfvf_ops { u32 pfvf_offset, u8 compat_ver); }; +struct adf_dc_ops { + void (*build_deflate_ctx)(void *ctx); +}; + struct adf_hw_device_data { struct adf_hw_device_class *dev_class; u32 (*get_accel_mask)(struct adf_hw_device_data *self); @@ -202,6 +206,7 @@ struct adf_hw_device_data { int (*dev_config)(struct adf_accel_dev *accel_dev); struct adf_pfvf_ops pfvf_ops; struct adf_hw_csr_ops csr_ops; + struct adf_dc_ops dc_ops; const char *fw_name; const char *fw_mmp_name; u32 fuses; @@ -247,6 +252,7 @@ struct adf_hw_device_data { #define GET_MAX_ACCELENGINES(accel_dev) (GET_HW_DATA(accel_dev)->num_engines) #define GET_CSR_OPS(accel_dev) (&(accel_dev)->hw_device->csr_ops) #define GET_PFVF_OPS(accel_dev) (&(accel_dev)->hw_device->pfvf_ops) +#define GET_DC_OPS(accel_dev) (&(accel_dev)->hw_device->dc_ops) #define accel_to_pci_dev(accel_ptr) accel_ptr->accel_pci_dev.pci_dev struct adf_admin_comms; @@ -266,13 +272,21 @@ struct adf_accel_vf_info { u8 vf_compat_ver; }; +struct adf_dc_data { + u8 *ovf_buff; + size_t ovf_buff_sz; + dma_addr_t ovf_buff_p; +}; + struct adf_accel_dev { struct adf_etr_data *transport; struct adf_hw_device_data *hw_device; struct adf_cfg_device_data *cfg; struct adf_fw_loader_data *fw_loader; struct adf_admin_comms *admin; + struct adf_dc_data *dc_data; struct list_head crypto_list; + struct list_head compression_list; unsigned long status; atomic_t ref_count; struct dentry *debugfs_dir; diff --git a/drivers/crypto/qat/qat_common/adf_cfg_strings.h b/drivers/crypto/qat/qat_common/adf_cfg_strings.h index 655248dbf962d..5d8c3bdb258c1 100644 --- a/drivers/crypto/qat/qat_common/adf_cfg_strings.h +++ b/drivers/crypto/qat/qat_common/adf_cfg_strings.h @@ -20,6 +20,7 @@ #define ADF_ETRMGR_BANK "Bank" #define ADF_RING_SYM_BANK_NUM "BankSymNumber" #define ADF_RING_ASYM_BANK_NUM "BankAsymNumber" +#define ADF_RING_DC_BANK_NUM "BankDcNumber" #define ADF_CY "Cy" #define ADF_DC "Dc" #define ADF_CFG_DC "dc" diff --git a/drivers/crypto/qat/qat_common/adf_common_drv.h b/drivers/crypto/qat/qat_common/adf_common_drv.h index b8ec0268d2d29..7189265573c08 100644 --- a/drivers/crypto/qat/qat_common/adf_common_drv.h +++ b/drivers/crypto/qat/qat_common/adf_common_drv.h @@ -120,6 +120,14 @@ void qat_algs_unregister(void); int qat_asym_algs_register(void); void qat_asym_algs_unregister(void); +struct qat_compression_instance *qat_compression_get_instance_node(int node); +void qat_compression_put_instance(struct qat_compression_instance *inst); +int qat_compression_register(void); +int qat_compression_unregister(void); +int qat_comp_algs_register(void); +void qat_comp_algs_unregister(void); +void qat_comp_alg_callback(void *resp); + int adf_isr_resource_alloc(struct adf_accel_dev *accel_dev); void adf_isr_resource_free(struct adf_accel_dev *accel_dev); int adf_vf_isr_resource_alloc(struct adf_accel_dev *accel_dev); diff --git a/drivers/crypto/qat/qat_common/adf_ctl_drv.c b/drivers/crypto/qat/qat_common/adf_ctl_drv.c index 82b69e1f725ba..9190532b27eb5 100644 --- a/drivers/crypto/qat/qat_common/adf_ctl_drv.c +++ b/drivers/crypto/qat/qat_common/adf_ctl_drv.c @@ -438,8 +438,13 @@ static int __init adf_register_ctl_device_driver(void) if (qat_crypto_register()) goto err_crypto_register; + if (qat_compression_register()) + goto err_compression_register; + return 0; +err_compression_register: + qat_crypto_unregister(); err_crypto_register: adf_exit_vf_wq(); err_vf_wq: @@ -463,6 +468,7 @@ static void __exit adf_unregister_ctl_device_driver(void) adf_exit_vf_wq(); adf_exit_pf_wq(); qat_crypto_unregister(); + qat_compression_unregister(); adf_clean_vf_map(false); mutex_destroy(&adf_ctl_lock); } diff --git a/drivers/crypto/qat/qat_common/adf_gen2_config.c b/drivers/crypto/qat/qat_common/adf_gen2_config.c index 1c490e1859a76..eeb30da7587a5 100644 --- a/drivers/crypto/qat/qat_common/adf_gen2_config.c +++ b/drivers/crypto/qat/qat_common/adf_gen2_config.c @@ -6,6 +6,7 @@ #include "adf_gen2_config.h" #include "adf_common_drv.h" #include "qat_crypto.h" +#include "qat_compression.h" #include "adf_transport_access_macros.h" static int adf_gen2_crypto_dev_config(struct adf_accel_dev *accel_dev) @@ -23,14 +24,6 @@ static int adf_gen2_crypto_dev_config(struct adf_accel_dev *accel_dev) else instances = 0; - ret = adf_cfg_section_add(accel_dev, ADF_KERNEL_SEC); - if (ret) - goto err; - - ret = adf_cfg_section_add(accel_dev, "Accelerator0"); - if (ret) - goto err; - for (i = 0; i < instances; i++) { val = i; snprintf(key, sizeof(key), ADF_CY "%d" ADF_RING_ASYM_BANK_NUM, i); @@ -108,10 +101,68 @@ static int adf_gen2_crypto_dev_config(struct adf_accel_dev *accel_dev) if (ret) goto err; - set_bit(ADF_STATUS_CONFIGURED, &accel_dev->status); - return 0; + return ret; + +err: + dev_err(&GET_DEV(accel_dev), "Failed to add configuration for crypto\n"); + return ret; +} + +static int adf_gen2_comp_dev_config(struct adf_accel_dev *accel_dev) +{ + char key[ADF_CFG_MAX_KEY_LEN_IN_BYTES]; + int banks = GET_MAX_BANKS(accel_dev); + int cpus = num_online_cpus(); + unsigned long val; + int instances; + int ret; + int i; + + if (adf_hw_dev_has_compression(accel_dev)) + instances = min(cpus, banks); + else + instances = 0; + + for (i = 0; i < instances; i++) { + val = i; + snprintf(key, sizeof(key), ADF_DC "%d" ADF_RING_DC_BANK_NUM, i); + ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, + key, &val, ADF_DEC); + if (ret) + goto err; + + val = 512; + snprintf(key, sizeof(key), ADF_DC "%d" ADF_RING_DC_SIZE, i); + ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, + key, &val, ADF_DEC); + if (ret) + goto err; + + val = 6; + snprintf(key, sizeof(key), ADF_DC "%d" ADF_RING_DC_TX, i); + ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, + key, &val, ADF_DEC); + if (ret) + goto err; + + val = 14; + snprintf(key, sizeof(key), ADF_DC "%d" ADF_RING_DC_RX, i); + ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, + key, &val, ADF_DEC); + if (ret) + goto err; + } + + val = i; + ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, ADF_NUM_DC, + &val, ADF_DEC); + if (ret) + return ret; + + return ret; + err: - dev_err(&GET_DEV(accel_dev), "Failed to start QAT accel dev\n"); + dev_err(&GET_DEV(accel_dev), "Failed to add configuration for compression\n"); return ret; } @@ -126,6 +177,30 @@ err: */ int adf_gen2_dev_config(struct adf_accel_dev *accel_dev) { - return adf_gen2_crypto_dev_config(accel_dev); + int ret; + + ret = adf_cfg_section_add(accel_dev, ADF_KERNEL_SEC); + if (ret) + goto err; + + ret = adf_cfg_section_add(accel_dev, "Accelerator0"); + if (ret) + goto err; + + ret = adf_gen2_crypto_dev_config(accel_dev); + if (ret) + goto err; + + ret = adf_gen2_comp_dev_config(accel_dev); + if (ret) + goto err; + + set_bit(ADF_STATUS_CONFIGURED, &accel_dev->status); + + return ret; + +err: + dev_err(&GET_DEV(accel_dev), "Failed to configure QAT driver\n"); + return ret; } EXPORT_SYMBOL_GPL(adf_gen2_dev_config); diff --git a/drivers/crypto/qat/qat_common/adf_gen2_dc.c b/drivers/crypto/qat/qat_common/adf_gen2_dc.c new file mode 100644 index 0000000000000..47261b1c1da69 --- /dev/null +++ b/drivers/crypto/qat/qat_common/adf_gen2_dc.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2022 Intel Corporation */ +#include "adf_accel_devices.h" +#include "adf_gen2_dc.h" +#include "icp_qat_fw_comp.h" + +static void qat_comp_build_deflate_ctx(void *ctx) +{ + struct icp_qat_fw_comp_req *req_tmpl = (struct icp_qat_fw_comp_req *)ctx; + struct icp_qat_fw_comn_req_hdr *header = &req_tmpl->comn_hdr; + struct icp_qat_fw_comp_req_hdr_cd_pars *cd_pars = &req_tmpl->cd_pars; + struct icp_qat_fw_comp_req_params *req_pars = &req_tmpl->comp_pars; + struct icp_qat_fw_comp_cd_hdr *comp_cd_ctrl = &req_tmpl->comp_cd_ctrl; + + memset(req_tmpl, 0, sizeof(*req_tmpl)); + header->hdr_flags = + ICP_QAT_FW_COMN_HDR_FLAGS_BUILD(ICP_QAT_FW_COMN_REQ_FLAG_SET); + header->service_type = ICP_QAT_FW_COMN_REQ_CPM_FW_COMP; + header->service_cmd_id = ICP_QAT_FW_COMP_CMD_STATIC; + header->comn_req_flags = + ICP_QAT_FW_COMN_FLAGS_BUILD(QAT_COMN_CD_FLD_TYPE_16BYTE_DATA, + QAT_COMN_PTR_TYPE_SGL); + header->serv_specif_flags = + ICP_QAT_FW_COMP_FLAGS_BUILD(ICP_QAT_FW_COMP_STATELESS_SESSION, + ICP_QAT_FW_COMP_NOT_AUTO_SELECT_BEST, + ICP_QAT_FW_COMP_NOT_ENH_AUTO_SELECT_BEST, + ICP_QAT_FW_COMP_NOT_DISABLE_TYPE0_ENH_AUTO_SELECT_BEST, + ICP_QAT_FW_COMP_ENABLE_SECURE_RAM_USED_AS_INTMD_BUF); + cd_pars->u.sl.comp_slice_cfg_word[0] = + ICP_QAT_HW_COMPRESSION_CONFIG_BUILD(ICP_QAT_HW_COMPRESSION_DIR_COMPRESS, + ICP_QAT_HW_COMPRESSION_DELAYED_MATCH_DISABLED, + ICP_QAT_HW_COMPRESSION_ALGO_DEFLATE, + ICP_QAT_HW_COMPRESSION_DEPTH_1, + ICP_QAT_HW_COMPRESSION_FILE_TYPE_0); + req_pars->crc.legacy.initial_adler = COMP_CPR_INITIAL_ADLER; + req_pars->crc.legacy.initial_crc32 = COMP_CPR_INITIAL_CRC; + req_pars->req_par_flags = + ICP_QAT_FW_COMP_REQ_PARAM_FLAGS_BUILD(ICP_QAT_FW_COMP_SOP, + ICP_QAT_FW_COMP_EOP, + ICP_QAT_FW_COMP_BFINAL, + ICP_QAT_FW_COMP_CNV, + ICP_QAT_FW_COMP_CNV_RECOVERY, + ICP_QAT_FW_COMP_NO_CNV_DFX, + ICP_QAT_FW_COMP_CRC_MODE_LEGACY, + ICP_QAT_FW_COMP_NO_XXHASH_ACC, + ICP_QAT_FW_COMP_CNV_ERROR_NONE, + ICP_QAT_FW_COMP_NO_APPEND_CRC, + ICP_QAT_FW_COMP_NO_DROP_DATA); + ICP_QAT_FW_COMN_NEXT_ID_SET(comp_cd_ctrl, ICP_QAT_FW_SLICE_DRAM_WR); + ICP_QAT_FW_COMN_CURR_ID_SET(comp_cd_ctrl, ICP_QAT_FW_SLICE_COMP); + + /* Fill second half of the template for decompression */ + memcpy(req_tmpl + 1, req_tmpl, sizeof(*req_tmpl)); + req_tmpl++; + header = &req_tmpl->comn_hdr; + header->service_cmd_id = ICP_QAT_FW_COMP_CMD_DECOMPRESS; + cd_pars = &req_tmpl->cd_pars; + cd_pars->u.sl.comp_slice_cfg_word[0] = + ICP_QAT_HW_COMPRESSION_CONFIG_BUILD(ICP_QAT_HW_COMPRESSION_DIR_DECOMPRESS, + ICP_QAT_HW_COMPRESSION_DELAYED_MATCH_DISABLED, + ICP_QAT_HW_COMPRESSION_ALGO_DEFLATE, + ICP_QAT_HW_COMPRESSION_DEPTH_1, + ICP_QAT_HW_COMPRESSION_FILE_TYPE_0); +} + +void adf_gen2_init_dc_ops(struct adf_dc_ops *dc_ops) +{ + dc_ops->build_deflate_ctx = qat_comp_build_deflate_ctx; +} +EXPORT_SYMBOL_GPL(adf_gen2_init_dc_ops); diff --git a/drivers/crypto/qat/qat_common/adf_gen2_dc.h b/drivers/crypto/qat/qat_common/adf_gen2_dc.h new file mode 100644 index 0000000000000..6eae023354d76 --- /dev/null +++ b/drivers/crypto/qat/qat_common/adf_gen2_dc.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright(c) 2022 Intel Corporation */ +#ifndef ADF_GEN2_DC_H +#define ADF_GEN2_DC_H + +#include "adf_accel_devices.h" + +void adf_gen2_init_dc_ops(struct adf_dc_ops *dc_ops); + +#endif /* ADF_GEN2_DC_H */ diff --git a/drivers/crypto/qat/qat_common/adf_init.c b/drivers/crypto/qat/qat_common/adf_init.c index 33a9a46d69494..cef7bb8ec0073 100644 --- a/drivers/crypto/qat/qat_common/adf_init.c +++ b/drivers/crypto/qat/qat_common/adf_init.c @@ -209,6 +209,14 @@ int adf_dev_start(struct adf_accel_dev *accel_dev) clear_bit(ADF_STATUS_STARTED, &accel_dev->status); return -EFAULT; } + + if (!list_empty(&accel_dev->compression_list) && qat_comp_algs_register()) { + dev_err(&GET_DEV(accel_dev), + "Failed to register compression algs\n"); + set_bit(ADF_STATUS_STARTING, &accel_dev->status); + clear_bit(ADF_STATUS_STARTED, &accel_dev->status); + return -EFAULT; + } return 0; } EXPORT_SYMBOL_GPL(adf_dev_start); @@ -242,6 +250,9 @@ void adf_dev_stop(struct adf_accel_dev *accel_dev) qat_asym_algs_unregister(); } + if (!list_empty(&accel_dev->compression_list)) + qat_comp_algs_unregister(); + list_for_each(list_itr, &service_table) { service = list_entry(list_itr, struct service_hndl, list); if (!test_bit(accel_dev->accel_id, service->start_status)) diff --git a/drivers/crypto/qat/qat_common/adf_sriov.c b/drivers/crypto/qat/qat_common/adf_sriov.c index b2db1d70d71fb..d85a90cc387b4 100644 --- a/drivers/crypto/qat/qat_common/adf_sriov.c +++ b/drivers/crypto/qat/qat_common/adf_sriov.c @@ -170,6 +170,10 @@ int adf_sriov_configure(struct pci_dev *pdev, int numvfs) if (adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, ADF_NUM_CY, (void *)&val, ADF_DEC)) return -EFAULT; + ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, ADF_NUM_DC, + &val, ADF_DEC); + if (ret) + return ret; set_bit(ADF_STATUS_CONFIGURED, &accel_dev->status); diff --git a/drivers/crypto/qat/qat_common/icp_qat_fw.h b/drivers/crypto/qat/qat_common/icp_qat_fw.h index 6dc09d270082f..c141160421e19 100644 --- a/drivers/crypto/qat/qat_common/icp_qat_fw.h +++ b/drivers/crypto/qat/qat_common/icp_qat_fw.h @@ -116,6 +116,10 @@ struct icp_qat_fw_comn_resp { #define ICP_QAT_FW_COMN_VALID_FLAG_BITPOS 7 #define ICP_QAT_FW_COMN_VALID_FLAG_MASK 0x1 #define ICP_QAT_FW_COMN_HDR_RESRVD_FLD_MASK 0x7F +#define ICP_QAT_FW_COMN_CNV_FLAG_BITPOS 6 +#define ICP_QAT_FW_COMN_CNV_FLAG_MASK 0x1 +#define ICP_QAT_FW_COMN_CNVNR_FLAG_BITPOS 5 +#define ICP_QAT_FW_COMN_CNVNR_FLAG_MASK 0x1 #define ICP_QAT_FW_COMN_OV_SRV_TYPE_GET(icp_qat_fw_comn_req_hdr_t) \ icp_qat_fw_comn_req_hdr_t.service_type @@ -132,6 +136,26 @@ struct icp_qat_fw_comn_resp { #define ICP_QAT_FW_COMN_HDR_VALID_FLAG_GET(hdr_t) \ ICP_QAT_FW_COMN_VALID_FLAG_GET(hdr_t.hdr_flags) +#define ICP_QAT_FW_COMN_HDR_CNVNR_FLAG_GET(hdr_flags) \ + QAT_FIELD_GET(hdr_flags, \ + ICP_QAT_FW_COMN_CNVNR_FLAG_BITPOS, \ + ICP_QAT_FW_COMN_CNVNR_FLAG_MASK) + +#define ICP_QAT_FW_COMN_HDR_CNVNR_FLAG_SET(hdr_t, val) \ + QAT_FIELD_SET((hdr_t.hdr_flags), (val), \ + ICP_QAT_FW_COMN_CNVNR_FLAG_BITPOS, \ + ICP_QAT_FW_COMN_CNVNR_FLAG_MASK) + +#define ICP_QAT_FW_COMN_HDR_CNV_FLAG_GET(hdr_flags) \ + QAT_FIELD_GET(hdr_flags, \ + ICP_QAT_FW_COMN_CNV_FLAG_BITPOS, \ + ICP_QAT_FW_COMN_CNV_FLAG_MASK) + +#define ICP_QAT_FW_COMN_HDR_CNV_FLAG_SET(hdr_t, val) \ + QAT_FIELD_SET((hdr_t.hdr_flags), (val), \ + ICP_QAT_FW_COMN_CNV_FLAG_BITPOS, \ + ICP_QAT_FW_COMN_CNV_FLAG_MASK) + #define ICP_QAT_FW_COMN_HDR_VALID_FLAG_SET(hdr_t, val) \ ICP_QAT_FW_COMN_VALID_FLAG_SET(hdr_t, val) diff --git a/drivers/crypto/qat/qat_common/icp_qat_fw_comp.h b/drivers/crypto/qat/qat_common/icp_qat_fw_comp.h new file mode 100644 index 0000000000000..a03d43fef2b37 --- /dev/null +++ b/drivers/crypto/qat/qat_common/icp_qat_fw_comp.h @@ -0,0 +1,404 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright(c) 2022 Intel Corporation */ +#ifndef _ICP_QAT_FW_COMP_H_ +#define _ICP_QAT_FW_COMP_H_ +#include "icp_qat_fw.h" + +enum icp_qat_fw_comp_cmd_id { + ICP_QAT_FW_COMP_CMD_STATIC = 0, + ICP_QAT_FW_COMP_CMD_DYNAMIC = 1, + ICP_QAT_FW_COMP_CMD_DECOMPRESS = 2, + ICP_QAT_FW_COMP_CMD_DELIMITER +}; + +enum icp_qat_fw_comp_20_cmd_id { + ICP_QAT_FW_COMP_20_CMD_LZ4_COMPRESS = 3, + ICP_QAT_FW_COMP_20_CMD_LZ4_DECOMPRESS = 4, + ICP_QAT_FW_COMP_20_CMD_LZ4S_COMPRESS = 5, + ICP_QAT_FW_COMP_20_CMD_LZ4S_DECOMPRESS = 6, + ICP_QAT_FW_COMP_20_CMD_XP10_COMPRESS = 7, + ICP_QAT_FW_COMP_20_CMD_XP10_DECOMPRESS = 8, + ICP_QAT_FW_COMP_20_CMD_RESERVED_9 = 9, + ICP_QAT_FW_COMP_23_CMD_ZSTD_COMPRESS = 10, + ICP_QAT_FW_COMP_23_CMD_ZSTD_DECOMPRESS = 11, + ICP_QAT_FW_COMP_20_CMD_DELIMITER +}; + +#define ICP_QAT_FW_COMP_STATELESS_SESSION 0 +#define ICP_QAT_FW_COMP_STATEFUL_SESSION 1 +#define ICP_QAT_FW_COMP_NOT_AUTO_SELECT_BEST 0 +#define ICP_QAT_FW_COMP_AUTO_SELECT_BEST 1 +#define ICP_QAT_FW_COMP_NOT_ENH_AUTO_SELECT_BEST 0 +#define ICP_QAT_FW_COMP_ENH_AUTO_SELECT_BEST 1 +#define ICP_QAT_FW_COMP_NOT_DISABLE_TYPE0_ENH_AUTO_SELECT_BEST 0 +#define ICP_QAT_FW_COMP_DISABLE_TYPE0_ENH_AUTO_SELECT_BEST 1 +#define ICP_QAT_FW_COMP_DISABLE_SECURE_RAM_USED_AS_INTMD_BUF 1 +#define ICP_QAT_FW_COMP_ENABLE_SECURE_RAM_USED_AS_INTMD_BUF 0 +#define ICP_QAT_FW_COMP_SESSION_TYPE_BITPOS 2 +#define ICP_QAT_FW_COMP_SESSION_TYPE_MASK 0x1 +#define ICP_QAT_FW_COMP_AUTO_SELECT_BEST_BITPOS 3 +#define ICP_QAT_FW_COMP_AUTO_SELECT_BEST_MASK 0x1 +#define ICP_QAT_FW_COMP_ENHANCED_AUTO_SELECT_BEST_BITPOS 4 +#define ICP_QAT_FW_COMP_ENHANCED_AUTO_SELECT_BEST_MASK 0x1 +#define ICP_QAT_FW_COMP_RET_DISABLE_TYPE0_HEADER_DATA_BITPOS 5 +#define ICP_QAT_FW_COMP_RET_DISABLE_TYPE0_HEADER_DATA_MASK 0x1 +#define ICP_QAT_FW_COMP_DISABLE_SECURE_RAM_AS_INTMD_BUF_BITPOS 7 +#define ICP_QAT_FW_COMP_DISABLE_SECURE_RAM_AS_INTMD_BUF_MASK 0x1 + +#define ICP_QAT_FW_COMP_FLAGS_BUILD(sesstype, autoselect, enhanced_asb, \ + ret_uncomp, secure_ram) \ + ((((sesstype) & ICP_QAT_FW_COMP_SESSION_TYPE_MASK) << \ + ICP_QAT_FW_COMP_SESSION_TYPE_BITPOS) | \ + (((autoselect) & ICP_QAT_FW_COMP_AUTO_SELECT_BEST_MASK) << \ + ICP_QAT_FW_COMP_AUTO_SELECT_BEST_BITPOS) | \ + (((enhanced_asb) & ICP_QAT_FW_COMP_ENHANCED_AUTO_SELECT_BEST_MASK) << \ + ICP_QAT_FW_COMP_ENHANCED_AUTO_SELECT_BEST_BITPOS) | \ + (((ret_uncomp) & ICP_QAT_FW_COMP_RET_DISABLE_TYPE0_HEADER_DATA_MASK) << \ + ICP_QAT_FW_COMP_RET_DISABLE_TYPE0_HEADER_DATA_BITPOS) | \ + (((secure_ram) & ICP_QAT_FW_COMP_DISABLE_SECURE_RAM_AS_INTMD_BUF_MASK) << \ + ICP_QAT_FW_COMP_DISABLE_SECURE_RAM_AS_INTMD_BUF_BITPOS)) + +#define ICP_QAT_FW_COMP_SESSION_TYPE_GET(flags) \ + QAT_FIELD_GET(flags, ICP_QAT_FW_COMP_SESSION_TYPE_BITPOS, \ + ICP_QAT_FW_COMP_SESSION_TYPE_MASK) + +#define ICP_QAT_FW_COMP_SESSION_TYPE_SET(flags, val) \ + QAT_FIELD_SET(flags, val, ICP_QAT_FW_COMP_SESSION_TYPE_BITPOS, \ + ICP_QAT_FW_COMP_SESSION_TYPE_MASK) + +#define ICP_QAT_FW_COMP_AUTO_SELECT_BEST_GET(flags) \ + QAT_FIELD_GET(flags, ICP_QAT_FW_COMP_AUTO_SELECT_BEST_BITPOS, \ + ICP_QAT_FW_COMP_AUTO_SELECT_BEST_MASK) + +#define ICP_QAT_FW_COMP_EN_ASB_GET(flags) \ + QAT_FIELD_GET(flags, ICP_QAT_FW_COMP_ENHANCED_AUTO_SELECT_BEST_BITPOS, \ + ICP_QAT_FW_COMP_ENHANCED_AUTO_SELECT_BEST_MASK) + +#define ICP_QAT_FW_COMP_RET_UNCOMP_GET(flags) \ + QAT_FIELD_GET(flags, \ + ICP_QAT_FW_COMP_RET_DISABLE_TYPE0_HEADER_DATA_BITPOS, \ + ICP_QAT_FW_COMP_RET_DISABLE_TYPE0_HEADER_DATA_MASK) + +#define ICP_QAT_FW_COMP_SECURE_RAM_USE_GET(flags) \ + QAT_FIELD_GET(flags, \ + ICP_QAT_FW_COMP_DISABLE_SECURE_RAM_AS_INTMD_BUF_BITPOS, \ + ICP_QAT_FW_COMP_DISABLE_SECURE_RAM_AS_INTMD_BUF_MASK) + +struct icp_qat_fw_comp_req_hdr_cd_pars { + union { + struct { + __u64 content_desc_addr; + __u16 content_desc_resrvd1; + __u8 content_desc_params_sz; + __u8 content_desc_hdr_resrvd2; + __u32 content_desc_resrvd3; + } s; + struct { + __u32 comp_slice_cfg_word[ICP_QAT_FW_NUM_LONGWORDS_2]; + __u32 content_desc_resrvd4; + } sl; + } u; +}; + +struct icp_qat_fw_comp_req_params { + __u32 comp_len; + __u32 out_buffer_sz; + union { + struct { + __u32 initial_crc32; + __u32 initial_adler; + } legacy; + __u64 crc_data_addr; + } crc; + __u32 req_par_flags; + __u32 rsrvd; +}; + +#define ICP_QAT_FW_COMP_REQ_PARAM_FLAGS_BUILD(sop, eop, bfinal, cnv, cnvnr, \ + cnvdfx, crc, xxhash_acc, \ + cnv_error_type, append_crc, \ + drop_data) \ + ((((sop) & ICP_QAT_FW_COMP_SOP_MASK) << \ + ICP_QAT_FW_COMP_SOP_BITPOS) | \ + (((eop) & ICP_QAT_FW_COMP_EOP_MASK) << \ + ICP_QAT_FW_COMP_EOP_BITPOS) | \ + (((bfinal) & ICP_QAT_FW_COMP_BFINAL_MASK) \ + << ICP_QAT_FW_COMP_BFINAL_BITPOS) | \ + (((cnv) & ICP_QAT_FW_COMP_CNV_MASK) << \ + ICP_QAT_FW_COMP_CNV_BITPOS) | \ + (((cnvnr) & ICP_QAT_FW_COMP_CNVNR_MASK) \ + << ICP_QAT_FW_COMP_CNVNR_BITPOS) | \ + (((cnvdfx) & ICP_QAT_FW_COMP_CNV_DFX_MASK) \ + << ICP_QAT_FW_COMP_CNV_DFX_BITPOS) | \ + (((crc) & ICP_QAT_FW_COMP_CRC_MODE_MASK) \ + << ICP_QAT_FW_COMP_CRC_MODE_BITPOS) | \ + (((xxhash_acc) & ICP_QAT_FW_COMP_XXHASH_ACC_MODE_MASK) \ + << ICP_QAT_FW_COMP_XXHASH_ACC_MODE_BITPOS) | \ + (((cnv_error_type) & ICP_QAT_FW_COMP_CNV_ERROR_MASK) \ + << ICP_QAT_FW_COMP_CNV_ERROR_BITPOS) | \ + (((append_crc) & ICP_QAT_FW_COMP_APPEND_CRC_MASK) \ + << ICP_QAT_FW_COMP_APPEND_CRC_BITPOS) | \ + (((drop_data) & ICP_QAT_FW_COMP_DROP_DATA_MASK) \ + << ICP_QAT_FW_COMP_DROP_DATA_BITPOS)) + +#define ICP_QAT_FW_COMP_NOT_SOP 0 +#define ICP_QAT_FW_COMP_SOP 1 +#define ICP_QAT_FW_COMP_NOT_EOP 0 +#define ICP_QAT_FW_COMP_EOP 1 +#define ICP_QAT_FW_COMP_NOT_BFINAL 0 +#define ICP_QAT_FW_COMP_BFINAL 1 +#define ICP_QAT_FW_COMP_NO_CNV 0 +#define ICP_QAT_FW_COMP_CNV 1 +#define ICP_QAT_FW_COMP_NO_CNV_RECOVERY 0 +#define ICP_QAT_FW_COMP_CNV_RECOVERY 1 +#define ICP_QAT_FW_COMP_NO_CNV_DFX 0 +#define ICP_QAT_FW_COMP_CNV_DFX 1 +#define ICP_QAT_FW_COMP_CRC_MODE_LEGACY 0 +#define ICP_QAT_FW_COMP_CRC_MODE_E2E 1 +#define ICP_QAT_FW_COMP_NO_XXHASH_ACC 0 +#define ICP_QAT_FW_COMP_XXHASH_ACC 1 +#define ICP_QAT_FW_COMP_APPEND_CRC 1 +#define ICP_QAT_FW_COMP_NO_APPEND_CRC 0 +#define ICP_QAT_FW_COMP_DROP_DATA 1 +#define ICP_QAT_FW_COMP_NO_DROP_DATA 0 +#define ICP_QAT_FW_COMP_SOP_BITPOS 0 +#define ICP_QAT_FW_COMP_SOP_MASK 0x1 +#define ICP_QAT_FW_COMP_EOP_BITPOS 1 +#define ICP_QAT_FW_COMP_EOP_MASK 0x1 +#define ICP_QAT_FW_COMP_BFINAL_BITPOS 6 +#define ICP_QAT_FW_COMP_BFINAL_MASK 0x1 +#define ICP_QAT_FW_COMP_CNV_BITPOS 16 +#define ICP_QAT_FW_COMP_CNV_MASK 0x1 +#define ICP_QAT_FW_COMP_CNVNR_BITPOS 17 +#define ICP_QAT_FW_COMP_CNVNR_MASK 0x1 +#define ICP_QAT_FW_COMP_CNV_DFX_BITPOS 18 +#define ICP_QAT_FW_COMP_CNV_DFX_MASK 0x1 +#define ICP_QAT_FW_COMP_CRC_MODE_BITPOS 19 +#define ICP_QAT_FW_COMP_CRC_MODE_MASK 0x1 +#define ICP_QAT_FW_COMP_XXHASH_ACC_MODE_BITPOS 20 +#define ICP_QAT_FW_COMP_XXHASH_ACC_MODE_MASK 0x1 +#define ICP_QAT_FW_COMP_CNV_ERROR_BITPOS 21 +#define ICP_QAT_FW_COMP_CNV_ERROR_MASK 0b111 +#define ICP_QAT_FW_COMP_CNV_ERROR_NONE 0b000 +#define ICP_QAT_FW_COMP_CNV_ERROR_CHECKSUM 0b001 +#define ICP_QAT_FW_COMP_CNV_ERROR_DCPR_OBC_DIFF 0b010 +#define ICP_QAT_FW_COMP_CNV_ERROR_DCPR 0b011 +#define ICP_QAT_FW_COMP_CNV_ERROR_XLT 0b100 +#define ICP_QAT_FW_COMP_CNV_ERROR_DCPR_IBC_DIFF 0b101 +#define ICP_QAT_FW_COMP_APPEND_CRC_BITPOS 24 +#define ICP_QAT_FW_COMP_APPEND_CRC_MASK 0x1 +#define ICP_QAT_FW_COMP_DROP_DATA_BITPOS 25 +#define ICP_QAT_FW_COMP_DROP_DATA_MASK 0x1 + +#define ICP_QAT_FW_COMP_SOP_GET(flags) \ + QAT_FIELD_GET(flags, ICP_QAT_FW_COMP_SOP_BITPOS, \ + ICP_QAT_FW_COMP_SOP_MASK) + +#define ICP_QAT_FW_COMP_SOP_SET(flags, val) \ + QAT_FIELD_SET(flags, val, ICP_QAT_FW_COMP_SOP_BITPOS, \ + ICP_QAT_FW_COMP_SOP_MASK) + +#define ICP_QAT_FW_COMP_EOP_GET(flags) \ + QAT_FIELD_GET(flags, ICP_QAT_FW_COMP_EOP_BITPOS, \ + ICP_QAT_FW_COMP_EOP_MASK) + +#define ICP_QAT_FW_COMP_EOP_SET(flags, val) \ + QAT_FIELD_SET(flags, val, ICP_QAT_FW_COMP_EOP_BITPOS, \ + ICP_QAT_FW_COMP_EOP_MASK) + +#define ICP_QAT_FW_COMP_BFINAL_GET(flags) \ + QAT_FIELD_GET(flags, ICP_QAT_FW_COMP_BFINAL_BITPOS, \ + ICP_QAT_FW_COMP_BFINAL_MASK) + +#define ICP_QAT_FW_COMP_BFINAL_SET(flags, val) \ + QAT_FIELD_SET(flags, val, ICP_QAT_FW_COMP_BFINAL_BITPOS, \ + ICP_QAT_FW_COMP_BFINAL_MASK) + +#define ICP_QAT_FW_COMP_CNV_GET(flags) \ + QAT_FIELD_GET(flags, ICP_QAT_FW_COMP_CNV_BITPOS, \ + ICP_QAT_FW_COMP_CNV_MASK) + +#define ICP_QAT_FW_COMP_CNVNR_GET(flags) \ + QAT_FIELD_GET(flags, ICP_QAT_FW_COMP_CNVNR_BITPOS, \ + ICP_QAT_FW_COMP_CNVNR_MASK) + +#define ICP_QAT_FW_COMP_CNV_DFX_GET(flags) \ + QAT_FIELD_GET(flags, ICP_QAT_FW_COMP_CNV_DFX_BITPOS, \ + ICP_QAT_FW_COMP_CNV_DFX_MASK) + +#define ICP_QAT_FW_COMP_CNV_DFX_SET(flags, val) \ + QAT_FIELD_SET(flags, val, ICP_QAT_FW_COMP_CNV_DFX_BITPOS, \ + ICP_QAT_FW_COMP_CNV_DFX_MASK) + +#define ICP_QAT_FW_COMP_CRC_MODE_GET(flags) \ + QAT_FIELD_GET(flags, ICP_QAT_FW_COMP_CRC_MODE_BITPOS, \ + ICP_QAT_FW_COMP_CRC_MODE_MASK) + +#define ICP_QAT_FW_COMP_XXHASH_ACC_MODE_GET(flags) \ + QAT_FIELD_GET(flags, ICP_QAT_FW_COMP_XXHASH_ACC_MODE_BITPOS, \ + ICP_QAT_FW_COMP_XXHASH_ACC_MODE_MASK) + +#define ICP_QAT_FW_COMP_XXHASH_ACC_MODE_SET(flags, val) \ + QAT_FIELD_SET(flags, val, ICP_QAT_FW_COMP_XXHASH_ACC_MODE_BITPOS, \ + ICP_QAT_FW_COMP_XXHASH_ACC_MODE_MASK) + +#define ICP_QAT_FW_COMP_CNV_ERROR_TYPE_GET(flags) \ + QAT_FIELD_GET(flags, ICP_QAT_FW_COMP_CNV_ERROR_BITPOS, \ + ICP_QAT_FW_COMP_CNV_ERROR_MASK) + +#define ICP_QAT_FW_COMP_CNV_ERROR_TYPE_SET(flags, val) \ + QAT_FIELD_SET(flags, val, ICP_QAT_FW_COMP_CNV_ERROR_BITPOS, \ + ICP_QAT_FW_COMP_CNV_ERROR_MASK) + +struct icp_qat_fw_xlt_req_params { + __u64 inter_buff_ptr; +}; + +struct icp_qat_fw_comp_cd_hdr { + __u16 ram_bank_flags; + __u8 comp_cfg_offset; + __u8 next_curr_id; + __u32 resrvd; + __u64 comp_state_addr; + __u64 ram_banks_addr; +}; + +#define COMP_CPR_INITIAL_CRC 0 +#define COMP_CPR_INITIAL_ADLER 1 + +struct icp_qat_fw_xlt_cd_hdr { + __u16 resrvd1; + __u8 resrvd2; + __u8 next_curr_id; + __u32 resrvd3; +}; + +struct icp_qat_fw_comp_req { + struct icp_qat_fw_comn_req_hdr comn_hdr; + struct icp_qat_fw_comp_req_hdr_cd_pars cd_pars; + struct icp_qat_fw_comn_req_mid comn_mid; + struct icp_qat_fw_comp_req_params comp_pars; + union { + struct icp_qat_fw_xlt_req_params xlt_pars; + __u32 resrvd1[ICP_QAT_FW_NUM_LONGWORDS_2]; + } u1; + __u32 resrvd2[ICP_QAT_FW_NUM_LONGWORDS_2]; + struct icp_qat_fw_comp_cd_hdr comp_cd_ctrl; + union { + struct icp_qat_fw_xlt_cd_hdr xlt_cd_ctrl; + __u32 resrvd3[ICP_QAT_FW_NUM_LONGWORDS_2]; + } u2; +}; + +struct icp_qat_fw_resp_comp_pars { + __u32 input_byte_counter; + __u32 output_byte_counter; + union { + struct { + __u32 curr_crc32; + __u32 curr_adler_32; + } legacy; + __u32 resrvd[ICP_QAT_FW_NUM_LONGWORDS_2]; + } crc; +}; + +struct icp_qat_fw_comp_state { + __u32 rd8_counter; + __u32 status_flags; + __u32 in_counter; + __u32 out_counter; + __u64 intermediate_state; + __u32 lobc; + __u32 replaybc; + __u64 pcrc64_poly; + __u32 crc32; + __u32 adler_xxhash32; + __u64 pcrc64_xorout; + __u32 out_buf_size; + __u32 in_buf_size; + __u64 in_pcrc64; + __u64 out_pcrc64; + __u32 lobs; + __u32 libc; + __u64 reserved; + __u32 xxhash_state[4]; + __u32 cleartext[4]; +}; + +struct icp_qat_fw_comp_resp { + struct icp_qat_fw_comn_resp_hdr comn_resp; + __u64 opaque_data; + struct icp_qat_fw_resp_comp_pars comp_resp_pars; +}; + +#define QAT_FW_COMP_BANK_FLAG_MASK 0x1 +#define QAT_FW_COMP_BANK_I_BITPOS 8 +#define QAT_FW_COMP_BANK_H_BITPOS 7 +#define QAT_FW_COMP_BANK_G_BITPOS 6 +#define QAT_FW_COMP_BANK_F_BITPOS 5 +#define QAT_FW_COMP_BANK_E_BITPOS 4 +#define QAT_FW_COMP_BANK_D_BITPOS 3 +#define QAT_FW_COMP_BANK_C_BITPOS 2 +#define QAT_FW_COMP_BANK_B_BITPOS 1 +#define QAT_FW_COMP_BANK_A_BITPOS 0 + +enum icp_qat_fw_comp_bank_enabled { + ICP_QAT_FW_COMP_BANK_DISABLED = 0, + ICP_QAT_FW_COMP_BANK_ENABLED = 1, + ICP_QAT_FW_COMP_BANK_DELIMITER = 2 +}; + +#define ICP_QAT_FW_COMP_RAM_FLAGS_BUILD(bank_i_enable, bank_h_enable, \ + bank_g_enable, bank_f_enable, \ + bank_e_enable, bank_d_enable, \ + bank_c_enable, bank_b_enable, \ + bank_a_enable) \ + ((((bank_i_enable) & QAT_FW_COMP_BANK_FLAG_MASK) << \ + QAT_FW_COMP_BANK_I_BITPOS) | \ + (((bank_h_enable) & QAT_FW_COMP_BANK_FLAG_MASK) << \ + QAT_FW_COMP_BANK_H_BITPOS) | \ + (((bank_g_enable) & QAT_FW_COMP_BANK_FLAG_MASK) << \ + QAT_FW_COMP_BANK_G_BITPOS) | \ + (((bank_f_enable) & QAT_FW_COMP_BANK_FLAG_MASK) << \ + QAT_FW_COMP_BANK_F_BITPOS) | \ + (((bank_e_enable) & QAT_FW_COMP_BANK_FLAG_MASK) << \ + QAT_FW_COMP_BANK_E_BITPOS) | \ + (((bank_d_enable) & QAT_FW_COMP_BANK_FLAG_MASK) << \ + QAT_FW_COMP_BANK_D_BITPOS) | \ + (((bank_c_enable) & QAT_FW_COMP_BANK_FLAG_MASK) << \ + QAT_FW_COMP_BANK_C_BITPOS) | \ + (((bank_b_enable) & QAT_FW_COMP_BANK_FLAG_MASK) << \ + QAT_FW_COMP_BANK_B_BITPOS) | \ + (((bank_a_enable) & QAT_FW_COMP_BANK_FLAG_MASK) << \ + QAT_FW_COMP_BANK_A_BITPOS)) + +struct icp_qat_fw_comp_crc_data_struct { + __u32 crc32; + union { + __u32 adler; + __u32 xxhash; + } adler_xxhash_u; + __u32 cpr_in_crc_lo; + __u32 cpr_in_crc_hi; + __u32 cpr_out_crc_lo; + __u32 cpr_out_crc_hi; + __u32 xlt_in_crc_lo; + __u32 xlt_in_crc_hi; + __u32 xlt_out_crc_lo; + __u32 xlt_out_crc_hi; + __u32 prog_crc_poly_lo; + __u32 prog_crc_poly_hi; + __u32 xor_out_lo; + __u32 xor_out_hi; + __u32 append_crc_lo; + __u32 append_crc_hi; +}; + +struct xxhash_acc_state_buff { + __u32 in_counter; + __u32 out_counter; + __u32 xxhash_state[4]; + __u32 clear_txt[4]; +}; + +#endif diff --git a/drivers/crypto/qat/qat_common/icp_qat_hw.h b/drivers/crypto/qat/qat_common/icp_qat_hw.h index 433304cad2edf..4042739bb6fa9 100644 --- a/drivers/crypto/qat/qat_common/icp_qat_hw.h +++ b/drivers/crypto/qat/qat_common/icp_qat_hw.h @@ -307,4 +307,70 @@ struct icp_qat_hw_cipher_algo_blk { struct icp_qat_hw_ucs_cipher_aes256_f8 ucs_aes; }; } __aligned(64); + +enum icp_qat_hw_compression_direction { + ICP_QAT_HW_COMPRESSION_DIR_COMPRESS = 0, + ICP_QAT_HW_COMPRESSION_DIR_DECOMPRESS = 1, + ICP_QAT_HW_COMPRESSION_DIR_DELIMITER = 2 +}; + +enum icp_qat_hw_compression_delayed_match { + ICP_QAT_HW_COMPRESSION_DELAYED_MATCH_DISABLED = 0, + ICP_QAT_HW_COMPRESSION_DELAYED_MATCH_ENABLED = 1, + ICP_QAT_HW_COMPRESSION_DELAYED_MATCH_DELIMITER = 2 +}; + +enum icp_qat_hw_compression_algo { + ICP_QAT_HW_COMPRESSION_ALGO_DEFLATE = 0, + ICP_QAT_HW_COMPRESSION_ALGO_LZS = 1, + ICP_QAT_HW_COMPRESSION_ALGO_DELIMITER = 2 +}; + +enum icp_qat_hw_compression_depth { + ICP_QAT_HW_COMPRESSION_DEPTH_1 = 0, + ICP_QAT_HW_COMPRESSION_DEPTH_4 = 1, + ICP_QAT_HW_COMPRESSION_DEPTH_8 = 2, + ICP_QAT_HW_COMPRESSION_DEPTH_16 = 3, + ICP_QAT_HW_COMPRESSION_DEPTH_128 = 4, + ICP_QAT_HW_COMPRESSION_DEPTH_DELIMITER = 5 +}; + +enum icp_qat_hw_compression_file_type { + ICP_QAT_HW_COMPRESSION_FILE_TYPE_0 = 0, + ICP_QAT_HW_COMPRESSION_FILE_TYPE_1 = 1, + ICP_QAT_HW_COMPRESSION_FILE_TYPE_2 = 2, + ICP_QAT_HW_COMPRESSION_FILE_TYPE_3 = 3, + ICP_QAT_HW_COMPRESSION_FILE_TYPE_4 = 4, + ICP_QAT_HW_COMPRESSION_FILE_TYPE_DELIMITER = 5 +}; + +struct icp_qat_hw_compression_config { + __u32 lower_val; + __u32 upper_val; +}; + +#define QAT_COMPRESSION_DIR_BITPOS 4 +#define QAT_COMPRESSION_DIR_MASK 0x7 +#define QAT_COMPRESSION_DELAYED_MATCH_BITPOS 16 +#define QAT_COMPRESSION_DELAYED_MATCH_MASK 0x1 +#define QAT_COMPRESSION_ALGO_BITPOS 31 +#define QAT_COMPRESSION_ALGO_MASK 0x1 +#define QAT_COMPRESSION_DEPTH_BITPOS 28 +#define QAT_COMPRESSION_DEPTH_MASK 0x7 +#define QAT_COMPRESSION_FILE_TYPE_BITPOS 24 +#define QAT_COMPRESSION_FILE_TYPE_MASK 0xF + +#define ICP_QAT_HW_COMPRESSION_CONFIG_BUILD(dir, delayed, \ + algo, depth, filetype) \ + ((((dir) & QAT_COMPRESSION_DIR_MASK) << \ + QAT_COMPRESSION_DIR_BITPOS) | \ + (((delayed) & QAT_COMPRESSION_DELAYED_MATCH_MASK) << \ + QAT_COMPRESSION_DELAYED_MATCH_BITPOS) | \ + (((algo) & QAT_COMPRESSION_ALGO_MASK) << \ + QAT_COMPRESSION_ALGO_BITPOS) | \ + (((depth) & QAT_COMPRESSION_DEPTH_MASK) << \ + QAT_COMPRESSION_DEPTH_BITPOS) | \ + (((filetype) & QAT_COMPRESSION_FILE_TYPE_MASK) << \ + QAT_COMPRESSION_FILE_TYPE_BITPOS)) + #endif diff --git a/drivers/crypto/qat/qat_common/qat_comp_algs.c b/drivers/crypto/qat/qat_common/qat_comp_algs.c new file mode 100644 index 0000000000000..63fd4ac33dbf3 --- /dev/null +++ b/drivers/crypto/qat/qat_common/qat_comp_algs.c @@ -0,0 +1,274 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2022 Intel Corporation */ +#include +#include +#include +#include +#include +#include "adf_accel_devices.h" +#include "adf_common_drv.h" +#include "qat_bl.h" +#include "qat_comp_req.h" +#include "qat_compression.h" +#include "qat_algs_send.h" + +static DEFINE_MUTEX(algs_lock); +static unsigned int active_devs; + +enum direction { + DECOMPRESSION = 0, + COMPRESSION = 1, +}; + +struct qat_compression_ctx { + u8 comp_ctx[QAT_COMP_CTX_SIZE]; + struct qat_compression_instance *inst; +}; + +struct qat_compression_req { + u8 req[QAT_COMP_REQ_SIZE]; + struct qat_compression_ctx *qat_compression_ctx; + struct acomp_req *acompress_req; + struct qat_request_buffs buf; + enum direction dir; + int actual_dlen; + struct qat_alg_req alg_req; +}; + +static int qat_alg_send_dc_message(struct qat_compression_req *qat_req, + struct qat_compression_instance *inst, + struct crypto_async_request *base) +{ + struct qat_alg_req *alg_req = &qat_req->alg_req; + + alg_req->fw_req = (u32 *)&qat_req->req; + alg_req->tx_ring = inst->dc_tx; + alg_req->base = base; + alg_req->backlog = &inst->backlog; + + return qat_alg_send_message(alg_req); +} + +static void qat_comp_generic_callback(struct qat_compression_req *qat_req, + void *resp) +{ + struct acomp_req *areq = qat_req->acompress_req; + struct qat_compression_ctx *ctx = qat_req->qat_compression_ctx; + struct adf_accel_dev *accel_dev = ctx->inst->accel_dev; + struct crypto_acomp *tfm = crypto_acomp_reqtfm(areq); + struct qat_compression_instance *inst = ctx->inst; + int consumed, produced; + s8 cmp_err, xlt_err; + int res = -EBADMSG; + int status; + u8 cnv; + + status = qat_comp_get_cmp_status(resp); + status |= qat_comp_get_xlt_status(resp); + cmp_err = qat_comp_get_cmp_err(resp); + xlt_err = qat_comp_get_xlt_err(resp); + + consumed = qat_comp_get_consumed_ctr(resp); + produced = qat_comp_get_produced_ctr(resp); + + dev_dbg(&GET_DEV(accel_dev), + "[%s][%s][%s] slen = %8d dlen = %8d consumed = %8d produced = %8d cmp_err = %3d xlt_err = %3d", + crypto_tfm_alg_driver_name(crypto_acomp_tfm(tfm)), + qat_req->dir == COMPRESSION ? "comp " : "decomp", + status ? "ERR" : "OK ", + areq->slen, areq->dlen, consumed, produced, cmp_err, xlt_err); + + areq->dlen = 0; + + if (unlikely(status != ICP_QAT_FW_COMN_STATUS_FLAG_OK)) + goto end; + + if (qat_req->dir == COMPRESSION) { + cnv = qat_comp_get_cmp_cnv_flag(resp); + if (unlikely(!cnv)) { + dev_err(&GET_DEV(accel_dev), + "Verified compression not supported\n"); + goto end; + } + + if (unlikely(produced > qat_req->actual_dlen)) { + memset(inst->dc_data->ovf_buff, 0, + inst->dc_data->ovf_buff_sz); + dev_dbg(&GET_DEV(accel_dev), + "Actual buffer overflow: produced=%d, dlen=%d\n", + produced, qat_req->actual_dlen); + goto end; + } + } + + res = 0; + areq->dlen = produced; + +end: + qat_bl_free_bufl(accel_dev, &qat_req->buf); + areq->base.complete(&areq->base, res); +} + +void qat_comp_alg_callback(void *resp) +{ + struct qat_compression_req *qat_req = + (void *)(__force long)qat_comp_get_opaque(resp); + struct qat_instance_backlog *backlog = qat_req->alg_req.backlog; + + qat_comp_generic_callback(qat_req, resp); + + qat_alg_send_backlog(backlog); +} + +static int qat_comp_alg_init_tfm(struct crypto_acomp *acomp_tfm) +{ + struct crypto_tfm *tfm = crypto_acomp_tfm(acomp_tfm); + struct qat_compression_ctx *ctx = crypto_tfm_ctx(tfm); + struct qat_compression_instance *inst; + int node; + + if (tfm->node == NUMA_NO_NODE) + node = numa_node_id(); + else + node = tfm->node; + + memset(ctx, 0, sizeof(*ctx)); + inst = qat_compression_get_instance_node(node); + if (!inst) + return -EINVAL; + ctx->inst = inst; + + ctx->inst->build_deflate_ctx(ctx->comp_ctx); + + return 0; +} + +static void qat_comp_alg_exit_tfm(struct crypto_acomp *acomp_tfm) +{ + struct crypto_tfm *tfm = crypto_acomp_tfm(acomp_tfm); + struct qat_compression_ctx *ctx = crypto_tfm_ctx(tfm); + + qat_compression_put_instance(ctx->inst); + memset(ctx, 0, sizeof(*ctx)); +} + +static int qat_comp_alg_compress_decompress(struct acomp_req *areq, + enum direction dir) +{ + struct qat_compression_req *qat_req = acomp_request_ctx(areq); + struct crypto_acomp *acomp_tfm = crypto_acomp_reqtfm(areq); + struct crypto_tfm *tfm = crypto_acomp_tfm(acomp_tfm); + struct qat_compression_ctx *ctx = crypto_tfm_ctx(tfm); + struct qat_compression_instance *inst = ctx->inst; + struct qat_sgl_to_bufl_params *p_params = NULL; + gfp_t f = qat_algs_alloc_flags(&areq->base); + struct qat_sgl_to_bufl_params params; + unsigned int slen = areq->slen; + unsigned int dlen = areq->dlen; + dma_addr_t sfbuf, dfbuf; + u8 *req = qat_req->req; + size_t ovf_buff_sz; + int ret; + + if (!areq->src || !slen) + return -EINVAL; + + if (areq->dst && !dlen) + return -EINVAL; + + /* Handle acomp requests that require the allocation of a destination + * buffer. The size of the destination buffer is double the source + * buffer (rounded up to the size of a page) to fit the decompressed + * output or an expansion on the data for compression. + */ + if (!areq->dst) { + dlen = round_up(2 * slen, PAGE_SIZE); + areq->dst = sgl_alloc(dlen, f, NULL); + if (!areq->dst) + return -ENOMEM; + } + + if (dir == COMPRESSION) { + params.extra_dst_buff = inst->dc_data->ovf_buff_p; + ovf_buff_sz = inst->dc_data->ovf_buff_sz; + params.sz_extra_dst_buff = ovf_buff_sz; + p_params = ¶ms; + } + + ret = qat_bl_sgl_to_bufl(ctx->inst->accel_dev, areq->src, areq->dst, + &qat_req->buf, p_params, f); + if (unlikely(ret)) + return ret; + + sfbuf = qat_req->buf.blp; + dfbuf = qat_req->buf.bloutp; + qat_req->qat_compression_ctx = ctx; + qat_req->acompress_req = areq; + qat_req->dir = dir; + + if (dir == COMPRESSION) { + qat_req->actual_dlen = dlen; + dlen += ovf_buff_sz; + qat_comp_create_compression_req(ctx->comp_ctx, req, + (u64)(__force long)sfbuf, slen, + (u64)(__force long)dfbuf, dlen, + (u64)(__force long)qat_req); + } else { + qat_comp_create_decompression_req(ctx->comp_ctx, req, + (u64)(__force long)sfbuf, slen, + (u64)(__force long)dfbuf, dlen, + (u64)(__force long)qat_req); + } + + ret = qat_alg_send_dc_message(qat_req, inst, &areq->base); + if (ret == -ENOSPC) + qat_bl_free_bufl(inst->accel_dev, &qat_req->buf); + + return ret; +} + +static int qat_comp_alg_compress(struct acomp_req *req) +{ + return qat_comp_alg_compress_decompress(req, COMPRESSION); +} + +static int qat_comp_alg_decompress(struct acomp_req *req) +{ + return qat_comp_alg_compress_decompress(req, DECOMPRESSION); +} + +static struct acomp_alg qat_acomp[] = { { + .base = { + .cra_name = "deflate", + .cra_driver_name = "qat_deflate", + .cra_priority = 4001, + .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_ALLOCATES_MEMORY, + .cra_ctxsize = sizeof(struct qat_compression_ctx), + .cra_module = THIS_MODULE, + }, + .init = qat_comp_alg_init_tfm, + .exit = qat_comp_alg_exit_tfm, + .compress = qat_comp_alg_compress, + .decompress = qat_comp_alg_decompress, + .dst_free = sgl_free, + .reqsize = sizeof(struct qat_compression_req), +} }; + +int qat_comp_algs_register(void) +{ + int ret = 0; + + mutex_lock(&algs_lock); + if (++active_devs == 1) + ret = crypto_register_acomps(qat_acomp, ARRAY_SIZE(qat_acomp)); + mutex_unlock(&algs_lock); + return ret; +} + +void qat_comp_algs_unregister(void) +{ + mutex_lock(&algs_lock); + if (--active_devs == 0) + crypto_unregister_acomps(qat_acomp, ARRAY_SIZE(qat_acomp)); + mutex_unlock(&algs_lock); +} diff --git a/drivers/crypto/qat/qat_common/qat_comp_req.h b/drivers/crypto/qat/qat_common/qat_comp_req.h new file mode 100644 index 0000000000000..18a1f33a6db98 --- /dev/null +++ b/drivers/crypto/qat/qat_common/qat_comp_req.h @@ -0,0 +1,113 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright(c) 2022 Intel Corporation */ +#ifndef _QAT_COMP_REQ_H_ +#define _QAT_COMP_REQ_H_ + +#include "icp_qat_fw_comp.h" + +#define QAT_COMP_REQ_SIZE (sizeof(struct icp_qat_fw_comp_req)) +#define QAT_COMP_CTX_SIZE (QAT_COMP_REQ_SIZE * 2) + +static inline void qat_comp_create_req(void *ctx, void *req, u64 src, u32 slen, + u64 dst, u32 dlen, u64 opaque) +{ + struct icp_qat_fw_comp_req *fw_tmpl = ctx; + struct icp_qat_fw_comp_req *fw_req = req; + struct icp_qat_fw_comp_req_params *req_pars = &fw_req->comp_pars; + + memcpy(fw_req, fw_tmpl, sizeof(*fw_req)); + fw_req->comn_mid.src_data_addr = src; + fw_req->comn_mid.src_length = slen; + fw_req->comn_mid.dest_data_addr = dst; + fw_req->comn_mid.dst_length = dlen; + fw_req->comn_mid.opaque_data = opaque; + req_pars->comp_len = slen; + req_pars->out_buffer_sz = dlen; +} + +static inline void qat_comp_create_compression_req(void *ctx, void *req, + u64 src, u32 slen, + u64 dst, u32 dlen, + u64 opaque) +{ + qat_comp_create_req(ctx, req, src, slen, dst, dlen, opaque); +} + +static inline void qat_comp_create_decompression_req(void *ctx, void *req, + u64 src, u32 slen, + u64 dst, u32 dlen, + u64 opaque) +{ + struct icp_qat_fw_comp_req *fw_tmpl = ctx; + + fw_tmpl++; + qat_comp_create_req(fw_tmpl, req, src, slen, dst, dlen, opaque); +} + +static inline u32 qat_comp_get_consumed_ctr(void *resp) +{ + struct icp_qat_fw_comp_resp *qat_resp = resp; + + return qat_resp->comp_resp_pars.input_byte_counter; +} + +static inline u32 qat_comp_get_produced_ctr(void *resp) +{ + struct icp_qat_fw_comp_resp *qat_resp = resp; + + return qat_resp->comp_resp_pars.output_byte_counter; +} + +static inline u32 qat_comp_get_produced_adler32(void *resp) +{ + struct icp_qat_fw_comp_resp *qat_resp = resp; + + return qat_resp->comp_resp_pars.crc.legacy.curr_adler_32; +} + +static inline u64 qat_comp_get_opaque(void *resp) +{ + struct icp_qat_fw_comp_resp *qat_resp = resp; + + return qat_resp->opaque_data; +} + +static inline s8 qat_comp_get_cmp_err(void *resp) +{ + struct icp_qat_fw_comp_resp *qat_resp = resp; + + return qat_resp->comn_resp.comn_error.cmp_err_code; +} + +static inline s8 qat_comp_get_xlt_err(void *resp) +{ + struct icp_qat_fw_comp_resp *qat_resp = resp; + + return qat_resp->comn_resp.comn_error.xlat_err_code; +} + +static inline s8 qat_comp_get_cmp_status(void *resp) +{ + struct icp_qat_fw_comp_resp *qat_resp = resp; + u8 stat_filed = qat_resp->comn_resp.comn_status; + + return ICP_QAT_FW_COMN_RESP_CMP_STAT_GET(stat_filed); +} + +static inline s8 qat_comp_get_xlt_status(void *resp) +{ + struct icp_qat_fw_comp_resp *qat_resp = resp; + u8 stat_filed = qat_resp->comn_resp.comn_status; + + return ICP_QAT_FW_COMN_RESP_XLAT_STAT_GET(stat_filed); +} + +static inline u8 qat_comp_get_cmp_cnv_flag(void *resp) +{ + struct icp_qat_fw_comp_resp *qat_resp = resp; + u8 flags = qat_resp->comn_resp.hdr_flags; + + return ICP_QAT_FW_COMN_HDR_CNV_FLAG_GET(flags); +} + +#endif diff --git a/drivers/crypto/qat/qat_common/qat_compression.c b/drivers/crypto/qat/qat_common/qat_compression.c new file mode 100644 index 0000000000000..9fd10f4242f84 --- /dev/null +++ b/drivers/crypto/qat/qat_common/qat_compression.c @@ -0,0 +1,297 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2022 Intel Corporation */ +#include +#include +#include "adf_accel_devices.h" +#include "adf_common_drv.h" +#include "adf_transport.h" +#include "adf_transport_access_macros.h" +#include "adf_cfg.h" +#include "adf_cfg_strings.h" +#include "qat_compression.h" +#include "icp_qat_fw.h" + +#define SEC ADF_KERNEL_SEC + +static struct service_hndl qat_compression; + +void qat_compression_put_instance(struct qat_compression_instance *inst) +{ + atomic_dec(&inst->refctr); + adf_dev_put(inst->accel_dev); +} + +static int qat_compression_free_instances(struct adf_accel_dev *accel_dev) +{ + struct qat_compression_instance *inst; + struct list_head *list_ptr, *tmp; + int i; + + list_for_each_safe(list_ptr, tmp, &accel_dev->compression_list) { + inst = list_entry(list_ptr, + struct qat_compression_instance, list); + + for (i = 0; i < atomic_read(&inst->refctr); i++) + qat_compression_put_instance(inst); + + if (inst->dc_tx) + adf_remove_ring(inst->dc_tx); + + if (inst->dc_rx) + adf_remove_ring(inst->dc_rx); + + list_del(list_ptr); + kfree(inst); + } + return 0; +} + +struct qat_compression_instance *qat_compression_get_instance_node(int node) +{ + struct qat_compression_instance *inst = NULL; + struct adf_accel_dev *accel_dev = NULL; + unsigned long best = ~0; + struct list_head *itr; + + list_for_each(itr, adf_devmgr_get_head()) { + struct adf_accel_dev *tmp_dev; + unsigned long ctr; + int tmp_dev_node; + + tmp_dev = list_entry(itr, struct adf_accel_dev, list); + tmp_dev_node = dev_to_node(&GET_DEV(tmp_dev)); + + if ((node == tmp_dev_node || tmp_dev_node < 0) && + adf_dev_started(tmp_dev) && !list_empty(&tmp_dev->compression_list)) { + ctr = atomic_read(&tmp_dev->ref_count); + if (best > ctr) { + accel_dev = tmp_dev; + best = ctr; + } + } + } + + if (!accel_dev) { + pr_info("QAT: Could not find a device on node %d\n", node); + /* Get any started device */ + list_for_each(itr, adf_devmgr_get_head()) { + struct adf_accel_dev *tmp_dev; + + tmp_dev = list_entry(itr, struct adf_accel_dev, list); + if (adf_dev_started(tmp_dev) && + !list_empty(&tmp_dev->compression_list)) { + accel_dev = tmp_dev; + break; + } + } + } + + if (!accel_dev) + return NULL; + + best = ~0; + list_for_each(itr, &accel_dev->compression_list) { + struct qat_compression_instance *tmp_inst; + unsigned long ctr; + + tmp_inst = list_entry(itr, struct qat_compression_instance, list); + ctr = atomic_read(&tmp_inst->refctr); + if (best > ctr) { + inst = tmp_inst; + best = ctr; + } + } + if (inst) { + if (adf_dev_get(accel_dev)) { + dev_err(&GET_DEV(accel_dev), "Could not increment dev refctr\n"); + return NULL; + } + atomic_inc(&inst->refctr); + } + return inst; +} + +static int qat_compression_create_instances(struct adf_accel_dev *accel_dev) +{ + struct qat_compression_instance *inst; + char key[ADF_CFG_MAX_KEY_LEN_IN_BYTES]; + char val[ADF_CFG_MAX_VAL_LEN_IN_BYTES]; + unsigned long num_inst, num_msg_dc; + unsigned long bank; + int msg_size; + int ret; + int i; + + INIT_LIST_HEAD(&accel_dev->compression_list); + strscpy(key, ADF_NUM_DC, sizeof(key)); + ret = adf_cfg_get_param_value(accel_dev, SEC, key, val); + if (ret) + return ret; + + ret = kstrtoul(val, 10, &num_inst); + if (ret) + return ret; + + for (i = 0; i < num_inst; i++) { + inst = kzalloc_node(sizeof(*inst), GFP_KERNEL, + dev_to_node(&GET_DEV(accel_dev))); + if (!inst) { + ret = -ENOMEM; + goto err; + } + + list_add_tail(&inst->list, &accel_dev->compression_list); + inst->id = i; + atomic_set(&inst->refctr, 0); + inst->accel_dev = accel_dev; + inst->build_deflate_ctx = GET_DC_OPS(accel_dev)->build_deflate_ctx; + + snprintf(key, sizeof(key), ADF_DC "%d" ADF_RING_DC_BANK_NUM, i); + ret = adf_cfg_get_param_value(accel_dev, SEC, key, val); + if (ret) + return ret; + + ret = kstrtoul(val, 10, &bank); + if (ret) + return ret; + + snprintf(key, sizeof(key), ADF_DC "%d" ADF_RING_DC_SIZE, i); + ret = adf_cfg_get_param_value(accel_dev, SEC, key, val); + if (ret) + return ret; + + ret = kstrtoul(val, 10, &num_msg_dc); + if (ret) + return ret; + + msg_size = ICP_QAT_FW_REQ_DEFAULT_SZ; + snprintf(key, sizeof(key), ADF_DC "%d" ADF_RING_DC_TX, i); + ret = adf_create_ring(accel_dev, SEC, bank, num_msg_dc, + msg_size, key, NULL, 0, &inst->dc_tx); + if (ret) + return ret; + + msg_size = ICP_QAT_FW_RESP_DEFAULT_SZ; + snprintf(key, sizeof(key), ADF_DC "%d" ADF_RING_DC_RX, i); + ret = adf_create_ring(accel_dev, SEC, bank, num_msg_dc, + msg_size, key, qat_comp_alg_callback, 0, + &inst->dc_rx); + if (ret) + return ret; + + inst->dc_data = accel_dev->dc_data; + INIT_LIST_HEAD(&inst->backlog.list); + spin_lock_init(&inst->backlog.lock); + } + return 0; +err: + qat_compression_free_instances(accel_dev); + return ret; +} + +static int qat_compression_alloc_dc_data(struct adf_accel_dev *accel_dev) +{ + struct device *dev = &GET_DEV(accel_dev); + dma_addr_t obuff_p = DMA_MAPPING_ERROR; + size_t ovf_buff_sz = QAT_COMP_MAX_SKID; + struct adf_dc_data *dc_data = NULL; + u8 *obuff = NULL; + + dc_data = devm_kzalloc(dev, sizeof(*dc_data), GFP_KERNEL); + if (!dc_data) + goto err; + + obuff = kzalloc_node(ovf_buff_sz, GFP_KERNEL, dev_to_node(dev)); + if (!obuff) + goto err; + + obuff_p = dma_map_single(dev, obuff, ovf_buff_sz, DMA_FROM_DEVICE); + if (unlikely(dma_mapping_error(dev, obuff_p))) + goto err; + + dc_data->ovf_buff = obuff; + dc_data->ovf_buff_p = obuff_p; + dc_data->ovf_buff_sz = ovf_buff_sz; + + accel_dev->dc_data = dc_data; + + return 0; + +err: + accel_dev->dc_data = NULL; + kfree(obuff); + devm_kfree(dev, dc_data); + return -ENOMEM; +} + +static void qat_free_dc_data(struct adf_accel_dev *accel_dev) +{ + struct adf_dc_data *dc_data = accel_dev->dc_data; + struct device *dev = &GET_DEV(accel_dev); + + if (!dc_data) + return; + + dma_unmap_single(dev, dc_data->ovf_buff_p, dc_data->ovf_buff_sz, + DMA_FROM_DEVICE); + memset(dc_data->ovf_buff, 0, dc_data->ovf_buff_sz); + kfree(dc_data->ovf_buff); + devm_kfree(dev, dc_data); + accel_dev->dc_data = NULL; +} + +static int qat_compression_init(struct adf_accel_dev *accel_dev) +{ + int ret; + + ret = qat_compression_alloc_dc_data(accel_dev); + if (ret) + return ret; + + ret = qat_compression_create_instances(accel_dev); + if (ret) + qat_free_dc_data(accel_dev); + + return ret; +} + +static int qat_compression_shutdown(struct adf_accel_dev *accel_dev) +{ + qat_free_dc_data(accel_dev); + return qat_compression_free_instances(accel_dev); +} + +static int qat_compression_event_handler(struct adf_accel_dev *accel_dev, + enum adf_event event) +{ + int ret; + + switch (event) { + case ADF_EVENT_INIT: + ret = qat_compression_init(accel_dev); + break; + case ADF_EVENT_SHUTDOWN: + ret = qat_compression_shutdown(accel_dev); + break; + case ADF_EVENT_RESTARTING: + case ADF_EVENT_RESTARTED: + case ADF_EVENT_START: + case ADF_EVENT_STOP: + default: + ret = 0; + } + return ret; +} + +int qat_compression_register(void) +{ + memset(&qat_compression, 0, sizeof(qat_compression)); + qat_compression.event_hld = qat_compression_event_handler; + qat_compression.name = "qat_compression"; + return adf_service_register(&qat_compression); +} + +int qat_compression_unregister(void) +{ + return adf_service_unregister(&qat_compression); +} diff --git a/drivers/crypto/qat/qat_common/qat_compression.h b/drivers/crypto/qat/qat_common/qat_compression.h new file mode 100644 index 0000000000000..aebac2302dcf2 --- /dev/null +++ b/drivers/crypto/qat/qat_common/qat_compression.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright(c) 2022 Intel Corporation */ +#ifndef _QAT_COMPRESSION_H_ +#define _QAT_COMPRESSION_H_ + +#include +#include +#include "adf_accel_devices.h" +#include "qat_algs_send.h" + +#define QAT_COMP_MAX_SKID 4096 + +struct qat_compression_instance { + struct adf_etr_ring_data *dc_tx; + struct adf_etr_ring_data *dc_rx; + struct adf_accel_dev *accel_dev; + struct list_head list; + unsigned long state; + int id; + atomic_t refctr; + struct qat_instance_backlog backlog; + struct adf_dc_data *dc_data; + void (*build_deflate_ctx)(void *ctx); +}; + +static inline bool adf_hw_dev_has_compression(struct adf_accel_dev *accel_dev) +{ + struct adf_hw_device_data *hw_device = accel_dev->hw_device; + u32 mask = ~hw_device->accel_capabilities_mask; + + if (mask & ADF_ACCEL_CAPABILITIES_COMPRESSION) + return false; + + return true; +} + +#endif diff --git a/drivers/crypto/qat/qat_dh895xcc/adf_dh895xcc_hw_data.c b/drivers/crypto/qat/qat_dh895xcc/adf_dh895xcc_hw_data.c index baacf817abf68..bc80bb475118d 100644 --- a/drivers/crypto/qat/qat_dh895xcc/adf_dh895xcc_hw_data.c +++ b/drivers/crypto/qat/qat_dh895xcc/adf_dh895xcc_hw_data.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include "adf_dh895xcc_hw_data.h" @@ -242,6 +243,7 @@ void adf_init_hw_data_dh895xcc(struct adf_hw_device_data *hw_data) hw_data->pfvf_ops.disable_all_vf2pf_interrupts = disable_all_vf2pf_interrupts; hw_data->pfvf_ops.disable_pending_vf2pf_interrupts = disable_pending_vf2pf_interrupts; adf_gen2_init_hw_csr_ops(&hw_data->csr_ops); + adf_gen2_init_dc_ops(&hw_data->dc_ops); } void adf_clean_hw_data_dh895xcc(struct adf_hw_device_data *hw_data) diff --git a/drivers/crypto/qat/qat_dh895xccvf/adf_dh895xccvf_hw_data.c b/drivers/crypto/qat/qat_dh895xccvf/adf_dh895xccvf_hw_data.c index b933a00fb91b8..70e56cc16eceb 100644 --- a/drivers/crypto/qat/qat_dh895xccvf/adf_dh895xccvf_hw_data.c +++ b/drivers/crypto/qat/qat_dh895xccvf/adf_dh895xccvf_hw_data.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -91,6 +92,7 @@ void adf_init_hw_data_dh895xcciov(struct adf_hw_device_data *hw_data) adf_devmgr_update_class_index(hw_data); adf_gen2_init_vf_pfvf_ops(&hw_data->pfvf_ops); adf_gen2_init_hw_csr_ops(&hw_data->csr_ops); + adf_gen2_init_dc_ops(&hw_data->dc_ops); } void adf_clean_hw_data_dh895xcciov(struct adf_hw_device_data *hw_data) -- GitLab From 5b14b2b307e4045b38a4961718cbe9c17cef2bf4 Mon Sep 17 00:00:00 2001 From: Giovanni Cabiddu Date: Mon, 28 Nov 2022 12:21:21 +0000 Subject: [PATCH 1257/1423] crypto: qat - enable deflate for QAT GEN4 Enable deflate for QAT GEN4 devices. This adds (1) logic to create configuration entries at probe time for the compression instances for QAT GEN4 devices; (2) the implementation of QAT GEN4 specific compression operations, required since the creation of the compression request template is different between GEN2 and GEN4; and (3) updates to the firmware API related to compression for GEN4. The implementation configures the device to produce data compressed dynamically, optimized for throughput over compression ratio. Signed-off-by: Giovanni Cabiddu Reviewed-by: Wojciech Ziemba Reviewed-by: Adam Guerin Signed-off-by: Herbert Xu --- .../crypto/qat/qat_4xxx/adf_4xxx_hw_data.c | 4 +- .../crypto/qat/qat_4xxx/adf_4xxx_hw_data.h | 2 +- drivers/crypto/qat/qat_4xxx/adf_drv.c | 139 +++++++- drivers/crypto/qat/qat_common/Makefile | 1 + drivers/crypto/qat/qat_common/adf_gen4_dc.c | 83 +++++ drivers/crypto/qat/qat_common/adf_gen4_dc.h | 10 + .../qat/qat_common/icp_qat_hw_20_comp.h | 164 ++++++++++ .../qat/qat_common/icp_qat_hw_20_comp_defs.h | 300 ++++++++++++++++++ 8 files changed, 689 insertions(+), 14 deletions(-) create mode 100644 drivers/crypto/qat/qat_common/adf_gen4_dc.c create mode 100644 drivers/crypto/qat/qat_common/adf_gen4_dc.h create mode 100644 drivers/crypto/qat/qat_common/icp_qat_hw_20_comp.h create mode 100644 drivers/crypto/qat/qat_common/icp_qat_hw_20_comp_defs.h diff --git a/drivers/crypto/qat/qat_4xxx/adf_4xxx_hw_data.c b/drivers/crypto/qat/qat_4xxx/adf_4xxx_hw_data.c index fda5f699ff575..834a705180c02 100644 --- a/drivers/crypto/qat/qat_4xxx/adf_4xxx_hw_data.c +++ b/drivers/crypto/qat/qat_4xxx/adf_4xxx_hw_data.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -357,10 +358,11 @@ void adf_init_hw_data_4xxx(struct adf_hw_device_data *hw_data) hw_data->ring_pair_reset = adf_gen4_ring_pair_reset; hw_data->enable_pm = adf_gen4_enable_pm; hw_data->handle_pm_interrupt = adf_gen4_handle_pm_interrupt; - hw_data->dev_config = adf_crypto_dev_config; + hw_data->dev_config = adf_gen4_dev_config; adf_gen4_init_hw_csr_ops(&hw_data->csr_ops); adf_gen4_init_pf_pfvf_ops(&hw_data->pfvf_ops); + adf_gen4_init_dc_ops(&hw_data->dc_ops); } void adf_clean_hw_data_4xxx(struct adf_hw_device_data *hw_data) diff --git a/drivers/crypto/qat/qat_4xxx/adf_4xxx_hw_data.h b/drivers/crypto/qat/qat_4xxx/adf_4xxx_hw_data.h index 9d49248931f6a..e98428ba78e29 100644 --- a/drivers/crypto/qat/qat_4xxx/adf_4xxx_hw_data.h +++ b/drivers/crypto/qat/qat_4xxx/adf_4xxx_hw_data.h @@ -70,6 +70,6 @@ enum icp_qat_4xxx_slice_mask { void adf_init_hw_data_4xxx(struct adf_hw_device_data *hw_data); void adf_clean_hw_data_4xxx(struct adf_hw_device_data *hw_data); -int adf_crypto_dev_config(struct adf_accel_dev *accel_dev); +int adf_gen4_dev_config(struct adf_accel_dev *accel_dev); #endif diff --git a/drivers/crypto/qat/qat_4xxx/adf_drv.c b/drivers/crypto/qat/qat_4xxx/adf_drv.c index 8496c451b48e1..b3a4c7b238644 100644 --- a/drivers/crypto/qat/qat_4xxx/adf_drv.c +++ b/drivers/crypto/qat/qat_4xxx/adf_drv.c @@ -9,6 +9,7 @@ #include #include "adf_4xxx_hw_data.h" +#include "qat_compression.h" #include "qat_crypto.h" #include "adf_transport_access_macros.h" @@ -19,6 +20,16 @@ static const struct pci_device_id adf_pci_tbl[] = { }; MODULE_DEVICE_TABLE(pci, adf_pci_tbl); +enum configs { + DEV_CFG_CY = 0, + DEV_CFG_DC, +}; + +static const char * const services_operations[] = { + ADF_CFG_CY, + ADF_CFG_DC, +}; + static void adf_cleanup_accel(struct adf_accel_dev *accel_dev) { if (accel_dev->hw_device) { @@ -53,7 +64,7 @@ static int adf_cfg_dev_init(struct adf_accel_dev *accel_dev) return 0; } -int adf_crypto_dev_config(struct adf_accel_dev *accel_dev) +static int adf_crypto_dev_config(struct adf_accel_dev *accel_dev) { char key[ADF_CFG_MAX_KEY_LEN_IN_BYTES]; int banks = GET_MAX_BANKS(accel_dev); @@ -68,14 +79,6 @@ int adf_crypto_dev_config(struct adf_accel_dev *accel_dev) else instances = 0; - ret = adf_cfg_section_add(accel_dev, ADF_KERNEL_SEC); - if (ret) - goto err; - - ret = adf_cfg_section_add(accel_dev, "Accelerator0"); - if (ret) - goto err; - for (i = 0; i < instances; i++) { val = i; bank = i * 2; @@ -161,10 +164,122 @@ int adf_crypto_dev_config(struct adf_accel_dev *accel_dev) if (ret) goto err; - set_bit(ADF_STATUS_CONFIGURED, &accel_dev->status); return 0; err: - dev_err(&GET_DEV(accel_dev), "Failed to start QAT accel dev\n"); + dev_err(&GET_DEV(accel_dev), "Failed to add configuration for crypto\n"); + return ret; +} + +static int adf_comp_dev_config(struct adf_accel_dev *accel_dev) +{ + char key[ADF_CFG_MAX_KEY_LEN_IN_BYTES]; + int banks = GET_MAX_BANKS(accel_dev); + int cpus = num_online_cpus(); + unsigned long val; + int instances; + int ret; + int i; + + if (adf_hw_dev_has_compression(accel_dev)) + instances = min(cpus, banks); + else + instances = 0; + + for (i = 0; i < instances; i++) { + val = i; + snprintf(key, sizeof(key), ADF_DC "%d" ADF_RING_DC_BANK_NUM, i); + ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, + key, &val, ADF_DEC); + if (ret) + goto err; + + val = 512; + snprintf(key, sizeof(key), ADF_DC "%d" ADF_RING_DC_SIZE, i); + ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, + key, &val, ADF_DEC); + if (ret) + goto err; + + val = 0; + snprintf(key, sizeof(key), ADF_DC "%d" ADF_RING_DC_TX, i); + ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, + key, &val, ADF_DEC); + if (ret) + goto err; + + val = 1; + snprintf(key, sizeof(key), ADF_DC "%d" ADF_RING_DC_RX, i); + ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, + key, &val, ADF_DEC); + if (ret) + goto err; + + val = ADF_COALESCING_DEF_TIME; + snprintf(key, sizeof(key), ADF_ETRMGR_COALESCE_TIMER_FORMAT, i); + ret = adf_cfg_add_key_value_param(accel_dev, "Accelerator0", + key, &val, ADF_DEC); + if (ret) + goto err; + } + + val = i; + ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, ADF_NUM_DC, + &val, ADF_DEC); + if (ret) + goto err; + + val = 0; + ret = adf_cfg_add_key_value_param(accel_dev, ADF_KERNEL_SEC, ADF_NUM_CY, + &val, ADF_DEC); + if (ret) + goto err; + + return 0; +err: + dev_err(&GET_DEV(accel_dev), "Failed to add configuration for compression\n"); + return ret; +} + +int adf_gen4_dev_config(struct adf_accel_dev *accel_dev) +{ + char services[ADF_CFG_MAX_VAL_LEN_IN_BYTES] = {0}; + int ret; + + ret = adf_cfg_section_add(accel_dev, ADF_KERNEL_SEC); + if (ret) + goto err; + + ret = adf_cfg_section_add(accel_dev, "Accelerator0"); + if (ret) + goto err; + + ret = adf_cfg_get_param_value(accel_dev, ADF_GENERAL_SEC, + ADF_SERVICES_ENABLED, services); + if (ret) + goto err; + + ret = sysfs_match_string(services_operations, services); + if (ret < 0) + goto err; + + switch (ret) { + case DEV_CFG_CY: + ret = adf_crypto_dev_config(accel_dev); + break; + case DEV_CFG_DC: + ret = adf_comp_dev_config(accel_dev); + break; + } + + if (ret) + goto err; + + set_bit(ADF_STATUS_CONFIGURED, &accel_dev->status); + + return ret; + +err: + dev_err(&GET_DEV(accel_dev), "Failed to configure QAT driver\n"); return ret; } @@ -300,7 +415,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (ret) goto out_err_disable_aer; - ret = adf_crypto_dev_config(accel_dev); + ret = hw_data->dev_config(accel_dev); if (ret) goto out_err_disable_aer; diff --git a/drivers/crypto/qat/qat_common/Makefile b/drivers/crypto/qat/qat_common/Makefile index e3db4786738f7..1fb8d50f509f8 100644 --- a/drivers/crypto/qat/qat_common/Makefile +++ b/drivers/crypto/qat/qat_common/Makefile @@ -16,6 +16,7 @@ intel_qat-objs := adf_cfg.o \ adf_gen4_hw_data.o \ adf_gen4_pm.o \ adf_gen2_dc.o \ + adf_gen4_dc.o \ qat_crypto.o \ qat_compression.o \ qat_comp_algs.o \ diff --git a/drivers/crypto/qat/qat_common/adf_gen4_dc.c b/drivers/crypto/qat/qat_common/adf_gen4_dc.c new file mode 100644 index 0000000000000..5859238e37de5 --- /dev/null +++ b/drivers/crypto/qat/qat_common/adf_gen4_dc.c @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2022 Intel Corporation */ +#include "adf_accel_devices.h" +#include "icp_qat_fw_comp.h" +#include "icp_qat_hw_20_comp.h" +#include "adf_gen4_dc.h" + +static void qat_comp_build_deflate(void *ctx) +{ + struct icp_qat_fw_comp_req *req_tmpl = + (struct icp_qat_fw_comp_req *)ctx; + struct icp_qat_fw_comn_req_hdr *header = &req_tmpl->comn_hdr; + struct icp_qat_fw_comp_req_hdr_cd_pars *cd_pars = &req_tmpl->cd_pars; + struct icp_qat_fw_comp_req_params *req_pars = &req_tmpl->comp_pars; + struct icp_qat_hw_comp_20_config_csr_upper hw_comp_upper_csr = {0}; + struct icp_qat_hw_comp_20_config_csr_lower hw_comp_lower_csr = {0}; + struct icp_qat_hw_decomp_20_config_csr_lower hw_decomp_lower_csr = {0}; + u32 upper_val; + u32 lower_val; + + memset(req_tmpl, 0, sizeof(*req_tmpl)); + header->hdr_flags = + ICP_QAT_FW_COMN_HDR_FLAGS_BUILD(ICP_QAT_FW_COMN_REQ_FLAG_SET); + header->service_type = ICP_QAT_FW_COMN_REQ_CPM_FW_COMP; + header->service_cmd_id = ICP_QAT_FW_COMP_CMD_STATIC; + header->comn_req_flags = + ICP_QAT_FW_COMN_FLAGS_BUILD(QAT_COMN_CD_FLD_TYPE_16BYTE_DATA, + QAT_COMN_PTR_TYPE_SGL); + header->serv_specif_flags = + ICP_QAT_FW_COMP_FLAGS_BUILD(ICP_QAT_FW_COMP_STATELESS_SESSION, + ICP_QAT_FW_COMP_AUTO_SELECT_BEST, + ICP_QAT_FW_COMP_NOT_ENH_AUTO_SELECT_BEST, + ICP_QAT_FW_COMP_NOT_DISABLE_TYPE0_ENH_AUTO_SELECT_BEST, + ICP_QAT_FW_COMP_ENABLE_SECURE_RAM_USED_AS_INTMD_BUF); + hw_comp_lower_csr.skip_ctrl = ICP_QAT_HW_COMP_20_BYTE_SKIP_3BYTE_LITERAL; + hw_comp_lower_csr.algo = ICP_QAT_HW_COMP_20_HW_COMP_FORMAT_ILZ77; + hw_comp_lower_csr.lllbd = ICP_QAT_HW_COMP_20_LLLBD_CTRL_LLLBD_ENABLED; + hw_comp_lower_csr.sd = ICP_QAT_HW_COMP_20_SEARCH_DEPTH_LEVEL_1; + hw_comp_lower_csr.hash_update = ICP_QAT_HW_COMP_20_SKIP_HASH_UPDATE_DONT_ALLOW; + hw_comp_lower_csr.edmm = ICP_QAT_HW_COMP_20_EXTENDED_DELAY_MATCH_MODE_EDMM_ENABLED; + hw_comp_upper_csr.nice = ICP_QAT_HW_COMP_20_CONFIG_CSR_NICE_PARAM_DEFAULT_VAL; + hw_comp_upper_csr.lazy = ICP_QAT_HW_COMP_20_CONFIG_CSR_LAZY_PARAM_DEFAULT_VAL; + + upper_val = ICP_QAT_FW_COMP_20_BUILD_CONFIG_UPPER(hw_comp_upper_csr); + lower_val = ICP_QAT_FW_COMP_20_BUILD_CONFIG_LOWER(hw_comp_lower_csr); + + cd_pars->u.sl.comp_slice_cfg_word[0] = lower_val; + cd_pars->u.sl.comp_slice_cfg_word[1] = upper_val; + + req_pars->crc.legacy.initial_adler = COMP_CPR_INITIAL_ADLER; + req_pars->crc.legacy.initial_crc32 = COMP_CPR_INITIAL_CRC; + req_pars->req_par_flags = + ICP_QAT_FW_COMP_REQ_PARAM_FLAGS_BUILD(ICP_QAT_FW_COMP_SOP, + ICP_QAT_FW_COMP_EOP, + ICP_QAT_FW_COMP_BFINAL, + ICP_QAT_FW_COMP_CNV, + ICP_QAT_FW_COMP_CNV_RECOVERY, + ICP_QAT_FW_COMP_NO_CNV_DFX, + ICP_QAT_FW_COMP_CRC_MODE_LEGACY, + ICP_QAT_FW_COMP_NO_XXHASH_ACC, + ICP_QAT_FW_COMP_CNV_ERROR_NONE, + ICP_QAT_FW_COMP_NO_APPEND_CRC, + ICP_QAT_FW_COMP_NO_DROP_DATA); + + /* Fill second half of the template for decompression */ + memcpy(req_tmpl + 1, req_tmpl, sizeof(*req_tmpl)); + req_tmpl++; + header = &req_tmpl->comn_hdr; + header->service_cmd_id = ICP_QAT_FW_COMP_CMD_DECOMPRESS; + cd_pars = &req_tmpl->cd_pars; + + hw_decomp_lower_csr.algo = ICP_QAT_HW_DECOMP_20_HW_DECOMP_FORMAT_DEFLATE; + lower_val = ICP_QAT_FW_DECOMP_20_BUILD_CONFIG_LOWER(hw_decomp_lower_csr); + + cd_pars->u.sl.comp_slice_cfg_word[0] = lower_val; + cd_pars->u.sl.comp_slice_cfg_word[1] = 0; +} + +void adf_gen4_init_dc_ops(struct adf_dc_ops *dc_ops) +{ + dc_ops->build_deflate_ctx = qat_comp_build_deflate; +} +EXPORT_SYMBOL_GPL(adf_gen4_init_dc_ops); diff --git a/drivers/crypto/qat/qat_common/adf_gen4_dc.h b/drivers/crypto/qat/qat_common/adf_gen4_dc.h new file mode 100644 index 0000000000000..0b1a6774412eb --- /dev/null +++ b/drivers/crypto/qat/qat_common/adf_gen4_dc.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright(c) 2022 Intel Corporation */ +#ifndef ADF_GEN4_DC_H +#define ADF_GEN4_DC_H + +#include "adf_accel_devices.h" + +void adf_gen4_init_dc_ops(struct adf_dc_ops *dc_ops); + +#endif /* ADF_GEN4_DC_H */ diff --git a/drivers/crypto/qat/qat_common/icp_qat_hw_20_comp.h b/drivers/crypto/qat/qat_common/icp_qat_hw_20_comp.h new file mode 100644 index 0000000000000..7ea8962272f2f --- /dev/null +++ b/drivers/crypto/qat/qat_common/icp_qat_hw_20_comp.h @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright(c) 2022 Intel Corporation */ +#ifndef _ICP_QAT_HW_20_COMP_H_ +#define _ICP_QAT_HW_20_COMP_H_ + +#include "icp_qat_hw_20_comp_defs.h" +#include "icp_qat_fw.h" + +struct icp_qat_hw_comp_20_config_csr_lower { + enum icp_qat_hw_comp_20_extended_delay_match_mode edmm; + enum icp_qat_hw_comp_20_hw_comp_format algo; + enum icp_qat_hw_comp_20_search_depth sd; + enum icp_qat_hw_comp_20_hbs_control hbs; + enum icp_qat_hw_comp_20_abd abd; + enum icp_qat_hw_comp_20_lllbd_ctrl lllbd; + enum icp_qat_hw_comp_20_min_match_control mmctrl; + enum icp_qat_hw_comp_20_skip_hash_collision hash_col; + enum icp_qat_hw_comp_20_skip_hash_update hash_update; + enum icp_qat_hw_comp_20_byte_skip skip_ctrl; +}; + +static inline __u32 +ICP_QAT_FW_COMP_20_BUILD_CONFIG_LOWER(struct icp_qat_hw_comp_20_config_csr_lower csr) +{ + u32 val32 = 0; + + QAT_FIELD_SET(val32, csr.algo, + ICP_QAT_HW_COMP_20_CONFIG_CSR_HW_COMP_FORMAT_BITPOS, + ICP_QAT_HW_COMP_20_CONFIG_CSR_HW_COMP_FORMAT_MASK); + QAT_FIELD_SET(val32, csr.sd, + ICP_QAT_HW_COMP_20_CONFIG_CSR_SEARCH_DEPTH_BITPOS, + ICP_QAT_HW_COMP_20_CONFIG_CSR_SEARCH_DEPTH_MASK); + QAT_FIELD_SET(val32, csr.edmm, + ICP_QAT_HW_COMP_20_CONFIG_CSR_EXTENDED_DELAY_MATCH_MODE_BITPOS, + ICP_QAT_HW_COMP_20_CONFIG_CSR_EXTENDED_DELAY_MATCH_MODE_MASK); + QAT_FIELD_SET(val32, csr.hbs, + ICP_QAT_HW_COMP_20_CONFIG_CSR_HBS_CONTROL_BITPOS, + ICP_QAT_HW_COMP_20_CONFIG_CSR_HBS_CONTROL_MASK); + QAT_FIELD_SET(val32, csr.lllbd, + ICP_QAT_HW_COMP_20_CONFIG_CSR_LLLBD_CTRL_BITPOS, + ICP_QAT_HW_COMP_20_CONFIG_CSR_LLLBD_CTRL_MASK); + QAT_FIELD_SET(val32, csr.mmctrl, + ICP_QAT_HW_COMP_20_CONFIG_CSR_MIN_MATCH_CONTROL_BITPOS, + ICP_QAT_HW_COMP_20_CONFIG_CSR_MIN_MATCH_CONTROL_MASK); + QAT_FIELD_SET(val32, csr.hash_col, + ICP_QAT_HW_COMP_20_CONFIG_CSR_SKIP_HASH_COLLISION_BITPOS, + ICP_QAT_HW_COMP_20_CONFIG_CSR_SKIP_HASH_COLLISION_MASK); + QAT_FIELD_SET(val32, csr.hash_update, + ICP_QAT_HW_COMP_20_CONFIG_CSR_SKIP_HASH_UPDATE_BITPOS, + ICP_QAT_HW_COMP_20_CONFIG_CSR_SKIP_HASH_UPDATE_MASK); + QAT_FIELD_SET(val32, csr.skip_ctrl, + ICP_QAT_HW_COMP_20_CONFIG_CSR_BYTE_SKIP_BITPOS, + ICP_QAT_HW_COMP_20_CONFIG_CSR_BYTE_SKIP_MASK); + QAT_FIELD_SET(val32, csr.abd, ICP_QAT_HW_COMP_20_CONFIG_CSR_ABD_BITPOS, + ICP_QAT_HW_COMP_20_CONFIG_CSR_ABD_MASK); + + return __builtin_bswap32(val32); +} + +struct icp_qat_hw_comp_20_config_csr_upper { + enum icp_qat_hw_comp_20_scb_control scb_ctrl; + enum icp_qat_hw_comp_20_rmb_control rmb_ctrl; + enum icp_qat_hw_comp_20_som_control som_ctrl; + enum icp_qat_hw_comp_20_skip_hash_rd_control skip_hash_ctrl; + enum icp_qat_hw_comp_20_scb_unload_control scb_unload_ctrl; + enum icp_qat_hw_comp_20_disable_token_fusion_control disable_token_fusion_ctrl; + enum icp_qat_hw_comp_20_lbms lbms; + enum icp_qat_hw_comp_20_scb_mode_reset_mask scb_mode_reset; + __u16 lazy; + __u16 nice; +}; + +static inline __u32 +ICP_QAT_FW_COMP_20_BUILD_CONFIG_UPPER(struct icp_qat_hw_comp_20_config_csr_upper csr) +{ + u32 val32 = 0; + + QAT_FIELD_SET(val32, csr.scb_ctrl, + ICP_QAT_HW_COMP_20_CONFIG_CSR_SCB_CONTROL_BITPOS, + ICP_QAT_HW_COMP_20_CONFIG_CSR_SCB_CONTROL_MASK); + QAT_FIELD_SET(val32, csr.rmb_ctrl, + ICP_QAT_HW_COMP_20_CONFIG_CSR_RMB_CONTROL_BITPOS, + ICP_QAT_HW_COMP_20_CONFIG_CSR_RMB_CONTROL_MASK); + QAT_FIELD_SET(val32, csr.som_ctrl, + ICP_QAT_HW_COMP_20_CONFIG_CSR_SOM_CONTROL_BITPOS, + ICP_QAT_HW_COMP_20_CONFIG_CSR_SOM_CONTROL_MASK); + QAT_FIELD_SET(val32, csr.skip_hash_ctrl, + ICP_QAT_HW_COMP_20_CONFIG_CSR_SKIP_HASH_RD_CONTROL_BITPOS, + ICP_QAT_HW_COMP_20_CONFIG_CSR_SKIP_HASH_RD_CONTROL_MASK); + QAT_FIELD_SET(val32, csr.scb_unload_ctrl, + ICP_QAT_HW_COMP_20_CONFIG_CSR_SCB_UNLOAD_CONTROL_BITPOS, + ICP_QAT_HW_COMP_20_CONFIG_CSR_SCB_UNLOAD_CONTROL_MASK); + QAT_FIELD_SET(val32, csr.disable_token_fusion_ctrl, + ICP_QAT_HW_COMP_20_CONFIG_CSR_DISABLE_TOKEN_FUSION_CONTROL_BITPOS, + ICP_QAT_HW_COMP_20_CONFIG_CSR_DISABLE_TOKEN_FUSION_CONTROL_MASK); + QAT_FIELD_SET(val32, csr.lbms, + ICP_QAT_HW_COMP_20_CONFIG_CSR_LBMS_BITPOS, + ICP_QAT_HW_COMP_20_CONFIG_CSR_LBMS_MASK); + QAT_FIELD_SET(val32, csr.scb_mode_reset, + ICP_QAT_HW_COMP_20_CONFIG_CSR_SCB_MODE_RESET_MASK_BITPOS, + ICP_QAT_HW_COMP_20_CONFIG_CSR_SCB_MODE_RESET_MASK_MASK); + QAT_FIELD_SET(val32, csr.lazy, + ICP_QAT_HW_COMP_20_CONFIG_CSR_LAZY_PARAM_BITPOS, + ICP_QAT_HW_COMP_20_CONFIG_CSR_LAZY_PARAM_MASK); + QAT_FIELD_SET(val32, csr.nice, + ICP_QAT_HW_COMP_20_CONFIG_CSR_NICE_PARAM_BITPOS, + ICP_QAT_HW_COMP_20_CONFIG_CSR_NICE_PARAM_MASK); + + return __builtin_bswap32(val32); +} + +struct icp_qat_hw_decomp_20_config_csr_lower { + enum icp_qat_hw_decomp_20_hbs_control hbs; + enum icp_qat_hw_decomp_20_lbms lbms; + enum icp_qat_hw_decomp_20_hw_comp_format algo; + enum icp_qat_hw_decomp_20_min_match_control mmctrl; + enum icp_qat_hw_decomp_20_lz4_block_checksum_present lbc; +}; + +static inline __u32 +ICP_QAT_FW_DECOMP_20_BUILD_CONFIG_LOWER(struct icp_qat_hw_decomp_20_config_csr_lower csr) +{ + u32 val32 = 0; + + QAT_FIELD_SET(val32, csr.hbs, + ICP_QAT_HW_DECOMP_20_CONFIG_CSR_HBS_CONTROL_BITPOS, + ICP_QAT_HW_DECOMP_20_CONFIG_CSR_HBS_CONTROL_MASK); + QAT_FIELD_SET(val32, csr.lbms, + ICP_QAT_HW_DECOMP_20_CONFIG_CSR_LBMS_BITPOS, + ICP_QAT_HW_DECOMP_20_CONFIG_CSR_LBMS_MASK); + QAT_FIELD_SET(val32, csr.algo, + ICP_QAT_HW_DECOMP_20_CONFIG_CSR_HW_DECOMP_FORMAT_BITPOS, + ICP_QAT_HW_DECOMP_20_CONFIG_CSR_HW_DECOMP_FORMAT_MASK); + QAT_FIELD_SET(val32, csr.mmctrl, + ICP_QAT_HW_DECOMP_20_CONFIG_CSR_MIN_MATCH_CONTROL_BITPOS, + ICP_QAT_HW_DECOMP_20_CONFIG_CSR_MIN_MATCH_CONTROL_MASK); + QAT_FIELD_SET(val32, csr.lbc, + ICP_QAT_HW_DECOMP_20_CONFIG_CSR_LZ4_BLOCK_CHECKSUM_PRESENT_BITPOS, + ICP_QAT_HW_DECOMP_20_CONFIG_CSR_LZ4_BLOCK_CHECKSUM_PRESENT_MASK); + + return __builtin_bswap32(val32); +} + +struct icp_qat_hw_decomp_20_config_csr_upper { + enum icp_qat_hw_decomp_20_speculative_decoder_control sdc; + enum icp_qat_hw_decomp_20_mini_cam_control mcc; +}; + +static inline __u32 +ICP_QAT_FW_DECOMP_20_BUILD_CONFIG_UPPER(struct icp_qat_hw_decomp_20_config_csr_upper csr) +{ + u32 val32 = 0; + + QAT_FIELD_SET(val32, csr.sdc, + ICP_QAT_HW_DECOMP_20_CONFIG_CSR_SPECULATIVE_DECODER_CONTROL_BITPOS, + ICP_QAT_HW_DECOMP_20_CONFIG_CSR_SPECULATIVE_DECODER_CONTROL_MASK); + QAT_FIELD_SET(val32, csr.mcc, + ICP_QAT_HW_DECOMP_20_CONFIG_CSR_MINI_CAM_CONTROL_BITPOS, + ICP_QAT_HW_DECOMP_20_CONFIG_CSR_MINI_CAM_CONTROL_MASK); + + return __builtin_bswap32(val32); +} + +#endif diff --git a/drivers/crypto/qat/qat_common/icp_qat_hw_20_comp_defs.h b/drivers/crypto/qat/qat_common/icp_qat_hw_20_comp_defs.h new file mode 100644 index 0000000000000..208d4554283bf --- /dev/null +++ b/drivers/crypto/qat/qat_common/icp_qat_hw_20_comp_defs.h @@ -0,0 +1,300 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright(c) 2022 Intel Corporation */ +#ifndef _ICP_QAT_HW_20_COMP_DEFS_H +#define _ICP_QAT_HW_20_COMP_DEFS_H + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_SCB_CONTROL_BITPOS 31 +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_SCB_CONTROL_MASK 0x1 + +enum icp_qat_hw_comp_20_scb_control { + ICP_QAT_HW_COMP_20_SCB_CONTROL_ENABLE = 0x0, + ICP_QAT_HW_COMP_20_SCB_CONTROL_DISABLE = 0x1, +}; + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_SCB_CONTROL_DEFAULT_VAL \ + ICP_QAT_HW_COMP_20_SCB_CONTROL_DISABLE + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_RMB_CONTROL_BITPOS 30 +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_RMB_CONTROL_MASK 0x1 + +enum icp_qat_hw_comp_20_rmb_control { + ICP_QAT_HW_COMP_20_RMB_CONTROL_RESET_ALL = 0x0, + ICP_QAT_HW_COMP_20_RMB_CONTROL_RESET_FC_ONLY = 0x1, +}; + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_RMB_CONTROL_DEFAULT_VAL \ + ICP_QAT_HW_COMP_20_RMB_CONTROL_RESET_ALL + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_SOM_CONTROL_BITPOS 28 +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_SOM_CONTROL_MASK 0x3 + +enum icp_qat_hw_comp_20_som_control { + ICP_QAT_HW_COMP_20_SOM_CONTROL_NORMAL_MODE = 0x0, + ICP_QAT_HW_COMP_20_SOM_CONTROL_REPLAY_MODE = 0x1, + ICP_QAT_HW_COMP_20_SOM_CONTROL_INPUT_CRC = 0x2, + ICP_QAT_HW_COMP_20_SOM_CONTROL_RESERVED_MODE = 0x3, +}; + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_SOM_CONTROL_DEFAULT_VAL \ + ICP_QAT_HW_COMP_20_SOM_CONTROL_NORMAL_MODE + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_SKIP_HASH_RD_CONTROL_BITPOS 27 +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_SKIP_HASH_RD_CONTROL_MASK 0x1 + +enum icp_qat_hw_comp_20_skip_hash_rd_control { + ICP_QAT_HW_COMP_20_SKIP_HASH_RD_CONTROL_NO_SKIP = 0x0, + ICP_QAT_HW_COMP_20_SKIP_HASH_RD_CONTROL_SKIP_HASH_READS = 0x1, +}; + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_SKIP_HASH_RD_CONTROL_DEFAULT_VAL \ + ICP_QAT_HW_COMP_20_SKIP_HASH_RD_CONTROL_NO_SKIP + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_SCB_UNLOAD_CONTROL_BITPOS 26 +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_SCB_UNLOAD_CONTROL_MASK 0x1 + +enum icp_qat_hw_comp_20_scb_unload_control { + ICP_QAT_HW_COMP_20_SCB_UNLOAD_CONTROL_UNLOAD = 0x0, + ICP_QAT_HW_COMP_20_SCB_UNLOAD_CONTROL_NO_UNLOAD = 0x1, +}; + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_SCB_UNLOAD_CONTROL_DEFAULT_VAL \ + ICP_QAT_HW_COMP_20_SCB_UNLOAD_CONTROL_UNLOAD + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_DISABLE_TOKEN_FUSION_CONTROL_BITPOS 21 +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_DISABLE_TOKEN_FUSION_CONTROL_MASK 0x1 + +enum icp_qat_hw_comp_20_disable_token_fusion_control { + ICP_QAT_HW_COMP_20_DISABLE_TOKEN_FUSION_CONTROL_ENABLE = 0x0, + ICP_QAT_HW_COMP_20_DISABLE_TOKEN_FUSION_CONTROL_DISABLE = 0x1, +}; + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_DISABLE_TOKEN_FUSION_CONTROL_DEFAULT_VAL \ + ICP_QAT_HW_COMP_20_DISABLE_TOKEN_FUSION_CONTROL_ENABLE + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_LBMS_BITPOS 19 +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_LBMS_MASK 0x3 + +enum icp_qat_hw_comp_20_lbms { + ICP_QAT_HW_COMP_20_LBMS_LBMS_64KB = 0x0, + ICP_QAT_HW_COMP_20_LBMS_LBMS_256KB = 0x1, + ICP_QAT_HW_COMP_20_LBMS_LBMS_1MB = 0x2, + ICP_QAT_HW_COMP_20_LBMS_LBMS_4MB = 0x3, +}; + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_LBMS_DEFAULT_VAL \ + ICP_QAT_HW_COMP_20_LBMS_LBMS_64KB + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_SCB_MODE_RESET_MASK_BITPOS 18 +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_SCB_MODE_RESET_MASK_MASK 0x1 + +enum icp_qat_hw_comp_20_scb_mode_reset_mask { + ICP_QAT_HW_COMP_20_SCB_MODE_RESET_MASK_RESET_COUNTERS = 0x0, + ICP_QAT_HW_COMP_20_SCB_MODE_RESET_MASK_RESET_COUNTERS_AND_HISTORY = 0x1, +}; + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_SCB_MODE_RESET_MASK_DEFAULT_VAL \ + ICP_QAT_HW_COMP_20_SCB_MODE_RESET_MASK_RESET_COUNTERS + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_LAZY_PARAM_BITPOS 9 +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_LAZY_PARAM_MASK 0x1ff +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_LAZY_PARAM_DEFAULT_VAL 258 + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_NICE_PARAM_BITPOS 0 +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_NICE_PARAM_MASK 0x1ff +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_NICE_PARAM_DEFAULT_VAL 259 + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_HBS_CONTROL_BITPOS 14 +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_HBS_CONTROL_MASK 0x7 + +enum icp_qat_hw_comp_20_hbs_control { + ICP_QAT_HW_COMP_20_HBS_CONTROL_HBS_IS_32KB = 0x0, + ICP_QAT_HW_COMP_23_HBS_CONTROL_HBS_IS_64KB = 0x1, +}; + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_HBS_CONTROL_DEFAULT_VAL \ + ICP_QAT_HW_COMP_20_HBS_CONTROL_HBS_IS_32KB + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_ABD_BITPOS 13 +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_ABD_MASK 0x1 + +enum icp_qat_hw_comp_20_abd { + ICP_QAT_HW_COMP_20_ABD_ABD_ENABLED = 0x0, + ICP_QAT_HW_COMP_20_ABD_ABD_DISABLED = 0x1, +}; + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_ABD_DEFAULT_VAL \ + ICP_QAT_HW_COMP_20_ABD_ABD_ENABLED + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_LLLBD_CTRL_BITPOS 12 +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_LLLBD_CTRL_MASK 0x1 + +enum icp_qat_hw_comp_20_lllbd_ctrl { + ICP_QAT_HW_COMP_20_LLLBD_CTRL_LLLBD_ENABLED = 0x0, + ICP_QAT_HW_COMP_20_LLLBD_CTRL_LLLBD_DISABLED = 0x1, +}; + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_LLLBD_CTRL_DEFAULT_VAL \ + ICP_QAT_HW_COMP_20_LLLBD_CTRL_LLLBD_ENABLED + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_SEARCH_DEPTH_BITPOS 8 +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_SEARCH_DEPTH_MASK 0xf + +enum icp_qat_hw_comp_20_search_depth { + ICP_QAT_HW_COMP_20_SEARCH_DEPTH_LEVEL_1 = 0x1, + ICP_QAT_HW_COMP_20_SEARCH_DEPTH_LEVEL_6 = 0x3, + ICP_QAT_HW_COMP_20_SEARCH_DEPTH_LEVEL_9 = 0x4, +}; + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_SEARCH_DEPTH_DEFAULT_VAL \ + ICP_QAT_HW_COMP_20_SEARCH_DEPTH_LEVEL_1 + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_HW_COMP_FORMAT_BITPOS 5 +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_HW_COMP_FORMAT_MASK 0x7 + +enum icp_qat_hw_comp_20_hw_comp_format { + ICP_QAT_HW_COMP_20_HW_COMP_FORMAT_ILZ77 = 0x0, + ICP_QAT_HW_COMP_20_HW_COMP_FORMAT_DEFLATE = 0x1, + ICP_QAT_HW_COMP_20_HW_COMP_FORMAT_LZ4 = 0x2, + ICP_QAT_HW_COMP_20_HW_COMP_FORMAT_LZ4S = 0x3, + ICP_QAT_HW_COMP_23_HW_COMP_FORMAT_ZSTD = 0x4, +}; + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_HW_COMP_FORMAT_DEFAULT_VAL \ + ICP_QAT_HW_COMP_20_HW_COMP_FORMAT_DEFLATE + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_MIN_MATCH_CONTROL_BITPOS 4 +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_MIN_MATCH_CONTROL_MASK 0x1 + +enum icp_qat_hw_comp_20_min_match_control { + ICP_QAT_HW_COMP_20_MIN_MATCH_CONTROL_MATCH_3B = 0x0, + ICP_QAT_HW_COMP_20_MIN_MATCH_CONTROL_MATCH_4B = 0x1, +}; + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_MIN_MATCH_CONTROL_DEFAULT_VAL \ + ICP_QAT_HW_COMP_20_MIN_MATCH_CONTROL_MATCH_3B + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_SKIP_HASH_COLLISION_BITPOS 3 +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_SKIP_HASH_COLLISION_MASK 0x1 + +enum icp_qat_hw_comp_20_skip_hash_collision { + ICP_QAT_HW_COMP_20_SKIP_HASH_COLLISION_ALLOW = 0x0, + ICP_QAT_HW_COMP_20_SKIP_HASH_COLLISION_DONT_ALLOW = 0x1, +}; + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_SKIP_HASH_COLLISION_DEFAULT_VAL \ + ICP_QAT_HW_COMP_20_SKIP_HASH_COLLISION_ALLOW + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_SKIP_HASH_UPDATE_BITPOS 2 +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_SKIP_HASH_UPDATE_MASK 0x1 + +enum icp_qat_hw_comp_20_skip_hash_update { + ICP_QAT_HW_COMP_20_SKIP_HASH_UPDATE_ALLOW = 0x0, + ICP_QAT_HW_COMP_20_SKIP_HASH_UPDATE_DONT_ALLOW = 0x1, +}; + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_SKIP_HASH_UPDATE_DEFAULT_VAL \ + ICP_QAT_HW_COMP_20_SKIP_HASH_UPDATE_ALLOW + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_BYTE_SKIP_BITPOS 1 +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_BYTE_SKIP_MASK 0x1 + +enum icp_qat_hw_comp_20_byte_skip { + ICP_QAT_HW_COMP_20_BYTE_SKIP_3BYTE_TOKEN = 0x0, + ICP_QAT_HW_COMP_20_BYTE_SKIP_3BYTE_LITERAL = 0x1, +}; + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_BYTE_SKIP_DEFAULT_VAL \ + ICP_QAT_HW_COMP_20_BYTE_SKIP_3BYTE_TOKEN + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_EXTENDED_DELAY_MATCH_MODE_BITPOS 0 +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_EXTENDED_DELAY_MATCH_MODE_MASK 0x1 + +enum icp_qat_hw_comp_20_extended_delay_match_mode { + ICP_QAT_HW_COMP_20_EXTENDED_DELAY_MATCH_MODE_EDMM_DISABLED = 0x0, + ICP_QAT_HW_COMP_20_EXTENDED_DELAY_MATCH_MODE_EDMM_ENABLED = 0x1, +}; + +#define ICP_QAT_HW_COMP_20_CONFIG_CSR_EXTENDED_DELAY_MATCH_MODE_DEFAULT_VAL \ + ICP_QAT_HW_COMP_20_EXTENDED_DELAY_MATCH_MODE_EDMM_DISABLED + +#define ICP_QAT_HW_DECOMP_20_CONFIG_CSR_SPECULATIVE_DECODER_CONTROL_BITPOS 31 +#define ICP_QAT_HW_DECOMP_20_CONFIG_CSR_SPECULATIVE_DECODER_CONTROL_MASK 0x1 + +enum icp_qat_hw_decomp_20_speculative_decoder_control { + ICP_QAT_HW_DECOMP_20_SPECULATIVE_DECODER_CONTROL_ENABLE = 0x0, + ICP_QAT_HW_DECOMP_20_SPECULATIVE_DECODER_CONTROL_DISABLE = 0x1, +}; + +#define ICP_QAT_HW_DECOMP_20_CONFIG_CSR_SPECULATIVE_DECODER_CONTROL_DEFAULT_VAL \ + ICP_QAT_HW_DECOMP_20_SPECULATIVE_DECODER_CONTROL_ENABLE + +#define ICP_QAT_HW_DECOMP_20_CONFIG_CSR_MINI_CAM_CONTROL_BITPOS 30 +#define ICP_QAT_HW_DECOMP_20_CONFIG_CSR_MINI_CAM_CONTROL_MASK 0x1 + +enum icp_qat_hw_decomp_20_mini_cam_control { + ICP_QAT_HW_DECOMP_20_MINI_CAM_CONTROL_ENABLE = 0x0, + ICP_QAT_HW_DECOMP_20_MINI_CAM_CONTROL_DISABLE = 0x1, +}; + +#define ICP_QAT_HW_DECOMP_20_CONFIG_CSR_MINI_CAM_CONTROL_DEFAULT_VAL \ + ICP_QAT_HW_DECOMP_20_MINI_CAM_CONTROL_ENABLE + +#define ICP_QAT_HW_DECOMP_20_CONFIG_CSR_HBS_CONTROL_BITPOS 14 +#define ICP_QAT_HW_DECOMP_20_CONFIG_CSR_HBS_CONTROL_MASK 0x7 + +enum icp_qat_hw_decomp_20_hbs_control { + ICP_QAT_HW_DECOMP_20_HBS_CONTROL_HBS_IS_32KB = 0x0, +}; + +#define ICP_QAT_HW_DECOMP_20_CONFIG_CSR_HBS_CONTROL_DEFAULT_VAL \ + ICP_QAT_HW_DECOMP_20_HBS_CONTROL_HBS_IS_32KB + +#define ICP_QAT_HW_DECOMP_20_CONFIG_CSR_LBMS_BITPOS 8 +#define ICP_QAT_HW_DECOMP_20_CONFIG_CSR_LBMS_MASK 0x3 + +enum icp_qat_hw_decomp_20_lbms { + ICP_QAT_HW_DECOMP_20_LBMS_LBMS_64KB = 0x0, + ICP_QAT_HW_DECOMP_20_LBMS_LBMS_256KB = 0x1, + ICP_QAT_HW_DECOMP_20_LBMS_LBMS_1MB = 0x2, + ICP_QAT_HW_DECOMP_20_LBMS_LBMS_4MB = 0x3, +}; + +#define ICP_QAT_HW_DECOMP_20_CONFIG_CSR_LBMS_DEFAULT_VAL \ + ICP_QAT_HW_DECOMP_20_LBMS_LBMS_64KB + +#define ICP_QAT_HW_DECOMP_20_CONFIG_CSR_HW_DECOMP_FORMAT_BITPOS 5 +#define ICP_QAT_HW_DECOMP_20_CONFIG_CSR_HW_DECOMP_FORMAT_MASK 0x7 + +enum icp_qat_hw_decomp_20_hw_comp_format { + ICP_QAT_HW_DECOMP_20_HW_DECOMP_FORMAT_DEFLATE = 0x1, + ICP_QAT_HW_DECOMP_20_HW_DECOMP_FORMAT_LZ4 = 0x2, + ICP_QAT_HW_DECOMP_20_HW_DECOMP_FORMAT_LZ4S = 0x3, + ICP_QAT_HW_DECOMP_23_HW_DECOMP_FORMAT_ZSTD = 0x4, +}; + +#define ICP_QAT_HW_DECOMP_20_CONFIG_CSR_HW_DECOMP_FORMAT_DEFAULT_VAL \ + ICP_QAT_HW_DECOMP_20_HW_DECOMP_FORMAT_DEFLATE + +#define ICP_QAT_HW_DECOMP_20_CONFIG_CSR_MIN_MATCH_CONTROL_BITPOS 4 +#define ICP_QAT_HW_DECOMP_20_CONFIG_CSR_MIN_MATCH_CONTROL_MASK 0x1 + +enum icp_qat_hw_decomp_20_min_match_control { + ICP_QAT_HW_DECOMP_20_MIN_MATCH_CONTROL_MATCH_3B = 0x0, + ICP_QAT_HW_DECOMP_20_MIN_MATCH_CONTROL_MATCH_4B = 0x1, +}; + +#define ICP_QAT_HW_DECOMP_20_CONFIG_CSR_MIN_MATCH_CONTROL_DEFAULT_VAL \ + ICP_QAT_HW_DECOMP_20_MIN_MATCH_CONTROL_MATCH_3B + +#define ICP_QAT_HW_DECOMP_20_CONFIG_CSR_LZ4_BLOCK_CHECKSUM_PRESENT_BITPOS 3 +#define ICP_QAT_HW_DECOMP_20_CONFIG_CSR_LZ4_BLOCK_CHECKSUM_PRESENT_MASK 0x1 + +enum icp_qat_hw_decomp_20_lz4_block_checksum_present { + ICP_QAT_HW_DECOMP_20_LZ4_BLOCK_CHKSUM_ABSENT = 0x0, + ICP_QAT_HW_DECOMP_20_LZ4_BLOCK_CHKSUM_PRESENT = 0x1, +}; + +#define ICP_QAT_HW_DECOMP_20_CONFIG_CSR_LZ4_BLOCK_CHECKSUM_PRESENT_DEFAULT_VAL \ + ICP_QAT_HW_DECOMP_20_LZ4_BLOCK_CHKSUM_ABSENT + +#endif -- GitLab From 5fc8041e56782e4d44682f8c2e4d822817a4dae6 Mon Sep 17 00:00:00 2001 From: Giovanni Cabiddu Date: Mon, 28 Nov 2022 12:21:22 +0000 Subject: [PATCH 1258/1423] crypto: acomp - define max size for destination The acomp API allows to send requests with a NULL destination buffer. In this case, the algorithm implementation needs to allocate the destination scatter list, perform the operation and return the buffer to the user. For decompression, data is likely to expand and be bigger than the allocated buffer. Define the maximum size (128KB) that acomp implementations will allocate for decompression operations as destination buffer when they receive a request with a NULL destination buffer. Suggested-by: Herbert Xu Signed-off-by: Giovanni Cabiddu Signed-off-by: Herbert Xu --- include/crypto/acompress.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/crypto/acompress.h b/include/crypto/acompress.h index cb3d6b1c655de..e4bc96528902e 100644 --- a/include/crypto/acompress.h +++ b/include/crypto/acompress.h @@ -11,6 +11,7 @@ #include #define CRYPTO_ACOMP_ALLOC_OUTPUT 0x00000001 +#define CRYPTO_ACOMP_DST_MAX 131072 /** * struct acomp_req - asynchronous (de)compression request -- GitLab From 3112d0f1b0b32daac97d170dbc9d3cce69f7ff49 Mon Sep 17 00:00:00 2001 From: Giovanni Cabiddu Date: Mon, 28 Nov 2022 12:21:23 +0000 Subject: [PATCH 1259/1423] crypto: qat - add resubmit logic for decompression The acomp API allows to send requests with a NULL destination buffer. In this case, the algorithm implementation needs to allocate the destination scatter list, perform the operation and return the buffer to the user. For decompression, data is likely to expand and be bigger than the allocated buffer. This implements a re-submission mechanism for decompression requests that is triggered if the destination buffer, allocated by the driver, is not sufficiently big to store the output from decompression. If an overflow is detected when processing the callback for a decompression request with a NULL destination buffer, a workqueue is scheduled. This allocates a new scatter list of size CRYPTO_ACOMP_DST_MAX, now 128KB, creates a new firmware scatter list and resubmits the job to the hardware accelerator. Suggested-by: Herbert Xu Signed-off-by: Giovanni Cabiddu Signed-off-by: Herbert Xu --- drivers/crypto/qat/qat_common/qat_bl.c | 159 ++++++++++++++++++ drivers/crypto/qat/qat_common/qat_bl.h | 6 + drivers/crypto/qat/qat_common/qat_comp_algs.c | 70 ++++++++ drivers/crypto/qat/qat_common/qat_comp_req.h | 10 ++ 4 files changed, 245 insertions(+) diff --git a/drivers/crypto/qat/qat_common/qat_bl.c b/drivers/crypto/qat/qat_common/qat_bl.c index 221a4eb610a38..2e89ff08041bf 100644 --- a/drivers/crypto/qat/qat_common/qat_bl.c +++ b/drivers/crypto/qat/qat_common/qat_bl.c @@ -222,3 +222,162 @@ int qat_bl_sgl_to_bufl(struct adf_accel_dev *accel_dev, extra_dst_buff, sz_extra_dst_buff, flags); } + +static void qat_bl_sgl_unmap(struct adf_accel_dev *accel_dev, + struct qat_alg_buf_list *bl) +{ + struct device *dev = &GET_DEV(accel_dev); + int n = bl->num_bufs; + int i; + + for (i = 0; i < n; i++) + if (!dma_mapping_error(dev, bl->bufers[i].addr)) + dma_unmap_single(dev, bl->bufers[i].addr, + bl->bufers[i].len, DMA_FROM_DEVICE); +} + +static int qat_bl_sgl_map(struct adf_accel_dev *accel_dev, + struct scatterlist *sgl, + struct qat_alg_buf_list **bl) +{ + struct device *dev = &GET_DEV(accel_dev); + struct qat_alg_buf_list *bufl; + int node = dev_to_node(dev); + struct scatterlist *sg; + int n, i, sg_nctr; + size_t sz; + + n = sg_nents(sgl); + sz = struct_size(bufl, bufers, n); + bufl = kzalloc_node(sz, GFP_KERNEL, node); + if (unlikely(!bufl)) + return -ENOMEM; + + for (i = 0; i < n; i++) + bufl->bufers[i].addr = DMA_MAPPING_ERROR; + + sg_nctr = 0; + for_each_sg(sgl, sg, n, i) { + int y = sg_nctr; + + if (!sg->length) + continue; + + bufl->bufers[y].addr = dma_map_single(dev, sg_virt(sg), + sg->length, + DMA_FROM_DEVICE); + bufl->bufers[y].len = sg->length; + if (unlikely(dma_mapping_error(dev, bufl->bufers[y].addr))) + goto err_map; + sg_nctr++; + } + bufl->num_bufs = sg_nctr; + bufl->num_mapped_bufs = sg_nctr; + + *bl = bufl; + + return 0; + +err_map: + for (i = 0; i < n; i++) + if (!dma_mapping_error(dev, bufl->bufers[i].addr)) + dma_unmap_single(dev, bufl->bufers[i].addr, + bufl->bufers[i].len, + DMA_FROM_DEVICE); + kfree(bufl); + *bl = NULL; + + return -ENOMEM; +} + +static void qat_bl_sgl_free_unmap(struct adf_accel_dev *accel_dev, + struct scatterlist *sgl, + struct qat_alg_buf_list *bl, + bool free_bl) +{ + if (bl) { + qat_bl_sgl_unmap(accel_dev, bl); + + if (free_bl) + kfree(bl); + } + if (sgl) + sgl_free(sgl); +} + +static int qat_bl_sgl_alloc_map(struct adf_accel_dev *accel_dev, + struct scatterlist **sgl, + struct qat_alg_buf_list **bl, + unsigned int dlen, + gfp_t gfp) +{ + struct scatterlist *dst; + int ret; + + dst = sgl_alloc(dlen, gfp, NULL); + if (!dst) { + dev_err(&GET_DEV(accel_dev), "sg_alloc failed\n"); + return -ENOMEM; + } + + ret = qat_bl_sgl_map(accel_dev, dst, bl); + if (ret) + goto err; + + *sgl = dst; + + return 0; + +err: + sgl_free(dst); + *sgl = NULL; + return ret; +} + +int qat_bl_realloc_map_new_dst(struct adf_accel_dev *accel_dev, + struct scatterlist **sg, + unsigned int dlen, + struct qat_request_buffs *qat_bufs, + gfp_t gfp) +{ + struct device *dev = &GET_DEV(accel_dev); + dma_addr_t new_blp = DMA_MAPPING_ERROR; + struct qat_alg_buf_list *new_bl; + struct scatterlist *new_sg; + size_t new_bl_size; + int ret; + + ret = qat_bl_sgl_alloc_map(accel_dev, &new_sg, &new_bl, dlen, gfp); + if (ret) + return ret; + + new_bl_size = struct_size(new_bl, bufers, new_bl->num_bufs); + + /* Map new firmware SGL descriptor */ + new_blp = dma_map_single(dev, new_bl, new_bl_size, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(dev, new_blp))) + goto err; + + /* Unmap old firmware SGL descriptor */ + dma_unmap_single(dev, qat_bufs->bloutp, qat_bufs->sz_out, DMA_TO_DEVICE); + + /* Free and unmap old scatterlist */ + qat_bl_sgl_free_unmap(accel_dev, *sg, qat_bufs->blout, + !qat_bufs->sgl_dst_valid); + + qat_bufs->sgl_dst_valid = false; + qat_bufs->blout = new_bl; + qat_bufs->bloutp = new_blp; + qat_bufs->sz_out = new_bl_size; + + *sg = new_sg; + + return 0; +err: + qat_bl_sgl_free_unmap(accel_dev, new_sg, new_bl, true); + + if (!dma_mapping_error(dev, new_blp)) + dma_unmap_single(dev, new_blp, new_bl_size, DMA_TO_DEVICE); + + return -ENOMEM; +} diff --git a/drivers/crypto/qat/qat_common/qat_bl.h b/drivers/crypto/qat/qat_common/qat_bl.h index 5f2ea8f352f76..8ca5e52ee9e21 100644 --- a/drivers/crypto/qat/qat_common/qat_bl.h +++ b/drivers/crypto/qat/qat_common/qat_bl.h @@ -58,4 +58,10 @@ static inline gfp_t qat_algs_alloc_flags(struct crypto_async_request *req) return req->flags & CRYPTO_TFM_REQ_MAY_SLEEP ? GFP_KERNEL : GFP_ATOMIC; } +int qat_bl_realloc_map_new_dst(struct adf_accel_dev *accel_dev, + struct scatterlist **newd, + unsigned int dlen, + struct qat_request_buffs *qat_bufs, + gfp_t gfp); + #endif diff --git a/drivers/crypto/qat/qat_common/qat_comp_algs.c b/drivers/crypto/qat/qat_common/qat_comp_algs.c index 63fd4ac33dbf3..1480d36a8d2bb 100644 --- a/drivers/crypto/qat/qat_common/qat_comp_algs.c +++ b/drivers/crypto/qat/qat_common/qat_comp_algs.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "adf_accel_devices.h" #include "adf_common_drv.h" #include "qat_bl.h" @@ -25,6 +26,11 @@ struct qat_compression_ctx { struct qat_compression_instance *inst; }; +struct qat_dst { + bool is_null; + int resubmitted; +}; + struct qat_compression_req { u8 req[QAT_COMP_REQ_SIZE]; struct qat_compression_ctx *qat_compression_ctx; @@ -33,6 +39,8 @@ struct qat_compression_req { enum direction dir; int actual_dlen; struct qat_alg_req alg_req; + struct work_struct resubmit; + struct qat_dst dst; }; static int qat_alg_send_dc_message(struct qat_compression_req *qat_req, @@ -49,6 +57,46 @@ static int qat_alg_send_dc_message(struct qat_compression_req *qat_req, return qat_alg_send_message(alg_req); } +static void qat_comp_resubmit(struct work_struct *work) +{ + struct qat_compression_req *qat_req = + container_of(work, struct qat_compression_req, resubmit); + struct qat_compression_ctx *ctx = qat_req->qat_compression_ctx; + struct adf_accel_dev *accel_dev = ctx->inst->accel_dev; + struct qat_request_buffs *qat_bufs = &qat_req->buf; + struct qat_compression_instance *inst = ctx->inst; + struct acomp_req *areq = qat_req->acompress_req; + struct crypto_acomp *tfm = crypto_acomp_reqtfm(areq); + unsigned int dlen = CRYPTO_ACOMP_DST_MAX; + u8 *req = qat_req->req; + dma_addr_t dfbuf; + int ret; + + areq->dlen = dlen; + + dev_dbg(&GET_DEV(accel_dev), "[%s][%s] retry NULL dst request - dlen = %d\n", + crypto_tfm_alg_driver_name(crypto_acomp_tfm(tfm)), + qat_req->dir == COMPRESSION ? "comp" : "decomp", dlen); + + ret = qat_bl_realloc_map_new_dst(accel_dev, &areq->dst, dlen, qat_bufs, + qat_algs_alloc_flags(&areq->base)); + if (ret) + goto err; + + qat_req->dst.resubmitted = true; + + dfbuf = qat_req->buf.bloutp; + qat_comp_override_dst(req, dfbuf, dlen); + + ret = qat_alg_send_dc_message(qat_req, inst, &areq->base); + if (ret != -ENOSPC) + return; + +err: + qat_bl_free_bufl(accel_dev, qat_bufs); + areq->base.complete(&areq->base, ret); +} + static void qat_comp_generic_callback(struct qat_compression_req *qat_req, void *resp) { @@ -80,6 +128,21 @@ static void qat_comp_generic_callback(struct qat_compression_req *qat_req, areq->dlen = 0; + if (qat_req->dir == DECOMPRESSION && qat_req->dst.is_null) { + if (cmp_err == ERR_CODE_OVERFLOW_ERROR) { + if (qat_req->dst.resubmitted) { + dev_dbg(&GET_DEV(accel_dev), + "Output does not fit destination buffer\n"); + res = -EOVERFLOW; + goto end; + } + + INIT_WORK(&qat_req->resubmit, qat_comp_resubmit); + adf_misc_wq_queue_work(&qat_req->resubmit); + return; + } + } + if (unlikely(status != ICP_QAT_FW_COMN_STATUS_FLAG_OK)) goto end; @@ -176,16 +239,23 @@ static int qat_comp_alg_compress_decompress(struct acomp_req *areq, if (areq->dst && !dlen) return -EINVAL; + qat_req->dst.is_null = false; + /* Handle acomp requests that require the allocation of a destination * buffer. The size of the destination buffer is double the source * buffer (rounded up to the size of a page) to fit the decompressed * output or an expansion on the data for compression. */ if (!areq->dst) { + qat_req->dst.is_null = true; + dlen = round_up(2 * slen, PAGE_SIZE); areq->dst = sgl_alloc(dlen, f, NULL); if (!areq->dst) return -ENOMEM; + + areq->dlen = dlen; + qat_req->dst.resubmitted = false; } if (dir == COMPRESSION) { diff --git a/drivers/crypto/qat/qat_common/qat_comp_req.h b/drivers/crypto/qat/qat_common/qat_comp_req.h index 18a1f33a6db98..404e32c5e7783 100644 --- a/drivers/crypto/qat/qat_common/qat_comp_req.h +++ b/drivers/crypto/qat/qat_common/qat_comp_req.h @@ -25,6 +25,16 @@ static inline void qat_comp_create_req(void *ctx, void *req, u64 src, u32 slen, req_pars->out_buffer_sz = dlen; } +static inline void qat_comp_override_dst(void *req, u64 dst, u32 dlen) +{ + struct icp_qat_fw_comp_req *fw_req = req; + struct icp_qat_fw_comp_req_params *req_pars = &fw_req->comp_pars; + + fw_req->comn_mid.dest_data_addr = dst; + fw_req->comn_mid.dst_length = dlen; + req_pars->out_buffer_sz = dlen; +} + static inline void qat_comp_create_compression_req(void *ctx, void *req, u64 src, u32 slen, u64 dst, u32 dlen, -- GitLab From 3564f5a2144355cadb4f0c5c14d2bc7fcd2418b9 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 29 Nov 2022 17:52:35 +0800 Subject: [PATCH 1260/1423] crypto: chelsio - Fix flexible struct array warning This patch fixes the sparse warning about arrays of flexible structures by removing an unnecessary use of them in struct __crypto_ctx. Signed-off-by: Herbert Xu --- drivers/crypto/chelsio/chcr_algo.c | 6 +++--- drivers/crypto/chelsio/chcr_crypto.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c index 6933546f87b1a..9fac1e7584066 100644 --- a/drivers/crypto/chelsio/chcr_algo.c +++ b/drivers/crypto/chelsio/chcr_algo.c @@ -98,17 +98,17 @@ static int chcr_handle_cipher_resp(struct skcipher_request *req, static inline struct chcr_aead_ctx *AEAD_CTX(struct chcr_context *ctx) { - return ctx->crypto_ctx->aeadctx; + return &ctx->crypto_ctx->aeadctx; } static inline struct ablk_ctx *ABLK_CTX(struct chcr_context *ctx) { - return ctx->crypto_ctx->ablkctx; + return &ctx->crypto_ctx->ablkctx; } static inline struct hmac_ctx *HMAC_CTX(struct chcr_context *ctx) { - return ctx->crypto_ctx->hmacctx; + return &ctx->crypto_ctx->hmacctx; } static inline struct chcr_gcm_ctx *GCM_CTX(struct chcr_aead_ctx *gctx) diff --git a/drivers/crypto/chelsio/chcr_crypto.h b/drivers/crypto/chelsio/chcr_crypto.h index c7816c83e3246..7f88ddb086313 100644 --- a/drivers/crypto/chelsio/chcr_crypto.h +++ b/drivers/crypto/chelsio/chcr_crypto.h @@ -248,9 +248,9 @@ struct hmac_ctx { struct __crypto_ctx { union { - DECLARE_FLEX_ARRAY(struct hmac_ctx, hmacctx); - DECLARE_FLEX_ARRAY(struct ablk_ctx, ablkctx); - DECLARE_FLEX_ARRAY(struct chcr_aead_ctx, aeadctx); + struct hmac_ctx hmacctx; + struct ablk_ctx ablkctx; + struct chcr_aead_ctx aeadctx; }; }; -- GitLab From 67ab02dce3adad3ea399e824b37f8e1c2453449f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 29 Nov 2022 17:48:49 +0100 Subject: [PATCH 1261/1423] crypto: arm64/aes-neonbs - use frame_push/pop consistently Use the frame_push and frame_pop macros consistently to create the stack frame, so that we will get PAC and/or shadow call stack handling as well when enabled. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/aes-neonbs-core.S | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/arch/arm64/crypto/aes-neonbs-core.S b/arch/arm64/crypto/aes-neonbs-core.S index d427f4556b6eb..7278a37c2d5cd 100644 --- a/arch/arm64/crypto/aes-neonbs-core.S +++ b/arch/arm64/crypto/aes-neonbs-core.S @@ -760,7 +760,7 @@ SYM_FUNC_START_LOCAL(__xts_crypt8) eor v6.16b, v6.16b, v31.16b eor v7.16b, v7.16b, v16.16b - stp q16, q17, [sp, #16] + stp q16, q17, [x6] mov bskey, x2 mov rounds, x3 @@ -768,8 +768,8 @@ SYM_FUNC_START_LOCAL(__xts_crypt8) SYM_FUNC_END(__xts_crypt8) .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 - stp x29, x30, [sp, #-48]! - mov x29, sp + frame_push 0, 32 + add x6, sp, #.Lframe_local_offset ld1 {v25.16b}, [x5] @@ -781,7 +781,7 @@ SYM_FUNC_END(__xts_crypt8) eor v18.16b, \o2\().16b, v27.16b eor v19.16b, \o3\().16b, v28.16b - ldp q24, q25, [sp, #16] + ldp q24, q25, [x6] eor v20.16b, \o4\().16b, v29.16b eor v21.16b, \o5\().16b, v30.16b @@ -795,7 +795,7 @@ SYM_FUNC_END(__xts_crypt8) b.gt 0b st1 {v25.16b}, [x5] - ldp x29, x30, [sp], #48 + frame_pop ret .endm @@ -820,9 +820,7 @@ SYM_FUNC_END(aesbs_xts_decrypt) * int rounds, int blocks, u8 iv[]) */ SYM_FUNC_START(aesbs_ctr_encrypt) - stp x29, x30, [sp, #-16]! - mov x29, sp - + frame_push 0 ldp x7, x8, [x5] ld1 {v0.16b}, [x5] CPU_LE( rev x7, x7 ) @@ -862,6 +860,6 @@ CPU_LE( rev x8, x8 ) b.gt 0b st1 {v0.16b}, [x5] - ldp x29, x30, [sp], #16 + frame_pop ret SYM_FUNC_END(aesbs_ctr_encrypt) -- GitLab From 7d709af18054bc9e2043499bb35eb1809c2a316f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 29 Nov 2022 17:48:50 +0100 Subject: [PATCH 1262/1423] crypto: arm64/aes-modes - use frame_push/pop macros consistently Use the frame_push and frame_pop macros to create the stack frames in the AES chaining mode wrappers so that they will get PAC and/or shadow call stack protection when configured. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/aes-modes.S | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S index 5abc834271f4a..0e834a2c062cf 100644 --- a/arch/arm64/crypto/aes-modes.S +++ b/arch/arm64/crypto/aes-modes.S @@ -52,8 +52,7 @@ SYM_FUNC_END(aes_decrypt_block5x) */ AES_FUNC_START(aes_ecb_encrypt) - stp x29, x30, [sp, #-16]! - mov x29, sp + frame_push 0 enc_prepare w3, x2, x5 @@ -77,14 +76,13 @@ ST5( st1 {v4.16b}, [x0], #16 ) subs w4, w4, #1 bne .Lecbencloop .Lecbencout: - ldp x29, x30, [sp], #16 + frame_pop ret AES_FUNC_END(aes_ecb_encrypt) AES_FUNC_START(aes_ecb_decrypt) - stp x29, x30, [sp, #-16]! - mov x29, sp + frame_push 0 dec_prepare w3, x2, x5 @@ -108,7 +106,7 @@ ST5( st1 {v4.16b}, [x0], #16 ) subs w4, w4, #1 bne .Lecbdecloop .Lecbdecout: - ldp x29, x30, [sp], #16 + frame_pop ret AES_FUNC_END(aes_ecb_decrypt) @@ -171,9 +169,6 @@ AES_FUNC_END(aes_cbc_encrypt) AES_FUNC_END(aes_essiv_cbc_encrypt) AES_FUNC_START(aes_essiv_cbc_decrypt) - stp x29, x30, [sp, #-16]! - mov x29, sp - ld1 {cbciv.16b}, [x5] /* get iv */ mov w8, #14 /* AES-256: 14 rounds */ @@ -182,11 +177,9 @@ AES_FUNC_START(aes_essiv_cbc_decrypt) b .Lessivcbcdecstart AES_FUNC_START(aes_cbc_decrypt) - stp x29, x30, [sp, #-16]! - mov x29, sp - ld1 {cbciv.16b}, [x5] /* get iv */ .Lessivcbcdecstart: + frame_push 0 dec_prepare w3, x2, x6 .LcbcdecloopNx: @@ -236,7 +229,7 @@ ST5( st1 {v4.16b}, [x0], #16 ) bne .Lcbcdecloop .Lcbcdecout: st1 {cbciv.16b}, [x5] /* return iv */ - ldp x29, x30, [sp], #16 + frame_pop ret AES_FUNC_END(aes_cbc_decrypt) AES_FUNC_END(aes_essiv_cbc_decrypt) @@ -337,8 +330,7 @@ AES_FUNC_END(aes_cbc_cts_decrypt) BLOCKS .req x13 BLOCKS_W .req w13 - stp x29, x30, [sp, #-16]! - mov x29, sp + frame_push 0 enc_prepare ROUNDS_W, KEY, IV_PART ld1 {vctr.16b}, [IV] @@ -481,7 +473,7 @@ ST5( st1 {v4.16b}, [OUT], #16 ) .if !\xctr st1 {vctr.16b}, [IV] /* return next CTR value */ .endif - ldp x29, x30, [sp], #16 + frame_pop ret .Lctrtail\xctr: @@ -645,8 +637,7 @@ AES_FUNC_END(aes_xctr_encrypt) .endm AES_FUNC_START(aes_xts_encrypt) - stp x29, x30, [sp, #-16]! - mov x29, sp + frame_push 0 ld1 {v4.16b}, [x6] xts_load_mask v8 @@ -704,7 +695,7 @@ AES_FUNC_START(aes_xts_encrypt) st1 {v0.16b}, [x0] .Lxtsencret: st1 {v4.16b}, [x6] - ldp x29, x30, [sp], #16 + frame_pop ret .LxtsencctsNx: @@ -732,8 +723,7 @@ AES_FUNC_START(aes_xts_encrypt) AES_FUNC_END(aes_xts_encrypt) AES_FUNC_START(aes_xts_decrypt) - stp x29, x30, [sp, #-16]! - mov x29, sp + frame_push 0 /* subtract 16 bytes if we are doing CTS */ sub w8, w4, #0x10 @@ -794,7 +784,7 @@ AES_FUNC_START(aes_xts_decrypt) b .Lxtsdecloop .Lxtsdecout: st1 {v4.16b}, [x6] - ldp x29, x30, [sp], #16 + frame_pop ret .Lxtsdeccts: -- GitLab From 489a4a05fe6d544f1f1052d2c6cd5bffbd89ddb6 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 29 Nov 2022 17:48:51 +0100 Subject: [PATCH 1263/1423] crypto: arm64/crct10dif - use frame_push/pop macros consistently Use the frame_push and frame_pop macros to set up the stack frame so that return address protections will be enabled automically when configured. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/crct10dif-ce-core.S | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S index dce6dcebfca18..5604de61d06d0 100644 --- a/arch/arm64/crypto/crct10dif-ce-core.S +++ b/arch/arm64/crypto/crct10dif-ce-core.S @@ -429,7 +429,7 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) umov w0, v0.h[0] .ifc \p, p8 - ldp x29, x30, [sp], #16 + frame_pop .endif ret @@ -466,8 +466,7 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) // Assumes len >= 16. // SYM_FUNC_START(crc_t10dif_pmull_p8) - stp x29, x30, [sp, #-16]! - mov x29, sp + frame_push 1 crc_t10dif_pmull p8 SYM_FUNC_END(crc_t10dif_pmull_p8) -- GitLab From a428636d4c827ebe967aa31a83684b4c8e742ed1 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 29 Nov 2022 17:48:52 +0100 Subject: [PATCH 1264/1423] crypto: arm64/ghash-ce - use frame_push/pop macros consistently Use the frame_push and frame_pop macros to set up the stack frame so that return address protections will be enabled automically when configured. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/ghash-ce-core.S | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S index ebe5558929b7b..23ee9a5eaf27c 100644 --- a/arch/arm64/crypto/ghash-ce-core.S +++ b/arch/arm64/crypto/ghash-ce-core.S @@ -436,9 +436,7 @@ SYM_FUNC_END(pmull_ghash_update_p8) .align 6 .macro pmull_gcm_do_crypt, enc - stp x29, x30, [sp, #-32]! - mov x29, sp - str x19, [sp, #24] + frame_push 1 load_round_keys x7, x6, x8 @@ -529,7 +527,7 @@ CPU_LE( rev w8, w8 ) .endif bne 0b -3: ldp x19, x10, [sp, #24] +3: ldr x10, [sp, #.Lframe_local_offset] cbz x10, 5f // output tag? ld1 {INP3.16b}, [x10] // load lengths[] @@ -562,7 +560,7 @@ CPU_LE( rev w8, w8 ) smov w0, v0.b[0] // return b0 .endif -4: ldp x29, x30, [sp], #32 +4: frame_pop ret 5: -- GitLab From 04ba54e5af8f8f0137b08cb51a0b3a2e1ea46c94 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Thu, 1 Dec 2022 14:25:26 +0800 Subject: [PATCH 1265/1423] crypto: img-hash - Fix variable dereferenced before check 'hdev->req' Smatch report warning as follows: drivers/crypto/img-hash.c:366 img_hash_dma_task() warn: variable dereferenced before check 'hdev->req' Variable dereferenced should be done after check 'hdev->req', fix it. Fixes: d358f1abbf71 ("crypto: img-hash - Add Imagination Technologies hw hash accelerator") Fixes: 10badea259fa ("crypto: img-hash - Fix null pointer exception") Signed-off-by: Gaosheng Cui Signed-off-by: Herbert Xu --- drivers/crypto/img-hash.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/img-hash.c b/drivers/crypto/img-hash.c index d8e82d69745d8..9629e98bd68b7 100644 --- a/drivers/crypto/img-hash.c +++ b/drivers/crypto/img-hash.c @@ -358,12 +358,16 @@ static int img_hash_dma_init(struct img_hash_dev *hdev) static void img_hash_dma_task(unsigned long d) { struct img_hash_dev *hdev = (struct img_hash_dev *)d; - struct img_hash_request_ctx *ctx = ahash_request_ctx(hdev->req); + struct img_hash_request_ctx *ctx; u8 *addr; size_t nbytes, bleft, wsend, len, tbc; struct scatterlist tsg; - if (!hdev->req || !ctx->sg) + if (!hdev->req) + return; + + ctx = ahash_request_ctx(hdev->req); + if (!ctx->sg) return; addr = sg_virt(ctx->sg); -- GitLab From 1c64a7e1f931821acadf964c5ddb0dc41abf9e20 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 2 Dec 2022 17:20:47 +0800 Subject: [PATCH 1266/1423] crypto: cavium - Set DMA alignment explicitly This driver has been implicitly relying on kmalloc alignment to be sufficient for DMA. This may no longer be the case with upcoming arm64 changes. This patch changes it to explicitly request DMA alignment from the Crypto API. Signed-off-by: Herbert Xu --- drivers/crypto/cavium/cpt/cptvf_algs.c | 10 +++++----- drivers/crypto/cavium/nitrox/nitrox_aead.c | 12 ++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/crypto/cavium/cpt/cptvf_algs.c b/drivers/crypto/cavium/cpt/cptvf_algs.c index ce3b91c612f0f..9eca0c3021866 100644 --- a/drivers/crypto/cavium/cpt/cptvf_algs.c +++ b/drivers/crypto/cavium/cpt/cptvf_algs.c @@ -97,7 +97,7 @@ static inline u32 create_ctx_hdr(struct skcipher_request *req, u32 enc, { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct cvm_enc_ctx *ctx = crypto_skcipher_ctx(tfm); - struct cvm_req_ctx *rctx = skcipher_request_ctx(req); + struct cvm_req_ctx *rctx = skcipher_request_ctx_dma(req); struct fc_context *fctx = &rctx->fctx; u32 enc_iv_len = crypto_skcipher_ivsize(tfm); struct cpt_request_info *req_info = &rctx->cpt_req; @@ -151,7 +151,7 @@ static inline u32 create_ctx_hdr(struct skcipher_request *req, u32 enc, static inline u32 create_input_list(struct skcipher_request *req, u32 enc, u32 enc_iv_len) { - struct cvm_req_ctx *rctx = skcipher_request_ctx(req); + struct cvm_req_ctx *rctx = skcipher_request_ctx_dma(req); struct cpt_request_info *req_info = &rctx->cpt_req; u32 argcnt = 0; @@ -173,7 +173,7 @@ static inline void store_cb_info(struct skcipher_request *req, static inline void create_output_list(struct skcipher_request *req, u32 enc_iv_len) { - struct cvm_req_ctx *rctx = skcipher_request_ctx(req); + struct cvm_req_ctx *rctx = skcipher_request_ctx_dma(req); struct cpt_request_info *req_info = &rctx->cpt_req; u32 argcnt = 0; @@ -193,7 +193,7 @@ static inline void create_output_list(struct skcipher_request *req, static inline int cvm_enc_dec(struct skcipher_request *req, u32 enc) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct cvm_req_ctx *rctx = skcipher_request_ctx(req); + struct cvm_req_ctx *rctx = skcipher_request_ctx_dma(req); u32 enc_iv_len = crypto_skcipher_ivsize(tfm); struct fc_context *fctx = &rctx->fctx; struct cpt_request_info *req_info = &rctx->cpt_req; @@ -335,7 +335,7 @@ static int cvm_ecb_des3_setkey(struct crypto_skcipher *cipher, const u8 *key, static int cvm_enc_dec_init(struct crypto_skcipher *tfm) { - crypto_skcipher_set_reqsize(tfm, sizeof(struct cvm_req_ctx)); + crypto_skcipher_set_reqsize_dma(tfm, sizeof(struct cvm_req_ctx)); return 0; } diff --git a/drivers/crypto/cavium/nitrox/nitrox_aead.c b/drivers/crypto/cavium/nitrox/nitrox_aead.c index c93c4e41d2678..0653484df23f7 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_aead.c +++ b/drivers/crypto/cavium/nitrox/nitrox_aead.c @@ -392,7 +392,7 @@ static int nitrox_rfc4106_setauthsize(struct crypto_aead *aead, static int nitrox_rfc4106_set_aead_rctx_sglist(struct aead_request *areq) { - struct nitrox_rfc4106_rctx *rctx = aead_request_ctx(areq); + struct nitrox_rfc4106_rctx *rctx = aead_request_ctx_dma(areq); struct nitrox_aead_rctx *aead_rctx = &rctx->base; unsigned int assoclen = areq->assoclen - GCM_RFC4106_IV_SIZE; struct scatterlist *sg; @@ -424,7 +424,7 @@ static int nitrox_rfc4106_set_aead_rctx_sglist(struct aead_request *areq) static void nitrox_rfc4106_callback(void *arg, int err) { struct aead_request *areq = arg; - struct nitrox_rfc4106_rctx *rctx = aead_request_ctx(areq); + struct nitrox_rfc4106_rctx *rctx = aead_request_ctx_dma(areq); struct nitrox_kcrypt_request *nkreq = &rctx->base.nkreq; free_src_sglist(nkreq); @@ -441,7 +441,7 @@ static int nitrox_rfc4106_enc(struct aead_request *areq) { struct crypto_aead *aead = crypto_aead_reqtfm(areq); struct nitrox_crypto_ctx *nctx = crypto_aead_ctx(aead); - struct nitrox_rfc4106_rctx *rctx = aead_request_ctx(areq); + struct nitrox_rfc4106_rctx *rctx = aead_request_ctx_dma(areq); struct nitrox_aead_rctx *aead_rctx = &rctx->base; struct se_crypto_request *creq = &aead_rctx->nkreq.creq; int ret; @@ -472,7 +472,7 @@ static int nitrox_rfc4106_enc(struct aead_request *areq) static int nitrox_rfc4106_dec(struct aead_request *areq) { struct crypto_aead *aead = crypto_aead_reqtfm(areq); - struct nitrox_crypto_ctx *nctx = crypto_aead_ctx(aead); + struct nitrox_crypto_ctx *nctx = crypto_aead_ctx_dma(aead); struct nitrox_rfc4106_rctx *rctx = aead_request_ctx(areq); struct nitrox_aead_rctx *aead_rctx = &rctx->base; struct se_crypto_request *creq = &aead_rctx->nkreq.creq; @@ -510,8 +510,8 @@ static int nitrox_rfc4106_init(struct crypto_aead *aead) if (ret) return ret; - crypto_aead_set_reqsize(aead, sizeof(struct aead_request) + - sizeof(struct nitrox_rfc4106_rctx)); + crypto_aead_set_reqsize_dma(aead, sizeof(struct aead_request) + + sizeof(struct nitrox_rfc4106_rctx)); return 0; } -- GitLab From 99c6b20edfc031610240afca97ba9be5ec6f5750 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 2 Dec 2022 17:20:49 +0800 Subject: [PATCH 1267/1423] crypto: ccp - Set DMA alignment explicitly This driver has been implicitly relying on kmalloc alignment to be sufficient for DMA. This may no longer be the case with upcoming arm64 changes. This patch changes it to explicitly request DMA alignment from the Crypto API. Signed-off-by: Herbert Xu --- drivers/crypto/ccp/ccp-crypto-aes-cmac.c | 21 ++++++++-------- drivers/crypto/ccp/ccp-crypto-aes-galois.c | 12 ++++----- drivers/crypto/ccp/ccp-crypto-aes-xts.c | 20 ++++++++------- drivers/crypto/ccp/ccp-crypto-aes.c | 29 +++++++++++----------- drivers/crypto/ccp/ccp-crypto-des3.c | 17 +++++++------ drivers/crypto/ccp/ccp-crypto-main.c | 4 +-- drivers/crypto/ccp/ccp-crypto-rsa.c | 18 +++++++------- drivers/crypto/ccp/ccp-crypto-sha.c | 26 +++++++++---------- 8 files changed, 76 insertions(+), 71 deletions(-) diff --git a/drivers/crypto/ccp/ccp-crypto-aes-cmac.c b/drivers/crypto/ccp/ccp-crypto-aes-cmac.c index 11a305fa19e67..d8426bdf31908 100644 --- a/drivers/crypto/ccp/ccp-crypto-aes-cmac.c +++ b/drivers/crypto/ccp/ccp-crypto-aes-cmac.c @@ -25,7 +25,7 @@ static int ccp_aes_cmac_complete(struct crypto_async_request *async_req, { struct ahash_request *req = ahash_request_cast(async_req); struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct ccp_aes_cmac_req_ctx *rctx = ahash_request_ctx(req); + struct ccp_aes_cmac_req_ctx *rctx = ahash_request_ctx_dma(req); unsigned int digest_size = crypto_ahash_digestsize(tfm); if (ret) @@ -56,8 +56,8 @@ static int ccp_do_cmac_update(struct ahash_request *req, unsigned int nbytes, unsigned int final) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct ccp_ctx *ctx = crypto_ahash_ctx(tfm); - struct ccp_aes_cmac_req_ctx *rctx = ahash_request_ctx(req); + struct ccp_ctx *ctx = crypto_ahash_ctx_dma(tfm); + struct ccp_aes_cmac_req_ctx *rctx = ahash_request_ctx_dma(req); struct scatterlist *sg, *cmac_key_sg = NULL; unsigned int block_size = crypto_tfm_alg_blocksize(crypto_ahash_tfm(tfm)); @@ -182,7 +182,7 @@ e_free: static int ccp_aes_cmac_init(struct ahash_request *req) { - struct ccp_aes_cmac_req_ctx *rctx = ahash_request_ctx(req); + struct ccp_aes_cmac_req_ctx *rctx = ahash_request_ctx_dma(req); memset(rctx, 0, sizeof(*rctx)); @@ -219,7 +219,7 @@ static int ccp_aes_cmac_digest(struct ahash_request *req) static int ccp_aes_cmac_export(struct ahash_request *req, void *out) { - struct ccp_aes_cmac_req_ctx *rctx = ahash_request_ctx(req); + struct ccp_aes_cmac_req_ctx *rctx = ahash_request_ctx_dma(req); struct ccp_aes_cmac_exp_ctx state; /* Don't let anything leak to 'out' */ @@ -238,7 +238,7 @@ static int ccp_aes_cmac_export(struct ahash_request *req, void *out) static int ccp_aes_cmac_import(struct ahash_request *req, const void *in) { - struct ccp_aes_cmac_req_ctx *rctx = ahash_request_ctx(req); + struct ccp_aes_cmac_req_ctx *rctx = ahash_request_ctx_dma(req); struct ccp_aes_cmac_exp_ctx state; /* 'in' may not be aligned so memcpy to local variable */ @@ -256,7 +256,7 @@ static int ccp_aes_cmac_import(struct ahash_request *req, const void *in) static int ccp_aes_cmac_setkey(struct crypto_ahash *tfm, const u8 *key, unsigned int key_len) { - struct ccp_ctx *ctx = crypto_tfm_ctx(crypto_ahash_tfm(tfm)); + struct ccp_ctx *ctx = crypto_ahash_ctx_dma(tfm); struct ccp_crypto_ahash_alg *alg = ccp_crypto_ahash_alg(crypto_ahash_tfm(tfm)); u64 k0_hi, k0_lo, k1_hi, k1_lo, k2_hi, k2_lo; @@ -334,13 +334,14 @@ static int ccp_aes_cmac_setkey(struct crypto_ahash *tfm, const u8 *key, static int ccp_aes_cmac_cra_init(struct crypto_tfm *tfm) { - struct ccp_ctx *ctx = crypto_tfm_ctx(tfm); + struct ccp_ctx *ctx = crypto_tfm_ctx_dma(tfm); struct crypto_ahash *ahash = __crypto_ahash_cast(tfm); ctx->complete = ccp_aes_cmac_complete; ctx->u.aes.key_len = 0; - crypto_ahash_set_reqsize(ahash, sizeof(struct ccp_aes_cmac_req_ctx)); + crypto_ahash_set_reqsize_dma(ahash, + sizeof(struct ccp_aes_cmac_req_ctx)); return 0; } @@ -382,7 +383,7 @@ int ccp_register_aes_cmac_algs(struct list_head *head) CRYPTO_ALG_KERN_DRIVER_ONLY | CRYPTO_ALG_NEED_FALLBACK; base->cra_blocksize = AES_BLOCK_SIZE; - base->cra_ctxsize = sizeof(struct ccp_ctx); + base->cra_ctxsize = sizeof(struct ccp_ctx) + crypto_dma_padding(); base->cra_priority = CCP_CRA_PRIORITY; base->cra_init = ccp_aes_cmac_cra_init; base->cra_module = THIS_MODULE; diff --git a/drivers/crypto/ccp/ccp-crypto-aes-galois.c b/drivers/crypto/ccp/ccp-crypto-aes-galois.c index 1c1c939f5c39b..b1dbb8cea559e 100644 --- a/drivers/crypto/ccp/ccp-crypto-aes-galois.c +++ b/drivers/crypto/ccp/ccp-crypto-aes-galois.c @@ -29,7 +29,7 @@ static int ccp_aes_gcm_complete(struct crypto_async_request *async_req, int ret) static int ccp_aes_gcm_setkey(struct crypto_aead *tfm, const u8 *key, unsigned int key_len) { - struct ccp_ctx *ctx = crypto_aead_ctx(tfm); + struct ccp_ctx *ctx = crypto_aead_ctx_dma(tfm); switch (key_len) { case AES_KEYSIZE_128: @@ -76,8 +76,8 @@ static int ccp_aes_gcm_setauthsize(struct crypto_aead *tfm, static int ccp_aes_gcm_crypt(struct aead_request *req, bool encrypt) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct ccp_ctx *ctx = crypto_aead_ctx(tfm); - struct ccp_aes_req_ctx *rctx = aead_request_ctx(req); + struct ccp_ctx *ctx = crypto_aead_ctx_dma(tfm); + struct ccp_aes_req_ctx *rctx = aead_request_ctx_dma(req); struct scatterlist *iv_sg = NULL; unsigned int iv_len = 0; int i; @@ -148,12 +148,12 @@ static int ccp_aes_gcm_decrypt(struct aead_request *req) static int ccp_aes_gcm_cra_init(struct crypto_aead *tfm) { - struct ccp_ctx *ctx = crypto_aead_ctx(tfm); + struct ccp_ctx *ctx = crypto_aead_ctx_dma(tfm); ctx->complete = ccp_aes_gcm_complete; ctx->u.aes.key_len = 0; - crypto_aead_set_reqsize(tfm, sizeof(struct ccp_aes_req_ctx)); + crypto_aead_set_reqsize_dma(tfm, sizeof(struct ccp_aes_req_ctx)); return 0; } @@ -176,7 +176,7 @@ static struct aead_alg ccp_aes_gcm_defaults = { CRYPTO_ALG_KERN_DRIVER_ONLY | CRYPTO_ALG_NEED_FALLBACK, .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct ccp_ctx), + .cra_ctxsize = sizeof(struct ccp_ctx) + CRYPTO_DMA_PADDING, .cra_priority = CCP_CRA_PRIORITY, .cra_exit = ccp_aes_gcm_cra_exit, .cra_module = THIS_MODULE, diff --git a/drivers/crypto/ccp/ccp-crypto-aes-xts.c b/drivers/crypto/ccp/ccp-crypto-aes-xts.c index 6849261ca47d0..93f735d6b02b2 100644 --- a/drivers/crypto/ccp/ccp-crypto-aes-xts.c +++ b/drivers/crypto/ccp/ccp-crypto-aes-xts.c @@ -62,7 +62,7 @@ static struct ccp_unit_size_map xts_unit_sizes[] = { static int ccp_aes_xts_complete(struct crypto_async_request *async_req, int ret) { struct skcipher_request *req = skcipher_request_cast(async_req); - struct ccp_aes_req_ctx *rctx = skcipher_request_ctx(req); + struct ccp_aes_req_ctx *rctx = skcipher_request_ctx_dma(req); if (ret) return ret; @@ -75,7 +75,7 @@ static int ccp_aes_xts_complete(struct crypto_async_request *async_req, int ret) static int ccp_aes_xts_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int key_len) { - struct ccp_ctx *ctx = crypto_skcipher_ctx(tfm); + struct ccp_ctx *ctx = crypto_skcipher_ctx_dma(tfm); unsigned int ccpversion = ccp_version(); int ret; @@ -105,8 +105,8 @@ static int ccp_aes_xts_crypt(struct skcipher_request *req, unsigned int encrypt) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct ccp_ctx *ctx = crypto_skcipher_ctx(tfm); - struct ccp_aes_req_ctx *rctx = skcipher_request_ctx(req); + struct ccp_ctx *ctx = crypto_skcipher_ctx_dma(tfm); + struct ccp_aes_req_ctx *rctx = skcipher_request_ctx_dma(req); unsigned int ccpversion = ccp_version(); unsigned int fallback = 0; unsigned int unit; @@ -196,7 +196,7 @@ static int ccp_aes_xts_decrypt(struct skcipher_request *req) static int ccp_aes_xts_init_tfm(struct crypto_skcipher *tfm) { - struct ccp_ctx *ctx = crypto_skcipher_ctx(tfm); + struct ccp_ctx *ctx = crypto_skcipher_ctx_dma(tfm); struct crypto_skcipher *fallback_tfm; ctx->complete = ccp_aes_xts_complete; @@ -210,15 +210,16 @@ static int ccp_aes_xts_init_tfm(struct crypto_skcipher *tfm) } ctx->u.aes.tfm_skcipher = fallback_tfm; - crypto_skcipher_set_reqsize(tfm, sizeof(struct ccp_aes_req_ctx) + - crypto_skcipher_reqsize(fallback_tfm)); + crypto_skcipher_set_reqsize_dma(tfm, + sizeof(struct ccp_aes_req_ctx) + + crypto_skcipher_reqsize(fallback_tfm)); return 0; } static void ccp_aes_xts_exit_tfm(struct crypto_skcipher *tfm) { - struct ccp_ctx *ctx = crypto_skcipher_ctx(tfm); + struct ccp_ctx *ctx = crypto_skcipher_ctx_dma(tfm); crypto_free_skcipher(ctx->u.aes.tfm_skcipher); } @@ -246,7 +247,8 @@ static int ccp_register_aes_xts_alg(struct list_head *head, CRYPTO_ALG_KERN_DRIVER_ONLY | CRYPTO_ALG_NEED_FALLBACK; alg->base.cra_blocksize = AES_BLOCK_SIZE; - alg->base.cra_ctxsize = sizeof(struct ccp_ctx); + alg->base.cra_ctxsize = sizeof(struct ccp_ctx) + + crypto_dma_padding(); alg->base.cra_priority = CCP_CRA_PRIORITY; alg->base.cra_module = THIS_MODULE; diff --git a/drivers/crypto/ccp/ccp-crypto-aes.c b/drivers/crypto/ccp/ccp-crypto-aes.c index bed331953ff94..918e223f21b65 100644 --- a/drivers/crypto/ccp/ccp-crypto-aes.c +++ b/drivers/crypto/ccp/ccp-crypto-aes.c @@ -22,8 +22,9 @@ static int ccp_aes_complete(struct crypto_async_request *async_req, int ret) { struct skcipher_request *req = skcipher_request_cast(async_req); - struct ccp_ctx *ctx = crypto_tfm_ctx(req->base.tfm); - struct ccp_aes_req_ctx *rctx = skcipher_request_ctx(req); + struct ccp_ctx *ctx = crypto_skcipher_ctx_dma( + crypto_skcipher_reqtfm(req)); + struct ccp_aes_req_ctx *rctx = skcipher_request_ctx_dma(req); if (ret) return ret; @@ -38,7 +39,7 @@ static int ccp_aes_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int key_len) { struct ccp_crypto_skcipher_alg *alg = ccp_crypto_skcipher_alg(tfm); - struct ccp_ctx *ctx = crypto_skcipher_ctx(tfm); + struct ccp_ctx *ctx = crypto_skcipher_ctx_dma(tfm); switch (key_len) { case AES_KEYSIZE_128: @@ -65,8 +66,8 @@ static int ccp_aes_setkey(struct crypto_skcipher *tfm, const u8 *key, static int ccp_aes_crypt(struct skcipher_request *req, bool encrypt) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct ccp_ctx *ctx = crypto_skcipher_ctx(tfm); - struct ccp_aes_req_ctx *rctx = skcipher_request_ctx(req); + struct ccp_ctx *ctx = crypto_skcipher_ctx_dma(tfm); + struct ccp_aes_req_ctx *rctx = skcipher_request_ctx_dma(req); struct scatterlist *iv_sg = NULL; unsigned int iv_len = 0; @@ -118,7 +119,7 @@ static int ccp_aes_decrypt(struct skcipher_request *req) static int ccp_aes_init_tfm(struct crypto_skcipher *tfm) { - struct ccp_ctx *ctx = crypto_skcipher_ctx(tfm); + struct ccp_ctx *ctx = crypto_skcipher_ctx_dma(tfm); ctx->complete = ccp_aes_complete; ctx->u.aes.key_len = 0; @@ -132,7 +133,7 @@ static int ccp_aes_rfc3686_complete(struct crypto_async_request *async_req, int ret) { struct skcipher_request *req = skcipher_request_cast(async_req); - struct ccp_aes_req_ctx *rctx = skcipher_request_ctx(req); + struct ccp_aes_req_ctx *rctx = skcipher_request_ctx_dma(req); /* Restore the original pointer */ req->iv = rctx->rfc3686_info; @@ -143,7 +144,7 @@ static int ccp_aes_rfc3686_complete(struct crypto_async_request *async_req, static int ccp_aes_rfc3686_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int key_len) { - struct ccp_ctx *ctx = crypto_skcipher_ctx(tfm); + struct ccp_ctx *ctx = crypto_skcipher_ctx_dma(tfm); if (key_len < CTR_RFC3686_NONCE_SIZE) return -EINVAL; @@ -157,8 +158,8 @@ static int ccp_aes_rfc3686_setkey(struct crypto_skcipher *tfm, const u8 *key, static int ccp_aes_rfc3686_crypt(struct skcipher_request *req, bool encrypt) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct ccp_ctx *ctx = crypto_skcipher_ctx(tfm); - struct ccp_aes_req_ctx *rctx = skcipher_request_ctx(req); + struct ccp_ctx *ctx = crypto_skcipher_ctx_dma(tfm); + struct ccp_aes_req_ctx *rctx = skcipher_request_ctx_dma(req); u8 *iv; /* Initialize the CTR block */ @@ -190,12 +191,12 @@ static int ccp_aes_rfc3686_decrypt(struct skcipher_request *req) static int ccp_aes_rfc3686_init_tfm(struct crypto_skcipher *tfm) { - struct ccp_ctx *ctx = crypto_skcipher_ctx(tfm); + struct ccp_ctx *ctx = crypto_skcipher_ctx_dma(tfm); ctx->complete = ccp_aes_rfc3686_complete; ctx->u.aes.key_len = 0; - crypto_skcipher_set_reqsize(tfm, sizeof(struct ccp_aes_req_ctx)); + crypto_skcipher_set_reqsize_dma(tfm, sizeof(struct ccp_aes_req_ctx)); return 0; } @@ -213,7 +214,7 @@ static const struct skcipher_alg ccp_aes_defaults = { CRYPTO_ALG_KERN_DRIVER_ONLY | CRYPTO_ALG_NEED_FALLBACK, .base.cra_blocksize = AES_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct ccp_ctx), + .base.cra_ctxsize = sizeof(struct ccp_ctx) + CRYPTO_DMA_PADDING, .base.cra_priority = CCP_CRA_PRIORITY, .base.cra_module = THIS_MODULE, }; @@ -231,7 +232,7 @@ static const struct skcipher_alg ccp_aes_rfc3686_defaults = { CRYPTO_ALG_KERN_DRIVER_ONLY | CRYPTO_ALG_NEED_FALLBACK, .base.cra_blocksize = CTR_RFC3686_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct ccp_ctx), + .base.cra_ctxsize = sizeof(struct ccp_ctx) + CRYPTO_DMA_PADDING, .base.cra_priority = CCP_CRA_PRIORITY, .base.cra_module = THIS_MODULE, }; diff --git a/drivers/crypto/ccp/ccp-crypto-des3.c b/drivers/crypto/ccp/ccp-crypto-des3.c index 278636ed251a5..afae30adb703f 100644 --- a/drivers/crypto/ccp/ccp-crypto-des3.c +++ b/drivers/crypto/ccp/ccp-crypto-des3.c @@ -21,8 +21,9 @@ static int ccp_des3_complete(struct crypto_async_request *async_req, int ret) { struct skcipher_request *req = skcipher_request_cast(async_req); - struct ccp_ctx *ctx = crypto_tfm_ctx(req->base.tfm); - struct ccp_des3_req_ctx *rctx = skcipher_request_ctx(req); + struct ccp_ctx *ctx = crypto_skcipher_ctx_dma( + crypto_skcipher_reqtfm(req)); + struct ccp_des3_req_ctx *rctx = skcipher_request_ctx_dma(req); if (ret) return ret; @@ -37,7 +38,7 @@ static int ccp_des3_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int key_len) { struct ccp_crypto_skcipher_alg *alg = ccp_crypto_skcipher_alg(tfm); - struct ccp_ctx *ctx = crypto_skcipher_ctx(tfm); + struct ccp_ctx *ctx = crypto_skcipher_ctx_dma(tfm); int err; err = verify_skcipher_des3_key(tfm, key); @@ -60,8 +61,8 @@ static int ccp_des3_setkey(struct crypto_skcipher *tfm, const u8 *key, static int ccp_des3_crypt(struct skcipher_request *req, bool encrypt) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct ccp_ctx *ctx = crypto_skcipher_ctx(tfm); - struct ccp_des3_req_ctx *rctx = skcipher_request_ctx(req); + struct ccp_ctx *ctx = crypto_skcipher_ctx_dma(tfm); + struct ccp_des3_req_ctx *rctx = skcipher_request_ctx_dma(req); struct scatterlist *iv_sg = NULL; unsigned int iv_len = 0; @@ -114,12 +115,12 @@ static int ccp_des3_decrypt(struct skcipher_request *req) static int ccp_des3_init_tfm(struct crypto_skcipher *tfm) { - struct ccp_ctx *ctx = crypto_skcipher_ctx(tfm); + struct ccp_ctx *ctx = crypto_skcipher_ctx_dma(tfm); ctx->complete = ccp_des3_complete; ctx->u.des3.key_len = 0; - crypto_skcipher_set_reqsize(tfm, sizeof(struct ccp_des3_req_ctx)); + crypto_skcipher_set_reqsize_dma(tfm, sizeof(struct ccp_des3_req_ctx)); return 0; } @@ -137,7 +138,7 @@ static const struct skcipher_alg ccp_des3_defaults = { CRYPTO_ALG_KERN_DRIVER_ONLY | CRYPTO_ALG_NEED_FALLBACK, .base.cra_blocksize = DES3_EDE_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct ccp_ctx), + .base.cra_ctxsize = sizeof(struct ccp_ctx) + CRYPTO_DMA_PADDING, .base.cra_priority = CCP_CRA_PRIORITY, .base.cra_module = THIS_MODULE, }; diff --git a/drivers/crypto/ccp/ccp-crypto-main.c b/drivers/crypto/ccp/ccp-crypto-main.c index dd86d2650bea4..73442a382f683 100644 --- a/drivers/crypto/ccp/ccp-crypto-main.c +++ b/drivers/crypto/ccp/ccp-crypto-main.c @@ -139,7 +139,7 @@ static void ccp_crypto_complete(void *data, int err) struct ccp_crypto_cmd *crypto_cmd = data; struct ccp_crypto_cmd *held, *next, *backlog; struct crypto_async_request *req = crypto_cmd->req; - struct ccp_ctx *ctx = crypto_tfm_ctx(req->tfm); + struct ccp_ctx *ctx = crypto_tfm_ctx_dma(req->tfm); int ret; if (err == -EINPROGRESS) { @@ -183,7 +183,7 @@ static void ccp_crypto_complete(void *data, int err) break; /* Error occurred, report it and get the next entry */ - ctx = crypto_tfm_ctx(held->req->tfm); + ctx = crypto_tfm_ctx_dma(held->req->tfm); if (ctx->complete) ret = ctx->complete(held->req, ret); held->req->complete(held->req, ret); diff --git a/drivers/crypto/ccp/ccp-crypto-rsa.c b/drivers/crypto/ccp/ccp-crypto-rsa.c index 1223ac70aea28..a14f85512cf4f 100644 --- a/drivers/crypto/ccp/ccp-crypto-rsa.c +++ b/drivers/crypto/ccp/ccp-crypto-rsa.c @@ -44,7 +44,7 @@ static inline int ccp_copy_and_save_keypart(u8 **kpbuf, unsigned int *kplen, static int ccp_rsa_complete(struct crypto_async_request *async_req, int ret) { struct akcipher_request *req = akcipher_request_cast(async_req); - struct ccp_rsa_req_ctx *rctx = akcipher_request_ctx(req); + struct ccp_rsa_req_ctx *rctx = akcipher_request_ctx_dma(req); if (ret) return ret; @@ -56,7 +56,7 @@ static int ccp_rsa_complete(struct crypto_async_request *async_req, int ret) static unsigned int ccp_rsa_maxsize(struct crypto_akcipher *tfm) { - struct ccp_ctx *ctx = akcipher_tfm_ctx(tfm); + struct ccp_ctx *ctx = akcipher_tfm_ctx_dma(tfm); return ctx->u.rsa.n_len; } @@ -64,8 +64,8 @@ static unsigned int ccp_rsa_maxsize(struct crypto_akcipher *tfm) static int ccp_rsa_crypt(struct akcipher_request *req, bool encrypt) { struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); - struct ccp_ctx *ctx = akcipher_tfm_ctx(tfm); - struct ccp_rsa_req_ctx *rctx = akcipher_request_ctx(req); + struct ccp_ctx *ctx = akcipher_tfm_ctx_dma(tfm); + struct ccp_rsa_req_ctx *rctx = akcipher_request_ctx_dma(req); int ret = 0; memset(&rctx->cmd, 0, sizeof(rctx->cmd)); @@ -126,7 +126,7 @@ static void ccp_rsa_free_key_bufs(struct ccp_ctx *ctx) static int ccp_rsa_setkey(struct crypto_akcipher *tfm, const void *key, unsigned int keylen, bool private) { - struct ccp_ctx *ctx = akcipher_tfm_ctx(tfm); + struct ccp_ctx *ctx = akcipher_tfm_ctx_dma(tfm); struct rsa_key raw_key; int ret; @@ -192,9 +192,9 @@ static int ccp_rsa_setpubkey(struct crypto_akcipher *tfm, const void *key, static int ccp_rsa_init_tfm(struct crypto_akcipher *tfm) { - struct ccp_ctx *ctx = akcipher_tfm_ctx(tfm); + struct ccp_ctx *ctx = akcipher_tfm_ctx_dma(tfm); - akcipher_set_reqsize(tfm, sizeof(struct ccp_rsa_req_ctx)); + akcipher_set_reqsize_dma(tfm, sizeof(struct ccp_rsa_req_ctx)); ctx->complete = ccp_rsa_complete; return 0; @@ -202,7 +202,7 @@ static int ccp_rsa_init_tfm(struct crypto_akcipher *tfm) static void ccp_rsa_exit_tfm(struct crypto_akcipher *tfm) { - struct ccp_ctx *ctx = crypto_tfm_ctx(&tfm->base); + struct ccp_ctx *ctx = akcipher_tfm_ctx_dma(tfm); ccp_rsa_free_key_bufs(ctx); } @@ -220,7 +220,7 @@ static struct akcipher_alg ccp_rsa_defaults = { .cra_driver_name = "rsa-ccp", .cra_priority = CCP_CRA_PRIORITY, .cra_module = THIS_MODULE, - .cra_ctxsize = 2 * sizeof(struct ccp_ctx), + .cra_ctxsize = 2 * sizeof(struct ccp_ctx) + CRYPTO_DMA_PADDING, }, }; diff --git a/drivers/crypto/ccp/ccp-crypto-sha.c b/drivers/crypto/ccp/ccp-crypto-sha.c index 74fa5360e7226..fa3ae8e78f6f3 100644 --- a/drivers/crypto/ccp/ccp-crypto-sha.c +++ b/drivers/crypto/ccp/ccp-crypto-sha.c @@ -28,7 +28,7 @@ static int ccp_sha_complete(struct crypto_async_request *async_req, int ret) { struct ahash_request *req = ahash_request_cast(async_req); struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct ccp_sha_req_ctx *rctx = ahash_request_ctx(req); + struct ccp_sha_req_ctx *rctx = ahash_request_ctx_dma(req); unsigned int digest_size = crypto_ahash_digestsize(tfm); if (ret) @@ -59,8 +59,8 @@ static int ccp_do_sha_update(struct ahash_request *req, unsigned int nbytes, unsigned int final) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct ccp_ctx *ctx = crypto_ahash_ctx(tfm); - struct ccp_sha_req_ctx *rctx = ahash_request_ctx(req); + struct ccp_ctx *ctx = crypto_ahash_ctx_dma(tfm); + struct ccp_sha_req_ctx *rctx = ahash_request_ctx_dma(req); struct scatterlist *sg; unsigned int block_size = crypto_tfm_alg_blocksize(crypto_ahash_tfm(tfm)); @@ -182,8 +182,8 @@ e_free: static int ccp_sha_init(struct ahash_request *req) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct ccp_ctx *ctx = crypto_ahash_ctx(tfm); - struct ccp_sha_req_ctx *rctx = ahash_request_ctx(req); + struct ccp_ctx *ctx = crypto_ahash_ctx_dma(tfm); + struct ccp_sha_req_ctx *rctx = ahash_request_ctx_dma(req); struct ccp_crypto_ahash_alg *alg = ccp_crypto_ahash_alg(crypto_ahash_tfm(tfm)); unsigned int block_size = @@ -231,7 +231,7 @@ static int ccp_sha_digest(struct ahash_request *req) static int ccp_sha_export(struct ahash_request *req, void *out) { - struct ccp_sha_req_ctx *rctx = ahash_request_ctx(req); + struct ccp_sha_req_ctx *rctx = ahash_request_ctx_dma(req); struct ccp_sha_exp_ctx state; /* Don't let anything leak to 'out' */ @@ -252,7 +252,7 @@ static int ccp_sha_export(struct ahash_request *req, void *out) static int ccp_sha_import(struct ahash_request *req, const void *in) { - struct ccp_sha_req_ctx *rctx = ahash_request_ctx(req); + struct ccp_sha_req_ctx *rctx = ahash_request_ctx_dma(req); struct ccp_sha_exp_ctx state; /* 'in' may not be aligned so memcpy to local variable */ @@ -272,7 +272,7 @@ static int ccp_sha_import(struct ahash_request *req, const void *in) static int ccp_sha_setkey(struct crypto_ahash *tfm, const u8 *key, unsigned int key_len) { - struct ccp_ctx *ctx = crypto_tfm_ctx(crypto_ahash_tfm(tfm)); + struct ccp_ctx *ctx = crypto_ahash_ctx_dma(tfm); struct crypto_shash *shash = ctx->u.sha.hmac_tfm; unsigned int block_size = crypto_shash_blocksize(shash); unsigned int digest_size = crypto_shash_digestsize(shash); @@ -313,13 +313,13 @@ static int ccp_sha_setkey(struct crypto_ahash *tfm, const u8 *key, static int ccp_sha_cra_init(struct crypto_tfm *tfm) { - struct ccp_ctx *ctx = crypto_tfm_ctx(tfm); struct crypto_ahash *ahash = __crypto_ahash_cast(tfm); + struct ccp_ctx *ctx = crypto_ahash_ctx_dma(ahash); ctx->complete = ccp_sha_complete; ctx->u.sha.key_len = 0; - crypto_ahash_set_reqsize(ahash, sizeof(struct ccp_sha_req_ctx)); + crypto_ahash_set_reqsize_dma(ahash, sizeof(struct ccp_sha_req_ctx)); return 0; } @@ -330,7 +330,7 @@ static void ccp_sha_cra_exit(struct crypto_tfm *tfm) static int ccp_hmac_sha_cra_init(struct crypto_tfm *tfm) { - struct ccp_ctx *ctx = crypto_tfm_ctx(tfm); + struct ccp_ctx *ctx = crypto_tfm_ctx_dma(tfm); struct ccp_crypto_ahash_alg *alg = ccp_crypto_ahash_alg(tfm); struct crypto_shash *hmac_tfm; @@ -348,7 +348,7 @@ static int ccp_hmac_sha_cra_init(struct crypto_tfm *tfm) static void ccp_hmac_sha_cra_exit(struct crypto_tfm *tfm) { - struct ccp_ctx *ctx = crypto_tfm_ctx(tfm); + struct ccp_ctx *ctx = crypto_tfm_ctx_dma(tfm); if (ctx->u.sha.hmac_tfm) crypto_free_shash(ctx->u.sha.hmac_tfm); @@ -492,7 +492,7 @@ static int ccp_register_sha_alg(struct list_head *head, CRYPTO_ALG_KERN_DRIVER_ONLY | CRYPTO_ALG_NEED_FALLBACK; base->cra_blocksize = def->block_size; - base->cra_ctxsize = sizeof(struct ccp_ctx); + base->cra_ctxsize = sizeof(struct ccp_ctx) + crypto_dma_padding(); base->cra_priority = CCP_CRA_PRIORITY; base->cra_init = ccp_sha_cra_init; base->cra_exit = ccp_sha_cra_exit; -- GitLab From 07547fa73e4645363165e662f50427a7d302dcf1 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 2 Dec 2022 17:20:51 +0800 Subject: [PATCH 1268/1423] crypto: ccree - Set DMA alignment explicitly This driver has been implicitly relying on kmalloc alignment to be sufficient for DMA. This may no longer be the case with upcoming arm64 changes. This patch changes it to explicitly request DMA alignment from the Crypto API. Signed-off-by: Herbert Xu --- drivers/crypto/ccree/cc_aead.c | 62 ++++++++++---------- drivers/crypto/ccree/cc_buffer_mgr.c | 18 +++--- drivers/crypto/ccree/cc_hash.c | 86 ++++++++++++++-------------- 3 files changed, 83 insertions(+), 83 deletions(-) diff --git a/drivers/crypto/ccree/cc_aead.c b/drivers/crypto/ccree/cc_aead.c index 35794c7271fb6..109ffb375fc69 100644 --- a/drivers/crypto/ccree/cc_aead.c +++ b/drivers/crypto/ccree/cc_aead.c @@ -138,7 +138,7 @@ static int cc_aead_init(struct crypto_aead *tfm) ctx->flow_mode = cc_alg->flow_mode; ctx->auth_mode = cc_alg->auth_mode; ctx->drvdata = cc_alg->drvdata; - crypto_aead_set_reqsize(tfm, sizeof(struct aead_req_ctx)); + crypto_aead_set_reqsize_dma(tfm, sizeof(struct aead_req_ctx)); /* Allocate key buffer, cache line aligned */ ctx->enckey = dma_alloc_coherent(dev, AES_MAX_KEY_SIZE, @@ -208,7 +208,7 @@ init_failed: static void cc_aead_complete(struct device *dev, void *cc_req, int err) { struct aead_request *areq = (struct aead_request *)cc_req; - struct aead_req_ctx *areq_ctx = aead_request_ctx(areq); + struct aead_req_ctx *areq_ctx = aead_request_ctx_dma(areq); struct crypto_aead *tfm = crypto_aead_reqtfm(cc_req); struct cc_aead_ctx *ctx = crypto_aead_ctx(tfm); @@ -723,7 +723,7 @@ static void cc_set_assoc_desc(struct aead_request *areq, unsigned int flow_mode, { struct crypto_aead *tfm = crypto_aead_reqtfm(areq); struct cc_aead_ctx *ctx = crypto_aead_ctx(tfm); - struct aead_req_ctx *areq_ctx = aead_request_ctx(areq); + struct aead_req_ctx *areq_ctx = aead_request_ctx_dma(areq); enum cc_req_dma_buf_type assoc_dma_type = areq_ctx->assoc_buff_type; unsigned int idx = *seq_size; struct device *dev = drvdata_to_dev(ctx->drvdata); @@ -762,7 +762,7 @@ static void cc_proc_authen_desc(struct aead_request *areq, struct cc_hw_desc desc[], unsigned int *seq_size, int direct) { - struct aead_req_ctx *areq_ctx = aead_request_ctx(areq); + struct aead_req_ctx *areq_ctx = aead_request_ctx_dma(areq); enum cc_req_dma_buf_type data_dma_type = areq_ctx->data_buff_type; unsigned int idx = *seq_size; struct crypto_aead *tfm = crypto_aead_reqtfm(areq); @@ -827,7 +827,7 @@ static void cc_proc_cipher_desc(struct aead_request *areq, unsigned int *seq_size) { unsigned int idx = *seq_size; - struct aead_req_ctx *areq_ctx = aead_request_ctx(areq); + struct aead_req_ctx *areq_ctx = aead_request_ctx_dma(areq); enum cc_req_dma_buf_type data_dma_type = areq_ctx->data_buff_type; struct crypto_aead *tfm = crypto_aead_reqtfm(areq); struct cc_aead_ctx *ctx = crypto_aead_ctx(tfm); @@ -873,7 +873,7 @@ static void cc_proc_digest_desc(struct aead_request *req, { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct cc_aead_ctx *ctx = crypto_aead_ctx(tfm); - struct aead_req_ctx *req_ctx = aead_request_ctx(req); + struct aead_req_ctx *req_ctx = aead_request_ctx_dma(req); unsigned int idx = *seq_size; unsigned int hash_mode = (ctx->auth_mode == DRV_HASH_SHA1) ? DRV_HASH_HW_SHA1 : DRV_HASH_HW_SHA256; @@ -923,7 +923,7 @@ static void cc_set_cipher_desc(struct aead_request *req, { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct cc_aead_ctx *ctx = crypto_aead_ctx(tfm); - struct aead_req_ctx *req_ctx = aead_request_ctx(req); + struct aead_req_ctx *req_ctx = aead_request_ctx_dma(req); unsigned int hw_iv_size = req_ctx->hw_iv_size; unsigned int idx = *seq_size; int direct = req_ctx->gen_ctx.op_type; @@ -965,7 +965,7 @@ static void cc_set_cipher_desc(struct aead_request *req, static void cc_proc_cipher(struct aead_request *req, struct cc_hw_desc desc[], unsigned int *seq_size, unsigned int data_flow_mode) { - struct aead_req_ctx *req_ctx = aead_request_ctx(req); + struct aead_req_ctx *req_ctx = aead_request_ctx_dma(req); int direct = req_ctx->gen_ctx.op_type; unsigned int idx = *seq_size; @@ -1082,7 +1082,7 @@ static void cc_proc_header_desc(struct aead_request *req, struct cc_hw_desc desc[], unsigned int *seq_size) { - struct aead_req_ctx *areq_ctx = aead_request_ctx(req); + struct aead_req_ctx *areq_ctx = aead_request_ctx_dma(req); unsigned int idx = *seq_size; /* Hash associated data */ @@ -1158,7 +1158,7 @@ static void cc_proc_scheme_desc(struct aead_request *req, static void cc_mlli_to_sram(struct aead_request *req, struct cc_hw_desc desc[], unsigned int *seq_size) { - struct aead_req_ctx *req_ctx = aead_request_ctx(req); + struct aead_req_ctx *req_ctx = aead_request_ctx_dma(req); struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct cc_aead_ctx *ctx = crypto_aead_ctx(tfm); struct device *dev = drvdata_to_dev(ctx->drvdata); @@ -1212,7 +1212,7 @@ static void cc_hmac_authenc(struct aead_request *req, struct cc_hw_desc desc[], { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct cc_aead_ctx *ctx = crypto_aead_ctx(tfm); - struct aead_req_ctx *req_ctx = aead_request_ctx(req); + struct aead_req_ctx *req_ctx = aead_request_ctx_dma(req); int direct = req_ctx->gen_ctx.op_type; unsigned int data_flow_mode = cc_get_data_flow(direct, ctx->flow_mode, @@ -1265,7 +1265,7 @@ cc_xcbc_authenc(struct aead_request *req, struct cc_hw_desc desc[], { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct cc_aead_ctx *ctx = crypto_aead_ctx(tfm); - struct aead_req_ctx *req_ctx = aead_request_ctx(req); + struct aead_req_ctx *req_ctx = aead_request_ctx_dma(req); int direct = req_ctx->gen_ctx.op_type; unsigned int data_flow_mode = cc_get_data_flow(direct, ctx->flow_mode, @@ -1312,7 +1312,7 @@ static int validate_data_size(struct cc_aead_ctx *ctx, enum drv_crypto_direction direct, struct aead_request *req) { - struct aead_req_ctx *areq_ctx = aead_request_ctx(req); + struct aead_req_ctx *areq_ctx = aead_request_ctx_dma(req); struct device *dev = drvdata_to_dev(ctx->drvdata); unsigned int assoclen = areq_ctx->assoclen; unsigned int cipherlen = (direct == DRV_CRYPTO_DIRECTION_DECRYPT) ? @@ -1411,7 +1411,7 @@ static int cc_ccm(struct aead_request *req, struct cc_hw_desc desc[], { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct cc_aead_ctx *ctx = crypto_aead_ctx(tfm); - struct aead_req_ctx *req_ctx = aead_request_ctx(req); + struct aead_req_ctx *req_ctx = aead_request_ctx_dma(req); unsigned int idx = *seq_size; unsigned int cipher_flow_mode; dma_addr_t mac_result; @@ -1533,7 +1533,7 @@ static int config_ccm_adata(struct aead_request *req) struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct cc_aead_ctx *ctx = crypto_aead_ctx(tfm); struct device *dev = drvdata_to_dev(ctx->drvdata); - struct aead_req_ctx *req_ctx = aead_request_ctx(req); + struct aead_req_ctx *req_ctx = aead_request_ctx_dma(req); //unsigned int size_of_a = 0, rem_a_size = 0; unsigned int lp = req->iv[0]; /* Note: The code assume that req->iv[0] already contains the value @@ -1591,7 +1591,7 @@ static void cc_proc_rfc4309_ccm(struct aead_request *req) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct cc_aead_ctx *ctx = crypto_aead_ctx(tfm); - struct aead_req_ctx *areq_ctx = aead_request_ctx(req); + struct aead_req_ctx *areq_ctx = aead_request_ctx_dma(req); /* L' */ memset(areq_ctx->ctr_iv, 0, AES_BLOCK_SIZE); @@ -1615,7 +1615,7 @@ static void cc_set_ghash_desc(struct aead_request *req, { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct cc_aead_ctx *ctx = crypto_aead_ctx(tfm); - struct aead_req_ctx *req_ctx = aead_request_ctx(req); + struct aead_req_ctx *req_ctx = aead_request_ctx_dma(req); unsigned int idx = *seq_size; /* load key to AES*/ @@ -1693,7 +1693,7 @@ static void cc_set_gctr_desc(struct aead_request *req, struct cc_hw_desc desc[], { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct cc_aead_ctx *ctx = crypto_aead_ctx(tfm); - struct aead_req_ctx *req_ctx = aead_request_ctx(req); + struct aead_req_ctx *req_ctx = aead_request_ctx_dma(req); unsigned int idx = *seq_size; /* load key to AES*/ @@ -1730,7 +1730,7 @@ static void cc_proc_gcm_result(struct aead_request *req, { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct cc_aead_ctx *ctx = crypto_aead_ctx(tfm); - struct aead_req_ctx *req_ctx = aead_request_ctx(req); + struct aead_req_ctx *req_ctx = aead_request_ctx_dma(req); dma_addr_t mac_result; unsigned int idx = *seq_size; @@ -1792,7 +1792,7 @@ static void cc_proc_gcm_result(struct aead_request *req, static int cc_gcm(struct aead_request *req, struct cc_hw_desc desc[], unsigned int *seq_size) { - struct aead_req_ctx *req_ctx = aead_request_ctx(req); + struct aead_req_ctx *req_ctx = aead_request_ctx_dma(req); unsigned int cipher_flow_mode; //in RFC4543 no data to encrypt. just copy data from src to dest. @@ -1830,7 +1830,7 @@ static int config_gcm_context(struct aead_request *req) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct cc_aead_ctx *ctx = crypto_aead_ctx(tfm); - struct aead_req_ctx *req_ctx = aead_request_ctx(req); + struct aead_req_ctx *req_ctx = aead_request_ctx_dma(req); struct device *dev = drvdata_to_dev(ctx->drvdata); unsigned int cryptlen = (req_ctx->gen_ctx.op_type == @@ -1879,7 +1879,7 @@ static void cc_proc_rfc4_gcm(struct aead_request *req) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct cc_aead_ctx *ctx = crypto_aead_ctx(tfm); - struct aead_req_ctx *areq_ctx = aead_request_ctx(req); + struct aead_req_ctx *areq_ctx = aead_request_ctx_dma(req); memcpy(areq_ctx->ctr_iv + GCM_BLOCK_RFC4_NONCE_OFFSET, ctx->ctr_nonce, GCM_BLOCK_RFC4_NONCE_SIZE); @@ -1896,7 +1896,7 @@ static int cc_proc_aead(struct aead_request *req, struct cc_hw_desc desc[MAX_AEAD_PROCESS_SEQ]; struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct cc_aead_ctx *ctx = crypto_aead_ctx(tfm); - struct aead_req_ctx *areq_ctx = aead_request_ctx(req); + struct aead_req_ctx *areq_ctx = aead_request_ctx_dma(req); struct device *dev = drvdata_to_dev(ctx->drvdata); struct cc_crypto_req cc_req = {}; @@ -2019,7 +2019,7 @@ exit: static int cc_aead_encrypt(struct aead_request *req) { - struct aead_req_ctx *areq_ctx = aead_request_ctx(req); + struct aead_req_ctx *areq_ctx = aead_request_ctx_dma(req); int rc; memset(areq_ctx, 0, sizeof(*areq_ctx)); @@ -2039,7 +2039,7 @@ static int cc_rfc4309_ccm_encrypt(struct aead_request *req) { /* Very similar to cc_aead_encrypt() above. */ - struct aead_req_ctx *areq_ctx = aead_request_ctx(req); + struct aead_req_ctx *areq_ctx = aead_request_ctx_dma(req); int rc; rc = crypto_ipsec_check_assoclen(req->assoclen); @@ -2063,7 +2063,7 @@ out: static int cc_aead_decrypt(struct aead_request *req) { - struct aead_req_ctx *areq_ctx = aead_request_ctx(req); + struct aead_req_ctx *areq_ctx = aead_request_ctx_dma(req); int rc; memset(areq_ctx, 0, sizeof(*areq_ctx)); @@ -2081,7 +2081,7 @@ static int cc_aead_decrypt(struct aead_request *req) static int cc_rfc4309_ccm_decrypt(struct aead_request *req) { - struct aead_req_ctx *areq_ctx = aead_request_ctx(req); + struct aead_req_ctx *areq_ctx = aead_request_ctx_dma(req); int rc; rc = crypto_ipsec_check_assoclen(req->assoclen); @@ -2193,7 +2193,7 @@ static int cc_rfc4543_gcm_setauthsize(struct crypto_aead *authenc, static int cc_rfc4106_gcm_encrypt(struct aead_request *req) { - struct aead_req_ctx *areq_ctx = aead_request_ctx(req); + struct aead_req_ctx *areq_ctx = aead_request_ctx_dma(req); int rc; rc = crypto_ipsec_check_assoclen(req->assoclen); @@ -2217,7 +2217,7 @@ out: static int cc_rfc4543_gcm_encrypt(struct aead_request *req) { - struct aead_req_ctx *areq_ctx = aead_request_ctx(req); + struct aead_req_ctx *areq_ctx = aead_request_ctx_dma(req); int rc; rc = crypto_ipsec_check_assoclen(req->assoclen); @@ -2244,7 +2244,7 @@ out: static int cc_rfc4106_gcm_decrypt(struct aead_request *req) { - struct aead_req_ctx *areq_ctx = aead_request_ctx(req); + struct aead_req_ctx *areq_ctx = aead_request_ctx_dma(req); int rc; rc = crypto_ipsec_check_assoclen(req->assoclen); @@ -2268,7 +2268,7 @@ out: static int cc_rfc4543_gcm_decrypt(struct aead_request *req) { - struct aead_req_ctx *areq_ctx = aead_request_ctx(req); + struct aead_req_ctx *areq_ctx = aead_request_ctx_dma(req); int rc; rc = crypto_ipsec_check_assoclen(req->assoclen); diff --git a/drivers/crypto/ccree/cc_buffer_mgr.c b/drivers/crypto/ccree/cc_buffer_mgr.c index 9efd88f871d12..bcca55bff910e 100644 --- a/drivers/crypto/ccree/cc_buffer_mgr.c +++ b/drivers/crypto/ccree/cc_buffer_mgr.c @@ -52,7 +52,7 @@ static inline char *cc_dma_buf_type(enum cc_req_dma_buf_type type) static void cc_copy_mac(struct device *dev, struct aead_request *req, enum cc_sg_cpy_direct dir) { - struct aead_req_ctx *areq_ctx = aead_request_ctx(req); + struct aead_req_ctx *areq_ctx = aead_request_ctx_dma(req); u32 skip = req->assoclen + req->cryptlen; cc_copy_sg_portion(dev, areq_ctx->backup_mac, req->src, @@ -456,7 +456,7 @@ cipher_exit: void cc_unmap_aead_request(struct device *dev, struct aead_request *req) { - struct aead_req_ctx *areq_ctx = aead_request_ctx(req); + struct aead_req_ctx *areq_ctx = aead_request_ctx_dma(req); unsigned int hw_iv_size = areq_ctx->hw_iv_size; struct cc_drvdata *drvdata = dev_get_drvdata(dev); int src_direction = (req->src != req->dst ? DMA_TO_DEVICE : DMA_BIDIRECTIONAL); @@ -546,7 +546,7 @@ static int cc_aead_chain_iv(struct cc_drvdata *drvdata, struct buffer_array *sg_data, bool is_last, bool do_chain) { - struct aead_req_ctx *areq_ctx = aead_request_ctx(req); + struct aead_req_ctx *areq_ctx = aead_request_ctx_dma(req); unsigned int hw_iv_size = areq_ctx->hw_iv_size; struct device *dev = drvdata_to_dev(drvdata); gfp_t flags = cc_gfp_flags(&req->base); @@ -586,7 +586,7 @@ static int cc_aead_chain_assoc(struct cc_drvdata *drvdata, struct buffer_array *sg_data, bool is_last, bool do_chain) { - struct aead_req_ctx *areq_ctx = aead_request_ctx(req); + struct aead_req_ctx *areq_ctx = aead_request_ctx_dma(req); int rc = 0; int mapped_nents = 0; struct device *dev = drvdata_to_dev(drvdata); @@ -652,7 +652,7 @@ chain_assoc_exit: static void cc_prepare_aead_data_dlli(struct aead_request *req, u32 *src_last_bytes, u32 *dst_last_bytes) { - struct aead_req_ctx *areq_ctx = aead_request_ctx(req); + struct aead_req_ctx *areq_ctx = aead_request_ctx_dma(req); enum drv_crypto_direction direct = areq_ctx->gen_ctx.op_type; unsigned int authsize = areq_ctx->req_authsize; struct scatterlist *sg; @@ -678,7 +678,7 @@ static void cc_prepare_aead_data_mlli(struct cc_drvdata *drvdata, u32 *src_last_bytes, u32 *dst_last_bytes, bool is_last_table) { - struct aead_req_ctx *areq_ctx = aead_request_ctx(req); + struct aead_req_ctx *areq_ctx = aead_request_ctx_dma(req); enum drv_crypto_direction direct = areq_ctx->gen_ctx.op_type; unsigned int authsize = areq_ctx->req_authsize; struct device *dev = drvdata_to_dev(drvdata); @@ -790,7 +790,7 @@ static int cc_aead_chain_data(struct cc_drvdata *drvdata, struct buffer_array *sg_data, bool is_last_table, bool do_chain) { - struct aead_req_ctx *areq_ctx = aead_request_ctx(req); + struct aead_req_ctx *areq_ctx = aead_request_ctx_dma(req); struct device *dev = drvdata_to_dev(drvdata); enum drv_crypto_direction direct = areq_ctx->gen_ctx.op_type; unsigned int authsize = areq_ctx->req_authsize; @@ -895,7 +895,7 @@ chain_data_exit: static void cc_update_aead_mlli_nents(struct cc_drvdata *drvdata, struct aead_request *req) { - struct aead_req_ctx *areq_ctx = aead_request_ctx(req); + struct aead_req_ctx *areq_ctx = aead_request_ctx_dma(req); u32 curr_mlli_size = 0; if (areq_ctx->assoc_buff_type == CC_DMA_BUF_MLLI) { @@ -945,7 +945,7 @@ static void cc_update_aead_mlli_nents(struct cc_drvdata *drvdata, int cc_map_aead_request(struct cc_drvdata *drvdata, struct aead_request *req) { - struct aead_req_ctx *areq_ctx = aead_request_ctx(req); + struct aead_req_ctx *areq_ctx = aead_request_ctx_dma(req); struct mlli_params *mlli_params = &areq_ctx->mlli_params; struct device *dev = drvdata_to_dev(drvdata); struct buffer_array sg_data; diff --git a/drivers/crypto/ccree/cc_hash.c b/drivers/crypto/ccree/cc_hash.c index 683c9a430e11b..f418162932fe3 100644 --- a/drivers/crypto/ccree/cc_hash.c +++ b/drivers/crypto/ccree/cc_hash.c @@ -283,9 +283,9 @@ static void cc_unmap_result(struct device *dev, struct ahash_req_ctx *state, static void cc_update_complete(struct device *dev, void *cc_req, int err) { struct ahash_request *req = (struct ahash_request *)cc_req; - struct ahash_req_ctx *state = ahash_request_ctx(req); + struct ahash_req_ctx *state = ahash_request_ctx_dma(req); struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct cc_hash_ctx *ctx = crypto_ahash_ctx(tfm); + struct cc_hash_ctx *ctx = crypto_ahash_ctx_dma(tfm); dev_dbg(dev, "req=%pK\n", req); @@ -301,9 +301,9 @@ static void cc_update_complete(struct device *dev, void *cc_req, int err) static void cc_digest_complete(struct device *dev, void *cc_req, int err) { struct ahash_request *req = (struct ahash_request *)cc_req; - struct ahash_req_ctx *state = ahash_request_ctx(req); + struct ahash_req_ctx *state = ahash_request_ctx_dma(req); struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct cc_hash_ctx *ctx = crypto_ahash_ctx(tfm); + struct cc_hash_ctx *ctx = crypto_ahash_ctx_dma(tfm); u32 digestsize = crypto_ahash_digestsize(tfm); dev_dbg(dev, "req=%pK\n", req); @@ -321,9 +321,9 @@ static void cc_digest_complete(struct device *dev, void *cc_req, int err) static void cc_hash_complete(struct device *dev, void *cc_req, int err) { struct ahash_request *req = (struct ahash_request *)cc_req; - struct ahash_req_ctx *state = ahash_request_ctx(req); + struct ahash_req_ctx *state = ahash_request_ctx_dma(req); struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct cc_hash_ctx *ctx = crypto_ahash_ctx(tfm); + struct cc_hash_ctx *ctx = crypto_ahash_ctx_dma(tfm); u32 digestsize = crypto_ahash_digestsize(tfm); dev_dbg(dev, "req=%pK\n", req); @@ -341,9 +341,9 @@ static void cc_hash_complete(struct device *dev, void *cc_req, int err) static int cc_fin_result(struct cc_hw_desc *desc, struct ahash_request *req, int idx) { - struct ahash_req_ctx *state = ahash_request_ctx(req); + struct ahash_req_ctx *state = ahash_request_ctx_dma(req); struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct cc_hash_ctx *ctx = crypto_ahash_ctx(tfm); + struct cc_hash_ctx *ctx = crypto_ahash_ctx_dma(tfm); u32 digestsize = crypto_ahash_digestsize(tfm); /* Get final MAC result */ @@ -364,9 +364,9 @@ static int cc_fin_result(struct cc_hw_desc *desc, struct ahash_request *req, static int cc_fin_hmac(struct cc_hw_desc *desc, struct ahash_request *req, int idx) { - struct ahash_req_ctx *state = ahash_request_ctx(req); + struct ahash_req_ctx *state = ahash_request_ctx_dma(req); struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct cc_hash_ctx *ctx = crypto_ahash_ctx(tfm); + struct cc_hash_ctx *ctx = crypto_ahash_ctx_dma(tfm); u32 digestsize = crypto_ahash_digestsize(tfm); /* store the hash digest result in the context */ @@ -417,9 +417,9 @@ static int cc_fin_hmac(struct cc_hw_desc *desc, struct ahash_request *req, static int cc_hash_digest(struct ahash_request *req) { - struct ahash_req_ctx *state = ahash_request_ctx(req); + struct ahash_req_ctx *state = ahash_request_ctx_dma(req); struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct cc_hash_ctx *ctx = crypto_ahash_ctx(tfm); + struct cc_hash_ctx *ctx = crypto_ahash_ctx_dma(tfm); u32 digestsize = crypto_ahash_digestsize(tfm); struct scatterlist *src = req->src; unsigned int nbytes = req->nbytes; @@ -555,9 +555,9 @@ static int cc_restore_hash(struct cc_hw_desc *desc, struct cc_hash_ctx *ctx, static int cc_hash_update(struct ahash_request *req) { - struct ahash_req_ctx *state = ahash_request_ctx(req); + struct ahash_req_ctx *state = ahash_request_ctx_dma(req); struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct cc_hash_ctx *ctx = crypto_ahash_ctx(tfm); + struct cc_hash_ctx *ctx = crypto_ahash_ctx_dma(tfm); unsigned int block_size = crypto_tfm_alg_blocksize(&tfm->base); struct scatterlist *src = req->src; unsigned int nbytes = req->nbytes; @@ -631,9 +631,9 @@ static int cc_hash_update(struct ahash_request *req) static int cc_do_finup(struct ahash_request *req, bool update) { - struct ahash_req_ctx *state = ahash_request_ctx(req); + struct ahash_req_ctx *state = ahash_request_ctx_dma(req); struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct cc_hash_ctx *ctx = crypto_ahash_ctx(tfm); + struct cc_hash_ctx *ctx = crypto_ahash_ctx_dma(tfm); u32 digestsize = crypto_ahash_digestsize(tfm); struct scatterlist *src = req->src; unsigned int nbytes = req->nbytes; @@ -711,9 +711,9 @@ static int cc_hash_final(struct ahash_request *req) static int cc_hash_init(struct ahash_request *req) { - struct ahash_req_ctx *state = ahash_request_ctx(req); + struct ahash_req_ctx *state = ahash_request_ctx_dma(req); struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct cc_hash_ctx *ctx = crypto_ahash_ctx(tfm); + struct cc_hash_ctx *ctx = crypto_ahash_ctx_dma(tfm); struct device *dev = drvdata_to_dev(ctx->drvdata); dev_dbg(dev, "===== init (%d) ====\n", req->nbytes); @@ -736,7 +736,7 @@ static int cc_hash_setkey(struct crypto_ahash *ahash, const u8 *key, u32 larval_addr; struct device *dev; - ctx = crypto_ahash_ctx(ahash); + ctx = crypto_ahash_ctx_dma(ahash); dev = drvdata_to_dev(ctx->drvdata); dev_dbg(dev, "start keylen: %d", keylen); @@ -922,7 +922,7 @@ static int cc_xcbc_setkey(struct crypto_ahash *ahash, const u8 *key, unsigned int keylen) { struct cc_crypto_req cc_req = {}; - struct cc_hash_ctx *ctx = crypto_ahash_ctx(ahash); + struct cc_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); struct device *dev = drvdata_to_dev(ctx->drvdata); int rc = 0; unsigned int idx = 0; @@ -1007,7 +1007,7 @@ static int cc_xcbc_setkey(struct crypto_ahash *ahash, static int cc_cmac_setkey(struct crypto_ahash *ahash, const u8 *key, unsigned int keylen) { - struct cc_hash_ctx *ctx = crypto_ahash_ctx(ahash); + struct cc_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); struct device *dev = drvdata_to_dev(ctx->drvdata); dev_dbg(dev, "===== setkey (%d) ====\n", keylen); @@ -1109,7 +1109,7 @@ fail: static int cc_get_hash_len(struct crypto_tfm *tfm) { - struct cc_hash_ctx *ctx = crypto_tfm_ctx(tfm); + struct cc_hash_ctx *ctx = crypto_tfm_ctx_dma(tfm); if (ctx->hash_mode == DRV_HASH_SM3) return CC_SM3_HASH_LEN_SIZE; @@ -1119,7 +1119,7 @@ static int cc_get_hash_len(struct crypto_tfm *tfm) static int cc_cra_init(struct crypto_tfm *tfm) { - struct cc_hash_ctx *ctx = crypto_tfm_ctx(tfm); + struct cc_hash_ctx *ctx = crypto_tfm_ctx_dma(tfm); struct hash_alg_common *hash_alg_common = container_of(tfm->__crt_alg, struct hash_alg_common, base); struct ahash_alg *ahash_alg = @@ -1127,8 +1127,8 @@ static int cc_cra_init(struct crypto_tfm *tfm) struct cc_hash_alg *cc_alg = container_of(ahash_alg, struct cc_hash_alg, ahash_alg); - crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), - sizeof(struct ahash_req_ctx)); + crypto_ahash_set_reqsize_dma(__crypto_ahash_cast(tfm), + sizeof(struct ahash_req_ctx)); ctx->hash_mode = cc_alg->hash_mode; ctx->hw_mode = cc_alg->hw_mode; @@ -1140,7 +1140,7 @@ static int cc_cra_init(struct crypto_tfm *tfm) static void cc_cra_exit(struct crypto_tfm *tfm) { - struct cc_hash_ctx *ctx = crypto_tfm_ctx(tfm); + struct cc_hash_ctx *ctx = crypto_tfm_ctx_dma(tfm); struct device *dev = drvdata_to_dev(ctx->drvdata); dev_dbg(dev, "cc_cra_exit"); @@ -1149,9 +1149,9 @@ static void cc_cra_exit(struct crypto_tfm *tfm) static int cc_mac_update(struct ahash_request *req) { - struct ahash_req_ctx *state = ahash_request_ctx(req); + struct ahash_req_ctx *state = ahash_request_ctx_dma(req); struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct cc_hash_ctx *ctx = crypto_ahash_ctx(tfm); + struct cc_hash_ctx *ctx = crypto_ahash_ctx_dma(tfm); struct device *dev = drvdata_to_dev(ctx->drvdata); unsigned int block_size = crypto_tfm_alg_blocksize(&tfm->base); struct cc_crypto_req cc_req = {}; @@ -1217,9 +1217,9 @@ static int cc_mac_update(struct ahash_request *req) static int cc_mac_final(struct ahash_request *req) { - struct ahash_req_ctx *state = ahash_request_ctx(req); + struct ahash_req_ctx *state = ahash_request_ctx_dma(req); struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct cc_hash_ctx *ctx = crypto_ahash_ctx(tfm); + struct cc_hash_ctx *ctx = crypto_ahash_ctx_dma(tfm); struct device *dev = drvdata_to_dev(ctx->drvdata); struct cc_crypto_req cc_req = {}; struct cc_hw_desc desc[CC_MAX_HASH_SEQ_LEN]; @@ -1338,9 +1338,9 @@ static int cc_mac_final(struct ahash_request *req) static int cc_mac_finup(struct ahash_request *req) { - struct ahash_req_ctx *state = ahash_request_ctx(req); + struct ahash_req_ctx *state = ahash_request_ctx_dma(req); struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct cc_hash_ctx *ctx = crypto_ahash_ctx(tfm); + struct cc_hash_ctx *ctx = crypto_ahash_ctx_dma(tfm); struct device *dev = drvdata_to_dev(ctx->drvdata); struct cc_crypto_req cc_req = {}; struct cc_hw_desc desc[CC_MAX_HASH_SEQ_LEN]; @@ -1419,9 +1419,9 @@ static int cc_mac_finup(struct ahash_request *req) static int cc_mac_digest(struct ahash_request *req) { - struct ahash_req_ctx *state = ahash_request_ctx(req); + struct ahash_req_ctx *state = ahash_request_ctx_dma(req); struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct cc_hash_ctx *ctx = crypto_ahash_ctx(tfm); + struct cc_hash_ctx *ctx = crypto_ahash_ctx_dma(tfm); struct device *dev = drvdata_to_dev(ctx->drvdata); u32 digestsize = crypto_ahash_digestsize(tfm); struct cc_crypto_req cc_req = {}; @@ -1499,8 +1499,8 @@ static int cc_mac_digest(struct ahash_request *req) static int cc_hash_export(struct ahash_request *req, void *out) { struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); - struct cc_hash_ctx *ctx = crypto_ahash_ctx(ahash); - struct ahash_req_ctx *state = ahash_request_ctx(req); + struct cc_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); + struct ahash_req_ctx *state = ahash_request_ctx_dma(req); u8 *curr_buff = cc_hash_buf(state); u32 curr_buff_cnt = *cc_hash_buf_cnt(state); const u32 tmp = CC_EXPORT_MAGIC; @@ -1525,9 +1525,9 @@ static int cc_hash_export(struct ahash_request *req, void *out) static int cc_hash_import(struct ahash_request *req, const void *in) { struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); - struct cc_hash_ctx *ctx = crypto_ahash_ctx(ahash); + struct cc_hash_ctx *ctx = crypto_ahash_ctx_dma(ahash); struct device *dev = drvdata_to_dev(ctx->drvdata); - struct ahash_req_ctx *state = ahash_request_ctx(req); + struct ahash_req_ctx *state = ahash_request_ctx_dma(req); u32 tmp; memcpy(&tmp, in, sizeof(u32)); @@ -1846,7 +1846,7 @@ static struct cc_hash_alg *cc_alloc_hash_alg(struct cc_hash_template *template, template->driver_name); } alg->cra_module = THIS_MODULE; - alg->cra_ctxsize = sizeof(struct cc_hash_ctx); + alg->cra_ctxsize = sizeof(struct cc_hash_ctx) + crypto_dma_padding(); alg->cra_priority = CC_CRA_PRIO; alg->cra_blocksize = template->blocksize; alg->cra_alignmask = 0; @@ -2073,9 +2073,9 @@ static void cc_setup_xcbc(struct ahash_request *areq, struct cc_hw_desc desc[], unsigned int *seq_size) { unsigned int idx = *seq_size; - struct ahash_req_ctx *state = ahash_request_ctx(areq); + struct ahash_req_ctx *state = ahash_request_ctx_dma(areq); struct crypto_ahash *tfm = crypto_ahash_reqtfm(areq); - struct cc_hash_ctx *ctx = crypto_ahash_ctx(tfm); + struct cc_hash_ctx *ctx = crypto_ahash_ctx_dma(tfm); /* Setup XCBC MAC K1 */ hw_desc_init(&desc[idx]); @@ -2130,9 +2130,9 @@ static void cc_setup_cmac(struct ahash_request *areq, struct cc_hw_desc desc[], unsigned int *seq_size) { unsigned int idx = *seq_size; - struct ahash_req_ctx *state = ahash_request_ctx(areq); + struct ahash_req_ctx *state = ahash_request_ctx_dma(areq); struct crypto_ahash *tfm = crypto_ahash_reqtfm(areq); - struct cc_hash_ctx *ctx = crypto_ahash_ctx(tfm); + struct cc_hash_ctx *ctx = crypto_ahash_ctx_dma(tfm); /* Setup CMAC Key */ hw_desc_init(&desc[idx]); -- GitLab From e055bffaa390042d73fed56a0ef9bfe71a675614 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 2 Dec 2022 17:20:53 +0800 Subject: [PATCH 1269/1423] crypto: chelsio - Set DMA alignment explicitly This driver has been implicitly relying on kmalloc alignment to be sufficient for DMA. This may no longer be the case with upcoming arm64 changes. This patch changes it to explicitly request DMA alignment from the Crypto API. Signed-off-by: Herbert Xu --- drivers/crypto/chelsio/chcr_algo.c | 43 +++++++++++++++--------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c index 9fac1e7584066..68d65773ef2bd 100644 --- a/drivers/crypto/chelsio/chcr_algo.c +++ b/drivers/crypto/chelsio/chcr_algo.c @@ -210,7 +210,7 @@ static inline int chcr_handle_aead_resp(struct aead_request *req, unsigned char *input, int err) { - struct chcr_aead_reqctx *reqctx = aead_request_ctx(req); + struct chcr_aead_reqctx *reqctx = aead_request_ctx_dma(req); struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct chcr_dev *dev = a_ctx(tfm)->dev; @@ -718,7 +718,7 @@ static inline int get_qidxs(struct crypto_async_request *req, { struct aead_request *aead_req = container_of(req, struct aead_request, base); - struct chcr_aead_reqctx *reqctx = aead_request_ctx(aead_req); + struct chcr_aead_reqctx *reqctx = aead_request_ctx_dma(aead_req); *txqidx = reqctx->txqidx; *rxqidx = reqctx->rxqidx; break; @@ -2362,7 +2362,7 @@ static void chcr_hmac_cra_exit(struct crypto_tfm *tfm) inline void chcr_aead_common_exit(struct aead_request *req) { - struct chcr_aead_reqctx *reqctx = aead_request_ctx(req); + struct chcr_aead_reqctx *reqctx = aead_request_ctx_dma(req); struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct uld_ctx *u_ctx = ULD_CTX(a_ctx(tfm)); @@ -2373,7 +2373,7 @@ static int chcr_aead_common_init(struct aead_request *req) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct chcr_aead_ctx *aeadctx = AEAD_CTX(a_ctx(tfm)); - struct chcr_aead_reqctx *reqctx = aead_request_ctx(req); + struct chcr_aead_reqctx *reqctx = aead_request_ctx_dma(req); unsigned int authsize = crypto_aead_authsize(tfm); int error = -EINVAL; @@ -2417,7 +2417,7 @@ static int chcr_aead_fallback(struct aead_request *req, unsigned short op_type) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct chcr_aead_ctx *aeadctx = AEAD_CTX(a_ctx(tfm)); - struct aead_request *subreq = aead_request_ctx(req); + struct aead_request *subreq = aead_request_ctx_dma(req); aead_request_set_tfm(subreq, aeadctx->sw_cipher); aead_request_set_callback(subreq, req->base.flags, @@ -2438,7 +2438,7 @@ static struct sk_buff *create_authenc_wr(struct aead_request *req, struct uld_ctx *u_ctx = ULD_CTX(ctx); struct chcr_aead_ctx *aeadctx = AEAD_CTX(ctx); struct chcr_authenc_ctx *actx = AUTHENC_CTX(aeadctx); - struct chcr_aead_reqctx *reqctx = aead_request_ctx(req); + struct chcr_aead_reqctx *reqctx = aead_request_ctx_dma(req); struct sk_buff *skb = NULL; struct chcr_wr *chcr_req; struct cpl_rx_phys_dsgl *phys_cpl; @@ -2576,7 +2576,7 @@ int chcr_aead_dma_map(struct device *dev, unsigned short op_type) { int error; - struct chcr_aead_reqctx *reqctx = aead_request_ctx(req); + struct chcr_aead_reqctx *reqctx = aead_request_ctx_dma(req); struct crypto_aead *tfm = crypto_aead_reqtfm(req); unsigned int authsize = crypto_aead_authsize(tfm); int src_len, dst_len; @@ -2637,7 +2637,7 @@ void chcr_aead_dma_unmap(struct device *dev, struct aead_request *req, unsigned short op_type) { - struct chcr_aead_reqctx *reqctx = aead_request_ctx(req); + struct chcr_aead_reqctx *reqctx = aead_request_ctx_dma(req); struct crypto_aead *tfm = crypto_aead_reqtfm(req); unsigned int authsize = crypto_aead_authsize(tfm); int src_len, dst_len; @@ -2678,7 +2678,7 @@ void chcr_add_aead_src_ent(struct aead_request *req, struct ulptx_sgl *ulptx) { struct ulptx_walk ulp_walk; - struct chcr_aead_reqctx *reqctx = aead_request_ctx(req); + struct chcr_aead_reqctx *reqctx = aead_request_ctx_dma(req); if (reqctx->imm) { u8 *buf = (u8 *)ulptx; @@ -2704,7 +2704,7 @@ void chcr_add_aead_dst_ent(struct aead_request *req, struct cpl_rx_phys_dsgl *phys_cpl, unsigned short qid) { - struct chcr_aead_reqctx *reqctx = aead_request_ctx(req); + struct chcr_aead_reqctx *reqctx = aead_request_ctx_dma(req); struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct dsgl_walk dsgl_walk; unsigned int authsize = crypto_aead_authsize(tfm); @@ -2894,7 +2894,7 @@ static int generate_b0(struct aead_request *req, u8 *ivptr, unsigned int l, lp, m; int rc; struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct chcr_aead_reqctx *reqctx = aead_request_ctx(req); + struct chcr_aead_reqctx *reqctx = aead_request_ctx_dma(req); u8 *b0 = reqctx->scratch_pad; m = crypto_aead_authsize(aead); @@ -2932,7 +2932,7 @@ static int ccm_format_packet(struct aead_request *req, unsigned short op_type, unsigned int assoclen) { - struct chcr_aead_reqctx *reqctx = aead_request_ctx(req); + struct chcr_aead_reqctx *reqctx = aead_request_ctx_dma(req); struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct chcr_aead_ctx *aeadctx = AEAD_CTX(a_ctx(tfm)); int rc = 0; @@ -2963,7 +2963,7 @@ static void fill_sec_cpl_for_aead(struct cpl_tx_sec_pdu *sec_cpl, struct chcr_context *ctx = a_ctx(tfm); struct uld_ctx *u_ctx = ULD_CTX(ctx); struct chcr_aead_ctx *aeadctx = AEAD_CTX(ctx); - struct chcr_aead_reqctx *reqctx = aead_request_ctx(req); + struct chcr_aead_reqctx *reqctx = aead_request_ctx_dma(req); unsigned int cipher_mode = CHCR_SCMD_CIPHER_MODE_AES_CCM; unsigned int mac_mode = CHCR_SCMD_AUTH_MODE_CBCMAC; unsigned int rx_channel_id = reqctx->rxqidx / ctx->rxq_perchan; @@ -3036,7 +3036,7 @@ static struct sk_buff *create_aead_ccm_wr(struct aead_request *req, { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct chcr_aead_ctx *aeadctx = AEAD_CTX(a_ctx(tfm)); - struct chcr_aead_reqctx *reqctx = aead_request_ctx(req); + struct chcr_aead_reqctx *reqctx = aead_request_ctx_dma(req); struct sk_buff *skb = NULL; struct chcr_wr *chcr_req; struct cpl_rx_phys_dsgl *phys_cpl; @@ -3135,7 +3135,7 @@ static struct sk_buff *create_gcm_wr(struct aead_request *req, struct chcr_context *ctx = a_ctx(tfm); struct uld_ctx *u_ctx = ULD_CTX(ctx); struct chcr_aead_ctx *aeadctx = AEAD_CTX(ctx); - struct chcr_aead_reqctx *reqctx = aead_request_ctx(req); + struct chcr_aead_reqctx *reqctx = aead_request_ctx_dma(req); struct sk_buff *skb = NULL; struct chcr_wr *chcr_req; struct cpl_rx_phys_dsgl *phys_cpl; @@ -3255,9 +3255,10 @@ static int chcr_aead_cra_init(struct crypto_aead *tfm) CRYPTO_ALG_ASYNC); if (IS_ERR(aeadctx->sw_cipher)) return PTR_ERR(aeadctx->sw_cipher); - crypto_aead_set_reqsize(tfm, max(sizeof(struct chcr_aead_reqctx), - sizeof(struct aead_request) + - crypto_aead_reqsize(aeadctx->sw_cipher))); + crypto_aead_set_reqsize_dma( + tfm, max(sizeof(struct chcr_aead_reqctx), + sizeof(struct aead_request) + + crypto_aead_reqsize(aeadctx->sw_cipher))); return chcr_device_init(a_ctx(tfm)); } @@ -3735,7 +3736,7 @@ static int chcr_aead_op(struct aead_request *req, create_wr_t create_wr_fn) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct chcr_aead_reqctx *reqctx = aead_request_ctx(req); + struct chcr_aead_reqctx *reqctx = aead_request_ctx_dma(req); struct chcr_context *ctx = a_ctx(tfm); struct uld_ctx *u_ctx = ULD_CTX(ctx); struct sk_buff *skb; @@ -3785,7 +3786,7 @@ static int chcr_aead_op(struct aead_request *req, static int chcr_aead_encrypt(struct aead_request *req) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct chcr_aead_reqctx *reqctx = aead_request_ctx(req); + struct chcr_aead_reqctx *reqctx = aead_request_ctx_dma(req); struct chcr_context *ctx = a_ctx(tfm); unsigned int cpu; @@ -3816,7 +3817,7 @@ static int chcr_aead_decrypt(struct aead_request *req) struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct chcr_context *ctx = a_ctx(tfm); struct chcr_aead_ctx *aeadctx = AEAD_CTX(ctx); - struct chcr_aead_reqctx *reqctx = aead_request_ctx(req); + struct chcr_aead_reqctx *reqctx = aead_request_ctx_dma(req); int size; unsigned int cpu; -- GitLab From 80b61baca4c8698139881f41473e652bedc65a73 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 2 Dec 2022 17:20:55 +0800 Subject: [PATCH 1270/1423] crypto: hisilicon/hpre - Set DMA alignment explicitly This driver has been implicitly relying on kmalloc alignment to be sufficient for DMA. This may no longer be the case with upcoming arm64 changes. This patch changes it to explicitly request DMA alignment from the Crypto API. Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/hpre/hpre_crypto.c | 40 +++++++++++++-------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/drivers/crypto/hisilicon/hpre/hpre_crypto.c b/drivers/crypto/hisilicon/hpre/hpre_crypto.c index 5f6d363c9435e..8ede77310dc52 100644 --- a/drivers/crypto/hisilicon/hpre/hpre_crypto.c +++ b/drivers/crypto/hisilicon/hpre/hpre_crypto.c @@ -147,6 +147,16 @@ struct hpre_asym_request { struct timespec64 req_time; }; +static inline unsigned int hpre_align_sz(void) +{ + return ((crypto_dma_align() - 1) | (HPRE_ALIGN_SZ - 1)) + 1; +} + +static inline unsigned int hpre_align_pd(void) +{ + return (hpre_align_sz() - 1) & ~(crypto_tfm_ctx_alignment() - 1); +} + static int hpre_alloc_req_id(struct hpre_ctx *ctx) { unsigned long flags; @@ -517,7 +527,7 @@ static int hpre_msg_request_set(struct hpre_ctx *ctx, void *req, bool is_rsa) } tmp = akcipher_request_ctx(akreq); - h_req = PTR_ALIGN(tmp, HPRE_ALIGN_SZ); + h_req = PTR_ALIGN(tmp, hpre_align_sz()); h_req->cb = hpre_rsa_cb; h_req->areq.rsa = akreq; msg = &h_req->req; @@ -531,7 +541,7 @@ static int hpre_msg_request_set(struct hpre_ctx *ctx, void *req, bool is_rsa) } tmp = kpp_request_ctx(kreq); - h_req = PTR_ALIGN(tmp, HPRE_ALIGN_SZ); + h_req = PTR_ALIGN(tmp, hpre_align_sz()); h_req->cb = hpre_dh_cb; h_req->areq.dh = kreq; msg = &h_req->req; @@ -582,7 +592,7 @@ static int hpre_dh_compute_value(struct kpp_request *req) struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); struct hpre_ctx *ctx = kpp_tfm_ctx(tfm); void *tmp = kpp_request_ctx(req); - struct hpre_asym_request *hpre_req = PTR_ALIGN(tmp, HPRE_ALIGN_SZ); + struct hpre_asym_request *hpre_req = PTR_ALIGN(tmp, hpre_align_sz()); struct hpre_sqe *msg = &hpre_req->req; int ret; @@ -740,7 +750,7 @@ static int hpre_dh_init_tfm(struct crypto_kpp *tfm) { struct hpre_ctx *ctx = kpp_tfm_ctx(tfm); - kpp_set_reqsize(tfm, sizeof(struct hpre_asym_request) + HPRE_ALIGN_SZ); + kpp_set_reqsize(tfm, sizeof(struct hpre_asym_request) + hpre_align_pd()); return hpre_ctx_init(ctx, HPRE_V2_ALG_TYPE); } @@ -785,7 +795,7 @@ static int hpre_rsa_enc(struct akcipher_request *req) struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); struct hpre_ctx *ctx = akcipher_tfm_ctx(tfm); void *tmp = akcipher_request_ctx(req); - struct hpre_asym_request *hpre_req = PTR_ALIGN(tmp, HPRE_ALIGN_SZ); + struct hpre_asym_request *hpre_req = PTR_ALIGN(tmp, hpre_align_sz()); struct hpre_sqe *msg = &hpre_req->req; int ret; @@ -833,7 +843,7 @@ static int hpre_rsa_dec(struct akcipher_request *req) struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); struct hpre_ctx *ctx = akcipher_tfm_ctx(tfm); void *tmp = akcipher_request_ctx(req); - struct hpre_asym_request *hpre_req = PTR_ALIGN(tmp, HPRE_ALIGN_SZ); + struct hpre_asym_request *hpre_req = PTR_ALIGN(tmp, hpre_align_sz()); struct hpre_sqe *msg = &hpre_req->req; int ret; @@ -1168,7 +1178,7 @@ static int hpre_rsa_init_tfm(struct crypto_akcipher *tfm) } akcipher_set_reqsize(tfm, sizeof(struct hpre_asym_request) + - HPRE_ALIGN_SZ); + hpre_align_pd()); ret = hpre_ctx_init(ctx, HPRE_V2_ALG_TYPE); if (ret) @@ -1490,7 +1500,7 @@ static int hpre_ecdh_msg_request_set(struct hpre_ctx *ctx, } tmp = kpp_request_ctx(req); - h_req = PTR_ALIGN(tmp, HPRE_ALIGN_SZ); + h_req = PTR_ALIGN(tmp, hpre_align_sz()); h_req->cb = hpre_ecdh_cb; h_req->areq.ecdh = req; msg = &h_req->req; @@ -1571,7 +1581,7 @@ static int hpre_ecdh_compute_value(struct kpp_request *req) struct hpre_ctx *ctx = kpp_tfm_ctx(tfm); struct device *dev = ctx->dev; void *tmp = kpp_request_ctx(req); - struct hpre_asym_request *hpre_req = PTR_ALIGN(tmp, HPRE_ALIGN_SZ); + struct hpre_asym_request *hpre_req = PTR_ALIGN(tmp, hpre_align_sz()); struct hpre_sqe *msg = &hpre_req->req; int ret; @@ -1622,7 +1632,7 @@ static int hpre_ecdh_nist_p192_init_tfm(struct crypto_kpp *tfm) ctx->curve_id = ECC_CURVE_NIST_P192; - kpp_set_reqsize(tfm, sizeof(struct hpre_asym_request) + HPRE_ALIGN_SZ); + kpp_set_reqsize(tfm, sizeof(struct hpre_asym_request) + hpre_align_pd()); return hpre_ctx_init(ctx, HPRE_V3_ECC_ALG_TYPE); } @@ -1633,7 +1643,7 @@ static int hpre_ecdh_nist_p256_init_tfm(struct crypto_kpp *tfm) ctx->curve_id = ECC_CURVE_NIST_P256; - kpp_set_reqsize(tfm, sizeof(struct hpre_asym_request) + HPRE_ALIGN_SZ); + kpp_set_reqsize(tfm, sizeof(struct hpre_asym_request) + hpre_align_pd()); return hpre_ctx_init(ctx, HPRE_V3_ECC_ALG_TYPE); } @@ -1644,7 +1654,7 @@ static int hpre_ecdh_nist_p384_init_tfm(struct crypto_kpp *tfm) ctx->curve_id = ECC_CURVE_NIST_P384; - kpp_set_reqsize(tfm, sizeof(struct hpre_asym_request) + HPRE_ALIGN_SZ); + kpp_set_reqsize(tfm, sizeof(struct hpre_asym_request) + hpre_align_pd()); return hpre_ctx_init(ctx, HPRE_V3_ECC_ALG_TYPE); } @@ -1802,7 +1812,7 @@ static int hpre_curve25519_msg_request_set(struct hpre_ctx *ctx, } tmp = kpp_request_ctx(req); - h_req = PTR_ALIGN(tmp, HPRE_ALIGN_SZ); + h_req = PTR_ALIGN(tmp, hpre_align_sz()); h_req->cb = hpre_curve25519_cb; h_req->areq.curve25519 = req; msg = &h_req->req; @@ -1923,7 +1933,7 @@ static int hpre_curve25519_compute_value(struct kpp_request *req) struct hpre_ctx *ctx = kpp_tfm_ctx(tfm); struct device *dev = ctx->dev; void *tmp = kpp_request_ctx(req); - struct hpre_asym_request *hpre_req = PTR_ALIGN(tmp, HPRE_ALIGN_SZ); + struct hpre_asym_request *hpre_req = PTR_ALIGN(tmp, hpre_align_sz()); struct hpre_sqe *msg = &hpre_req->req; int ret; @@ -1972,7 +1982,7 @@ static int hpre_curve25519_init_tfm(struct crypto_kpp *tfm) { struct hpre_ctx *ctx = kpp_tfm_ctx(tfm); - kpp_set_reqsize(tfm, sizeof(struct hpre_asym_request) + HPRE_ALIGN_SZ); + kpp_set_reqsize(tfm, sizeof(struct hpre_asym_request) + hpre_align_pd()); return hpre_ctx_init(ctx, HPRE_V3_ECC_ALG_TYPE); } -- GitLab From b2e2e2da7b4f62c54ce0d6a66c54e9fb05a8d514 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 2 Dec 2022 17:20:57 +0800 Subject: [PATCH 1271/1423] crypto: safexcel - Set DMA alignment explicitly This driver has been implicitly relying on kmalloc alignment to be sufficient for DMA. This may no longer be the case with upcoming arm64 changes. This patch changes it to explicitly request DMA alignment from the Crypto API. Signed-off-by: Herbert Xu --- drivers/crypto/inside-secure/safexcel_hash.c | 99 ++++++++++---------- 1 file changed, 50 insertions(+), 49 deletions(-) diff --git a/drivers/crypto/inside-secure/safexcel_hash.c b/drivers/crypto/inside-secure/safexcel_hash.c index 103fc551d2af9..ca46328472d4b 100644 --- a/drivers/crypto/inside-secure/safexcel_hash.c +++ b/drivers/crypto/inside-secure/safexcel_hash.c @@ -231,7 +231,7 @@ static int safexcel_handle_req_result(struct safexcel_crypto_priv *priv, struct safexcel_result_desc *rdesc; struct ahash_request *areq = ahash_request_cast(async); struct crypto_ahash *ahash = crypto_ahash_reqtfm(areq); - struct safexcel_ahash_req *sreq = ahash_request_ctx(areq); + struct safexcel_ahash_req *sreq = ahash_request_ctx_dma(areq); struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(ahash); u64 cache_len; @@ -312,7 +312,7 @@ static int safexcel_ahash_send_req(struct crypto_async_request *async, int ring, int *commands, int *results) { struct ahash_request *areq = ahash_request_cast(async); - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(areq)); struct safexcel_crypto_priv *priv = ctx->base.priv; struct safexcel_command_desc *cdesc, *first_cdesc = NULL; @@ -569,7 +569,7 @@ static int safexcel_handle_result(struct safexcel_crypto_priv *priv, int ring, bool *should_complete, int *ret) { struct ahash_request *areq = ahash_request_cast(async); - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); int err; BUG_ON(!(priv->flags & EIP197_TRC_CACHE) && req->needs_inv); @@ -608,7 +608,7 @@ static int safexcel_ahash_send(struct crypto_async_request *async, int ring, int *commands, int *results) { struct ahash_request *areq = ahash_request_cast(async); - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); int ret; if (req->needs_inv) @@ -624,7 +624,7 @@ static int safexcel_ahash_exit_inv(struct crypto_tfm *tfm) struct safexcel_ahash_ctx *ctx = crypto_tfm_ctx(tfm); struct safexcel_crypto_priv *priv = ctx->base.priv; EIP197_REQUEST_ON_STACK(req, ahash, EIP197_AHASH_REQ_SIZE); - struct safexcel_ahash_req *rctx = ahash_request_ctx(req); + struct safexcel_ahash_req *rctx = ahash_request_ctx_dma(req); struct safexcel_inv_result result = {}; int ring = ctx->base.ring; @@ -663,7 +663,7 @@ static int safexcel_ahash_exit_inv(struct crypto_tfm *tfm) */ static int safexcel_ahash_cache(struct ahash_request *areq) { - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); u64 cache_len; /* cache_len: everything accepted by the driver but not sent yet, @@ -689,7 +689,7 @@ static int safexcel_ahash_cache(struct ahash_request *areq) static int safexcel_ahash_enqueue(struct ahash_request *areq) { struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(areq)); - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); struct safexcel_crypto_priv *priv = ctx->base.priv; int ret, ring; @@ -741,7 +741,7 @@ static int safexcel_ahash_enqueue(struct ahash_request *areq) static int safexcel_ahash_update(struct ahash_request *areq) { - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); int ret; /* If the request is 0 length, do nothing */ @@ -766,7 +766,7 @@ static int safexcel_ahash_update(struct ahash_request *areq) static int safexcel_ahash_final(struct ahash_request *areq) { - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(areq)); req->finish = true; @@ -870,7 +870,7 @@ static int safexcel_ahash_final(struct ahash_request *areq) static int safexcel_ahash_finup(struct ahash_request *areq) { - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); req->finish = true; @@ -880,7 +880,7 @@ static int safexcel_ahash_finup(struct ahash_request *areq) static int safexcel_ahash_export(struct ahash_request *areq, void *out) { - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); struct safexcel_ahash_export_state *export = out; export->len = req->len; @@ -896,7 +896,7 @@ static int safexcel_ahash_export(struct ahash_request *areq, void *out) static int safexcel_ahash_import(struct ahash_request *areq, const void *in) { - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); const struct safexcel_ahash_export_state *export = in; int ret; @@ -927,15 +927,15 @@ static int safexcel_ahash_cra_init(struct crypto_tfm *tfm) ctx->base.handle_result = safexcel_handle_result; ctx->fb_do_setkey = false; - crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), - sizeof(struct safexcel_ahash_req)); + crypto_ahash_set_reqsize_dma(__crypto_ahash_cast(tfm), + sizeof(struct safexcel_ahash_req)); return 0; } static int safexcel_sha1_init(struct ahash_request *areq) { struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(areq)); - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); memset(req, 0, sizeof(*req)); @@ -1012,7 +1012,7 @@ struct safexcel_alg_template safexcel_alg_sha1 = { static int safexcel_hmac_sha1_init(struct ahash_request *areq) { struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(areq)); - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); memset(req, 0, sizeof(*req)); @@ -1124,7 +1124,7 @@ static int safexcel_hmac_init_iv(struct ahash_request *areq, if (ret) return ret; - req = ahash_request_ctx(areq); + req = ahash_request_ctx_dma(areq); req->hmac = true; req->last_req = true; @@ -1264,7 +1264,7 @@ struct safexcel_alg_template safexcel_alg_hmac_sha1 = { static int safexcel_sha256_init(struct ahash_request *areq) { struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(areq)); - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); memset(req, 0, sizeof(*req)); @@ -1321,7 +1321,7 @@ struct safexcel_alg_template safexcel_alg_sha256 = { static int safexcel_sha224_init(struct ahash_request *areq) { struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(areq)); - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); memset(req, 0, sizeof(*req)); @@ -1385,7 +1385,7 @@ static int safexcel_hmac_sha224_setkey(struct crypto_ahash *tfm, const u8 *key, static int safexcel_hmac_sha224_init(struct ahash_request *areq) { struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(areq)); - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); memset(req, 0, sizeof(*req)); @@ -1457,7 +1457,7 @@ static int safexcel_hmac_sha256_setkey(struct crypto_ahash *tfm, const u8 *key, static int safexcel_hmac_sha256_init(struct ahash_request *areq) { struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(areq)); - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); memset(req, 0, sizeof(*req)); @@ -1522,7 +1522,7 @@ struct safexcel_alg_template safexcel_alg_hmac_sha256 = { static int safexcel_sha512_init(struct ahash_request *areq) { struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(areq)); - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); memset(req, 0, sizeof(*req)); @@ -1579,7 +1579,7 @@ struct safexcel_alg_template safexcel_alg_sha512 = { static int safexcel_sha384_init(struct ahash_request *areq) { struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(areq)); - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); memset(req, 0, sizeof(*req)); @@ -1643,7 +1643,7 @@ static int safexcel_hmac_sha512_setkey(struct crypto_ahash *tfm, const u8 *key, static int safexcel_hmac_sha512_init(struct ahash_request *areq) { struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(areq)); - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); memset(req, 0, sizeof(*req)); @@ -1715,7 +1715,7 @@ static int safexcel_hmac_sha384_setkey(struct crypto_ahash *tfm, const u8 *key, static int safexcel_hmac_sha384_init(struct ahash_request *areq) { struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(areq)); - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); memset(req, 0, sizeof(*req)); @@ -1780,7 +1780,7 @@ struct safexcel_alg_template safexcel_alg_hmac_sha384 = { static int safexcel_md5_init(struct ahash_request *areq) { struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(areq)); - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); memset(req, 0, sizeof(*req)); @@ -1837,7 +1837,7 @@ struct safexcel_alg_template safexcel_alg_md5 = { static int safexcel_hmac_md5_init(struct ahash_request *areq) { struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(areq)); - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); memset(req, 0, sizeof(*req)); @@ -1920,7 +1920,7 @@ static int safexcel_crc32_cra_init(struct crypto_tfm *tfm) static int safexcel_crc32_init(struct ahash_request *areq) { struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(areq)); - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); memset(req, 0, sizeof(*req)); @@ -1992,7 +1992,7 @@ struct safexcel_alg_template safexcel_alg_crc32 = { static int safexcel_cbcmac_init(struct ahash_request *areq) { struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(areq)); - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); memset(req, 0, sizeof(*req)); @@ -2252,7 +2252,7 @@ struct safexcel_alg_template safexcel_alg_cmac = { static int safexcel_sm3_init(struct ahash_request *areq) { struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(areq)); - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); memset(req, 0, sizeof(*req)); @@ -2316,7 +2316,7 @@ static int safexcel_hmac_sm3_setkey(struct crypto_ahash *tfm, const u8 *key, static int safexcel_hmac_sm3_init(struct ahash_request *areq) { struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(areq)); - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); memset(req, 0, sizeof(*req)); @@ -2382,7 +2382,7 @@ static int safexcel_sha3_224_init(struct ahash_request *areq) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(areq); struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(tfm); - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); memset(req, 0, sizeof(*req)); @@ -2400,7 +2400,7 @@ static int safexcel_sha3_fbcheck(struct ahash_request *req) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(tfm); - struct ahash_request *subreq = ahash_request_ctx(req); + struct ahash_request *subreq = ahash_request_ctx_dma(req); int ret = 0; if (ctx->do_fallback) { @@ -2437,7 +2437,7 @@ static int safexcel_sha3_update(struct ahash_request *req) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(tfm); - struct ahash_request *subreq = ahash_request_ctx(req); + struct ahash_request *subreq = ahash_request_ctx_dma(req); ctx->do_fallback = true; return safexcel_sha3_fbcheck(req) ?: crypto_ahash_update(subreq); @@ -2447,7 +2447,7 @@ static int safexcel_sha3_final(struct ahash_request *req) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(tfm); - struct ahash_request *subreq = ahash_request_ctx(req); + struct ahash_request *subreq = ahash_request_ctx_dma(req); ctx->do_fallback = true; return safexcel_sha3_fbcheck(req) ?: crypto_ahash_final(subreq); @@ -2457,7 +2457,7 @@ static int safexcel_sha3_finup(struct ahash_request *req) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(tfm); - struct ahash_request *subreq = ahash_request_ctx(req); + struct ahash_request *subreq = ahash_request_ctx_dma(req); ctx->do_fallback |= !req->nbytes; if (ctx->do_fallback) @@ -2472,7 +2472,7 @@ static int safexcel_sha3_digest_fallback(struct ahash_request *req) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(tfm); - struct ahash_request *subreq = ahash_request_ctx(req); + struct ahash_request *subreq = ahash_request_ctx_dma(req); ctx->do_fallback = true; ctx->fb_init_done = false; @@ -2492,7 +2492,7 @@ static int safexcel_sha3_export(struct ahash_request *req, void *out) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(tfm); - struct ahash_request *subreq = ahash_request_ctx(req); + struct ahash_request *subreq = ahash_request_ctx_dma(req); ctx->do_fallback = true; return safexcel_sha3_fbcheck(req) ?: crypto_ahash_export(subreq, out); @@ -2502,7 +2502,7 @@ static int safexcel_sha3_import(struct ahash_request *req, const void *in) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(tfm); - struct ahash_request *subreq = ahash_request_ctx(req); + struct ahash_request *subreq = ahash_request_ctx_dma(req); ctx->do_fallback = true; return safexcel_sha3_fbcheck(req) ?: crypto_ahash_import(subreq, in); @@ -2526,9 +2526,10 @@ static int safexcel_sha3_cra_init(struct crypto_tfm *tfm) /* Update statesize from fallback algorithm! */ crypto_hash_alg_common(ahash)->statesize = crypto_ahash_statesize(ctx->fback); - crypto_ahash_set_reqsize(ahash, max(sizeof(struct safexcel_ahash_req), - sizeof(struct ahash_request) + - crypto_ahash_reqsize(ctx->fback))); + crypto_ahash_set_reqsize_dma( + ahash, max(sizeof(struct safexcel_ahash_req), + sizeof(struct ahash_request) + + crypto_ahash_reqsize(ctx->fback))); return 0; } @@ -2575,7 +2576,7 @@ static int safexcel_sha3_256_init(struct ahash_request *areq) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(areq); struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(tfm); - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); memset(req, 0, sizeof(*req)); @@ -2633,7 +2634,7 @@ static int safexcel_sha3_384_init(struct ahash_request *areq) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(areq); struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(tfm); - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); memset(req, 0, sizeof(*req)); @@ -2691,7 +2692,7 @@ static int safexcel_sha3_512_init(struct ahash_request *areq) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(areq); struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(tfm); - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); memset(req, 0, sizeof(*req)); @@ -2841,7 +2842,7 @@ static int safexcel_hmac_sha3_224_init(struct ahash_request *areq) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(areq); struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(tfm); - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); memset(req, 0, sizeof(*req)); @@ -2912,7 +2913,7 @@ static int safexcel_hmac_sha3_256_init(struct ahash_request *areq) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(areq); struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(tfm); - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); memset(req, 0, sizeof(*req)); @@ -2983,7 +2984,7 @@ static int safexcel_hmac_sha3_384_init(struct ahash_request *areq) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(areq); struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(tfm); - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); memset(req, 0, sizeof(*req)); @@ -3054,7 +3055,7 @@ static int safexcel_hmac_sha3_512_init(struct ahash_request *areq) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(areq); struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(tfm); - struct safexcel_ahash_req *req = ahash_request_ctx(areq); + struct safexcel_ahash_req *req = ahash_request_ctx_dma(areq); memset(req, 0, sizeof(*req)); -- GitLab From be75969c81d9a6e13487e1c043e62ed5432d9fa1 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 2 Dec 2022 17:20:59 +0800 Subject: [PATCH 1272/1423] crypto: keembay - Set DMA alignment explicitly This driver has been implicitly relying on kmalloc alignment to be sufficient for DMA. This may no longer be the case with upcoming arm64 changes. This patch changes it to explicitly request DMA alignment from the Crypto API. Signed-off-by: Herbert Xu --- drivers/crypto/keembay/keembay-ocs-hcu-core.c | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/crypto/keembay/keembay-ocs-hcu-core.c b/drivers/crypto/keembay/keembay-ocs-hcu-core.c index 0379dbf32a4c4..d4bcbed1f5466 100644 --- a/drivers/crypto/keembay/keembay-ocs-hcu-core.c +++ b/drivers/crypto/keembay/keembay-ocs-hcu-core.c @@ -226,7 +226,7 @@ static void kmb_ocs_hcu_dma_cleanup(struct ahash_request *req, */ static int kmb_ocs_dma_prepare(struct ahash_request *req) { - struct ocs_hcu_rctx *rctx = ahash_request_ctx(req); + struct ocs_hcu_rctx *rctx = ahash_request_ctx_dma(req); struct device *dev = rctx->hcu_dev->dev; unsigned int remainder = 0; unsigned int total; @@ -356,7 +356,7 @@ cleanup: static void kmb_ocs_hcu_secure_cleanup(struct ahash_request *req) { - struct ocs_hcu_rctx *rctx = ahash_request_ctx(req); + struct ocs_hcu_rctx *rctx = ahash_request_ctx_dma(req); /* Clear buffer of any data. */ memzero_explicit(rctx->buffer, sizeof(rctx->buffer)); @@ -374,7 +374,7 @@ static int kmb_ocs_hcu_handle_queue(struct ahash_request *req) static int prepare_ipad(struct ahash_request *req) { - struct ocs_hcu_rctx *rctx = ahash_request_ctx(req); + struct ocs_hcu_rctx *rctx = ahash_request_ctx_dma(req); struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); struct ocs_hcu_ctx *ctx = crypto_ahash_ctx(tfm); int i; @@ -414,7 +414,7 @@ static int kmb_ocs_hcu_do_one_request(struct crypto_engine *engine, void *areq) base); struct ocs_hcu_dev *hcu_dev = kmb_ocs_hcu_find_dev(req); struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct ocs_hcu_rctx *rctx = ahash_request_ctx(req); + struct ocs_hcu_rctx *rctx = ahash_request_ctx_dma(req); struct ocs_hcu_ctx *tctx = crypto_ahash_ctx(tfm); int rc; int i; @@ -561,7 +561,7 @@ error: static int kmb_ocs_hcu_init(struct ahash_request *req) { struct ocs_hcu_dev *hcu_dev = kmb_ocs_hcu_find_dev(req); - struct ocs_hcu_rctx *rctx = ahash_request_ctx(req); + struct ocs_hcu_rctx *rctx = ahash_request_ctx_dma(req); struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); struct ocs_hcu_ctx *ctx = crypto_ahash_ctx(tfm); @@ -614,7 +614,7 @@ static int kmb_ocs_hcu_init(struct ahash_request *req) static int kmb_ocs_hcu_update(struct ahash_request *req) { - struct ocs_hcu_rctx *rctx = ahash_request_ctx(req); + struct ocs_hcu_rctx *rctx = ahash_request_ctx_dma(req); int rc; if (!req->nbytes) @@ -650,7 +650,7 @@ static int kmb_ocs_hcu_update(struct ahash_request *req) /* Common logic for kmb_ocs_hcu_final() and kmb_ocs_hcu_finup(). */ static int kmb_ocs_hcu_fin_common(struct ahash_request *req) { - struct ocs_hcu_rctx *rctx = ahash_request_ctx(req); + struct ocs_hcu_rctx *rctx = ahash_request_ctx_dma(req); struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); struct ocs_hcu_ctx *ctx = crypto_ahash_ctx(tfm); int rc; @@ -687,7 +687,7 @@ static int kmb_ocs_hcu_fin_common(struct ahash_request *req) static int kmb_ocs_hcu_final(struct ahash_request *req) { - struct ocs_hcu_rctx *rctx = ahash_request_ctx(req); + struct ocs_hcu_rctx *rctx = ahash_request_ctx_dma(req); rctx->sg_data_total = 0; rctx->sg_data_offset = 0; @@ -698,7 +698,7 @@ static int kmb_ocs_hcu_final(struct ahash_request *req) static int kmb_ocs_hcu_finup(struct ahash_request *req) { - struct ocs_hcu_rctx *rctx = ahash_request_ctx(req); + struct ocs_hcu_rctx *rctx = ahash_request_ctx_dma(req); rctx->sg_data_total = req->nbytes; rctx->sg_data_offset = 0; @@ -726,7 +726,7 @@ static int kmb_ocs_hcu_digest(struct ahash_request *req) static int kmb_ocs_hcu_export(struct ahash_request *req, void *out) { - struct ocs_hcu_rctx *rctx = ahash_request_ctx(req); + struct ocs_hcu_rctx *rctx = ahash_request_ctx_dma(req); /* Intermediate data is always stored and applied per request. */ memcpy(out, rctx, sizeof(*rctx)); @@ -736,7 +736,7 @@ static int kmb_ocs_hcu_export(struct ahash_request *req, void *out) static int kmb_ocs_hcu_import(struct ahash_request *req, const void *in) { - struct ocs_hcu_rctx *rctx = ahash_request_ctx(req); + struct ocs_hcu_rctx *rctx = ahash_request_ctx_dma(req); /* Intermediate data is always stored and applied per request. */ memcpy(rctx, in, sizeof(*rctx)); @@ -822,8 +822,8 @@ err_free_ahash: /* Set request size and initialize tfm context. */ static void __cra_init(struct crypto_tfm *tfm, struct ocs_hcu_ctx *ctx) { - crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), - sizeof(struct ocs_hcu_rctx)); + crypto_ahash_set_reqsize_dma(__crypto_ahash_cast(tfm), + sizeof(struct ocs_hcu_rctx)); /* Init context to 0. */ memzero_explicit(ctx, sizeof(*ctx)); -- GitLab From 0a55f4e38556f7e59b0f30fac0751e3a04be44c2 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 2 Dec 2022 17:21:01 +0800 Subject: [PATCH 1273/1423] crypto: octeontx - Set DMA alignment explicitly This driver has been implicitly relying on kmalloc alignment to be sufficient for DMA. This may no longer be the case with upcoming arm64 changes. This patch changes it to explicitly request DMA alignment from the Crypto API. Signed-off-by: Herbert Xu --- .../crypto/marvell/octeontx/otx_cptvf_algs.c | 69 ++++++++++--------- 1 file changed, 35 insertions(+), 34 deletions(-) diff --git a/drivers/crypto/marvell/octeontx/otx_cptvf_algs.c b/drivers/crypto/marvell/octeontx/otx_cptvf_algs.c index 01c48ddc4eeb4..80ba77c793a7c 100644 --- a/drivers/crypto/marvell/octeontx/otx_cptvf_algs.c +++ b/drivers/crypto/marvell/octeontx/otx_cptvf_algs.c @@ -103,7 +103,7 @@ static inline int validate_hmac_cipher_null(struct otx_cpt_req_info *cpt_req) req = container_of(cpt_req->areq, struct aead_request, base); tfm = crypto_aead_reqtfm(req); - rctx = aead_request_ctx(req); + rctx = aead_request_ctx_dma(req); if (memcmp(rctx->fctx.hmac.s.hmac_calc, rctx->fctx.hmac.s.hmac_recv, crypto_aead_authsize(tfm)) != 0) @@ -155,7 +155,7 @@ static void output_iv_copyback(struct crypto_async_request *areq) ctx = crypto_skcipher_ctx(stfm); if (ctx->cipher_type == OTX_CPT_AES_CBC || ctx->cipher_type == OTX_CPT_DES3_CBC) { - rctx = skcipher_request_ctx(sreq); + rctx = skcipher_request_ctx_dma(sreq); req_info = &rctx->cpt_req; ivsize = crypto_skcipher_ivsize(stfm); start = sreq->cryptlen - ivsize; @@ -233,7 +233,7 @@ static inline u32 create_ctx_hdr(struct skcipher_request *req, u32 enc, u32 *argcnt) { struct crypto_skcipher *stfm = crypto_skcipher_reqtfm(req); - struct otx_cpt_req_ctx *rctx = skcipher_request_ctx(req); + struct otx_cpt_req_ctx *rctx = skcipher_request_ctx_dma(req); struct otx_cpt_req_info *req_info = &rctx->cpt_req; struct crypto_tfm *tfm = crypto_skcipher_tfm(stfm); struct otx_cpt_enc_ctx *ctx = crypto_tfm_ctx(tfm); @@ -303,7 +303,7 @@ static inline u32 create_ctx_hdr(struct skcipher_request *req, u32 enc, static inline u32 create_input_list(struct skcipher_request *req, u32 enc, u32 enc_iv_len) { - struct otx_cpt_req_ctx *rctx = skcipher_request_ctx(req); + struct otx_cpt_req_ctx *rctx = skcipher_request_ctx_dma(req); struct otx_cpt_req_info *req_info = &rctx->cpt_req; u32 argcnt = 0; int ret; @@ -321,7 +321,7 @@ static inline u32 create_input_list(struct skcipher_request *req, u32 enc, static inline void create_output_list(struct skcipher_request *req, u32 enc_iv_len) { - struct otx_cpt_req_ctx *rctx = skcipher_request_ctx(req); + struct otx_cpt_req_ctx *rctx = skcipher_request_ctx_dma(req); struct otx_cpt_req_info *req_info = &rctx->cpt_req; u32 argcnt = 0; @@ -340,7 +340,7 @@ static inline void create_output_list(struct skcipher_request *req, static inline int cpt_enc_dec(struct skcipher_request *req, u32 enc) { struct crypto_skcipher *stfm = crypto_skcipher_reqtfm(req); - struct otx_cpt_req_ctx *rctx = skcipher_request_ctx(req); + struct otx_cpt_req_ctx *rctx = skcipher_request_ctx_dma(req); struct otx_cpt_req_info *req_info = &rctx->cpt_req; u32 enc_iv_len = crypto_skcipher_ivsize(stfm); struct pci_dev *pdev; @@ -501,15 +501,16 @@ static int otx_cpt_enc_dec_init(struct crypto_skcipher *tfm) * allocated since the cryptd daemon uses * this memory for request_ctx information */ - crypto_skcipher_set_reqsize(tfm, sizeof(struct otx_cpt_req_ctx) + - sizeof(struct skcipher_request)); + crypto_skcipher_set_reqsize_dma( + tfm, sizeof(struct otx_cpt_req_ctx) + + sizeof(struct skcipher_request)); return 0; } static int cpt_aead_init(struct crypto_aead *tfm, u8 cipher_type, u8 mac_type) { - struct otx_cpt_aead_ctx *ctx = crypto_aead_ctx(tfm); + struct otx_cpt_aead_ctx *ctx = crypto_aead_ctx_dma(tfm); ctx->cipher_type = cipher_type; ctx->mac_type = mac_type; @@ -551,7 +552,7 @@ static int cpt_aead_init(struct crypto_aead *tfm, u8 cipher_type, u8 mac_type) } } - crypto_aead_set_reqsize(tfm, sizeof(struct otx_cpt_req_ctx)); + crypto_aead_set_reqsize_dma(tfm, sizeof(struct otx_cpt_req_ctx)); return 0; } @@ -603,7 +604,7 @@ static int otx_cpt_aead_gcm_aes_init(struct crypto_aead *tfm) static void otx_cpt_aead_exit(struct crypto_aead *tfm) { - struct otx_cpt_aead_ctx *ctx = crypto_aead_ctx(tfm); + struct otx_cpt_aead_ctx *ctx = crypto_aead_ctx_dma(tfm); kfree(ctx->ipad); kfree(ctx->opad); @@ -619,7 +620,7 @@ static void otx_cpt_aead_exit(struct crypto_aead *tfm) static int otx_cpt_aead_set_authsize(struct crypto_aead *tfm, unsigned int authsize) { - struct otx_cpt_aead_ctx *ctx = crypto_aead_ctx(tfm); + struct otx_cpt_aead_ctx *ctx = crypto_aead_ctx_dma(tfm); switch (ctx->mac_type) { case OTX_CPT_SHA1: @@ -739,7 +740,7 @@ static int copy_pad(u8 mac_type, u8 *out_pad, u8 *in_pad) static int aead_hmac_init(struct crypto_aead *cipher) { - struct otx_cpt_aead_ctx *ctx = crypto_aead_ctx(cipher); + struct otx_cpt_aead_ctx *ctx = crypto_aead_ctx_dma(cipher); int state_size = crypto_shash_statesize(ctx->hashalg); int ds = crypto_shash_digestsize(ctx->hashalg); int bs = crypto_shash_blocksize(ctx->hashalg); @@ -837,7 +838,7 @@ static int otx_cpt_aead_cbc_aes_sha_setkey(struct crypto_aead *cipher, const unsigned char *key, unsigned int keylen) { - struct otx_cpt_aead_ctx *ctx = crypto_aead_ctx(cipher); + struct otx_cpt_aead_ctx *ctx = crypto_aead_ctx_dma(cipher); struct crypto_authenc_key_param *param; int enckeylen = 0, authkeylen = 0; struct rtattr *rta = (void *)key; @@ -896,7 +897,7 @@ static int otx_cpt_aead_ecb_null_sha_setkey(struct crypto_aead *cipher, const unsigned char *key, unsigned int keylen) { - struct otx_cpt_aead_ctx *ctx = crypto_aead_ctx(cipher); + struct otx_cpt_aead_ctx *ctx = crypto_aead_ctx_dma(cipher); struct crypto_authenc_key_param *param; struct rtattr *rta = (void *)key; int enckeylen = 0; @@ -932,7 +933,7 @@ static int otx_cpt_aead_gcm_aes_setkey(struct crypto_aead *cipher, const unsigned char *key, unsigned int keylen) { - struct otx_cpt_aead_ctx *ctx = crypto_aead_ctx(cipher); + struct otx_cpt_aead_ctx *ctx = crypto_aead_ctx_dma(cipher); /* * For aes gcm we expect to get encryption key (16, 24, 32 bytes) @@ -965,9 +966,9 @@ static int otx_cpt_aead_gcm_aes_setkey(struct crypto_aead *cipher, static inline u32 create_aead_ctx_hdr(struct aead_request *req, u32 enc, u32 *argcnt) { - struct otx_cpt_req_ctx *rctx = aead_request_ctx(req); + struct otx_cpt_req_ctx *rctx = aead_request_ctx_dma(req); struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct otx_cpt_aead_ctx *ctx = crypto_aead_ctx(tfm); + struct otx_cpt_aead_ctx *ctx = crypto_aead_ctx_dma(tfm); struct otx_cpt_req_info *req_info = &rctx->cpt_req; struct otx_cpt_fc_ctx *fctx = &rctx->fctx; int mac_len = crypto_aead_authsize(tfm); @@ -1050,9 +1051,9 @@ static inline u32 create_aead_ctx_hdr(struct aead_request *req, u32 enc, static inline u32 create_hmac_ctx_hdr(struct aead_request *req, u32 *argcnt, u32 enc) { - struct otx_cpt_req_ctx *rctx = aead_request_ctx(req); + struct otx_cpt_req_ctx *rctx = aead_request_ctx_dma(req); struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct otx_cpt_aead_ctx *ctx = crypto_aead_ctx(tfm); + struct otx_cpt_aead_ctx *ctx = crypto_aead_ctx_dma(tfm); struct otx_cpt_req_info *req_info = &rctx->cpt_req; req_info->ctrl.s.dma_mode = OTX_CPT_DMA_GATHER_SCATTER; @@ -1076,7 +1077,7 @@ static inline u32 create_hmac_ctx_hdr(struct aead_request *req, u32 *argcnt, static inline u32 create_aead_input_list(struct aead_request *req, u32 enc) { - struct otx_cpt_req_ctx *rctx = aead_request_ctx(req); + struct otx_cpt_req_ctx *rctx = aead_request_ctx_dma(req); struct otx_cpt_req_info *req_info = &rctx->cpt_req; u32 inputlen = req->cryptlen + req->assoclen; u32 status, argcnt = 0; @@ -1093,7 +1094,7 @@ static inline u32 create_aead_input_list(struct aead_request *req, u32 enc) static inline u32 create_aead_output_list(struct aead_request *req, u32 enc, u32 mac_len) { - struct otx_cpt_req_ctx *rctx = aead_request_ctx(req); + struct otx_cpt_req_ctx *rctx = aead_request_ctx_dma(req); struct otx_cpt_req_info *req_info = &rctx->cpt_req; u32 argcnt = 0, outputlen = 0; @@ -1111,7 +1112,7 @@ static inline u32 create_aead_output_list(struct aead_request *req, u32 enc, static inline u32 create_aead_null_input_list(struct aead_request *req, u32 enc, u32 mac_len) { - struct otx_cpt_req_ctx *rctx = aead_request_ctx(req); + struct otx_cpt_req_ctx *rctx = aead_request_ctx_dma(req); struct otx_cpt_req_info *req_info = &rctx->cpt_req; u32 inputlen, argcnt = 0; @@ -1130,7 +1131,7 @@ static inline u32 create_aead_null_input_list(struct aead_request *req, static inline u32 create_aead_null_output_list(struct aead_request *req, u32 enc, u32 mac_len) { - struct otx_cpt_req_ctx *rctx = aead_request_ctx(req); + struct otx_cpt_req_ctx *rctx = aead_request_ctx_dma(req); struct otx_cpt_req_info *req_info = &rctx->cpt_req; struct scatterlist *dst; u8 *ptr = NULL; @@ -1217,7 +1218,7 @@ error: static u32 cpt_aead_enc_dec(struct aead_request *req, u8 reg_type, u8 enc) { - struct otx_cpt_req_ctx *rctx = aead_request_ctx(req); + struct otx_cpt_req_ctx *rctx = aead_request_ctx_dma(req); struct otx_cpt_req_info *req_info = &rctx->cpt_req; struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct pci_dev *pdev; @@ -1409,7 +1410,7 @@ static struct aead_alg otx_cpt_aeads[] = { { .cra_driver_name = "cpt_hmac_sha1_cbc_aes", .cra_blocksize = AES_BLOCK_SIZE, .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_ALLOCATES_MEMORY, - .cra_ctxsize = sizeof(struct otx_cpt_aead_ctx), + .cra_ctxsize = sizeof(struct otx_cpt_aead_ctx) + CRYPTO_DMA_PADDING, .cra_priority = 4001, .cra_alignmask = 0, .cra_module = THIS_MODULE, @@ -1428,7 +1429,7 @@ static struct aead_alg otx_cpt_aeads[] = { { .cra_driver_name = "cpt_hmac_sha256_cbc_aes", .cra_blocksize = AES_BLOCK_SIZE, .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_ALLOCATES_MEMORY, - .cra_ctxsize = sizeof(struct otx_cpt_aead_ctx), + .cra_ctxsize = sizeof(struct otx_cpt_aead_ctx) + CRYPTO_DMA_PADDING, .cra_priority = 4001, .cra_alignmask = 0, .cra_module = THIS_MODULE, @@ -1447,7 +1448,7 @@ static struct aead_alg otx_cpt_aeads[] = { { .cra_driver_name = "cpt_hmac_sha384_cbc_aes", .cra_blocksize = AES_BLOCK_SIZE, .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_ALLOCATES_MEMORY, - .cra_ctxsize = sizeof(struct otx_cpt_aead_ctx), + .cra_ctxsize = sizeof(struct otx_cpt_aead_ctx) + CRYPTO_DMA_PADDING, .cra_priority = 4001, .cra_alignmask = 0, .cra_module = THIS_MODULE, @@ -1466,7 +1467,7 @@ static struct aead_alg otx_cpt_aeads[] = { { .cra_driver_name = "cpt_hmac_sha512_cbc_aes", .cra_blocksize = AES_BLOCK_SIZE, .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_ALLOCATES_MEMORY, - .cra_ctxsize = sizeof(struct otx_cpt_aead_ctx), + .cra_ctxsize = sizeof(struct otx_cpt_aead_ctx) + CRYPTO_DMA_PADDING, .cra_priority = 4001, .cra_alignmask = 0, .cra_module = THIS_MODULE, @@ -1485,7 +1486,7 @@ static struct aead_alg otx_cpt_aeads[] = { { .cra_driver_name = "cpt_hmac_sha1_ecb_null", .cra_blocksize = 1, .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_ALLOCATES_MEMORY, - .cra_ctxsize = sizeof(struct otx_cpt_aead_ctx), + .cra_ctxsize = sizeof(struct otx_cpt_aead_ctx) + CRYPTO_DMA_PADDING, .cra_priority = 4001, .cra_alignmask = 0, .cra_module = THIS_MODULE, @@ -1504,7 +1505,7 @@ static struct aead_alg otx_cpt_aeads[] = { { .cra_driver_name = "cpt_hmac_sha256_ecb_null", .cra_blocksize = 1, .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_ALLOCATES_MEMORY, - .cra_ctxsize = sizeof(struct otx_cpt_aead_ctx), + .cra_ctxsize = sizeof(struct otx_cpt_aead_ctx) + CRYPTO_DMA_PADDING, .cra_priority = 4001, .cra_alignmask = 0, .cra_module = THIS_MODULE, @@ -1523,7 +1524,7 @@ static struct aead_alg otx_cpt_aeads[] = { { .cra_driver_name = "cpt_hmac_sha384_ecb_null", .cra_blocksize = 1, .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_ALLOCATES_MEMORY, - .cra_ctxsize = sizeof(struct otx_cpt_aead_ctx), + .cra_ctxsize = sizeof(struct otx_cpt_aead_ctx) + CRYPTO_DMA_PADDING, .cra_priority = 4001, .cra_alignmask = 0, .cra_module = THIS_MODULE, @@ -1542,7 +1543,7 @@ static struct aead_alg otx_cpt_aeads[] = { { .cra_driver_name = "cpt_hmac_sha512_ecb_null", .cra_blocksize = 1, .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_ALLOCATES_MEMORY, - .cra_ctxsize = sizeof(struct otx_cpt_aead_ctx), + .cra_ctxsize = sizeof(struct otx_cpt_aead_ctx) + CRYPTO_DMA_PADDING, .cra_priority = 4001, .cra_alignmask = 0, .cra_module = THIS_MODULE, @@ -1561,7 +1562,7 @@ static struct aead_alg otx_cpt_aeads[] = { { .cra_driver_name = "cpt_rfc4106_gcm_aes", .cra_blocksize = 1, .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_ALLOCATES_MEMORY, - .cra_ctxsize = sizeof(struct otx_cpt_aead_ctx), + .cra_ctxsize = sizeof(struct otx_cpt_aead_ctx) + CRYPTO_DMA_PADDING, .cra_priority = 4001, .cra_alignmask = 0, .cra_module = THIS_MODULE, -- GitLab From d887dec105cdeda6b8da0e84d96c7a07d80269bc Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 2 Dec 2022 17:21:03 +0800 Subject: [PATCH 1274/1423] crypto: octeontx2 - Set DMA alignment explicitly This driver has been implicitly relying on kmalloc alignment to be sufficient for DMA. This may no longer be the case with upcoming arm64 changes. This patch changes it to explicitly request DMA alignment from the Crypto API. Signed-off-by: Herbert Xu --- .../marvell/octeontx2/otx2_cptvf_algs.c | 79 ++++++++++--------- 1 file changed, 40 insertions(+), 39 deletions(-) diff --git a/drivers/crypto/marvell/octeontx2/otx2_cptvf_algs.c b/drivers/crypto/marvell/octeontx2/otx2_cptvf_algs.c index 67530e90bbfed..30b423605c9c1 100644 --- a/drivers/crypto/marvell/octeontx2/otx2_cptvf_algs.c +++ b/drivers/crypto/marvell/octeontx2/otx2_cptvf_algs.c @@ -87,7 +87,7 @@ static inline int validate_hmac_cipher_null(struct otx2_cpt_req_info *cpt_req) req = container_of(cpt_req->areq, struct aead_request, base); tfm = crypto_aead_reqtfm(req); - rctx = aead_request_ctx(req); + rctx = aead_request_ctx_dma(req); if (memcmp(rctx->fctx.hmac.s.hmac_calc, rctx->fctx.hmac.s.hmac_recv, crypto_aead_authsize(tfm)) != 0) @@ -137,7 +137,7 @@ static void output_iv_copyback(struct crypto_async_request *areq) ctx = crypto_skcipher_ctx(stfm); if (ctx->cipher_type == OTX2_CPT_AES_CBC || ctx->cipher_type == OTX2_CPT_DES3_CBC) { - rctx = skcipher_request_ctx(sreq); + rctx = skcipher_request_ctx_dma(sreq); req_info = &rctx->cpt_req; ivsize = crypto_skcipher_ivsize(stfm); start = sreq->cryptlen - ivsize; @@ -219,7 +219,7 @@ static inline int create_ctx_hdr(struct skcipher_request *req, u32 enc, u32 *argcnt) { struct crypto_skcipher *stfm = crypto_skcipher_reqtfm(req); - struct otx2_cpt_req_ctx *rctx = skcipher_request_ctx(req); + struct otx2_cpt_req_ctx *rctx = skcipher_request_ctx_dma(req); struct otx2_cpt_enc_ctx *ctx = crypto_skcipher_ctx(stfm); struct otx2_cpt_req_info *req_info = &rctx->cpt_req; struct otx2_cpt_fc_ctx *fctx = &rctx->fctx; @@ -288,7 +288,7 @@ static inline int create_ctx_hdr(struct skcipher_request *req, u32 enc, static inline int create_input_list(struct skcipher_request *req, u32 enc, u32 enc_iv_len) { - struct otx2_cpt_req_ctx *rctx = skcipher_request_ctx(req); + struct otx2_cpt_req_ctx *rctx = skcipher_request_ctx_dma(req); struct otx2_cpt_req_info *req_info = &rctx->cpt_req; u32 argcnt = 0; int ret; @@ -306,7 +306,7 @@ static inline int create_input_list(struct skcipher_request *req, u32 enc, static inline void create_output_list(struct skcipher_request *req, u32 enc_iv_len) { - struct otx2_cpt_req_ctx *rctx = skcipher_request_ctx(req); + struct otx2_cpt_req_ctx *rctx = skcipher_request_ctx_dma(req); struct otx2_cpt_req_info *req_info = &rctx->cpt_req; u32 argcnt = 0; @@ -325,7 +325,7 @@ static inline void create_output_list(struct skcipher_request *req, static int skcipher_do_fallback(struct skcipher_request *req, bool is_enc) { struct crypto_skcipher *stfm = crypto_skcipher_reqtfm(req); - struct otx2_cpt_req_ctx *rctx = skcipher_request_ctx(req); + struct otx2_cpt_req_ctx *rctx = skcipher_request_ctx_dma(req); struct otx2_cpt_enc_ctx *ctx = crypto_skcipher_ctx(stfm); int ret; @@ -348,7 +348,7 @@ static int skcipher_do_fallback(struct skcipher_request *req, bool is_enc) static inline int cpt_enc_dec(struct skcipher_request *req, u32 enc) { struct crypto_skcipher *stfm = crypto_skcipher_reqtfm(req); - struct otx2_cpt_req_ctx *rctx = skcipher_request_ctx(req); + struct otx2_cpt_req_ctx *rctx = skcipher_request_ctx_dma(req); struct otx2_cpt_enc_ctx *ctx = crypto_skcipher_ctx(stfm); struct otx2_cpt_req_info *req_info = &rctx->cpt_req; u32 enc_iv_len = crypto_skcipher_ivsize(stfm); @@ -537,8 +537,9 @@ static int otx2_cpt_enc_dec_init(struct crypto_skcipher *stfm) * allocated since the cryptd daemon uses * this memory for request_ctx information */ - crypto_skcipher_set_reqsize(stfm, sizeof(struct otx2_cpt_req_ctx) + - sizeof(struct skcipher_request)); + crypto_skcipher_set_reqsize_dma( + stfm, sizeof(struct otx2_cpt_req_ctx) + + sizeof(struct skcipher_request)); return cpt_skcipher_fallback_init(ctx, alg); } @@ -572,7 +573,7 @@ static int cpt_aead_fallback_init(struct otx2_cpt_aead_ctx *ctx, static int cpt_aead_init(struct crypto_aead *atfm, u8 cipher_type, u8 mac_type) { - struct otx2_cpt_aead_ctx *ctx = crypto_aead_ctx(atfm); + struct otx2_cpt_aead_ctx *ctx = crypto_aead_ctx_dma(atfm); struct crypto_tfm *tfm = crypto_aead_tfm(atfm); struct crypto_alg *alg = tfm->__crt_alg; @@ -629,7 +630,7 @@ static int cpt_aead_init(struct crypto_aead *atfm, u8 cipher_type, u8 mac_type) ctx->enc_align_len = 1; break; } - crypto_aead_set_reqsize(atfm, sizeof(struct otx2_cpt_req_ctx)); + crypto_aead_set_reqsize_dma(atfm, sizeof(struct otx2_cpt_req_ctx)); return cpt_aead_fallback_init(ctx, alg); } @@ -681,7 +682,7 @@ static int otx2_cpt_aead_gcm_aes_init(struct crypto_aead *tfm) static void otx2_cpt_aead_exit(struct crypto_aead *tfm) { - struct otx2_cpt_aead_ctx *ctx = crypto_aead_ctx(tfm); + struct otx2_cpt_aead_ctx *ctx = crypto_aead_ctx_dma(tfm); kfree(ctx->ipad); kfree(ctx->opad); @@ -698,7 +699,7 @@ static void otx2_cpt_aead_exit(struct crypto_aead *tfm) static int otx2_cpt_aead_gcm_set_authsize(struct crypto_aead *tfm, unsigned int authsize) { - struct otx2_cpt_aead_ctx *ctx = crypto_aead_ctx(tfm); + struct otx2_cpt_aead_ctx *ctx = crypto_aead_ctx_dma(tfm); if (crypto_rfc4106_check_authsize(authsize)) return -EINVAL; @@ -722,7 +723,7 @@ static int otx2_cpt_aead_set_authsize(struct crypto_aead *tfm, static int otx2_cpt_aead_null_set_authsize(struct crypto_aead *tfm, unsigned int authsize) { - struct otx2_cpt_aead_ctx *ctx = crypto_aead_ctx(tfm); + struct otx2_cpt_aead_ctx *ctx = crypto_aead_ctx_dma(tfm); ctx->is_trunc_hmac = true; tfm->authsize = authsize; @@ -794,7 +795,7 @@ static int copy_pad(u8 mac_type, u8 *out_pad, u8 *in_pad) static int aead_hmac_init(struct crypto_aead *cipher) { - struct otx2_cpt_aead_ctx *ctx = crypto_aead_ctx(cipher); + struct otx2_cpt_aead_ctx *ctx = crypto_aead_ctx_dma(cipher); int state_size = crypto_shash_statesize(ctx->hashalg); int ds = crypto_shash_digestsize(ctx->hashalg); int bs = crypto_shash_blocksize(ctx->hashalg); @@ -892,7 +893,7 @@ static int otx2_cpt_aead_cbc_aes_sha_setkey(struct crypto_aead *cipher, const unsigned char *key, unsigned int keylen) { - struct otx2_cpt_aead_ctx *ctx = crypto_aead_ctx(cipher); + struct otx2_cpt_aead_ctx *ctx = crypto_aead_ctx_dma(cipher); struct crypto_authenc_key_param *param; int enckeylen = 0, authkeylen = 0; struct rtattr *rta = (void *)key; @@ -944,7 +945,7 @@ static int otx2_cpt_aead_ecb_null_sha_setkey(struct crypto_aead *cipher, const unsigned char *key, unsigned int keylen) { - struct otx2_cpt_aead_ctx *ctx = crypto_aead_ctx(cipher); + struct otx2_cpt_aead_ctx *ctx = crypto_aead_ctx_dma(cipher); struct crypto_authenc_key_param *param; struct rtattr *rta = (void *)key; int enckeylen = 0; @@ -979,7 +980,7 @@ static int otx2_cpt_aead_gcm_aes_setkey(struct crypto_aead *cipher, const unsigned char *key, unsigned int keylen) { - struct otx2_cpt_aead_ctx *ctx = crypto_aead_ctx(cipher); + struct otx2_cpt_aead_ctx *ctx = crypto_aead_ctx_dma(cipher); /* * For aes gcm we expect to get encryption key (16, 24, 32 bytes) @@ -1012,9 +1013,9 @@ static int otx2_cpt_aead_gcm_aes_setkey(struct crypto_aead *cipher, static inline int create_aead_ctx_hdr(struct aead_request *req, u32 enc, u32 *argcnt) { - struct otx2_cpt_req_ctx *rctx = aead_request_ctx(req); + struct otx2_cpt_req_ctx *rctx = aead_request_ctx_dma(req); struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct otx2_cpt_aead_ctx *ctx = crypto_aead_ctx(tfm); + struct otx2_cpt_aead_ctx *ctx = crypto_aead_ctx_dma(tfm); struct otx2_cpt_req_info *req_info = &rctx->cpt_req; struct otx2_cpt_fc_ctx *fctx = &rctx->fctx; int mac_len = crypto_aead_authsize(tfm); @@ -1103,9 +1104,9 @@ static inline int create_aead_ctx_hdr(struct aead_request *req, u32 enc, static inline void create_hmac_ctx_hdr(struct aead_request *req, u32 *argcnt, u32 enc) { - struct otx2_cpt_req_ctx *rctx = aead_request_ctx(req); + struct otx2_cpt_req_ctx *rctx = aead_request_ctx_dma(req); struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct otx2_cpt_aead_ctx *ctx = crypto_aead_ctx(tfm); + struct otx2_cpt_aead_ctx *ctx = crypto_aead_ctx_dma(tfm); struct otx2_cpt_req_info *req_info = &rctx->cpt_req; req_info->ctrl.s.dma_mode = OTX2_CPT_DMA_MODE_SG; @@ -1127,7 +1128,7 @@ static inline void create_hmac_ctx_hdr(struct aead_request *req, u32 *argcnt, static inline int create_aead_input_list(struct aead_request *req, u32 enc) { - struct otx2_cpt_req_ctx *rctx = aead_request_ctx(req); + struct otx2_cpt_req_ctx *rctx = aead_request_ctx_dma(req); struct otx2_cpt_req_info *req_info = &rctx->cpt_req; u32 inputlen = req->cryptlen + req->assoclen; u32 status, argcnt = 0; @@ -1144,7 +1145,7 @@ static inline int create_aead_input_list(struct aead_request *req, u32 enc) static inline void create_aead_output_list(struct aead_request *req, u32 enc, u32 mac_len) { - struct otx2_cpt_req_ctx *rctx = aead_request_ctx(req); + struct otx2_cpt_req_ctx *rctx = aead_request_ctx_dma(req); struct otx2_cpt_req_info *req_info = &rctx->cpt_req; u32 argcnt = 0, outputlen = 0; @@ -1160,7 +1161,7 @@ static inline void create_aead_output_list(struct aead_request *req, u32 enc, static inline void create_aead_null_input_list(struct aead_request *req, u32 enc, u32 mac_len) { - struct otx2_cpt_req_ctx *rctx = aead_request_ctx(req); + struct otx2_cpt_req_ctx *rctx = aead_request_ctx_dma(req); struct otx2_cpt_req_info *req_info = &rctx->cpt_req; u32 inputlen, argcnt = 0; @@ -1177,7 +1178,7 @@ static inline void create_aead_null_input_list(struct aead_request *req, static inline int create_aead_null_output_list(struct aead_request *req, u32 enc, u32 mac_len) { - struct otx2_cpt_req_ctx *rctx = aead_request_ctx(req); + struct otx2_cpt_req_ctx *rctx = aead_request_ctx_dma(req); struct otx2_cpt_req_info *req_info = &rctx->cpt_req; struct scatterlist *dst; u8 *ptr = NULL; @@ -1257,9 +1258,9 @@ error_free: static int aead_do_fallback(struct aead_request *req, bool is_enc) { - struct otx2_cpt_req_ctx *rctx = aead_request_ctx(req); + struct otx2_cpt_req_ctx *rctx = aead_request_ctx_dma(req); struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct otx2_cpt_aead_ctx *ctx = crypto_aead_ctx(aead); + struct otx2_cpt_aead_ctx *ctx = crypto_aead_ctx_dma(aead); int ret; if (ctx->fbk_cipher) { @@ -1281,10 +1282,10 @@ static int aead_do_fallback(struct aead_request *req, bool is_enc) static int cpt_aead_enc_dec(struct aead_request *req, u8 reg_type, u8 enc) { - struct otx2_cpt_req_ctx *rctx = aead_request_ctx(req); + struct otx2_cpt_req_ctx *rctx = aead_request_ctx_dma(req); struct otx2_cpt_req_info *req_info = &rctx->cpt_req; struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct otx2_cpt_aead_ctx *ctx = crypto_aead_ctx(tfm); + struct otx2_cpt_aead_ctx *ctx = crypto_aead_ctx_dma(tfm); struct pci_dev *pdev; int status, cpu_num; @@ -1458,7 +1459,7 @@ static struct aead_alg otx2_cpt_aeads[] = { { .cra_driver_name = "cpt_hmac_sha1_cbc_aes", .cra_blocksize = AES_BLOCK_SIZE, .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK, - .cra_ctxsize = sizeof(struct otx2_cpt_aead_ctx), + .cra_ctxsize = sizeof(struct otx2_cpt_aead_ctx) + CRYPTO_DMA_PADDING, .cra_priority = 4001, .cra_alignmask = 0, .cra_module = THIS_MODULE, @@ -1477,7 +1478,7 @@ static struct aead_alg otx2_cpt_aeads[] = { { .cra_driver_name = "cpt_hmac_sha256_cbc_aes", .cra_blocksize = AES_BLOCK_SIZE, .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK, - .cra_ctxsize = sizeof(struct otx2_cpt_aead_ctx), + .cra_ctxsize = sizeof(struct otx2_cpt_aead_ctx) + CRYPTO_DMA_PADDING, .cra_priority = 4001, .cra_alignmask = 0, .cra_module = THIS_MODULE, @@ -1496,7 +1497,7 @@ static struct aead_alg otx2_cpt_aeads[] = { { .cra_driver_name = "cpt_hmac_sha384_cbc_aes", .cra_blocksize = AES_BLOCK_SIZE, .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK, - .cra_ctxsize = sizeof(struct otx2_cpt_aead_ctx), + .cra_ctxsize = sizeof(struct otx2_cpt_aead_ctx) + CRYPTO_DMA_PADDING, .cra_priority = 4001, .cra_alignmask = 0, .cra_module = THIS_MODULE, @@ -1515,7 +1516,7 @@ static struct aead_alg otx2_cpt_aeads[] = { { .cra_driver_name = "cpt_hmac_sha512_cbc_aes", .cra_blocksize = AES_BLOCK_SIZE, .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK, - .cra_ctxsize = sizeof(struct otx2_cpt_aead_ctx), + .cra_ctxsize = sizeof(struct otx2_cpt_aead_ctx) + CRYPTO_DMA_PADDING, .cra_priority = 4001, .cra_alignmask = 0, .cra_module = THIS_MODULE, @@ -1534,7 +1535,7 @@ static struct aead_alg otx2_cpt_aeads[] = { { .cra_driver_name = "cpt_hmac_sha1_ecb_null", .cra_blocksize = 1, .cra_flags = CRYPTO_ALG_ASYNC, - .cra_ctxsize = sizeof(struct otx2_cpt_aead_ctx), + .cra_ctxsize = sizeof(struct otx2_cpt_aead_ctx) + CRYPTO_DMA_PADDING, .cra_priority = 4001, .cra_alignmask = 0, .cra_module = THIS_MODULE, @@ -1553,7 +1554,7 @@ static struct aead_alg otx2_cpt_aeads[] = { { .cra_driver_name = "cpt_hmac_sha256_ecb_null", .cra_blocksize = 1, .cra_flags = CRYPTO_ALG_ASYNC, - .cra_ctxsize = sizeof(struct otx2_cpt_aead_ctx), + .cra_ctxsize = sizeof(struct otx2_cpt_aead_ctx) + CRYPTO_DMA_PADDING, .cra_priority = 4001, .cra_alignmask = 0, .cra_module = THIS_MODULE, @@ -1572,7 +1573,7 @@ static struct aead_alg otx2_cpt_aeads[] = { { .cra_driver_name = "cpt_hmac_sha384_ecb_null", .cra_blocksize = 1, .cra_flags = CRYPTO_ALG_ASYNC, - .cra_ctxsize = sizeof(struct otx2_cpt_aead_ctx), + .cra_ctxsize = sizeof(struct otx2_cpt_aead_ctx) + CRYPTO_DMA_PADDING, .cra_priority = 4001, .cra_alignmask = 0, .cra_module = THIS_MODULE, @@ -1591,7 +1592,7 @@ static struct aead_alg otx2_cpt_aeads[] = { { .cra_driver_name = "cpt_hmac_sha512_ecb_null", .cra_blocksize = 1, .cra_flags = CRYPTO_ALG_ASYNC, - .cra_ctxsize = sizeof(struct otx2_cpt_aead_ctx), + .cra_ctxsize = sizeof(struct otx2_cpt_aead_ctx) + CRYPTO_DMA_PADDING, .cra_priority = 4001, .cra_alignmask = 0, .cra_module = THIS_MODULE, @@ -1610,7 +1611,7 @@ static struct aead_alg otx2_cpt_aeads[] = { { .cra_driver_name = "cpt_rfc4106_gcm_aes", .cra_blocksize = 1, .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK, - .cra_ctxsize = sizeof(struct otx2_cpt_aead_ctx), + .cra_ctxsize = sizeof(struct otx2_cpt_aead_ctx) + CRYPTO_DMA_PADDING, .cra_priority = 4001, .cra_alignmask = 0, .cra_module = THIS_MODULE, -- GitLab From 18daae5b0c41bf54af5f162a3e205858c9771400 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 2 Dec 2022 17:21:05 +0800 Subject: [PATCH 1275/1423] crypto: qce - Set DMA alignment explicitly This driver has been implicitly relying on kmalloc alignment to be sufficient for DMA. This may no longer be the case with upcoming arm64 changes. This patch changes it to explicitly request DMA alignment from the Crypto API. Signed-off-by: Herbert Xu --- drivers/crypto/qce/aead.c | 22 +++++++++++----------- drivers/crypto/qce/common.c | 5 +++-- drivers/crypto/qce/sha.c | 18 +++++++++--------- 3 files changed, 23 insertions(+), 22 deletions(-) diff --git a/drivers/crypto/qce/aead.c b/drivers/crypto/qce/aead.c index 6eb4d2e356297..7d811728f0478 100644 --- a/drivers/crypto/qce/aead.c +++ b/drivers/crypto/qce/aead.c @@ -24,7 +24,7 @@ static void qce_aead_done(void *data) { struct crypto_async_request *async_req = data; struct aead_request *req = aead_request_cast(async_req); - struct qce_aead_reqctx *rctx = aead_request_ctx(req); + struct qce_aead_reqctx *rctx = aead_request_ctx_dma(req); struct qce_aead_ctx *ctx = crypto_tfm_ctx(async_req->tfm); struct qce_alg_template *tmpl = to_aead_tmpl(crypto_aead_reqtfm(req)); struct qce_device *qce = tmpl->qce; @@ -92,7 +92,7 @@ static void qce_aead_done(void *data) static struct scatterlist * qce_aead_prepare_result_buf(struct sg_table *tbl, struct aead_request *req) { - struct qce_aead_reqctx *rctx = aead_request_ctx(req); + struct qce_aead_reqctx *rctx = aead_request_ctx_dma(req); struct qce_alg_template *tmpl = to_aead_tmpl(crypto_aead_reqtfm(req)); struct qce_device *qce = tmpl->qce; @@ -103,7 +103,7 @@ qce_aead_prepare_result_buf(struct sg_table *tbl, struct aead_request *req) static struct scatterlist * qce_aead_prepare_ccm_result_buf(struct sg_table *tbl, struct aead_request *req) { - struct qce_aead_reqctx *rctx = aead_request_ctx(req); + struct qce_aead_reqctx *rctx = aead_request_ctx_dma(req); sg_init_one(&rctx->result_sg, rctx->ccmresult_buf, QCE_BAM_BURST_SIZE); return qce_sgtable_add(tbl, &rctx->result_sg, QCE_BAM_BURST_SIZE); @@ -112,7 +112,7 @@ qce_aead_prepare_ccm_result_buf(struct sg_table *tbl, struct aead_request *req) static struct scatterlist * qce_aead_prepare_dst_buf(struct aead_request *req) { - struct qce_aead_reqctx *rctx = aead_request_ctx(req); + struct qce_aead_reqctx *rctx = aead_request_ctx_dma(req); struct qce_alg_template *tmpl = to_aead_tmpl(crypto_aead_reqtfm(req)); struct qce_device *qce = tmpl->qce; struct scatterlist *sg, *msg_sg, __sg[2]; @@ -186,7 +186,7 @@ qce_aead_ccm_prepare_buf_assoclen(struct aead_request *req) { struct scatterlist *sg, *msg_sg, __sg[2]; struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct qce_aead_reqctx *rctx = aead_request_ctx(req); + struct qce_aead_reqctx *rctx = aead_request_ctx_dma(req); struct qce_aead_ctx *ctx = crypto_aead_ctx(tfm); unsigned int assoclen = rctx->assoclen; unsigned int adata_header_len, cryptlen, totallen; @@ -300,7 +300,7 @@ err_free: static int qce_aead_prepare_buf(struct aead_request *req) { - struct qce_aead_reqctx *rctx = aead_request_ctx(req); + struct qce_aead_reqctx *rctx = aead_request_ctx_dma(req); struct qce_alg_template *tmpl = to_aead_tmpl(crypto_aead_reqtfm(req)); struct qce_device *qce = tmpl->qce; struct scatterlist *sg; @@ -328,7 +328,7 @@ static int qce_aead_prepare_buf(struct aead_request *req) static int qce_aead_ccm_prepare_buf(struct aead_request *req) { - struct qce_aead_reqctx *rctx = aead_request_ctx(req); + struct qce_aead_reqctx *rctx = aead_request_ctx_dma(req); struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct qce_aead_ctx *ctx = crypto_aead_ctx(tfm); struct scatterlist *sg; @@ -408,7 +408,7 @@ static int qce_aead_async_req_handle(struct crypto_async_request *async_req) { struct aead_request *req = aead_request_cast(async_req); - struct qce_aead_reqctx *rctx = aead_request_ctx(req); + struct qce_aead_reqctx *rctx = aead_request_ctx_dma(req); struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct qce_aead_ctx *ctx = crypto_tfm_ctx(async_req->tfm); struct qce_alg_template *tmpl = to_aead_tmpl(crypto_aead_reqtfm(req)); @@ -502,7 +502,7 @@ error_free: static int qce_aead_crypt(struct aead_request *req, int encrypt) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct qce_aead_reqctx *rctx = aead_request_ctx(req); + struct qce_aead_reqctx *rctx = aead_request_ctx_dma(req); struct qce_aead_ctx *ctx = crypto_aead_ctx(tfm); struct qce_alg_template *tmpl = to_aead_tmpl(tfm); unsigned int blocksize = crypto_aead_blocksize(tfm); @@ -675,8 +675,8 @@ static int qce_aead_init(struct crypto_aead *tfm) if (IS_ERR(ctx->fallback)) return PTR_ERR(ctx->fallback); - crypto_aead_set_reqsize(tfm, sizeof(struct qce_aead_reqctx) + - crypto_aead_reqsize(ctx->fallback)); + crypto_aead_set_reqsize_dma(tfm, sizeof(struct qce_aead_reqctx) + + crypto_aead_reqsize(ctx->fallback)); return 0; } diff --git a/drivers/crypto/qce/common.c b/drivers/crypto/qce/common.c index 7c612ba5068f7..04253a8d33409 100644 --- a/drivers/crypto/qce/common.c +++ b/drivers/crypto/qce/common.c @@ -3,6 +3,7 @@ * Copyright (c) 2012-2014, The Linux Foundation. All rights reserved. */ +#include #include #include #include @@ -147,7 +148,7 @@ static int qce_setup_regs_ahash(struct crypto_async_request *async_req) { struct ahash_request *req = ahash_request_cast(async_req); struct crypto_ahash *ahash = __crypto_ahash_cast(async_req->tfm); - struct qce_sha_reqctx *rctx = ahash_request_ctx(req); + struct qce_sha_reqctx *rctx = ahash_request_ctx_dma(req); struct qce_alg_template *tmpl = to_ahash_tmpl(async_req->tfm); struct qce_device *qce = tmpl->qce; unsigned int digestsize = crypto_ahash_digestsize(ahash); @@ -419,7 +420,7 @@ static unsigned int qce_be32_to_cpu_array(u32 *dst, const u8 *src, unsigned int static int qce_setup_regs_aead(struct crypto_async_request *async_req) { struct aead_request *req = aead_request_cast(async_req); - struct qce_aead_reqctx *rctx = aead_request_ctx(req); + struct qce_aead_reqctx *rctx = aead_request_ctx_dma(req); struct qce_aead_ctx *ctx = crypto_tfm_ctx(async_req->tfm); struct qce_alg_template *tmpl = to_aead_tmpl(crypto_aead_reqtfm(req)); struct qce_device *qce = tmpl->qce; diff --git a/drivers/crypto/qce/sha.c b/drivers/crypto/qce/sha.c index 37bafd7aeb79d..fc72af8aa9a72 100644 --- a/drivers/crypto/qce/sha.c +++ b/drivers/crypto/qce/sha.c @@ -38,7 +38,7 @@ static void qce_ahash_done(void *data) struct crypto_async_request *async_req = data; struct ahash_request *req = ahash_request_cast(async_req); struct crypto_ahash *ahash = crypto_ahash_reqtfm(req); - struct qce_sha_reqctx *rctx = ahash_request_ctx(req); + struct qce_sha_reqctx *rctx = ahash_request_ctx_dma(req); struct qce_alg_template *tmpl = to_ahash_tmpl(async_req->tfm); struct qce_device *qce = tmpl->qce; struct qce_result_dump *result = qce->dma.result_buf; @@ -75,7 +75,7 @@ static void qce_ahash_done(void *data) static int qce_ahash_async_req_handle(struct crypto_async_request *async_req) { struct ahash_request *req = ahash_request_cast(async_req); - struct qce_sha_reqctx *rctx = ahash_request_ctx(req); + struct qce_sha_reqctx *rctx = ahash_request_ctx_dma(req); struct qce_sha_ctx *ctx = crypto_tfm_ctx(async_req->tfm); struct qce_alg_template *tmpl = to_ahash_tmpl(async_req->tfm); struct qce_device *qce = tmpl->qce; @@ -132,7 +132,7 @@ error_unmap_src: static int qce_ahash_init(struct ahash_request *req) { - struct qce_sha_reqctx *rctx = ahash_request_ctx(req); + struct qce_sha_reqctx *rctx = ahash_request_ctx_dma(req); struct qce_alg_template *tmpl = to_ahash_tmpl(req->base.tfm); const u32 *std_iv = tmpl->std_iv; @@ -147,7 +147,7 @@ static int qce_ahash_init(struct ahash_request *req) static int qce_ahash_export(struct ahash_request *req, void *out) { - struct qce_sha_reqctx *rctx = ahash_request_ctx(req); + struct qce_sha_reqctx *rctx = ahash_request_ctx_dma(req); struct qce_sha_saved_state *export_state = out; memcpy(export_state->pending_buf, rctx->buf, rctx->buflen); @@ -164,7 +164,7 @@ static int qce_ahash_export(struct ahash_request *req, void *out) static int qce_ahash_import(struct ahash_request *req, const void *in) { - struct qce_sha_reqctx *rctx = ahash_request_ctx(req); + struct qce_sha_reqctx *rctx = ahash_request_ctx_dma(req); const struct qce_sha_saved_state *import_state = in; memset(rctx, 0, sizeof(*rctx)); @@ -183,7 +183,7 @@ static int qce_ahash_import(struct ahash_request *req, const void *in) static int qce_ahash_update(struct ahash_request *req) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct qce_sha_reqctx *rctx = ahash_request_ctx(req); + struct qce_sha_reqctx *rctx = ahash_request_ctx_dma(req); struct qce_alg_template *tmpl = to_ahash_tmpl(req->base.tfm); struct qce_device *qce = tmpl->qce; struct scatterlist *sg_last, *sg; @@ -275,7 +275,7 @@ static int qce_ahash_update(struct ahash_request *req) static int qce_ahash_final(struct ahash_request *req) { - struct qce_sha_reqctx *rctx = ahash_request_ctx(req); + struct qce_sha_reqctx *rctx = ahash_request_ctx_dma(req); struct qce_alg_template *tmpl = to_ahash_tmpl(req->base.tfm); struct qce_device *qce = tmpl->qce; @@ -302,7 +302,7 @@ static int qce_ahash_final(struct ahash_request *req) static int qce_ahash_digest(struct ahash_request *req) { - struct qce_sha_reqctx *rctx = ahash_request_ctx(req); + struct qce_sha_reqctx *rctx = ahash_request_ctx_dma(req); struct qce_alg_template *tmpl = to_ahash_tmpl(req->base.tfm); struct qce_device *qce = tmpl->qce; int ret; @@ -395,7 +395,7 @@ static int qce_ahash_cra_init(struct crypto_tfm *tfm) struct crypto_ahash *ahash = __crypto_ahash_cast(tfm); struct qce_sha_ctx *ctx = crypto_tfm_ctx(tfm); - crypto_ahash_set_reqsize(ahash, sizeof(struct qce_sha_reqctx)); + crypto_ahash_set_reqsize_dma(ahash, sizeof(struct qce_sha_reqctx)); memset(ctx, 0, sizeof(*ctx)); return 0; } -- GitLab From ecadb5b0111ea19fc7c240bb25d424a94471eb7d Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Fri, 2 Dec 2022 21:22:33 +0800 Subject: [PATCH 1276/1423] hwrng: amd - Fix PCI device refcount leak for_each_pci_dev() is implemented by pci_get_device(). The comment of pci_get_device() says that it will increase the reference count for the returned pci_dev and also decrease the reference count for the input pci_dev @from if it is not NULL. If we break for_each_pci_dev() loop with pdev not NULL, we need to call pci_dev_put() to decrease the reference count. Add the missing pci_dev_put() for the normal and error path. Fixes: 96d63c0297cc ("[PATCH] Add AMD HW RNG driver") Signed-off-by: Xiongfeng Wang Signed-off-by: Herbert Xu --- drivers/char/hw_random/amd-rng.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/char/hw_random/amd-rng.c b/drivers/char/hw_random/amd-rng.c index c22d4184bb612..0555e3838bce1 100644 --- a/drivers/char/hw_random/amd-rng.c +++ b/drivers/char/hw_random/amd-rng.c @@ -143,15 +143,19 @@ static int __init amd_rng_mod_init(void) found: err = pci_read_config_dword(pdev, 0x58, &pmbase); if (err) - return err; + goto put_dev; pmbase &= 0x0000FF00; - if (pmbase == 0) - return -EIO; + if (pmbase == 0) { + err = -EIO; + goto put_dev; + } priv = kzalloc(sizeof(*priv), GFP_KERNEL); - if (!priv) - return -ENOMEM; + if (!priv) { + err = -ENOMEM; + goto put_dev; + } if (!request_region(pmbase + PMBASE_OFFSET, PMBASE_SIZE, DRV_NAME)) { dev_err(&pdev->dev, DRV_NAME " region 0x%x already in use!\n", @@ -185,6 +189,8 @@ err_iomap: release_region(pmbase + PMBASE_OFFSET, PMBASE_SIZE); out: kfree(priv); +put_dev: + pci_dev_put(pdev); return err; } @@ -200,6 +206,8 @@ static void __exit amd_rng_mod_exit(void) release_region(priv->pmbase + PMBASE_OFFSET, PMBASE_SIZE); + pci_dev_put(priv->pcidev); + kfree(priv); } -- GitLab From 9f6ec8dc574efb7f4f3d7ee9cd59ae307e78f445 Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Fri, 2 Dec 2022 21:22:34 +0800 Subject: [PATCH 1277/1423] hwrng: geode - Fix PCI device refcount leak for_each_pci_dev() is implemented by pci_get_device(). The comment of pci_get_device() says that it will increase the reference count for the returned pci_dev and also decrease the reference count for the input pci_dev @from if it is not NULL. If we break for_each_pci_dev() loop with pdev not NULL, we need to call pci_dev_put() to decrease the reference count. We add a new struct 'amd_geode_priv' to record pointer of the pci_dev and membase, and then add missing pci_dev_put() for the normal and error path. Fixes: ef5d862734b8 ("[PATCH] Add Geode HW RNG driver") Signed-off-by: Xiongfeng Wang Signed-off-by: Herbert Xu --- drivers/char/hw_random/geode-rng.c | 36 +++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/drivers/char/hw_random/geode-rng.c b/drivers/char/hw_random/geode-rng.c index 138ce434f86b2..12fbe80918319 100644 --- a/drivers/char/hw_random/geode-rng.c +++ b/drivers/char/hw_random/geode-rng.c @@ -51,6 +51,10 @@ static const struct pci_device_id pci_tbl[] = { }; MODULE_DEVICE_TABLE(pci, pci_tbl); +struct amd_geode_priv { + struct pci_dev *pcidev; + void __iomem *membase; +}; static int geode_rng_data_read(struct hwrng *rng, u32 *data) { @@ -90,6 +94,7 @@ static int __init geode_rng_init(void) const struct pci_device_id *ent; void __iomem *mem; unsigned long rng_base; + struct amd_geode_priv *priv; for_each_pci_dev(pdev) { ent = pci_match_id(pci_tbl, pdev); @@ -97,17 +102,26 @@ static int __init geode_rng_init(void) goto found; } /* Device not found. */ - goto out; + return err; found: + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) { + err = -ENOMEM; + goto put_dev; + } + rng_base = pci_resource_start(pdev, 0); if (rng_base == 0) - goto out; + goto free_priv; err = -ENOMEM; mem = ioremap(rng_base, 0x58); if (!mem) - goto out; - geode_rng.priv = (unsigned long)mem; + goto free_priv; + + geode_rng.priv = (unsigned long)priv; + priv->membase = mem; + priv->pcidev = pdev; pr_info("AMD Geode RNG detected\n"); err = hwrng_register(&geode_rng); @@ -116,20 +130,26 @@ found: err); goto err_unmap; } -out: return err; err_unmap: iounmap(mem); - goto out; +free_priv: + kfree(priv); +put_dev: + pci_dev_put(pdev); + return err; } static void __exit geode_rng_exit(void) { - void __iomem *mem = (void __iomem *)geode_rng.priv; + struct amd_geode_priv *priv; + priv = (struct amd_geode_priv *)geode_rng.priv; hwrng_unregister(&geode_rng); - iounmap(mem); + iounmap(priv->membase); + pci_dev_put(priv->pcidev); + kfree(priv); } module_init(geode_rng_init); -- GitLab From 6c013679eb5c7e0b09cbcb64276f6dd97b473d12 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sat, 3 Dec 2022 10:15:15 +0100 Subject: [PATCH 1278/1423] dt-bindings: crypto: Let STM32 define Ux500 CRYP This adds device tree bindings for the Ux500 CRYP block as a compatible in the STM32 CRYP bindings. The Ux500 CRYP binding has been used for ages in the kernel device tree for Ux500 but was never documented, so fill in the gap by making it a sibling of the STM32 CRYP block, which is what it is. The relationship to the existing STM32 CRYP block is pretty obvious when looking at the register map, and I have written patches to reuse the STM32 CRYP driver on the Ux500. The two properties added are DMA channels and power domain. Power domains are a generic SoC feature and the STM32 variant also has DMA channels. Cc: devicetree@vger.kernel.org Cc: Rob Herring Cc: Krzysztof Kozlowski Cc: Lionel Debieve Cc: Maxime Coquelin Cc: Alexandre Torgue Acked-by: Krzysztof Kozlowski Signed-off-by: Linus Walleij Signed-off-by: Herbert Xu --- .../bindings/crypto/st,stm32-cryp.yaml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/Documentation/devicetree/bindings/crypto/st,stm32-cryp.yaml b/Documentation/devicetree/bindings/crypto/st,stm32-cryp.yaml index ed23bf94a8e00..6759c5bf3e572 100644 --- a/Documentation/devicetree/bindings/crypto/st,stm32-cryp.yaml +++ b/Documentation/devicetree/bindings/crypto/st,stm32-cryp.yaml @@ -6,12 +6,18 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: STMicroelectronics STM32 CRYP bindings +description: The STM32 CRYP block is built on the CRYP block found in + the STn8820 SoC introduced in 2007, and subsequently used in the U8500 + SoC in 2010. + maintainers: - Lionel Debieve properties: compatible: enum: + - st,stn8820-cryp + - stericsson,ux500-cryp - st,stm32f756-cryp - st,stm32mp1-cryp @@ -27,6 +33,19 @@ properties: resets: maxItems: 1 + dmas: + items: + - description: mem2cryp DMA channel + - description: cryp2mem DMA channel + + dma-names: + items: + - const: mem2cryp + - const: cryp2mem + + power-domains: + maxItems: 1 + required: - compatible - reg -- GitLab From fe867538c1620738bda5328a14179a3c2bc95ab1 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sat, 3 Dec 2022 10:15:16 +0100 Subject: [PATCH 1279/1423] crypto: stm32 - enable drivers to be used on Ux500 The Ux500 cryp and hash drivers are older versions of the hardware managed by the stm32 driver. Instead of trying to improve the Ux500 cryp and hash drivers, start to switch over to the modern and more well-maintained STM32 drivers. Cc: Maxime Coquelin Cc: Alexandre Torgue Acked-by: Lionel Debieve Signed-off-by: Linus Walleij Signed-off-by: Herbert Xu --- drivers/crypto/Makefile | 2 +- drivers/crypto/stm32/Kconfig | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/crypto/Makefile b/drivers/crypto/Makefile index 116de173a66c5..fa8bf1be1a8cd 100644 --- a/drivers/crypto/Makefile +++ b/drivers/crypto/Makefile @@ -41,7 +41,7 @@ obj-$(CONFIG_CRYPTO_DEV_S5P) += s5p-sss.o obj-$(CONFIG_CRYPTO_DEV_SA2UL) += sa2ul.o obj-$(CONFIG_CRYPTO_DEV_SAHARA) += sahara.o obj-$(CONFIG_CRYPTO_DEV_SL3516) += gemini/ -obj-$(CONFIG_ARCH_STM32) += stm32/ +obj-y += stm32/ obj-$(CONFIG_CRYPTO_DEV_TALITOS) += talitos.o obj-$(CONFIG_CRYPTO_DEV_UX500) += ux500/ obj-$(CONFIG_CRYPTO_DEV_VIRTIO) += virtio/ diff --git a/drivers/crypto/stm32/Kconfig b/drivers/crypto/stm32/Kconfig index 4a4c3284ae1f3..4fc581e9e5956 100644 --- a/drivers/crypto/stm32/Kconfig +++ b/drivers/crypto/stm32/Kconfig @@ -10,7 +10,7 @@ config CRYPTO_DEV_STM32_CRC config CRYPTO_DEV_STM32_HASH tristate "Support for STM32 hash accelerators" - depends on ARCH_STM32 + depends on ARCH_STM32 || ARCH_U8500 depends on HAS_DMA select CRYPTO_HASH select CRYPTO_MD5 @@ -23,7 +23,7 @@ config CRYPTO_DEV_STM32_HASH config CRYPTO_DEV_STM32_CRYP tristate "Support for STM32 cryp accelerators" - depends on ARCH_STM32 + depends on ARCH_STM32 || ARCH_U8500 select CRYPTO_HASH select CRYPTO_ENGINE select CRYPTO_LIB_DES -- GitLab From 0b496efbd2d00f658dbf906882d935e7fa3dfd03 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sat, 3 Dec 2022 10:15:17 +0100 Subject: [PATCH 1280/1423] crypto: stm32/cryp - enable for use with Ux500 This adds a few small quirks to handle the differences between the STM32 and Ux500 cryp blocks. The following differences are handled with special bool switch bits in the capabilities: - The main difference is that some registers are removed, so we add register offsets for all registers in the per-variant data. Then we assign the right offsets for Ux500 vs the STM32 variants. - The Ux500 does not support the aeads algorithms; gcm(aes) and ccm(aes). Avoid registering them when running on Ux500. - The Ux500 has a special "linear" key format and does some elaborare bit swizzling of the key bits before writing them into the key registers. This is written as an "application note" inside the DB8500 design specification, and seems to be the result of some mishap when assigning the data lines to register bits. (STM32 has clearly fixed this.) - The Ux500 does not have the KP "key prepare" bit in the CR register. Instead, we need to set the KSE bit, "key schedule encryption" bit which does the same thing but is in bit 11 rather than being a special "algorithm type" as on STM32. The algorithm must however be specified as AES ECB while doing this. - The Ux500 cannot just read out IV registers, we need to set the KEYRDEN "key read enable" bit, as this protects not just the key but also the IV from being read out. Enable this bit before reading out the IV and disable it afterwards. Cc: Maxime Coquelin Cc: Alexandre Torgue Acked by: Lionel Debieve Signed-off-by: Linus Walleij Signed-off-by: Herbert Xu --- drivers/crypto/stm32/stm32-cryp.c | 413 +++++++++++++++++++++++------- 1 file changed, 322 insertions(+), 91 deletions(-) diff --git a/drivers/crypto/stm32/stm32-cryp.c b/drivers/crypto/stm32/stm32-cryp.c index 59638dfce573b..4208338e72b60 100644 --- a/drivers/crypto/stm32/stm32-cryp.c +++ b/drivers/crypto/stm32/stm32-cryp.c @@ -2,6 +2,7 @@ /* * Copyright (C) STMicroelectronics SA 2017 * Author: Fabien Dessenne + * Ux500 support taken from snippets in the old Ux500 cryp driver */ #include @@ -62,6 +63,29 @@ #define CRYP_CSGCMCCM0R 0x00000050 #define CRYP_CSGCM0R 0x00000070 +#define UX500_CRYP_CR 0x00000000 +#define UX500_CRYP_SR 0x00000004 +#define UX500_CRYP_DIN 0x00000008 +#define UX500_CRYP_DINSIZE 0x0000000C +#define UX500_CRYP_DOUT 0x00000010 +#define UX500_CRYP_DOUSIZE 0x00000014 +#define UX500_CRYP_DMACR 0x00000018 +#define UX500_CRYP_IMSC 0x0000001C +#define UX500_CRYP_RIS 0x00000020 +#define UX500_CRYP_MIS 0x00000024 +#define UX500_CRYP_K1L 0x00000028 +#define UX500_CRYP_K1R 0x0000002C +#define UX500_CRYP_K2L 0x00000030 +#define UX500_CRYP_K2R 0x00000034 +#define UX500_CRYP_K3L 0x00000038 +#define UX500_CRYP_K3R 0x0000003C +#define UX500_CRYP_K4L 0x00000040 +#define UX500_CRYP_K4R 0x00000044 +#define UX500_CRYP_IV0L 0x00000048 +#define UX500_CRYP_IV0R 0x0000004C +#define UX500_CRYP_IV1L 0x00000050 +#define UX500_CRYP_IV1R 0x00000054 + /* Registers values */ #define CR_DEC_NOT_ENC 0x00000004 #define CR_TDES_ECB 0x00000000 @@ -71,7 +95,8 @@ #define CR_AES_ECB 0x00000020 #define CR_AES_CBC 0x00000028 #define CR_AES_CTR 0x00000030 -#define CR_AES_KP 0x00000038 +#define CR_AES_KP 0x00000038 /* Not on Ux500 */ +#define CR_AES_XTS 0x00000038 /* Only on Ux500 */ #define CR_AES_GCM 0x00080000 #define CR_AES_CCM 0x00080008 #define CR_AES_UNKNOWN 0xFFFFFFFF @@ -83,6 +108,8 @@ #define CR_KEY128 0x00000000 #define CR_KEY192 0x00000100 #define CR_KEY256 0x00000200 +#define CR_KEYRDEN 0x00000400 /* Only on Ux500 */ +#define CR_KSE 0x00000800 /* Only on Ux500 */ #define CR_FFLUSH 0x00004000 #define CR_CRYPEN 0x00008000 #define CR_PH_INIT 0x00000000 @@ -107,8 +134,25 @@ #define CRYP_AUTOSUSPEND_DELAY 50 struct stm32_cryp_caps { - bool swap_final; - bool padding_wa; + bool aeads_support; + bool linear_aes_key; + bool kp_mode; + bool iv_protection; + bool swap_final; + bool padding_wa; + u32 cr; + u32 sr; + u32 din; + u32 dout; + u32 imsc; + u32 mis; + u32 k1l; + u32 k1r; + u32 k3r; + u32 iv0l; + u32 iv0r; + u32 iv1l; + u32 iv1r; }; struct stm32_cryp_ctx { @@ -228,20 +272,21 @@ static inline int stm32_cryp_wait_busy(struct stm32_cryp *cryp) { u32 status; - return readl_relaxed_poll_timeout(cryp->regs + CRYP_SR, status, + return readl_relaxed_poll_timeout(cryp->regs + cryp->caps->sr, status, !(status & SR_BUSY), 10, 100000); } static inline void stm32_cryp_enable(struct stm32_cryp *cryp) { - writel_relaxed(readl_relaxed(cryp->regs + CRYP_CR) | CR_CRYPEN, cryp->regs + CRYP_CR); + writel_relaxed(readl_relaxed(cryp->regs + cryp->caps->cr) | CR_CRYPEN, + cryp->regs + cryp->caps->cr); } static inline int stm32_cryp_wait_enable(struct stm32_cryp *cryp) { u32 status; - return readl_relaxed_poll_timeout(cryp->regs + CRYP_CR, status, + return readl_relaxed_poll_timeout(cryp->regs + cryp->caps->cr, status, !(status & CR_CRYPEN), 10, 100000); } @@ -249,10 +294,22 @@ static inline int stm32_cryp_wait_output(struct stm32_cryp *cryp) { u32 status; - return readl_relaxed_poll_timeout(cryp->regs + CRYP_SR, status, + return readl_relaxed_poll_timeout(cryp->regs + cryp->caps->sr, status, status & SR_OFNE, 10, 100000); } +static inline void stm32_cryp_key_read_enable(struct stm32_cryp *cryp) +{ + writel_relaxed(readl_relaxed(cryp->regs + cryp->caps->cr) | CR_KEYRDEN, + cryp->regs + cryp->caps->cr); +} + +static inline void stm32_cryp_key_read_disable(struct stm32_cryp *cryp) +{ + writel_relaxed(readl_relaxed(cryp->regs + cryp->caps->cr) & ~CR_KEYRDEN, + cryp->regs + cryp->caps->cr); +} + static int stm32_cryp_read_auth_tag(struct stm32_cryp *cryp); static void stm32_cryp_finish_req(struct stm32_cryp *cryp, int err); @@ -281,12 +338,12 @@ static void stm32_cryp_hw_write_iv(struct stm32_cryp *cryp, __be32 *iv) if (!iv) return; - stm32_cryp_write(cryp, CRYP_IV0LR, be32_to_cpu(*iv++)); - stm32_cryp_write(cryp, CRYP_IV0RR, be32_to_cpu(*iv++)); + stm32_cryp_write(cryp, cryp->caps->iv0l, be32_to_cpu(*iv++)); + stm32_cryp_write(cryp, cryp->caps->iv0r, be32_to_cpu(*iv++)); if (is_aes(cryp)) { - stm32_cryp_write(cryp, CRYP_IV1LR, be32_to_cpu(*iv++)); - stm32_cryp_write(cryp, CRYP_IV1RR, be32_to_cpu(*iv++)); + stm32_cryp_write(cryp, cryp->caps->iv1l, be32_to_cpu(*iv++)); + stm32_cryp_write(cryp, cryp->caps->iv1r, be32_to_cpu(*iv++)); } } @@ -298,12 +355,102 @@ static void stm32_cryp_get_iv(struct stm32_cryp *cryp) if (!tmp) return; - *tmp++ = cpu_to_be32(stm32_cryp_read(cryp, CRYP_IV0LR)); - *tmp++ = cpu_to_be32(stm32_cryp_read(cryp, CRYP_IV0RR)); + if (cryp->caps->iv_protection) + stm32_cryp_key_read_enable(cryp); + + *tmp++ = cpu_to_be32(stm32_cryp_read(cryp, cryp->caps->iv0l)); + *tmp++ = cpu_to_be32(stm32_cryp_read(cryp, cryp->caps->iv0r)); if (is_aes(cryp)) { - *tmp++ = cpu_to_be32(stm32_cryp_read(cryp, CRYP_IV1LR)); - *tmp++ = cpu_to_be32(stm32_cryp_read(cryp, CRYP_IV1RR)); + *tmp++ = cpu_to_be32(stm32_cryp_read(cryp, cryp->caps->iv1l)); + *tmp++ = cpu_to_be32(stm32_cryp_read(cryp, cryp->caps->iv1r)); + } + + if (cryp->caps->iv_protection) + stm32_cryp_key_read_disable(cryp); +} + +/** + * ux500_swap_bits_in_byte() - mirror the bits in a byte + * @b: the byte to be mirrored + * + * The bits are swapped the following way: + * Byte b include bits 0-7, nibble 1 (n1) include bits 0-3 and + * nibble 2 (n2) bits 4-7. + * + * Nibble 1 (n1): + * (The "old" (moved) bit is replaced with a zero) + * 1. Move bit 6 and 7, 4 positions to the left. + * 2. Move bit 3 and 5, 2 positions to the left. + * 3. Move bit 1-4, 1 position to the left. + * + * Nibble 2 (n2): + * 1. Move bit 0 and 1, 4 positions to the right. + * 2. Move bit 2 and 4, 2 positions to the right. + * 3. Move bit 3-6, 1 position to the right. + * + * Combine the two nibbles to a complete and swapped byte. + */ +static inline u8 ux500_swap_bits_in_byte(u8 b) +{ +#define R_SHIFT_4_MASK 0xc0 /* Bits 6 and 7, right shift 4 */ +#define R_SHIFT_2_MASK 0x28 /* (After right shift 4) Bits 3 and 5, + right shift 2 */ +#define R_SHIFT_1_MASK 0x1e /* (After right shift 2) Bits 1-4, + right shift 1 */ +#define L_SHIFT_4_MASK 0x03 /* Bits 0 and 1, left shift 4 */ +#define L_SHIFT_2_MASK 0x14 /* (After left shift 4) Bits 2 and 4, + left shift 2 */ +#define L_SHIFT_1_MASK 0x78 /* (After left shift 1) Bits 3-6, + left shift 1 */ + + u8 n1; + u8 n2; + + /* Swap most significant nibble */ + /* Right shift 4, bits 6 and 7 */ + n1 = ((b & R_SHIFT_4_MASK) >> 4) | (b & ~(R_SHIFT_4_MASK >> 4)); + /* Right shift 2, bits 3 and 5 */ + n1 = ((n1 & R_SHIFT_2_MASK) >> 2) | (n1 & ~(R_SHIFT_2_MASK >> 2)); + /* Right shift 1, bits 1-4 */ + n1 = (n1 & R_SHIFT_1_MASK) >> 1; + + /* Swap least significant nibble */ + /* Left shift 4, bits 0 and 1 */ + n2 = ((b & L_SHIFT_4_MASK) << 4) | (b & ~(L_SHIFT_4_MASK << 4)); + /* Left shift 2, bits 2 and 4 */ + n2 = ((n2 & L_SHIFT_2_MASK) << 2) | (n2 & ~(L_SHIFT_2_MASK << 2)); + /* Left shift 1, bits 3-6 */ + n2 = (n2 & L_SHIFT_1_MASK) << 1; + + return n1 | n2; +} + +/** + * ux500_swizzle_key() - Shuffle around words and bits in the AES key + * @in: key to swizzle + * @out: swizzled key + * @len: length of key, in bytes + * + * This "key swizzling procedure" is described in the examples in the + * DB8500 design specification. There is no real description of why + * the bits have been arranged like this in the hardware. + */ +static inline void ux500_swizzle_key(const u8 *in, u8 *out, u32 len) +{ + int i = 0; + int bpw = sizeof(u32); + int j; + int index = 0; + + j = len - bpw; + while (j >= 0) { + for (i = 0; i < bpw; i++) { + index = len - j - bpw + i; + out[j + i] = + ux500_swap_bits_in_byte(in[index]); + } + j -= bpw; } } @@ -313,14 +460,33 @@ static void stm32_cryp_hw_write_key(struct stm32_cryp *c) int r_id; if (is_des(c)) { - stm32_cryp_write(c, CRYP_K1LR, be32_to_cpu(c->ctx->key[0])); - stm32_cryp_write(c, CRYP_K1RR, be32_to_cpu(c->ctx->key[1])); - } else { - r_id = CRYP_K3RR; - for (i = c->ctx->keylen / sizeof(u32); i > 0; i--, r_id -= 4) - stm32_cryp_write(c, r_id, - be32_to_cpu(c->ctx->key[i - 1])); + stm32_cryp_write(c, c->caps->k1l, be32_to_cpu(c->ctx->key[0])); + stm32_cryp_write(c, c->caps->k1r, be32_to_cpu(c->ctx->key[1])); + return; } + + /* + * On the Ux500 the AES key is considered as a single bit sequence + * of 128, 192 or 256 bits length. It is written linearly into the + * registers from K1L and down, and need to be processed to become + * a proper big-endian bit sequence. + */ + if (is_aes(c) && c->caps->linear_aes_key) { + u32 tmpkey[8]; + + ux500_swizzle_key((u8 *)c->ctx->key, + (u8 *)tmpkey, c->ctx->keylen); + + r_id = c->caps->k1l; + for (i = 0; i < c->ctx->keylen / sizeof(u32); i++, r_id += 4) + stm32_cryp_write(c, r_id, tmpkey[i]); + + return; + } + + r_id = c->caps->k3r; + for (i = c->ctx->keylen / sizeof(u32); i > 0; i--, r_id -= 4) + stm32_cryp_write(c, r_id, be32_to_cpu(c->ctx->key[i - 1])); } static u32 stm32_cryp_get_hw_mode(struct stm32_cryp *cryp) @@ -373,7 +539,7 @@ static int stm32_cryp_gcm_init(struct stm32_cryp *cryp, u32 cfg) cryp->gcm_ctr = GCM_CTR_INIT; stm32_cryp_hw_write_iv(cryp, iv); - stm32_cryp_write(cryp, CRYP_CR, cfg | CR_PH_INIT | CR_CRYPEN); + stm32_cryp_write(cryp, cryp->caps->cr, cfg | CR_PH_INIT | CR_CRYPEN); /* Wait for end of processing */ ret = stm32_cryp_wait_enable(cryp); @@ -385,10 +551,10 @@ static int stm32_cryp_gcm_init(struct stm32_cryp *cryp, u32 cfg) /* Prepare next phase */ if (cryp->areq->assoclen) { cfg |= CR_PH_HEADER; - stm32_cryp_write(cryp, CRYP_CR, cfg); + stm32_cryp_write(cryp, cryp->caps->cr, cfg); } else if (stm32_cryp_get_input_text_len(cryp)) { cfg |= CR_PH_PAYLOAD; - stm32_cryp_write(cryp, CRYP_CR, cfg); + stm32_cryp_write(cryp, cryp->caps->cr, cfg); } return 0; @@ -405,20 +571,20 @@ static void stm32_crypt_gcmccm_end_header(struct stm32_cryp *cryp) err = stm32_cryp_wait_busy(cryp); if (err) { dev_err(cryp->dev, "Timeout (gcm/ccm header)\n"); - stm32_cryp_write(cryp, CRYP_IMSCR, 0); + stm32_cryp_write(cryp, cryp->caps->imsc, 0); stm32_cryp_finish_req(cryp, err); return; } if (stm32_cryp_get_input_text_len(cryp)) { /* Phase 3 : payload */ - cfg = stm32_cryp_read(cryp, CRYP_CR); + cfg = stm32_cryp_read(cryp, cryp->caps->cr); cfg &= ~CR_CRYPEN; - stm32_cryp_write(cryp, CRYP_CR, cfg); + stm32_cryp_write(cryp, cryp->caps->cr, cfg); cfg &= ~CR_PH_MASK; cfg |= CR_PH_PAYLOAD | CR_CRYPEN; - stm32_cryp_write(cryp, CRYP_CR, cfg); + stm32_cryp_write(cryp, cryp->caps->cr, cfg); } else { /* * Phase 4 : tag. @@ -458,7 +624,7 @@ static void stm32_cryp_write_ccm_first_header(struct stm32_cryp *cryp) scatterwalk_copychunks((char *)block + len, &cryp->in_walk, written, 0); for (i = 0; i < AES_BLOCK_32; i++) - stm32_cryp_write(cryp, CRYP_DIN, block[i]); + stm32_cryp_write(cryp, cryp->caps->din, block[i]); cryp->header_in -= written; @@ -494,7 +660,7 @@ static int stm32_cryp_ccm_init(struct stm32_cryp *cryp, u32 cfg) b0[AES_BLOCK_SIZE - 1] = textlen & 0xFF; /* Enable HW */ - stm32_cryp_write(cryp, CRYP_CR, cfg | CR_PH_INIT | CR_CRYPEN); + stm32_cryp_write(cryp, cryp->caps->cr, cfg | CR_PH_INIT | CR_CRYPEN); /* Write B0 */ d = (u32 *)b0; @@ -505,7 +671,7 @@ static int stm32_cryp_ccm_init(struct stm32_cryp *cryp, u32 cfg) if (!cryp->caps->padding_wa) xd = be32_to_cpu(bd[i]); - stm32_cryp_write(cryp, CRYP_DIN, xd); + stm32_cryp_write(cryp, cryp->caps->din, xd); } /* Wait for end of processing */ @@ -518,13 +684,13 @@ static int stm32_cryp_ccm_init(struct stm32_cryp *cryp, u32 cfg) /* Prepare next phase */ if (cryp->areq->assoclen) { cfg |= CR_PH_HEADER | CR_CRYPEN; - stm32_cryp_write(cryp, CRYP_CR, cfg); + stm32_cryp_write(cryp, cryp->caps->cr, cfg); /* Write first (special) block (may move to next phase [payload]) */ stm32_cryp_write_ccm_first_header(cryp); } else if (stm32_cryp_get_input_text_len(cryp)) { cfg |= CR_PH_PAYLOAD; - stm32_cryp_write(cryp, CRYP_CR, cfg); + stm32_cryp_write(cryp, cryp->caps->cr, cfg); } return 0; @@ -538,7 +704,7 @@ static int stm32_cryp_hw_init(struct stm32_cryp *cryp) pm_runtime_get_sync(cryp->dev); /* Disable interrupt */ - stm32_cryp_write(cryp, CRYP_IMSCR, 0); + stm32_cryp_write(cryp, cryp->caps->imsc, 0); /* Set configuration */ cfg = CR_DATA8 | CR_FFLUSH; @@ -566,7 +732,12 @@ static int stm32_cryp_hw_init(struct stm32_cryp *cryp) if (is_decrypt(cryp) && ((hw_mode == CR_AES_ECB) || (hw_mode == CR_AES_CBC))) { /* Configure in key preparation mode */ - stm32_cryp_write(cryp, CRYP_CR, cfg | CR_AES_KP); + if (cryp->caps->kp_mode) + stm32_cryp_write(cryp, cryp->caps->cr, + cfg | CR_AES_KP); + else + stm32_cryp_write(cryp, + cryp->caps->cr, cfg | CR_AES_ECB | CR_KSE); /* Set key only after full configuration done */ stm32_cryp_hw_write_key(cryp); @@ -583,14 +754,14 @@ static int stm32_cryp_hw_init(struct stm32_cryp *cryp) cfg |= hw_mode | CR_DEC_NOT_ENC; /* Apply updated config (Decrypt + algo) and flush */ - stm32_cryp_write(cryp, CRYP_CR, cfg); + stm32_cryp_write(cryp, cryp->caps->cr, cfg); } else { cfg |= hw_mode; if (is_decrypt(cryp)) cfg |= CR_DEC_NOT_ENC; /* Apply config and flush */ - stm32_cryp_write(cryp, CRYP_CR, cfg); + stm32_cryp_write(cryp, cryp->caps->cr, cfg); /* Set key only after configuration done */ stm32_cryp_hw_write_key(cryp); @@ -649,7 +820,7 @@ static void stm32_cryp_finish_req(struct stm32_cryp *cryp, int err) static int stm32_cryp_cpu_start(struct stm32_cryp *cryp) { /* Enable interrupt and let the IRQ handler do everything */ - stm32_cryp_write(cryp, CRYP_IMSCR, IMSCR_IN | IMSCR_OUT); + stm32_cryp_write(cryp, cryp->caps->imsc, IMSCR_IN | IMSCR_OUT); return 0; } @@ -1137,14 +1308,14 @@ static int stm32_cryp_read_auth_tag(struct stm32_cryp *cryp) int ret = 0; /* Update Config */ - cfg = stm32_cryp_read(cryp, CRYP_CR); + cfg = stm32_cryp_read(cryp, cryp->caps->cr); cfg &= ~CR_PH_MASK; cfg |= CR_PH_FINAL; cfg &= ~CR_DEC_NOT_ENC; cfg |= CR_CRYPEN; - stm32_cryp_write(cryp, CRYP_CR, cfg); + stm32_cryp_write(cryp, cryp->caps->cr, cfg); if (is_gcm(cryp)) { /* GCM: write aad and payload size (in bits) */ @@ -1152,8 +1323,8 @@ static int stm32_cryp_read_auth_tag(struct stm32_cryp *cryp) if (cryp->caps->swap_final) size_bit = (__force u32)cpu_to_be32(size_bit); - stm32_cryp_write(cryp, CRYP_DIN, 0); - stm32_cryp_write(cryp, CRYP_DIN, size_bit); + stm32_cryp_write(cryp, cryp->caps->din, 0); + stm32_cryp_write(cryp, cryp->caps->din, size_bit); size_bit = is_encrypt(cryp) ? cryp->areq->cryptlen : cryp->areq->cryptlen - cryp->authsize; @@ -1161,8 +1332,8 @@ static int stm32_cryp_read_auth_tag(struct stm32_cryp *cryp) if (cryp->caps->swap_final) size_bit = (__force u32)cpu_to_be32(size_bit); - stm32_cryp_write(cryp, CRYP_DIN, 0); - stm32_cryp_write(cryp, CRYP_DIN, size_bit); + stm32_cryp_write(cryp, cryp->caps->din, 0); + stm32_cryp_write(cryp, cryp->caps->din, size_bit); } else { /* CCM: write CTR0 */ u32 iv32[AES_BLOCK_32]; @@ -1177,7 +1348,7 @@ static int stm32_cryp_read_auth_tag(struct stm32_cryp *cryp) if (!cryp->caps->padding_wa) xiv = be32_to_cpu(biv[i]); - stm32_cryp_write(cryp, CRYP_DIN, xiv); + stm32_cryp_write(cryp, cryp->caps->din, xiv); } } @@ -1193,7 +1364,7 @@ static int stm32_cryp_read_auth_tag(struct stm32_cryp *cryp) /* Get and write tag */ for (i = 0; i < AES_BLOCK_32; i++) - out_tag[i] = stm32_cryp_read(cryp, CRYP_DOUT); + out_tag[i] = stm32_cryp_read(cryp, cryp->caps->dout); scatterwalk_copychunks(out_tag, &cryp->out_walk, cryp->authsize, 1); } else { @@ -1203,7 +1374,7 @@ static int stm32_cryp_read_auth_tag(struct stm32_cryp *cryp) scatterwalk_copychunks(in_tag, &cryp->in_walk, cryp->authsize, 0); for (i = 0; i < AES_BLOCK_32; i++) - out_tag[i] = stm32_cryp_read(cryp, CRYP_DOUT); + out_tag[i] = stm32_cryp_read(cryp, cryp->caps->dout); if (crypto_memneq(in_tag, out_tag, cryp->authsize)) ret = -EBADMSG; @@ -1211,7 +1382,7 @@ static int stm32_cryp_read_auth_tag(struct stm32_cryp *cryp) /* Disable cryp */ cfg &= ~CR_CRYPEN; - stm32_cryp_write(cryp, CRYP_CR, cfg); + stm32_cryp_write(cryp, cryp->caps->cr, cfg); return ret; } @@ -1227,19 +1398,19 @@ static void stm32_cryp_check_ctr_counter(struct stm32_cryp *cryp) */ crypto_inc((u8 *)cryp->last_ctr, sizeof(cryp->last_ctr)); - cr = stm32_cryp_read(cryp, CRYP_CR); - stm32_cryp_write(cryp, CRYP_CR, cr & ~CR_CRYPEN); + cr = stm32_cryp_read(cryp, cryp->caps->cr); + stm32_cryp_write(cryp, cryp->caps->cr, cr & ~CR_CRYPEN); stm32_cryp_hw_write_iv(cryp, cryp->last_ctr); - stm32_cryp_write(cryp, CRYP_CR, cr); + stm32_cryp_write(cryp, cryp->caps->cr, cr); } /* The IV registers are BE */ - cryp->last_ctr[0] = cpu_to_be32(stm32_cryp_read(cryp, CRYP_IV0LR)); - cryp->last_ctr[1] = cpu_to_be32(stm32_cryp_read(cryp, CRYP_IV0RR)); - cryp->last_ctr[2] = cpu_to_be32(stm32_cryp_read(cryp, CRYP_IV1LR)); - cryp->last_ctr[3] = cpu_to_be32(stm32_cryp_read(cryp, CRYP_IV1RR)); + cryp->last_ctr[0] = cpu_to_be32(stm32_cryp_read(cryp, cryp->caps->iv0l)); + cryp->last_ctr[1] = cpu_to_be32(stm32_cryp_read(cryp, cryp->caps->iv0r)); + cryp->last_ctr[2] = cpu_to_be32(stm32_cryp_read(cryp, cryp->caps->iv1l)); + cryp->last_ctr[3] = cpu_to_be32(stm32_cryp_read(cryp, cryp->caps->iv1r)); } static void stm32_cryp_irq_read_data(struct stm32_cryp *cryp) @@ -1248,7 +1419,7 @@ static void stm32_cryp_irq_read_data(struct stm32_cryp *cryp) u32 block[AES_BLOCK_32]; for (i = 0; i < cryp->hw_blocksize / sizeof(u32); i++) - block[i] = stm32_cryp_read(cryp, CRYP_DOUT); + block[i] = stm32_cryp_read(cryp, cryp->caps->dout); scatterwalk_copychunks(block, &cryp->out_walk, min_t(size_t, cryp->hw_blocksize, cryp->payload_out), 1); @@ -1264,7 +1435,7 @@ static void stm32_cryp_irq_write_block(struct stm32_cryp *cryp) scatterwalk_copychunks(block, &cryp->in_walk, min_t(size_t, cryp->hw_blocksize, cryp->payload_in), 0); for (i = 0; i < cryp->hw_blocksize / sizeof(u32); i++) - stm32_cryp_write(cryp, CRYP_DIN, block[i]); + stm32_cryp_write(cryp, cryp->caps->din, block[i]); cryp->payload_in -= min_t(size_t, cryp->hw_blocksize, cryp->payload_in); } @@ -1278,22 +1449,22 @@ static void stm32_cryp_irq_write_gcm_padded_data(struct stm32_cryp *cryp) /* 'Special workaround' procedure described in the datasheet */ /* a) disable ip */ - stm32_cryp_write(cryp, CRYP_IMSCR, 0); - cfg = stm32_cryp_read(cryp, CRYP_CR); + stm32_cryp_write(cryp, cryp->caps->imsc, 0); + cfg = stm32_cryp_read(cryp, cryp->caps->cr); cfg &= ~CR_CRYPEN; - stm32_cryp_write(cryp, CRYP_CR, cfg); + stm32_cryp_write(cryp, cryp->caps->cr, cfg); /* b) Update IV1R */ - stm32_cryp_write(cryp, CRYP_IV1RR, cryp->gcm_ctr - 2); + stm32_cryp_write(cryp, cryp->caps->iv1r, cryp->gcm_ctr - 2); /* c) change mode to CTR */ cfg &= ~CR_ALGO_MASK; cfg |= CR_AES_CTR; - stm32_cryp_write(cryp, CRYP_CR, cfg); + stm32_cryp_write(cryp, cryp->caps->cr, cfg); /* a) enable IP */ cfg |= CR_CRYPEN; - stm32_cryp_write(cryp, CRYP_CR, cfg); + stm32_cryp_write(cryp, cryp->caps->cr, cfg); /* b) pad and write the last block */ stm32_cryp_irq_write_block(cryp); @@ -1310,7 +1481,7 @@ static void stm32_cryp_irq_write_gcm_padded_data(struct stm32_cryp *cryp) * block value */ for (i = 0; i < cryp->hw_blocksize / sizeof(u32); i++) - block[i] = stm32_cryp_read(cryp, CRYP_DOUT); + block[i] = stm32_cryp_read(cryp, cryp->caps->dout); scatterwalk_copychunks(block, &cryp->out_walk, min_t(size_t, cryp->hw_blocksize, cryp->payload_out), 1); @@ -1320,16 +1491,16 @@ static void stm32_cryp_irq_write_gcm_padded_data(struct stm32_cryp *cryp) /* d) change mode back to AES GCM */ cfg &= ~CR_ALGO_MASK; cfg |= CR_AES_GCM; - stm32_cryp_write(cryp, CRYP_CR, cfg); + stm32_cryp_write(cryp, cryp->caps->cr, cfg); /* e) change phase to Final */ cfg &= ~CR_PH_MASK; cfg |= CR_PH_FINAL; - stm32_cryp_write(cryp, CRYP_CR, cfg); + stm32_cryp_write(cryp, cryp->caps->cr, cfg); /* f) write padded data */ for (i = 0; i < AES_BLOCK_32; i++) - stm32_cryp_write(cryp, CRYP_DIN, block[i]); + stm32_cryp_write(cryp, cryp->caps->din, block[i]); /* g) Empty fifo out */ err = stm32_cryp_wait_output(cryp); @@ -1339,7 +1510,7 @@ static void stm32_cryp_irq_write_gcm_padded_data(struct stm32_cryp *cryp) } for (i = 0; i < AES_BLOCK_32; i++) - stm32_cryp_read(cryp, CRYP_DOUT); + stm32_cryp_read(cryp, cryp->caps->dout); /* h) run the he normal Final phase */ stm32_cryp_finish_req(cryp, 0); @@ -1350,13 +1521,13 @@ static void stm32_cryp_irq_set_npblb(struct stm32_cryp *cryp) u32 cfg; /* disable ip, set NPBLB and reneable ip */ - cfg = stm32_cryp_read(cryp, CRYP_CR); + cfg = stm32_cryp_read(cryp, cryp->caps->cr); cfg &= ~CR_CRYPEN; - stm32_cryp_write(cryp, CRYP_CR, cfg); + stm32_cryp_write(cryp, cryp->caps->cr, cfg); cfg |= (cryp->hw_blocksize - cryp->payload_in) << CR_NBPBL_SHIFT; cfg |= CR_CRYPEN; - stm32_cryp_write(cryp, CRYP_CR, cfg); + stm32_cryp_write(cryp, cryp->caps->cr, cfg); } static void stm32_cryp_irq_write_ccm_padded_data(struct stm32_cryp *cryp) @@ -1370,11 +1541,11 @@ static void stm32_cryp_irq_write_ccm_padded_data(struct stm32_cryp *cryp) /* 'Special workaround' procedure described in the datasheet */ /* a) disable ip */ - stm32_cryp_write(cryp, CRYP_IMSCR, 0); + stm32_cryp_write(cryp, cryp->caps->imsc, 0); - cfg = stm32_cryp_read(cryp, CRYP_CR); + cfg = stm32_cryp_read(cryp, cryp->caps->cr); cfg &= ~CR_CRYPEN; - stm32_cryp_write(cryp, CRYP_CR, cfg); + stm32_cryp_write(cryp, cryp->caps->cr, cfg); /* b) get IV1 from CRYP_CSGCMCCM7 */ iv1tmp = stm32_cryp_read(cryp, CRYP_CSGCMCCM0R + 7 * 4); @@ -1384,16 +1555,16 @@ static void stm32_cryp_irq_write_ccm_padded_data(struct stm32_cryp *cryp) cstmp1[i] = stm32_cryp_read(cryp, CRYP_CSGCMCCM0R + i * 4); /* d) Write IV1R */ - stm32_cryp_write(cryp, CRYP_IV1RR, iv1tmp); + stm32_cryp_write(cryp, cryp->caps->iv1r, iv1tmp); /* e) change mode to CTR */ cfg &= ~CR_ALGO_MASK; cfg |= CR_AES_CTR; - stm32_cryp_write(cryp, CRYP_CR, cfg); + stm32_cryp_write(cryp, cryp->caps->cr, cfg); /* a) enable IP */ cfg |= CR_CRYPEN; - stm32_cryp_write(cryp, CRYP_CR, cfg); + stm32_cryp_write(cryp, cryp->caps->cr, cfg); /* b) pad and write the last block */ stm32_cryp_irq_write_block(cryp); @@ -1410,7 +1581,7 @@ static void stm32_cryp_irq_write_ccm_padded_data(struct stm32_cryp *cryp) * block value */ for (i = 0; i < cryp->hw_blocksize / sizeof(u32); i++) - block[i] = stm32_cryp_read(cryp, CRYP_DOUT); + block[i] = stm32_cryp_read(cryp, cryp->caps->dout); scatterwalk_copychunks(block, &cryp->out_walk, min_t(size_t, cryp->hw_blocksize, cryp->payload_out), 1); @@ -1423,18 +1594,18 @@ static void stm32_cryp_irq_write_ccm_padded_data(struct stm32_cryp *cryp) /* e) change mode back to AES CCM */ cfg &= ~CR_ALGO_MASK; cfg |= CR_AES_CCM; - stm32_cryp_write(cryp, CRYP_CR, cfg); + stm32_cryp_write(cryp, cryp->caps->cr, cfg); /* f) change phase to header */ cfg &= ~CR_PH_MASK; cfg |= CR_PH_HEADER; - stm32_cryp_write(cryp, CRYP_CR, cfg); + stm32_cryp_write(cryp, cryp->caps->cr, cfg); /* g) XOR and write padded data */ for (i = 0; i < ARRAY_SIZE(block); i++) { block[i] ^= cstmp1[i]; block[i] ^= cstmp2[i]; - stm32_cryp_write(cryp, CRYP_DIN, block[i]); + stm32_cryp_write(cryp, cryp->caps->din, block[i]); } /* h) wait for completion */ @@ -1497,7 +1668,7 @@ static void stm32_cryp_irq_write_gcmccm_header(struct stm32_cryp *cryp) scatterwalk_copychunks(block, &cryp->in_walk, written, 0); for (i = 0; i < AES_BLOCK_32; i++) - stm32_cryp_write(cryp, CRYP_DIN, block[i]); + stm32_cryp_write(cryp, cryp->caps->din, block[i]); cryp->header_in -= written; @@ -1508,7 +1679,7 @@ static irqreturn_t stm32_cryp_irq_thread(int irq, void *arg) { struct stm32_cryp *cryp = arg; u32 ph; - u32 it_mask = stm32_cryp_read(cryp, CRYP_IMSCR); + u32 it_mask = stm32_cryp_read(cryp, cryp->caps->imsc); if (cryp->irq_status & MISR_OUT) /* Output FIFO IRQ: read data */ @@ -1516,7 +1687,7 @@ static irqreturn_t stm32_cryp_irq_thread(int irq, void *arg) if (cryp->irq_status & MISR_IN) { if (is_gcm(cryp) || is_ccm(cryp)) { - ph = stm32_cryp_read(cryp, CRYP_CR) & CR_PH_MASK; + ph = stm32_cryp_read(cryp, cryp->caps->cr) & CR_PH_MASK; if (unlikely(ph == CR_PH_HEADER)) /* Write Header */ stm32_cryp_irq_write_gcmccm_header(cryp); @@ -1536,7 +1707,7 @@ static irqreturn_t stm32_cryp_irq_thread(int irq, void *arg) it_mask &= ~IMSCR_IN; if (!cryp->payload_out) it_mask &= ~IMSCR_OUT; - stm32_cryp_write(cryp, CRYP_IMSCR, it_mask); + stm32_cryp_write(cryp, cryp->caps->imsc, it_mask); if (!cryp->payload_in && !cryp->header_in && !cryp->payload_out) stm32_cryp_finish_req(cryp, 0); @@ -1548,7 +1719,7 @@ static irqreturn_t stm32_cryp_irq(int irq, void *arg) { struct stm32_cryp *cryp = arg; - cryp->irq_status = stm32_cryp_read(cryp, CRYP_MISR); + cryp->irq_status = stm32_cryp_read(cryp, cryp->caps->mis); return IRQ_WAKE_THREAD; } @@ -1722,17 +1893,74 @@ static struct aead_alg aead_algs[] = { }, }; +static const struct stm32_cryp_caps ux500_data = { + .aeads_support = false, + .linear_aes_key = true, + .kp_mode = false, + .iv_protection = true, + .swap_final = true, + .padding_wa = true, + .cr = UX500_CRYP_CR, + .sr = UX500_CRYP_SR, + .din = UX500_CRYP_DIN, + .dout = UX500_CRYP_DOUT, + .imsc = UX500_CRYP_IMSC, + .mis = UX500_CRYP_MIS, + .k1l = UX500_CRYP_K1L, + .k1r = UX500_CRYP_K1R, + .k3r = UX500_CRYP_K3R, + .iv0l = UX500_CRYP_IV0L, + .iv0r = UX500_CRYP_IV0R, + .iv1l = UX500_CRYP_IV1L, + .iv1r = UX500_CRYP_IV1R, +}; + static const struct stm32_cryp_caps f7_data = { + .aeads_support = true, + .linear_aes_key = false, + .kp_mode = true, + .iv_protection = false, .swap_final = true, .padding_wa = true, + .cr = CRYP_CR, + .sr = CRYP_SR, + .din = CRYP_DIN, + .dout = CRYP_DOUT, + .imsc = CRYP_IMSCR, + .mis = CRYP_MISR, + .k1l = CRYP_K1LR, + .k1r = CRYP_K1RR, + .k3r = CRYP_K3RR, + .iv0l = CRYP_IV0LR, + .iv0r = CRYP_IV0RR, + .iv1l = CRYP_IV1LR, + .iv1r = CRYP_IV1RR, }; static const struct stm32_cryp_caps mp1_data = { + .aeads_support = true, + .linear_aes_key = false, + .kp_mode = true, + .iv_protection = false, .swap_final = false, .padding_wa = false, + .cr = CRYP_CR, + .sr = CRYP_SR, + .din = CRYP_DIN, + .dout = CRYP_DOUT, + .imsc = CRYP_IMSCR, + .mis = CRYP_MISR, + .k1l = CRYP_K1LR, + .k1r = CRYP_K1RR, + .k3r = CRYP_K3RR, + .iv0l = CRYP_IV0LR, + .iv0r = CRYP_IV0RR, + .iv1l = CRYP_IV1LR, + .iv1r = CRYP_IV1RR, }; static const struct of_device_id stm32_dt_ids[] = { + { .compatible = "stericsson,ux500-cryp", .data = &ux500_data}, { .compatible = "st,stm32f756-cryp", .data = &f7_data}, { .compatible = "st,stm32mp1-cryp", .data = &mp1_data}, {}, @@ -1829,9 +2057,11 @@ static int stm32_cryp_probe(struct platform_device *pdev) goto err_algs; } - ret = crypto_register_aeads(aead_algs, ARRAY_SIZE(aead_algs)); - if (ret) - goto err_aead_algs; + if (cryp->caps->aeads_support) { + ret = crypto_register_aeads(aead_algs, ARRAY_SIZE(aead_algs)); + if (ret) + goto err_aead_algs; + } dev_info(dev, "Initialized\n"); @@ -1869,7 +2099,8 @@ static int stm32_cryp_remove(struct platform_device *pdev) if (ret < 0) return ret; - crypto_unregister_aeads(aead_algs, ARRAY_SIZE(aead_algs)); + if (cryp->caps->aeads_support) + crypto_unregister_aeads(aead_algs, ARRAY_SIZE(aead_algs)); crypto_unregister_skciphers(crypto_algs, ARRAY_SIZE(crypto_algs)); crypto_engine_exit(cryp->engine); -- GitLab From 453de3eb08c4b7e31b3019a4b0cc3ebce51a6219 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sat, 3 Dec 2022 10:15:18 +0100 Subject: [PATCH 1281/1423] crypto: ux500/cryp - delete driver It turns out we can just modify the newer STM32 CRYP driver to be used with Ux500 and now that we have done that, delete the old and sparsely maintained Ux500 CRYP driver. Cc: Lionel Debieve Cc: Maxime Coquelin Cc: Alexandre Torgue Signed-off-by: Linus Walleij Signed-off-by: Herbert Xu --- drivers/crypto/ux500/Kconfig | 10 - drivers/crypto/ux500/Makefile | 1 - drivers/crypto/ux500/cryp/Makefile | 10 - drivers/crypto/ux500/cryp/cryp.c | 394 ------ drivers/crypto/ux500/cryp/cryp.h | 315 ----- drivers/crypto/ux500/cryp/cryp_core.c | 1600 ------------------------- drivers/crypto/ux500/cryp/cryp_irq.c | 45 - drivers/crypto/ux500/cryp/cryp_irq.h | 31 - drivers/crypto/ux500/cryp/cryp_irqp.h | 125 -- drivers/crypto/ux500/cryp/cryp_p.h | 122 -- 10 files changed, 2653 deletions(-) delete mode 100644 drivers/crypto/ux500/cryp/Makefile delete mode 100644 drivers/crypto/ux500/cryp/cryp.c delete mode 100644 drivers/crypto/ux500/cryp/cryp.h delete mode 100644 drivers/crypto/ux500/cryp/cryp_core.c delete mode 100644 drivers/crypto/ux500/cryp/cryp_irq.c delete mode 100644 drivers/crypto/ux500/cryp/cryp_irq.h delete mode 100644 drivers/crypto/ux500/cryp/cryp_irqp.h delete mode 100644 drivers/crypto/ux500/cryp/cryp_p.h diff --git a/drivers/crypto/ux500/Kconfig b/drivers/crypto/ux500/Kconfig index f56d65c56ccf7..dcbd7404768f1 100644 --- a/drivers/crypto/ux500/Kconfig +++ b/drivers/crypto/ux500/Kconfig @@ -4,16 +4,6 @@ # Author: Shujuan Chen (shujuan.chen@stericsson.com) # -config CRYPTO_DEV_UX500_CRYP - tristate "UX500 crypto driver for CRYP block" - depends on CRYPTO_DEV_UX500 - select CRYPTO_ALGAPI - select CRYPTO_SKCIPHER - select CRYPTO_LIB_DES - help - This selects the crypto driver for the UX500_CRYP hardware. It supports - AES-ECB, CBC and CTR with keys sizes of 128, 192 and 256 bit sizes. - config CRYPTO_DEV_UX500_HASH tristate "UX500 crypto driver for HASH block" depends on CRYPTO_DEV_UX500 diff --git a/drivers/crypto/ux500/Makefile b/drivers/crypto/ux500/Makefile index f014eb01710a6..f1aa4edf66f4f 100644 --- a/drivers/crypto/ux500/Makefile +++ b/drivers/crypto/ux500/Makefile @@ -5,4 +5,3 @@ # obj-$(CONFIG_CRYPTO_DEV_UX500_HASH) += hash/ -obj-$(CONFIG_CRYPTO_DEV_UX500_CRYP) += cryp/ diff --git a/drivers/crypto/ux500/cryp/Makefile b/drivers/crypto/ux500/cryp/Makefile deleted file mode 100644 index 3e67531f484cd..0000000000000 --- a/drivers/crypto/ux500/cryp/Makefile +++ /dev/null @@ -1,10 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -#/* -# * Copyright (C) ST-Ericsson SA 2010 -# * Author: shujuan.chen@stericsson.com for ST-Ericsson. -# */ - -ccflags-$(CONFIG_CRYPTO_DEV_UX500_DEBUG) += -DDEBUG - -obj-$(CONFIG_CRYPTO_DEV_UX500_CRYP) += ux500_cryp.o -ux500_cryp-objs := cryp.o cryp_irq.o cryp_core.o diff --git a/drivers/crypto/ux500/cryp/cryp.c b/drivers/crypto/ux500/cryp/cryp.c deleted file mode 100644 index 759d0d9786fd1..0000000000000 --- a/drivers/crypto/ux500/cryp/cryp.c +++ /dev/null @@ -1,394 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) ST-Ericsson SA 2010 - * Author: Shujuan Chen for ST-Ericsson. - * Author: Jonas Linde for ST-Ericsson. - * Author: Niklas Hernaeus for ST-Ericsson. - * Author: Joakim Bech for ST-Ericsson. - * Author: Berne Hebark for ST-Ericsson. - */ - -#include -#include -#include - -#include "cryp_p.h" -#include "cryp.h" - -/* - * cryp_wait_until_done - wait until the device logic is not busy - */ -void cryp_wait_until_done(struct cryp_device_data *device_data) -{ - while (cryp_is_logic_busy(device_data)) - cpu_relax(); -} - -/** - * cryp_check - This routine checks Peripheral and PCell Id - * @device_data: Pointer to the device data struct for base address. - */ -int cryp_check(struct cryp_device_data *device_data) -{ - int peripheralid2 = 0; - - if (NULL == device_data) - return -EINVAL; - - peripheralid2 = readl_relaxed(&device_data->base->periphId2); - - if (peripheralid2 != CRYP_PERIPHERAL_ID2_DB8500) - return -EPERM; - - /* Check Peripheral and Pcell Id Register for CRYP */ - if ((CRYP_PERIPHERAL_ID0 == - readl_relaxed(&device_data->base->periphId0)) - && (CRYP_PERIPHERAL_ID1 == - readl_relaxed(&device_data->base->periphId1)) - && (CRYP_PERIPHERAL_ID3 == - readl_relaxed(&device_data->base->periphId3)) - && (CRYP_PCELL_ID0 == - readl_relaxed(&device_data->base->pcellId0)) - && (CRYP_PCELL_ID1 == - readl_relaxed(&device_data->base->pcellId1)) - && (CRYP_PCELL_ID2 == - readl_relaxed(&device_data->base->pcellId2)) - && (CRYP_PCELL_ID3 == - readl_relaxed(&device_data->base->pcellId3))) { - return 0; - } - - return -EPERM; -} - -/** - * cryp_activity - This routine enables/disable the cryptography function. - * @device_data: Pointer to the device data struct for base address. - * @cryp_crypen: Enable/Disable functionality - */ -void cryp_activity(struct cryp_device_data *device_data, - enum cryp_crypen cryp_crypen) -{ - CRYP_PUT_BITS(&device_data->base->cr, - cryp_crypen, - CRYP_CR_CRYPEN_POS, - CRYP_CR_CRYPEN_MASK); -} - -/** - * cryp_flush_inoutfifo - Resets both the input and the output FIFOs - * @device_data: Pointer to the device data struct for base address. - */ -void cryp_flush_inoutfifo(struct cryp_device_data *device_data) -{ - /* - * We always need to disable the hardware before trying to flush the - * FIFO. This is something that isn't written in the design - * specification, but we have been informed by the hardware designers - * that this must be done. - */ - cryp_activity(device_data, CRYP_CRYPEN_DISABLE); - cryp_wait_until_done(device_data); - - CRYP_SET_BITS(&device_data->base->cr, CRYP_CR_FFLUSH_MASK); - /* - * CRYP_SR_INFIFO_READY_MASK is the expected value on the status - * register when starting a new calculation, which means Input FIFO is - * not full and input FIFO is empty. - */ - while (readl_relaxed(&device_data->base->sr) != - CRYP_SR_INFIFO_READY_MASK) - cpu_relax(); -} - -/** - * cryp_set_configuration - This routine set the cr CRYP IP - * @device_data: Pointer to the device data struct for base address. - * @cryp_config: Pointer to the configuration parameter - * @control_register: The control register to be written later on. - */ -int cryp_set_configuration(struct cryp_device_data *device_data, - struct cryp_config *cryp_config, - u32 *control_register) -{ - u32 cr_for_kse; - - if (NULL == device_data || NULL == cryp_config) - return -EINVAL; - - *control_register |= (cryp_config->keysize << CRYP_CR_KEYSIZE_POS); - - /* Prepare key for decryption in AES_ECB and AES_CBC mode. */ - if ((CRYP_ALGORITHM_DECRYPT == cryp_config->algodir) && - ((CRYP_ALGO_AES_ECB == cryp_config->algomode) || - (CRYP_ALGO_AES_CBC == cryp_config->algomode))) { - cr_for_kse = *control_register; - /* - * This seems a bit odd, but it is indeed needed to set this to - * encrypt even though it is a decryption that we are doing. It - * also mentioned in the design spec that you need to do this. - * After the keyprepartion for decrypting is done you should set - * algodir back to decryption, which is done outside this if - * statement. - * - * According to design specification we should set mode ECB - * during key preparation even though we might be running CBC - * when enter this function. - * - * Writing to KSE_ENABLED will drop CRYPEN when key preparation - * is done. Therefore we need to set CRYPEN again outside this - * if statement when running decryption. - */ - cr_for_kse |= ((CRYP_ALGORITHM_ENCRYPT << CRYP_CR_ALGODIR_POS) | - (CRYP_ALGO_AES_ECB << CRYP_CR_ALGOMODE_POS) | - (CRYP_CRYPEN_ENABLE << CRYP_CR_CRYPEN_POS) | - (KSE_ENABLED << CRYP_CR_KSE_POS)); - - writel_relaxed(cr_for_kse, &device_data->base->cr); - cryp_wait_until_done(device_data); - } - - *control_register |= - ((cryp_config->algomode << CRYP_CR_ALGOMODE_POS) | - (cryp_config->algodir << CRYP_CR_ALGODIR_POS)); - - return 0; -} - -/** - * cryp_configure_protection - set the protection bits in the CRYP logic. - * @device_data: Pointer to the device data struct for base address. - * @p_protect_config: Pointer to the protection mode and - * secure mode configuration - */ -int cryp_configure_protection(struct cryp_device_data *device_data, - struct cryp_protection_config *p_protect_config) -{ - if (NULL == p_protect_config) - return -EINVAL; - - CRYP_WRITE_BIT(&device_data->base->cr, - (u32) p_protect_config->secure_access, - CRYP_CR_SECURE_MASK); - CRYP_PUT_BITS(&device_data->base->cr, - p_protect_config->privilege_access, - CRYP_CR_PRLG_POS, - CRYP_CR_PRLG_MASK); - - return 0; -} - -/** - * cryp_is_logic_busy - returns the busy status of the CRYP logic - * @device_data: Pointer to the device data struct for base address. - */ -int cryp_is_logic_busy(struct cryp_device_data *device_data) -{ - return CRYP_TEST_BITS(&device_data->base->sr, - CRYP_SR_BUSY_MASK); -} - -/** - * cryp_configure_for_dma - configures the CRYP IP for DMA operation - * @device_data: Pointer to the device data struct for base address. - * @dma_req: Specifies the DMA request type value. - */ -void cryp_configure_for_dma(struct cryp_device_data *device_data, - enum cryp_dma_req_type dma_req) -{ - CRYP_SET_BITS(&device_data->base->dmacr, - (u32) dma_req); -} - -/** - * cryp_configure_key_values - configures the key values for CRYP operations - * @device_data: Pointer to the device data struct for base address. - * @key_reg_index: Key value index register - * @key_value: The key value struct - */ -int cryp_configure_key_values(struct cryp_device_data *device_data, - enum cryp_key_reg_index key_reg_index, - struct cryp_key_value key_value) -{ - while (cryp_is_logic_busy(device_data)) - cpu_relax(); - - switch (key_reg_index) { - case CRYP_KEY_REG_1: - writel_relaxed(key_value.key_value_left, - &device_data->base->key_1_l); - writel_relaxed(key_value.key_value_right, - &device_data->base->key_1_r); - break; - case CRYP_KEY_REG_2: - writel_relaxed(key_value.key_value_left, - &device_data->base->key_2_l); - writel_relaxed(key_value.key_value_right, - &device_data->base->key_2_r); - break; - case CRYP_KEY_REG_3: - writel_relaxed(key_value.key_value_left, - &device_data->base->key_3_l); - writel_relaxed(key_value.key_value_right, - &device_data->base->key_3_r); - break; - case CRYP_KEY_REG_4: - writel_relaxed(key_value.key_value_left, - &device_data->base->key_4_l); - writel_relaxed(key_value.key_value_right, - &device_data->base->key_4_r); - break; - default: - return -EINVAL; - } - - return 0; -} - -/** - * cryp_configure_init_vector - configures the initialization vector register - * @device_data: Pointer to the device data struct for base address. - * @init_vector_index: Specifies the index of the init vector. - * @init_vector_value: Specifies the value for the init vector. - */ -int cryp_configure_init_vector(struct cryp_device_data *device_data, - enum cryp_init_vector_index - init_vector_index, - struct cryp_init_vector_value - init_vector_value) -{ - while (cryp_is_logic_busy(device_data)) - cpu_relax(); - - switch (init_vector_index) { - case CRYP_INIT_VECTOR_INDEX_0: - writel_relaxed(init_vector_value.init_value_left, - &device_data->base->init_vect_0_l); - writel_relaxed(init_vector_value.init_value_right, - &device_data->base->init_vect_0_r); - break; - case CRYP_INIT_VECTOR_INDEX_1: - writel_relaxed(init_vector_value.init_value_left, - &device_data->base->init_vect_1_l); - writel_relaxed(init_vector_value.init_value_right, - &device_data->base->init_vect_1_r); - break; - default: - return -EINVAL; - } - - return 0; -} - -/** - * cryp_save_device_context - Store hardware registers and - * other device context parameter - * @device_data: Pointer to the device data struct for base address. - * @ctx: Crypto device context - * @cryp_mode: Mode: Polling, Interrupt or DMA - */ -void cryp_save_device_context(struct cryp_device_data *device_data, - struct cryp_device_context *ctx, - int cryp_mode) -{ - enum cryp_algo_mode algomode; - struct cryp_register __iomem *src_reg = device_data->base; - struct cryp_config *config = - (struct cryp_config *)device_data->current_ctx; - - /* - * Always start by disable the hardware and wait for it to finish the - * ongoing calculations before trying to reprogram it. - */ - cryp_activity(device_data, CRYP_CRYPEN_DISABLE); - cryp_wait_until_done(device_data); - - if (cryp_mode == CRYP_MODE_DMA) - cryp_configure_for_dma(device_data, CRYP_DMA_DISABLE_BOTH); - - if (CRYP_TEST_BITS(&src_reg->sr, CRYP_SR_IFEM_MASK) == 0) - ctx->din = readl_relaxed(&src_reg->din); - - ctx->cr = readl_relaxed(&src_reg->cr) & CRYP_CR_CONTEXT_SAVE_MASK; - - switch (config->keysize) { - case CRYP_KEY_SIZE_256: - ctx->key_4_l = readl_relaxed(&src_reg->key_4_l); - ctx->key_4_r = readl_relaxed(&src_reg->key_4_r); - fallthrough; - - case CRYP_KEY_SIZE_192: - ctx->key_3_l = readl_relaxed(&src_reg->key_3_l); - ctx->key_3_r = readl_relaxed(&src_reg->key_3_r); - fallthrough; - - case CRYP_KEY_SIZE_128: - ctx->key_2_l = readl_relaxed(&src_reg->key_2_l); - ctx->key_2_r = readl_relaxed(&src_reg->key_2_r); - fallthrough; - - default: - ctx->key_1_l = readl_relaxed(&src_reg->key_1_l); - ctx->key_1_r = readl_relaxed(&src_reg->key_1_r); - } - - /* Save IV for CBC mode for both AES and DES. */ - algomode = ((ctx->cr & CRYP_CR_ALGOMODE_MASK) >> CRYP_CR_ALGOMODE_POS); - if (algomode == CRYP_ALGO_TDES_CBC || - algomode == CRYP_ALGO_DES_CBC || - algomode == CRYP_ALGO_AES_CBC) { - ctx->init_vect_0_l = readl_relaxed(&src_reg->init_vect_0_l); - ctx->init_vect_0_r = readl_relaxed(&src_reg->init_vect_0_r); - ctx->init_vect_1_l = readl_relaxed(&src_reg->init_vect_1_l); - ctx->init_vect_1_r = readl_relaxed(&src_reg->init_vect_1_r); - } -} - -/** - * cryp_restore_device_context - Restore hardware registers and - * other device context parameter - * @device_data: Pointer to the device data struct for base address. - * @ctx: Crypto device context - */ -void cryp_restore_device_context(struct cryp_device_data *device_data, - struct cryp_device_context *ctx) -{ - struct cryp_register __iomem *reg = device_data->base; - struct cryp_config *config = - (struct cryp_config *)device_data->current_ctx; - - /* - * Fall through for all items in switch statement. DES is captured in - * the default. - */ - switch (config->keysize) { - case CRYP_KEY_SIZE_256: - writel_relaxed(ctx->key_4_l, ®->key_4_l); - writel_relaxed(ctx->key_4_r, ®->key_4_r); - fallthrough; - - case CRYP_KEY_SIZE_192: - writel_relaxed(ctx->key_3_l, ®->key_3_l); - writel_relaxed(ctx->key_3_r, ®->key_3_r); - fallthrough; - - case CRYP_KEY_SIZE_128: - writel_relaxed(ctx->key_2_l, ®->key_2_l); - writel_relaxed(ctx->key_2_r, ®->key_2_r); - fallthrough; - - default: - writel_relaxed(ctx->key_1_l, ®->key_1_l); - writel_relaxed(ctx->key_1_r, ®->key_1_r); - } - - /* Restore IV for CBC mode for AES and DES. */ - if (config->algomode == CRYP_ALGO_TDES_CBC || - config->algomode == CRYP_ALGO_DES_CBC || - config->algomode == CRYP_ALGO_AES_CBC) { - writel_relaxed(ctx->init_vect_0_l, ®->init_vect_0_l); - writel_relaxed(ctx->init_vect_0_r, ®->init_vect_0_r); - writel_relaxed(ctx->init_vect_1_l, ®->init_vect_1_l); - writel_relaxed(ctx->init_vect_1_r, ®->init_vect_1_r); - } -} diff --git a/drivers/crypto/ux500/cryp/cryp.h b/drivers/crypto/ux500/cryp/cryp.h deleted file mode 100644 index 59e1557a620a9..0000000000000 --- a/drivers/crypto/ux500/cryp/cryp.h +++ /dev/null @@ -1,315 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) ST-Ericsson SA 2010 - * Author: Shujuan Chen for ST-Ericsson. - * Author: Jonas Linde for ST-Ericsson. - * Author: Joakim Bech for ST-Ericsson. - * Author: Berne Hebark for ST-Ericsson. - * Author: Niklas Hernaeus for ST-Ericsson. - */ - -#ifndef _CRYP_H_ -#define _CRYP_H_ - -#include -#include -#include -#include - -#define DEV_DBG_NAME "crypX crypX:" - -/* CRYP enable/disable */ -enum cryp_crypen { - CRYP_CRYPEN_DISABLE = 0, - CRYP_CRYPEN_ENABLE = 1 -}; - -/* CRYP Start Computation enable/disable */ -enum cryp_start { - CRYP_START_DISABLE = 0, - CRYP_START_ENABLE = 1 -}; - -/* CRYP Init Signal enable/disable */ -enum cryp_init { - CRYP_INIT_DISABLE = 0, - CRYP_INIT_ENABLE = 1 -}; - -/* Cryp State enable/disable */ -enum cryp_state { - CRYP_STATE_DISABLE = 0, - CRYP_STATE_ENABLE = 1 -}; - -/* Key preparation bit enable */ -enum cryp_key_prep { - KSE_DISABLED = 0, - KSE_ENABLED = 1 -}; - -/* Key size for AES */ -#define CRYP_KEY_SIZE_128 (0) -#define CRYP_KEY_SIZE_192 (1) -#define CRYP_KEY_SIZE_256 (2) - -/* AES modes */ -enum cryp_algo_mode { - CRYP_ALGO_TDES_ECB, - CRYP_ALGO_TDES_CBC, - CRYP_ALGO_DES_ECB, - CRYP_ALGO_DES_CBC, - CRYP_ALGO_AES_ECB, - CRYP_ALGO_AES_CBC, - CRYP_ALGO_AES_CTR, - CRYP_ALGO_AES_XTS -}; - -/* Cryp Encryption or Decryption */ -enum cryp_algorithm_dir { - CRYP_ALGORITHM_ENCRYPT, - CRYP_ALGORITHM_DECRYPT -}; - -/* Hardware access method */ -enum cryp_mode { - CRYP_MODE_POLLING, - CRYP_MODE_INTERRUPT, - CRYP_MODE_DMA -}; - -/** - * struct cryp_config - - * @keysize: Key size for AES - * @algomode: AES modes - * @algodir: Cryp Encryption or Decryption - * - * CRYP configuration structure to be passed to set configuration - */ -struct cryp_config { - int keysize; - enum cryp_algo_mode algomode; - enum cryp_algorithm_dir algodir; -}; - -/** - * struct cryp_protection_config - - * @privilege_access: Privileged cryp state enable/disable - * @secure_access: Secure cryp state enable/disable - * - * Protection configuration structure for setting privilage access - */ -struct cryp_protection_config { - enum cryp_state privilege_access; - enum cryp_state secure_access; -}; - -/* Cryp status */ -enum cryp_status_id { - CRYP_STATUS_BUSY = 0x10, - CRYP_STATUS_OUTPUT_FIFO_FULL = 0x08, - CRYP_STATUS_OUTPUT_FIFO_NOT_EMPTY = 0x04, - CRYP_STATUS_INPUT_FIFO_NOT_FULL = 0x02, - CRYP_STATUS_INPUT_FIFO_EMPTY = 0x01 -}; - -/* Cryp DMA interface */ -#define CRYP_DMA_TX_FIFO 0x08 -#define CRYP_DMA_RX_FIFO 0x10 - -enum cryp_dma_req_type { - CRYP_DMA_DISABLE_BOTH, - CRYP_DMA_ENABLE_IN_DATA, - CRYP_DMA_ENABLE_OUT_DATA, - CRYP_DMA_ENABLE_BOTH_DIRECTIONS -}; - -enum cryp_dma_channel { - CRYP_DMA_RX = 0, - CRYP_DMA_TX -}; - -/* Key registers */ -enum cryp_key_reg_index { - CRYP_KEY_REG_1, - CRYP_KEY_REG_2, - CRYP_KEY_REG_3, - CRYP_KEY_REG_4 -}; - -/* Key register left and right */ -struct cryp_key_value { - u32 key_value_left; - u32 key_value_right; -}; - -/* Cryp Initialization structure */ -enum cryp_init_vector_index { - CRYP_INIT_VECTOR_INDEX_0, - CRYP_INIT_VECTOR_INDEX_1 -}; - -/* struct cryp_init_vector_value - - * @init_value_left - * @init_value_right - * */ -struct cryp_init_vector_value { - u32 init_value_left; - u32 init_value_right; -}; - -/** - * struct cryp_device_context - structure for a cryp context. - * @cr: control register - * @dmacr: DMA control register - * @imsc: Interrupt mask set/clear register - * @key_1_l: Key 1l register - * @key_1_r: Key 1r register - * @key_2_l: Key 2l register - * @key_2_r: Key 2r register - * @key_3_l: Key 3l register - * @key_3_r: Key 3r register - * @key_4_l: Key 4l register - * @key_4_r: Key 4r register - * @init_vect_0_l: Initialization vector 0l register - * @init_vect_0_r: Initialization vector 0r register - * @init_vect_1_l: Initialization vector 1l register - * @init_vect_1_r: Initialization vector 0r register - * @din: Data in register - * @dout: Data out register - * - * CRYP power management specifc structure. - */ -struct cryp_device_context { - u32 cr; - u32 dmacr; - u32 imsc; - - u32 key_1_l; - u32 key_1_r; - u32 key_2_l; - u32 key_2_r; - u32 key_3_l; - u32 key_3_r; - u32 key_4_l; - u32 key_4_r; - - u32 init_vect_0_l; - u32 init_vect_0_r; - u32 init_vect_1_l; - u32 init_vect_1_r; - - u32 din; - u32 dout; -}; - -struct cryp_dma { - dma_cap_mask_t mask; - struct completion cryp_dma_complete; - struct dma_chan *chan_cryp2mem; - struct dma_chan *chan_mem2cryp; - struct stedma40_chan_cfg *cfg_cryp2mem; - struct stedma40_chan_cfg *cfg_mem2cryp; - int sg_src_len; - int sg_dst_len; - struct scatterlist *sg_src; - struct scatterlist *sg_dst; - int nents_src; - int nents_dst; -}; - -/** - * struct cryp_device_data - structure for a cryp device. - * @base: Pointer to virtual base address of the cryp device. - * @phybase: Pointer to physical memory location of the cryp device. - * @dev: Pointer to the devices dev structure. - * @clk: Pointer to the device's clock control. - * @irq: IRQ number - * @pwr_regulator: Pointer to the device's power control. - * @power_status: Current status of the power. - * @ctx_lock: Lock for current_ctx. - * @current_ctx: Pointer to the currently allocated context. - * @list_node: For inclusion into a klist. - * @dma: The dma structure holding channel configuration. - * @power_state: TRUE = power state on, FALSE = power state off. - * @power_state_spinlock: Spinlock for power_state. - * @restore_dev_ctx: TRUE = saved ctx, FALSE = no saved ctx. - */ -struct cryp_device_data { - struct cryp_register __iomem *base; - phys_addr_t phybase; - struct device *dev; - struct clk *clk; - int irq; - struct regulator *pwr_regulator; - int power_status; - spinlock_t ctx_lock; - struct cryp_ctx *current_ctx; - struct klist_node list_node; - struct cryp_dma dma; - bool power_state; - spinlock_t power_state_spinlock; - bool restore_dev_ctx; -}; - -void cryp_wait_until_done(struct cryp_device_data *device_data); - -/* Initialization functions */ - -int cryp_check(struct cryp_device_data *device_data); - -void cryp_activity(struct cryp_device_data *device_data, - enum cryp_crypen cryp_crypen); - -void cryp_flush_inoutfifo(struct cryp_device_data *device_data); - -int cryp_set_configuration(struct cryp_device_data *device_data, - struct cryp_config *cryp_config, - u32 *control_register); - -void cryp_configure_for_dma(struct cryp_device_data *device_data, - enum cryp_dma_req_type dma_req); - -int cryp_configure_key_values(struct cryp_device_data *device_data, - enum cryp_key_reg_index key_reg_index, - struct cryp_key_value key_value); - -int cryp_configure_init_vector(struct cryp_device_data *device_data, - enum cryp_init_vector_index - init_vector_index, - struct cryp_init_vector_value - init_vector_value); - -int cryp_configure_protection(struct cryp_device_data *device_data, - struct cryp_protection_config *p_protect_config); - -/* Power management funtions */ -void cryp_save_device_context(struct cryp_device_data *device_data, - struct cryp_device_context *ctx, - int cryp_mode); - -void cryp_restore_device_context(struct cryp_device_data *device_data, - struct cryp_device_context *ctx); - -/* Data transfer and status bits. */ -int cryp_is_logic_busy(struct cryp_device_data *device_data); - -int cryp_get_status(struct cryp_device_data *device_data); - -/** - * cryp_write_indata - This routine writes 32 bit data into the data input - * register of the cryptography IP. - * @device_data: Pointer to the device data struct for base address. - * @write_data: Data to write. - */ -int cryp_write_indata(struct cryp_device_data *device_data, u32 write_data); - -/** - * cryp_read_outdata - This routine reads the data from the data output - * register of the CRYP logic - * @device_data: Pointer to the device data struct for base address. - * @read_data: Read the data from the output FIFO. - */ -int cryp_read_outdata(struct cryp_device_data *device_data, u32 *read_data); - -#endif /* _CRYP_H_ */ diff --git a/drivers/crypto/ux500/cryp/cryp_core.c b/drivers/crypto/ux500/cryp/cryp_core.c deleted file mode 100644 index 5a57c9afd8c88..0000000000000 --- a/drivers/crypto/ux500/cryp/cryp_core.c +++ /dev/null @@ -1,1600 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) ST-Ericsson SA 2010 - * Author: Shujuan Chen for ST-Ericsson. - * Author: Joakim Bech for ST-Ericsson. - * Author: Berne Hebark for ST-Ericsson. - * Author: Niklas Hernaeus for ST-Ericsson. - * Author: Jonas Linde for ST-Ericsson. - * Author: Andreas Westin for ST-Ericsson. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include - -#include "cryp_p.h" -#include "cryp.h" - -#define CRYP_MAX_KEY_SIZE 32 -#define BYTES_PER_WORD 4 - -static int cryp_mode; -static atomic_t session_id; - -static struct stedma40_chan_cfg *mem_to_engine; -static struct stedma40_chan_cfg *engine_to_mem; - -/** - * struct cryp_driver_data - data specific to the driver. - * - * @device_list: A list of registered devices to choose from. - * @device_allocation: A semaphore initialized with number of devices. - */ -struct cryp_driver_data { - struct klist device_list; - struct semaphore device_allocation; -}; - -/** - * struct cryp_ctx - Crypto context - * @config: Crypto mode. - * @key: Key array. - * @keylen: Length of key. - * @iv: Pointer to initialization vector. - * @indata: Pointer to indata. - * @outdata: Pointer to outdata. - * @datalen: Length of indata. - * @outlen: Length of outdata. - * @blocksize: Size of blocks. - * @updated: Updated flag. - * @dev_ctx: Device dependent context. - * @device: Pointer to the device. - * @session_id: Atomic session ID. - */ -struct cryp_ctx { - struct cryp_config config; - u8 key[CRYP_MAX_KEY_SIZE]; - u32 keylen; - u8 *iv; - const u8 *indata; - u8 *outdata; - u32 datalen; - u32 outlen; - u32 blocksize; - u8 updated; - struct cryp_device_context dev_ctx; - struct cryp_device_data *device; - u32 session_id; -}; - -static struct cryp_driver_data driver_data; - -/** - * swap_bits_in_byte - mirror the bits in a byte - * @b: the byte to be mirrored - * - * The bits are swapped the following way: - * Byte b include bits 0-7, nibble 1 (n1) include bits 0-3 and - * nibble 2 (n2) bits 4-7. - * - * Nibble 1 (n1): - * (The "old" (moved) bit is replaced with a zero) - * 1. Move bit 6 and 7, 4 positions to the left. - * 2. Move bit 3 and 5, 2 positions to the left. - * 3. Move bit 1-4, 1 position to the left. - * - * Nibble 2 (n2): - * 1. Move bit 0 and 1, 4 positions to the right. - * 2. Move bit 2 and 4, 2 positions to the right. - * 3. Move bit 3-6, 1 position to the right. - * - * Combine the two nibbles to a complete and swapped byte. - */ - -static inline u8 swap_bits_in_byte(u8 b) -{ -#define R_SHIFT_4_MASK 0xc0 /* Bits 6 and 7, right shift 4 */ -#define R_SHIFT_2_MASK 0x28 /* (After right shift 4) Bits 3 and 5, - right shift 2 */ -#define R_SHIFT_1_MASK 0x1e /* (After right shift 2) Bits 1-4, - right shift 1 */ -#define L_SHIFT_4_MASK 0x03 /* Bits 0 and 1, left shift 4 */ -#define L_SHIFT_2_MASK 0x14 /* (After left shift 4) Bits 2 and 4, - left shift 2 */ -#define L_SHIFT_1_MASK 0x78 /* (After left shift 1) Bits 3-6, - left shift 1 */ - - u8 n1; - u8 n2; - - /* Swap most significant nibble */ - /* Right shift 4, bits 6 and 7 */ - n1 = ((b & R_SHIFT_4_MASK) >> 4) | (b & ~(R_SHIFT_4_MASK >> 4)); - /* Right shift 2, bits 3 and 5 */ - n1 = ((n1 & R_SHIFT_2_MASK) >> 2) | (n1 & ~(R_SHIFT_2_MASK >> 2)); - /* Right shift 1, bits 1-4 */ - n1 = (n1 & R_SHIFT_1_MASK) >> 1; - - /* Swap least significant nibble */ - /* Left shift 4, bits 0 and 1 */ - n2 = ((b & L_SHIFT_4_MASK) << 4) | (b & ~(L_SHIFT_4_MASK << 4)); - /* Left shift 2, bits 2 and 4 */ - n2 = ((n2 & L_SHIFT_2_MASK) << 2) | (n2 & ~(L_SHIFT_2_MASK << 2)); - /* Left shift 1, bits 3-6 */ - n2 = (n2 & L_SHIFT_1_MASK) << 1; - - return n1 | n2; -} - -static inline void swap_words_in_key_and_bits_in_byte(const u8 *in, - u8 *out, u32 len) -{ - unsigned int i = 0; - int j; - int index = 0; - - j = len - BYTES_PER_WORD; - while (j >= 0) { - for (i = 0; i < BYTES_PER_WORD; i++) { - index = len - j - BYTES_PER_WORD + i; - out[j + i] = - swap_bits_in_byte(in[index]); - } - j -= BYTES_PER_WORD; - } -} - -static void add_session_id(struct cryp_ctx *ctx) -{ - /* - * We never want 0 to be a valid value, since this is the default value - * for the software context. - */ - if (unlikely(atomic_inc_and_test(&session_id))) - atomic_inc(&session_id); - - ctx->session_id = atomic_read(&session_id); -} - -static irqreturn_t cryp_interrupt_handler(int irq, void *param) -{ - struct cryp_ctx *ctx; - int count; - struct cryp_device_data *device_data; - - if (param == NULL) { - BUG_ON(!param); - return IRQ_HANDLED; - } - - /* The device is coming from the one found in hw_crypt_noxts. */ - device_data = (struct cryp_device_data *)param; - - ctx = device_data->current_ctx; - - if (ctx == NULL) { - BUG_ON(!ctx); - return IRQ_HANDLED; - } - - dev_dbg(ctx->device->dev, "[%s] (len: %d) %s, ", __func__, ctx->outlen, - cryp_pending_irq_src(device_data, CRYP_IRQ_SRC_OUTPUT_FIFO) ? - "out" : "in"); - - if (cryp_pending_irq_src(device_data, - CRYP_IRQ_SRC_OUTPUT_FIFO)) { - if (ctx->outlen / ctx->blocksize > 0) { - count = ctx->blocksize / 4; - - readsl(&device_data->base->dout, ctx->outdata, count); - ctx->outdata += count; - ctx->outlen -= count; - - if (ctx->outlen == 0) { - cryp_disable_irq_src(device_data, - CRYP_IRQ_SRC_OUTPUT_FIFO); - } - } - } else if (cryp_pending_irq_src(device_data, - CRYP_IRQ_SRC_INPUT_FIFO)) { - if (ctx->datalen / ctx->blocksize > 0) { - count = ctx->blocksize / 4; - - writesl(&device_data->base->din, ctx->indata, count); - - ctx->indata += count; - ctx->datalen -= count; - - if (ctx->datalen == 0) - cryp_disable_irq_src(device_data, - CRYP_IRQ_SRC_INPUT_FIFO); - - if (ctx->config.algomode == CRYP_ALGO_AES_XTS) { - CRYP_PUT_BITS(&device_data->base->cr, - CRYP_START_ENABLE, - CRYP_CR_START_POS, - CRYP_CR_START_MASK); - - cryp_wait_until_done(device_data); - } - } - } - - return IRQ_HANDLED; -} - -static int mode_is_aes(enum cryp_algo_mode mode) -{ - return CRYP_ALGO_AES_ECB == mode || - CRYP_ALGO_AES_CBC == mode || - CRYP_ALGO_AES_CTR == mode || - CRYP_ALGO_AES_XTS == mode; -} - -static int cfg_iv(struct cryp_device_data *device_data, u32 left, u32 right, - enum cryp_init_vector_index index) -{ - struct cryp_init_vector_value vector_value; - - dev_dbg(device_data->dev, "[%s]", __func__); - - vector_value.init_value_left = left; - vector_value.init_value_right = right; - - return cryp_configure_init_vector(device_data, - index, - vector_value); -} - -static int cfg_ivs(struct cryp_device_data *device_data, struct cryp_ctx *ctx) -{ - int i; - int status = 0; - int num_of_regs = ctx->blocksize / 8; - __be32 *civ = (__be32 *)ctx->iv; - u32 iv[AES_BLOCK_SIZE / 4]; - - dev_dbg(device_data->dev, "[%s]", __func__); - - /* - * Since we loop on num_of_regs we need to have a check in case - * someone provides an incorrect blocksize which would force calling - * cfg_iv with i greater than 2 which is an error. - */ - if (num_of_regs > 2) { - dev_err(device_data->dev, "[%s] Incorrect blocksize %d", - __func__, ctx->blocksize); - return -EINVAL; - } - - for (i = 0; i < ctx->blocksize / 4; i++) - iv[i] = be32_to_cpup(civ + i); - - for (i = 0; i < num_of_regs; i++) { - status = cfg_iv(device_data, iv[i*2], iv[i*2+1], - (enum cryp_init_vector_index) i); - if (status != 0) - return status; - } - return status; -} - -static int set_key(struct cryp_device_data *device_data, - u32 left_key, - u32 right_key, - enum cryp_key_reg_index index) -{ - struct cryp_key_value key_value; - int cryp_error; - - dev_dbg(device_data->dev, "[%s]", __func__); - - key_value.key_value_left = left_key; - key_value.key_value_right = right_key; - - cryp_error = cryp_configure_key_values(device_data, - index, - key_value); - if (cryp_error != 0) - dev_err(device_data->dev, "[%s]: " - "cryp_configure_key_values() failed!", __func__); - - return cryp_error; -} - -static int cfg_keys(struct cryp_ctx *ctx) -{ - int i; - int num_of_regs = ctx->keylen / 8; - u32 swapped_key[CRYP_MAX_KEY_SIZE / 4]; - __be32 *ckey = (__be32 *)ctx->key; - int cryp_error = 0; - - dev_dbg(ctx->device->dev, "[%s]", __func__); - - if (mode_is_aes(ctx->config.algomode)) { - swap_words_in_key_and_bits_in_byte((u8 *)ckey, - (u8 *)swapped_key, - ctx->keylen); - } else { - for (i = 0; i < ctx->keylen / 4; i++) - swapped_key[i] = be32_to_cpup(ckey + i); - } - - for (i = 0; i < num_of_regs; i++) { - cryp_error = set_key(ctx->device, - swapped_key[i * 2], - swapped_key[i * 2 + 1], - (enum cryp_key_reg_index) i); - - if (cryp_error != 0) { - dev_err(ctx->device->dev, "[%s]: set_key() failed!", - __func__); - return cryp_error; - } - } - return cryp_error; -} - -static int cryp_setup_context(struct cryp_ctx *ctx, - struct cryp_device_data *device_data) -{ - u32 control_register = CRYP_CR_DEFAULT; - - switch (cryp_mode) { - case CRYP_MODE_INTERRUPT: - writel_relaxed(CRYP_IMSC_DEFAULT, &device_data->base->imsc); - break; - - case CRYP_MODE_DMA: - writel_relaxed(CRYP_DMACR_DEFAULT, &device_data->base->dmacr); - break; - - default: - break; - } - - if (ctx->updated == 0) { - cryp_flush_inoutfifo(device_data); - if (cfg_keys(ctx) != 0) { - dev_err(ctx->device->dev, "[%s]: cfg_keys failed!", - __func__); - return -EINVAL; - } - - if (ctx->iv && - CRYP_ALGO_AES_ECB != ctx->config.algomode && - CRYP_ALGO_DES_ECB != ctx->config.algomode && - CRYP_ALGO_TDES_ECB != ctx->config.algomode) { - if (cfg_ivs(device_data, ctx) != 0) - return -EPERM; - } - - cryp_set_configuration(device_data, &ctx->config, - &control_register); - add_session_id(ctx); - } else if (ctx->updated == 1 && - ctx->session_id != atomic_read(&session_id)) { - cryp_flush_inoutfifo(device_data); - cryp_restore_device_context(device_data, &ctx->dev_ctx); - - add_session_id(ctx); - control_register = ctx->dev_ctx.cr; - } else - control_register = ctx->dev_ctx.cr; - - writel(control_register | - (CRYP_CRYPEN_ENABLE << CRYP_CR_CRYPEN_POS), - &device_data->base->cr); - - return 0; -} - -static int cryp_get_device_data(struct cryp_ctx *ctx, - struct cryp_device_data **device_data) -{ - int ret; - struct klist_iter device_iterator; - struct klist_node *device_node; - struct cryp_device_data *local_device_data = NULL; - pr_debug(DEV_DBG_NAME " [%s]", __func__); - - /* Wait until a device is available */ - ret = down_interruptible(&driver_data.device_allocation); - if (ret) - return ret; /* Interrupted */ - - /* Select a device */ - klist_iter_init(&driver_data.device_list, &device_iterator); - - device_node = klist_next(&device_iterator); - while (device_node) { - local_device_data = container_of(device_node, - struct cryp_device_data, list_node); - spin_lock(&local_device_data->ctx_lock); - /* current_ctx allocates a device, NULL = unallocated */ - if (local_device_data->current_ctx) { - device_node = klist_next(&device_iterator); - } else { - local_device_data->current_ctx = ctx; - ctx->device = local_device_data; - spin_unlock(&local_device_data->ctx_lock); - break; - } - spin_unlock(&local_device_data->ctx_lock); - } - klist_iter_exit(&device_iterator); - - if (!device_node) { - /** - * No free device found. - * Since we allocated a device with down_interruptible, this - * should not be able to happen. - * Number of available devices, which are contained in - * device_allocation, is therefore decremented by not doing - * an up(device_allocation). - */ - return -EBUSY; - } - - *device_data = local_device_data; - - return 0; -} - -static void cryp_dma_setup_channel(struct cryp_device_data *device_data, - struct device *dev) -{ - struct dma_slave_config mem2cryp = { - .direction = DMA_MEM_TO_DEV, - .dst_addr = device_data->phybase + CRYP_DMA_TX_FIFO, - .dst_addr_width = DMA_SLAVE_BUSWIDTH_2_BYTES, - .dst_maxburst = 4, - }; - struct dma_slave_config cryp2mem = { - .direction = DMA_DEV_TO_MEM, - .src_addr = device_data->phybase + CRYP_DMA_RX_FIFO, - .src_addr_width = DMA_SLAVE_BUSWIDTH_2_BYTES, - .src_maxburst = 4, - }; - - dma_cap_zero(device_data->dma.mask); - dma_cap_set(DMA_SLAVE, device_data->dma.mask); - - device_data->dma.cfg_mem2cryp = mem_to_engine; - device_data->dma.chan_mem2cryp = - dma_request_channel(device_data->dma.mask, - stedma40_filter, - device_data->dma.cfg_mem2cryp); - - device_data->dma.cfg_cryp2mem = engine_to_mem; - device_data->dma.chan_cryp2mem = - dma_request_channel(device_data->dma.mask, - stedma40_filter, - device_data->dma.cfg_cryp2mem); - - dmaengine_slave_config(device_data->dma.chan_mem2cryp, &mem2cryp); - dmaengine_slave_config(device_data->dma.chan_cryp2mem, &cryp2mem); - - init_completion(&device_data->dma.cryp_dma_complete); -} - -static void cryp_dma_out_callback(void *data) -{ - struct cryp_ctx *ctx = (struct cryp_ctx *) data; - dev_dbg(ctx->device->dev, "[%s]: ", __func__); - - complete(&ctx->device->dma.cryp_dma_complete); -} - -static int cryp_set_dma_transfer(struct cryp_ctx *ctx, - struct scatterlist *sg, - int len, - enum dma_data_direction direction) -{ - struct dma_async_tx_descriptor *desc; - struct dma_chan *channel = NULL; - dma_cookie_t cookie; - - dev_dbg(ctx->device->dev, "[%s]: ", __func__); - - if (unlikely(!IS_ALIGNED((unsigned long)sg, 4))) { - dev_err(ctx->device->dev, "[%s]: Data in sg list isn't " - "aligned! Addr: 0x%08lx", __func__, (unsigned long)sg); - return -EFAULT; - } - - switch (direction) { - case DMA_TO_DEVICE: - channel = ctx->device->dma.chan_mem2cryp; - ctx->device->dma.sg_src = sg; - ctx->device->dma.sg_src_len = dma_map_sg(channel->device->dev, - ctx->device->dma.sg_src, - ctx->device->dma.nents_src, - direction); - - if (!ctx->device->dma.sg_src_len) { - dev_dbg(ctx->device->dev, - "[%s]: Could not map the sg list (TO_DEVICE)", - __func__); - return -EFAULT; - } - - dev_dbg(ctx->device->dev, "[%s]: Setting up DMA for buffer " - "(TO_DEVICE)", __func__); - - desc = dmaengine_prep_slave_sg(channel, - ctx->device->dma.sg_src, - ctx->device->dma.sg_src_len, - DMA_MEM_TO_DEV, DMA_CTRL_ACK); - break; - - case DMA_FROM_DEVICE: - channel = ctx->device->dma.chan_cryp2mem; - ctx->device->dma.sg_dst = sg; - ctx->device->dma.sg_dst_len = dma_map_sg(channel->device->dev, - ctx->device->dma.sg_dst, - ctx->device->dma.nents_dst, - direction); - - if (!ctx->device->dma.sg_dst_len) { - dev_dbg(ctx->device->dev, - "[%s]: Could not map the sg list (FROM_DEVICE)", - __func__); - return -EFAULT; - } - - dev_dbg(ctx->device->dev, "[%s]: Setting up DMA for buffer " - "(FROM_DEVICE)", __func__); - - desc = dmaengine_prep_slave_sg(channel, - ctx->device->dma.sg_dst, - ctx->device->dma.sg_dst_len, - DMA_DEV_TO_MEM, - DMA_CTRL_ACK | - DMA_PREP_INTERRUPT); - - desc->callback = cryp_dma_out_callback; - desc->callback_param = ctx; - break; - - default: - dev_dbg(ctx->device->dev, "[%s]: Invalid DMA direction", - __func__); - return -EFAULT; - } - - cookie = dmaengine_submit(desc); - if (dma_submit_error(cookie)) { - dev_dbg(ctx->device->dev, "[%s]: DMA submission failed\n", - __func__); - return cookie; - } - - dma_async_issue_pending(channel); - - return 0; -} - -static void cryp_dma_done(struct cryp_ctx *ctx) -{ - struct dma_chan *chan; - - dev_dbg(ctx->device->dev, "[%s]: ", __func__); - - chan = ctx->device->dma.chan_mem2cryp; - dmaengine_terminate_all(chan); - dma_unmap_sg(chan->device->dev, ctx->device->dma.sg_src, - ctx->device->dma.nents_src, DMA_TO_DEVICE); - - chan = ctx->device->dma.chan_cryp2mem; - dmaengine_terminate_all(chan); - dma_unmap_sg(chan->device->dev, ctx->device->dma.sg_dst, - ctx->device->dma.nents_dst, DMA_FROM_DEVICE); -} - -static int cryp_dma_write(struct cryp_ctx *ctx, struct scatterlist *sg, - int len) -{ - int error = cryp_set_dma_transfer(ctx, sg, len, DMA_TO_DEVICE); - dev_dbg(ctx->device->dev, "[%s]: ", __func__); - - if (error) { - dev_dbg(ctx->device->dev, "[%s]: cryp_set_dma_transfer() " - "failed", __func__); - return error; - } - - return len; -} - -static int cryp_dma_read(struct cryp_ctx *ctx, struct scatterlist *sg, int len) -{ - int error = cryp_set_dma_transfer(ctx, sg, len, DMA_FROM_DEVICE); - if (error) { - dev_dbg(ctx->device->dev, "[%s]: cryp_set_dma_transfer() " - "failed", __func__); - return error; - } - - return len; -} - -static void cryp_polling_mode(struct cryp_ctx *ctx, - struct cryp_device_data *device_data) -{ - int len = ctx->blocksize / BYTES_PER_WORD; - int remaining_length = ctx->datalen; - u32 *indata = (u32 *)ctx->indata; - u32 *outdata = (u32 *)ctx->outdata; - - while (remaining_length > 0) { - writesl(&device_data->base->din, indata, len); - indata += len; - remaining_length -= (len * BYTES_PER_WORD); - cryp_wait_until_done(device_data); - - readsl(&device_data->base->dout, outdata, len); - outdata += len; - cryp_wait_until_done(device_data); - } -} - -static int cryp_disable_power(struct device *dev, - struct cryp_device_data *device_data, - bool save_device_context) -{ - int ret = 0; - - dev_dbg(dev, "[%s]", __func__); - - spin_lock(&device_data->power_state_spinlock); - if (!device_data->power_state) - goto out; - - spin_lock(&device_data->ctx_lock); - if (save_device_context && device_data->current_ctx) { - cryp_save_device_context(device_data, - &device_data->current_ctx->dev_ctx, - cryp_mode); - device_data->restore_dev_ctx = true; - } - spin_unlock(&device_data->ctx_lock); - - clk_disable(device_data->clk); - ret = regulator_disable(device_data->pwr_regulator); - if (ret) - dev_err(dev, "[%s]: " - "regulator_disable() failed!", - __func__); - - device_data->power_state = false; - -out: - spin_unlock(&device_data->power_state_spinlock); - - return ret; -} - -static int cryp_enable_power( - struct device *dev, - struct cryp_device_data *device_data, - bool restore_device_context) -{ - int ret = 0; - - dev_dbg(dev, "[%s]", __func__); - - spin_lock(&device_data->power_state_spinlock); - if (!device_data->power_state) { - ret = regulator_enable(device_data->pwr_regulator); - if (ret) { - dev_err(dev, "[%s]: regulator_enable() failed!", - __func__); - goto out; - } - - ret = clk_enable(device_data->clk); - if (ret) { - dev_err(dev, "[%s]: clk_enable() failed!", - __func__); - regulator_disable(device_data->pwr_regulator); - goto out; - } - device_data->power_state = true; - } - - if (device_data->restore_dev_ctx) { - spin_lock(&device_data->ctx_lock); - if (restore_device_context && device_data->current_ctx) { - device_data->restore_dev_ctx = false; - cryp_restore_device_context(device_data, - &device_data->current_ctx->dev_ctx); - } - spin_unlock(&device_data->ctx_lock); - } -out: - spin_unlock(&device_data->power_state_spinlock); - - return ret; -} - -static int hw_crypt_noxts(struct cryp_ctx *ctx, - struct cryp_device_data *device_data) -{ - int ret = 0; - - const u8 *indata = ctx->indata; - u8 *outdata = ctx->outdata; - u32 datalen = ctx->datalen; - u32 outlen = datalen; - - pr_debug(DEV_DBG_NAME " [%s]", __func__); - - ctx->outlen = ctx->datalen; - - if (unlikely(!IS_ALIGNED((unsigned long)indata, 4))) { - pr_debug(DEV_DBG_NAME " [%s]: Data isn't aligned! Addr: " - "0x%08lx", __func__, (unsigned long)indata); - return -EINVAL; - } - - ret = cryp_setup_context(ctx, device_data); - - if (ret) - goto out; - - if (cryp_mode == CRYP_MODE_INTERRUPT) { - cryp_enable_irq_src(device_data, CRYP_IRQ_SRC_INPUT_FIFO | - CRYP_IRQ_SRC_OUTPUT_FIFO); - - /* - * ctx->outlen is decremented in the cryp_interrupt_handler - * function. We had to add cpu_relax() (barrier) to make sure - * that gcc didn't optimze away this variable. - */ - while (ctx->outlen > 0) - cpu_relax(); - } else if (cryp_mode == CRYP_MODE_POLLING || - cryp_mode == CRYP_MODE_DMA) { - /* - * The reason for having DMA in this if case is that if we are - * running cryp_mode = 2, then we separate DMA routines for - * handling cipher/plaintext > blocksize, except when - * running the normal CRYPTO_ALG_TYPE_CIPHER, then we still use - * the polling mode. Overhead of doing DMA setup eats up the - * benefits using it. - */ - cryp_polling_mode(ctx, device_data); - } else { - dev_err(ctx->device->dev, "[%s]: Invalid operation mode!", - __func__); - ret = -EPERM; - goto out; - } - - cryp_save_device_context(device_data, &ctx->dev_ctx, cryp_mode); - ctx->updated = 1; - -out: - ctx->indata = indata; - ctx->outdata = outdata; - ctx->datalen = datalen; - ctx->outlen = outlen; - - return ret; -} - -static int get_nents(struct scatterlist *sg, int nbytes) -{ - int nents = 0; - - while (nbytes > 0) { - nbytes -= sg->length; - sg = sg_next(sg); - nents++; - } - - return nents; -} - -static int ablk_dma_crypt(struct skcipher_request *areq) -{ - struct crypto_skcipher *cipher = crypto_skcipher_reqtfm(areq); - struct cryp_ctx *ctx = crypto_skcipher_ctx(cipher); - struct cryp_device_data *device_data; - - int bytes_written = 0; - int bytes_read = 0; - int ret; - - pr_debug(DEV_DBG_NAME " [%s]", __func__); - - ctx->datalen = areq->cryptlen; - ctx->outlen = areq->cryptlen; - - ret = cryp_get_device_data(ctx, &device_data); - if (ret) - return ret; - - ret = cryp_setup_context(ctx, device_data); - if (ret) - goto out; - - /* We have the device now, so store the nents in the dma struct. */ - ctx->device->dma.nents_src = get_nents(areq->src, ctx->datalen); - ctx->device->dma.nents_dst = get_nents(areq->dst, ctx->outlen); - - /* Enable DMA in- and output. */ - cryp_configure_for_dma(device_data, CRYP_DMA_ENABLE_BOTH_DIRECTIONS); - - bytes_written = cryp_dma_write(ctx, areq->src, ctx->datalen); - bytes_read = cryp_dma_read(ctx, areq->dst, bytes_written); - - wait_for_completion(&ctx->device->dma.cryp_dma_complete); - cryp_dma_done(ctx); - - cryp_save_device_context(device_data, &ctx->dev_ctx, cryp_mode); - ctx->updated = 1; - -out: - spin_lock(&device_data->ctx_lock); - device_data->current_ctx = NULL; - ctx->device = NULL; - spin_unlock(&device_data->ctx_lock); - - /* - * The down_interruptible part for this semaphore is called in - * cryp_get_device_data. - */ - up(&driver_data.device_allocation); - - if (unlikely(bytes_written != bytes_read)) - return -EPERM; - - return 0; -} - -static int ablk_crypt(struct skcipher_request *areq) -{ - struct skcipher_walk walk; - struct crypto_skcipher *cipher = crypto_skcipher_reqtfm(areq); - struct cryp_ctx *ctx = crypto_skcipher_ctx(cipher); - struct cryp_device_data *device_data; - unsigned long src_paddr; - unsigned long dst_paddr; - int ret; - int nbytes; - - pr_debug(DEV_DBG_NAME " [%s]", __func__); - - ret = cryp_get_device_data(ctx, &device_data); - if (ret) - goto out; - - ret = skcipher_walk_async(&walk, areq); - - if (ret) { - pr_err(DEV_DBG_NAME "[%s]: skcipher_walk_async() failed!", - __func__); - goto out; - } - - while ((nbytes = walk.nbytes) > 0) { - ctx->iv = walk.iv; - src_paddr = (page_to_phys(walk.src.phys.page) + walk.src.phys.offset); - ctx->indata = phys_to_virt(src_paddr); - - dst_paddr = (page_to_phys(walk.dst.phys.page) + walk.dst.phys.offset); - ctx->outdata = phys_to_virt(dst_paddr); - - ctx->datalen = nbytes - (nbytes % ctx->blocksize); - - ret = hw_crypt_noxts(ctx, device_data); - if (ret) - goto out; - - nbytes -= ctx->datalen; - ret = skcipher_walk_done(&walk, nbytes); - if (ret) - goto out; - } - -out: - /* Release the device */ - spin_lock(&device_data->ctx_lock); - device_data->current_ctx = NULL; - ctx->device = NULL; - spin_unlock(&device_data->ctx_lock); - - /* - * The down_interruptible part for this semaphore is called in - * cryp_get_device_data. - */ - up(&driver_data.device_allocation); - - return ret; -} - -static int aes_skcipher_setkey(struct crypto_skcipher *cipher, - const u8 *key, unsigned int keylen) -{ - struct cryp_ctx *ctx = crypto_skcipher_ctx(cipher); - - pr_debug(DEV_DBG_NAME " [%s]", __func__); - - switch (keylen) { - case AES_KEYSIZE_128: - ctx->config.keysize = CRYP_KEY_SIZE_128; - break; - - case AES_KEYSIZE_192: - ctx->config.keysize = CRYP_KEY_SIZE_192; - break; - - case AES_KEYSIZE_256: - ctx->config.keysize = CRYP_KEY_SIZE_256; - break; - - default: - pr_err(DEV_DBG_NAME "[%s]: Unknown keylen!", __func__); - return -EINVAL; - } - - memcpy(ctx->key, key, keylen); - ctx->keylen = keylen; - - ctx->updated = 0; - - return 0; -} - -static int des_skcipher_setkey(struct crypto_skcipher *cipher, - const u8 *key, unsigned int keylen) -{ - struct cryp_ctx *ctx = crypto_skcipher_ctx(cipher); - int err; - - pr_debug(DEV_DBG_NAME " [%s]", __func__); - - err = verify_skcipher_des_key(cipher, key); - if (err) - return err; - - memcpy(ctx->key, key, keylen); - ctx->keylen = keylen; - - ctx->updated = 0; - return 0; -} - -static int des3_skcipher_setkey(struct crypto_skcipher *cipher, - const u8 *key, unsigned int keylen) -{ - struct cryp_ctx *ctx = crypto_skcipher_ctx(cipher); - int err; - - pr_debug(DEV_DBG_NAME " [%s]", __func__); - - err = verify_skcipher_des3_key(cipher, key); - if (err) - return err; - - memcpy(ctx->key, key, keylen); - ctx->keylen = keylen; - - ctx->updated = 0; - return 0; -} - -static int cryp_blk_encrypt(struct skcipher_request *areq) -{ - struct crypto_skcipher *cipher = crypto_skcipher_reqtfm(areq); - struct cryp_ctx *ctx = crypto_skcipher_ctx(cipher); - - pr_debug(DEV_DBG_NAME " [%s]", __func__); - - ctx->config.algodir = CRYP_ALGORITHM_ENCRYPT; - - /* - * DMA does not work for DES due to a hw bug */ - if (cryp_mode == CRYP_MODE_DMA && mode_is_aes(ctx->config.algomode)) - return ablk_dma_crypt(areq); - - /* For everything except DMA, we run the non DMA version. */ - return ablk_crypt(areq); -} - -static int cryp_blk_decrypt(struct skcipher_request *areq) -{ - struct crypto_skcipher *cipher = crypto_skcipher_reqtfm(areq); - struct cryp_ctx *ctx = crypto_skcipher_ctx(cipher); - - pr_debug(DEV_DBG_NAME " [%s]", __func__); - - ctx->config.algodir = CRYP_ALGORITHM_DECRYPT; - - /* DMA does not work for DES due to a hw bug */ - if (cryp_mode == CRYP_MODE_DMA && mode_is_aes(ctx->config.algomode)) - return ablk_dma_crypt(areq); - - /* For everything except DMA, we run the non DMA version. */ - return ablk_crypt(areq); -} - -struct cryp_algo_template { - enum cryp_algo_mode algomode; - struct skcipher_alg skcipher; -}; - -static int cryp_init_tfm(struct crypto_skcipher *tfm) -{ - struct cryp_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_alg *alg = crypto_skcipher_alg(tfm); - struct cryp_algo_template *cryp_alg = container_of(alg, - struct cryp_algo_template, - skcipher); - - ctx->config.algomode = cryp_alg->algomode; - ctx->blocksize = crypto_skcipher_blocksize(tfm); - - return 0; -} - -static struct cryp_algo_template cryp_algs[] = { - { - .algomode = CRYP_ALGO_AES_ECB, - .skcipher = { - .base.cra_name = "ecb(aes)", - .base.cra_driver_name = "ecb-aes-ux500", - .base.cra_priority = 300, - .base.cra_flags = CRYPTO_ALG_ASYNC, - .base.cra_blocksize = AES_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct cryp_ctx), - .base.cra_alignmask = 3, - .base.cra_module = THIS_MODULE, - - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .setkey = aes_skcipher_setkey, - .encrypt = cryp_blk_encrypt, - .decrypt = cryp_blk_decrypt, - .init = cryp_init_tfm, - } - }, - { - .algomode = CRYP_ALGO_AES_CBC, - .skcipher = { - .base.cra_name = "cbc(aes)", - .base.cra_driver_name = "cbc-aes-ux500", - .base.cra_priority = 300, - .base.cra_flags = CRYPTO_ALG_ASYNC, - .base.cra_blocksize = AES_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct cryp_ctx), - .base.cra_alignmask = 3, - .base.cra_module = THIS_MODULE, - - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .setkey = aes_skcipher_setkey, - .encrypt = cryp_blk_encrypt, - .decrypt = cryp_blk_decrypt, - .init = cryp_init_tfm, - .ivsize = AES_BLOCK_SIZE, - } - }, - { - .algomode = CRYP_ALGO_AES_CTR, - .skcipher = { - .base.cra_name = "ctr(aes)", - .base.cra_driver_name = "ctr-aes-ux500", - .base.cra_priority = 300, - .base.cra_flags = CRYPTO_ALG_ASYNC, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct cryp_ctx), - .base.cra_alignmask = 3, - .base.cra_module = THIS_MODULE, - - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .setkey = aes_skcipher_setkey, - .encrypt = cryp_blk_encrypt, - .decrypt = cryp_blk_decrypt, - .init = cryp_init_tfm, - .ivsize = AES_BLOCK_SIZE, - .chunksize = AES_BLOCK_SIZE, - } - }, - { - .algomode = CRYP_ALGO_DES_ECB, - .skcipher = { - .base.cra_name = "ecb(des)", - .base.cra_driver_name = "ecb-des-ux500", - .base.cra_priority = 300, - .base.cra_flags = CRYPTO_ALG_ASYNC, - .base.cra_blocksize = DES_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct cryp_ctx), - .base.cra_alignmask = 3, - .base.cra_module = THIS_MODULE, - - .min_keysize = DES_KEY_SIZE, - .max_keysize = DES_KEY_SIZE, - .setkey = des_skcipher_setkey, - .encrypt = cryp_blk_encrypt, - .decrypt = cryp_blk_decrypt, - .init = cryp_init_tfm, - } - }, - { - .algomode = CRYP_ALGO_TDES_ECB, - .skcipher = { - .base.cra_name = "ecb(des3_ede)", - .base.cra_driver_name = "ecb-des3_ede-ux500", - .base.cra_priority = 300, - .base.cra_flags = CRYPTO_ALG_ASYNC, - .base.cra_blocksize = DES3_EDE_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct cryp_ctx), - .base.cra_alignmask = 3, - .base.cra_module = THIS_MODULE, - - .min_keysize = DES3_EDE_KEY_SIZE, - .max_keysize = DES3_EDE_KEY_SIZE, - .setkey = des3_skcipher_setkey, - .encrypt = cryp_blk_encrypt, - .decrypt = cryp_blk_decrypt, - .init = cryp_init_tfm, - } - }, - { - .algomode = CRYP_ALGO_DES_CBC, - .skcipher = { - .base.cra_name = "cbc(des)", - .base.cra_driver_name = "cbc-des-ux500", - .base.cra_priority = 300, - .base.cra_flags = CRYPTO_ALG_ASYNC, - .base.cra_blocksize = DES_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct cryp_ctx), - .base.cra_alignmask = 3, - .base.cra_module = THIS_MODULE, - - .min_keysize = DES_KEY_SIZE, - .max_keysize = DES_KEY_SIZE, - .setkey = des_skcipher_setkey, - .encrypt = cryp_blk_encrypt, - .decrypt = cryp_blk_decrypt, - .ivsize = DES_BLOCK_SIZE, - .init = cryp_init_tfm, - } - }, - { - .algomode = CRYP_ALGO_TDES_CBC, - .skcipher = { - .base.cra_name = "cbc(des3_ede)", - .base.cra_driver_name = "cbc-des3_ede-ux500", - .base.cra_priority = 300, - .base.cra_flags = CRYPTO_ALG_ASYNC, - .base.cra_blocksize = DES3_EDE_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct cryp_ctx), - .base.cra_alignmask = 3, - .base.cra_module = THIS_MODULE, - - .min_keysize = DES3_EDE_KEY_SIZE, - .max_keysize = DES3_EDE_KEY_SIZE, - .setkey = des3_skcipher_setkey, - .encrypt = cryp_blk_encrypt, - .decrypt = cryp_blk_decrypt, - .ivsize = DES3_EDE_BLOCK_SIZE, - .init = cryp_init_tfm, - } - } -}; - -/** - * cryp_algs_register_all - - */ -static int cryp_algs_register_all(void) -{ - int ret; - int i; - int count; - - pr_debug("[%s]", __func__); - - for (i = 0; i < ARRAY_SIZE(cryp_algs); i++) { - ret = crypto_register_skcipher(&cryp_algs[i].skcipher); - if (ret) { - count = i; - pr_err("[%s] alg registration failed", - cryp_algs[i].skcipher.base.cra_driver_name); - goto unreg; - } - } - return 0; -unreg: - for (i = 0; i < count; i++) - crypto_unregister_skcipher(&cryp_algs[i].skcipher); - return ret; -} - -/** - * cryp_algs_unregister_all - - */ -static void cryp_algs_unregister_all(void) -{ - int i; - - pr_debug(DEV_DBG_NAME " [%s]", __func__); - - for (i = 0; i < ARRAY_SIZE(cryp_algs); i++) - crypto_unregister_skcipher(&cryp_algs[i].skcipher); -} - -static int ux500_cryp_probe(struct platform_device *pdev) -{ - int ret; - struct resource *res; - struct cryp_device_data *device_data; - struct cryp_protection_config prot = { - .privilege_access = CRYP_STATE_ENABLE - }; - struct device *dev = &pdev->dev; - - dev_dbg(dev, "[%s]", __func__); - device_data = devm_kzalloc(dev, sizeof(*device_data), GFP_KERNEL); - if (!device_data) { - ret = -ENOMEM; - goto out; - } - - device_data->dev = dev; - device_data->current_ctx = NULL; - - /* Grab the DMA configuration from platform data. */ - mem_to_engine = &((struct cryp_platform_data *) - dev->platform_data)->mem_to_engine; - engine_to_mem = &((struct cryp_platform_data *) - dev->platform_data)->engine_to_mem; - - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) { - dev_err(dev, "[%s]: platform_get_resource() failed", - __func__); - ret = -ENODEV; - goto out; - } - - device_data->phybase = res->start; - device_data->base = devm_ioremap_resource(dev, res); - if (IS_ERR(device_data->base)) { - ret = PTR_ERR(device_data->base); - goto out; - } - - spin_lock_init(&device_data->ctx_lock); - spin_lock_init(&device_data->power_state_spinlock); - - /* Enable power for CRYP hardware block */ - device_data->pwr_regulator = regulator_get(&pdev->dev, "v-ape"); - if (IS_ERR(device_data->pwr_regulator)) { - dev_err(dev, "[%s]: could not get cryp regulator", __func__); - ret = PTR_ERR(device_data->pwr_regulator); - device_data->pwr_regulator = NULL; - goto out; - } - - /* Enable the clk for CRYP hardware block */ - device_data->clk = devm_clk_get(&pdev->dev, NULL); - if (IS_ERR(device_data->clk)) { - dev_err(dev, "[%s]: clk_get() failed!", __func__); - ret = PTR_ERR(device_data->clk); - goto out_regulator; - } - - ret = clk_prepare(device_data->clk); - if (ret) { - dev_err(dev, "[%s]: clk_prepare() failed!", __func__); - goto out_regulator; - } - - /* Enable device power (and clock) */ - ret = cryp_enable_power(device_data->dev, device_data, false); - if (ret) { - dev_err(dev, "[%s]: cryp_enable_power() failed!", __func__); - goto out_clk_unprepare; - } - - if (cryp_check(device_data)) { - dev_err(dev, "[%s]: cryp_check() failed!", __func__); - ret = -EINVAL; - goto out_power; - } - - if (cryp_configure_protection(device_data, &prot)) { - dev_err(dev, "[%s]: cryp_configure_protection() failed!", - __func__); - ret = -EINVAL; - goto out_power; - } - - device_data->irq = platform_get_irq(pdev, 0); - if (device_data->irq <= 0) { - ret = device_data->irq ? device_data->irq : -ENXIO; - goto out_power; - } - - ret = devm_request_irq(&pdev->dev, device_data->irq, - cryp_interrupt_handler, 0, "cryp1", device_data); - if (ret) { - dev_err(dev, "[%s]: Unable to request IRQ", __func__); - goto out_power; - } - - if (cryp_mode == CRYP_MODE_DMA) - cryp_dma_setup_channel(device_data, dev); - - platform_set_drvdata(pdev, device_data); - - /* Put the new device into the device list... */ - klist_add_tail(&device_data->list_node, &driver_data.device_list); - - /* ... and signal that a new device is available. */ - up(&driver_data.device_allocation); - - atomic_set(&session_id, 1); - - ret = cryp_algs_register_all(); - if (ret) { - dev_err(dev, "[%s]: cryp_algs_register_all() failed!", - __func__); - goto out_power; - } - - dev_info(dev, "successfully registered\n"); - - return 0; - -out_power: - cryp_disable_power(device_data->dev, device_data, false); - -out_clk_unprepare: - clk_unprepare(device_data->clk); - -out_regulator: - regulator_put(device_data->pwr_regulator); - -out: - return ret; -} - -static int ux500_cryp_remove(struct platform_device *pdev) -{ - struct cryp_device_data *device_data; - - dev_dbg(&pdev->dev, "[%s]", __func__); - device_data = platform_get_drvdata(pdev); - if (!device_data) { - dev_err(&pdev->dev, "[%s]: platform_get_drvdata() failed!", - __func__); - return -ENOMEM; - } - - /* Try to decrease the number of available devices. */ - if (down_trylock(&driver_data.device_allocation)) - return -EBUSY; - - /* Check that the device is free */ - spin_lock(&device_data->ctx_lock); - /* current_ctx allocates a device, NULL = unallocated */ - if (device_data->current_ctx) { - /* The device is busy */ - spin_unlock(&device_data->ctx_lock); - /* Return the device to the pool. */ - up(&driver_data.device_allocation); - return -EBUSY; - } - - spin_unlock(&device_data->ctx_lock); - - /* Remove the device from the list */ - if (klist_node_attached(&device_data->list_node)) - klist_remove(&device_data->list_node); - - /* If this was the last device, remove the services */ - if (list_empty(&driver_data.device_list.k_list)) - cryp_algs_unregister_all(); - - if (cryp_disable_power(&pdev->dev, device_data, false)) - dev_err(&pdev->dev, "[%s]: cryp_disable_power() failed", - __func__); - - clk_unprepare(device_data->clk); - regulator_put(device_data->pwr_regulator); - - return 0; -} - -static void ux500_cryp_shutdown(struct platform_device *pdev) -{ - struct cryp_device_data *device_data; - - dev_dbg(&pdev->dev, "[%s]", __func__); - - device_data = platform_get_drvdata(pdev); - if (!device_data) { - dev_err(&pdev->dev, "[%s]: platform_get_drvdata() failed!", - __func__); - return; - } - - /* Check that the device is free */ - spin_lock(&device_data->ctx_lock); - /* current_ctx allocates a device, NULL = unallocated */ - if (!device_data->current_ctx) { - if (down_trylock(&driver_data.device_allocation)) - dev_dbg(&pdev->dev, "[%s]: Cryp still in use!" - "Shutting down anyway...", __func__); - /** - * (Allocate the device) - * Need to set this to non-null (dummy) value, - * to avoid usage if context switching. - */ - device_data->current_ctx++; - } - spin_unlock(&device_data->ctx_lock); - - /* Remove the device from the list */ - if (klist_node_attached(&device_data->list_node)) - klist_remove(&device_data->list_node); - - /* If this was the last device, remove the services */ - if (list_empty(&driver_data.device_list.k_list)) - cryp_algs_unregister_all(); - - if (cryp_disable_power(&pdev->dev, device_data, false)) - dev_err(&pdev->dev, "[%s]: cryp_disable_power() failed", - __func__); - -} - -#ifdef CONFIG_PM_SLEEP -static int ux500_cryp_suspend(struct device *dev) -{ - int ret; - struct platform_device *pdev = to_platform_device(dev); - struct cryp_device_data *device_data; - struct cryp_ctx *temp_ctx = NULL; - - dev_dbg(dev, "[%s]", __func__); - - /* Handle state? */ - device_data = platform_get_drvdata(pdev); - if (!device_data) { - dev_err(dev, "[%s]: platform_get_drvdata() failed!", __func__); - return -ENOMEM; - } - - disable_irq(device_data->irq); - - spin_lock(&device_data->ctx_lock); - if (!device_data->current_ctx) - device_data->current_ctx++; - spin_unlock(&device_data->ctx_lock); - - if (device_data->current_ctx == ++temp_ctx) { - if (down_interruptible(&driver_data.device_allocation)) - dev_dbg(dev, "[%s]: down_interruptible() failed", - __func__); - ret = cryp_disable_power(dev, device_data, false); - - } else - ret = cryp_disable_power(dev, device_data, true); - - if (ret) - dev_err(dev, "[%s]: cryp_disable_power()", __func__); - - return ret; -} - -static int ux500_cryp_resume(struct device *dev) -{ - int ret = 0; - struct platform_device *pdev = to_platform_device(dev); - struct cryp_device_data *device_data; - struct cryp_ctx *temp_ctx = NULL; - - dev_dbg(dev, "[%s]", __func__); - - device_data = platform_get_drvdata(pdev); - if (!device_data) { - dev_err(dev, "[%s]: platform_get_drvdata() failed!", __func__); - return -ENOMEM; - } - - spin_lock(&device_data->ctx_lock); - if (device_data->current_ctx == ++temp_ctx) - device_data->current_ctx = NULL; - spin_unlock(&device_data->ctx_lock); - - - if (!device_data->current_ctx) - up(&driver_data.device_allocation); - else - ret = cryp_enable_power(dev, device_data, true); - - if (ret) - dev_err(dev, "[%s]: cryp_enable_power() failed!", __func__); - else - enable_irq(device_data->irq); - - return ret; -} -#endif - -static SIMPLE_DEV_PM_OPS(ux500_cryp_pm, ux500_cryp_suspend, ux500_cryp_resume); - -static const struct of_device_id ux500_cryp_match[] = { - { .compatible = "stericsson,ux500-cryp" }, - { }, -}; -MODULE_DEVICE_TABLE(of, ux500_cryp_match); - -static struct platform_driver cryp_driver = { - .probe = ux500_cryp_probe, - .remove = ux500_cryp_remove, - .shutdown = ux500_cryp_shutdown, - .driver = { - .name = "cryp1", - .of_match_table = ux500_cryp_match, - .pm = &ux500_cryp_pm, - } -}; - -static int __init ux500_cryp_mod_init(void) -{ - pr_debug("[%s] is called!", __func__); - klist_init(&driver_data.device_list, NULL, NULL); - /* Initialize the semaphore to 0 devices (locked state) */ - sema_init(&driver_data.device_allocation, 0); - return platform_driver_register(&cryp_driver); -} - -static void __exit ux500_cryp_mod_fini(void) -{ - pr_debug("[%s] is called!", __func__); - platform_driver_unregister(&cryp_driver); -} - -module_init(ux500_cryp_mod_init); -module_exit(ux500_cryp_mod_fini); - -module_param(cryp_mode, int, 0); - -MODULE_DESCRIPTION("Driver for ST-Ericsson UX500 CRYP crypto engine."); -MODULE_ALIAS_CRYPTO("aes-all"); -MODULE_ALIAS_CRYPTO("des-all"); - -MODULE_LICENSE("GPL"); diff --git a/drivers/crypto/ux500/cryp/cryp_irq.c b/drivers/crypto/ux500/cryp/cryp_irq.c deleted file mode 100644 index 6d2f07bec98a7..0000000000000 --- a/drivers/crypto/ux500/cryp/cryp_irq.c +++ /dev/null @@ -1,45 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) ST-Ericsson SA 2010 - * Author: Shujuan Chen for ST-Ericsson. - * Author: Jonas Linde for ST-Ericsson. - * Author: Joakim Bech for ST-Ericsson. - * Author: Berne Hebark for ST-Ericsson. - * Author: Niklas Hernaeus for ST-Ericsson. - */ - -#include -#include -#include - -#include "cryp.h" -#include "cryp_p.h" -#include "cryp_irq.h" -#include "cryp_irqp.h" - -void cryp_enable_irq_src(struct cryp_device_data *device_data, u32 irq_src) -{ - u32 i; - - dev_dbg(device_data->dev, "[%s]", __func__); - - i = readl_relaxed(&device_data->base->imsc); - i = i | irq_src; - writel_relaxed(i, &device_data->base->imsc); -} - -void cryp_disable_irq_src(struct cryp_device_data *device_data, u32 irq_src) -{ - u32 i; - - dev_dbg(device_data->dev, "[%s]", __func__); - - i = readl_relaxed(&device_data->base->imsc); - i = i & ~irq_src; - writel_relaxed(i, &device_data->base->imsc); -} - -bool cryp_pending_irq_src(struct cryp_device_data *device_data, u32 irq_src) -{ - return (readl_relaxed(&device_data->base->mis) & irq_src) > 0; -} diff --git a/drivers/crypto/ux500/cryp/cryp_irq.h b/drivers/crypto/ux500/cryp/cryp_irq.h deleted file mode 100644 index da90029ea1412..0000000000000 --- a/drivers/crypto/ux500/cryp/cryp_irq.h +++ /dev/null @@ -1,31 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) ST-Ericsson SA 2010 - * Author: Shujuan Chen for ST-Ericsson. - * Author: Jonas Linde for ST-Ericsson. - * Author: Joakim Bech for ST-Ericsson. - * Author: Berne Hebark for ST-Ericsson. - * Author: Niklas Hernaeus for ST-Ericsson. - */ - -#ifndef _CRYP_IRQ_H_ -#define _CRYP_IRQ_H_ - -#include "cryp.h" - -enum cryp_irq_src_id { - CRYP_IRQ_SRC_INPUT_FIFO = 0x1, - CRYP_IRQ_SRC_OUTPUT_FIFO = 0x2, - CRYP_IRQ_SRC_ALL = 0x3 -}; - -/* - * M0 Funtions - */ -void cryp_enable_irq_src(struct cryp_device_data *device_data, u32 irq_src); - -void cryp_disable_irq_src(struct cryp_device_data *device_data, u32 irq_src); - -bool cryp_pending_irq_src(struct cryp_device_data *device_data, u32 irq_src); - -#endif /* _CRYP_IRQ_H_ */ diff --git a/drivers/crypto/ux500/cryp/cryp_irqp.h b/drivers/crypto/ux500/cryp/cryp_irqp.h deleted file mode 100644 index 4981a3f461e5e..0000000000000 --- a/drivers/crypto/ux500/cryp/cryp_irqp.h +++ /dev/null @@ -1,125 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) ST-Ericsson SA 2010 - * Author: Shujuan Chen for ST-Ericsson. - * Author: Jonas Linde for ST-Ericsson. - * Author: Joakim Bech for ST-Ericsson. - * Author: Berne Hebark for ST-Ericsson. - * Author: Niklas Hernaeus for ST-Ericsson. - */ - -#ifndef __CRYP_IRQP_H_ -#define __CRYP_IRQP_H_ - -#include "cryp_irq.h" - -/* - * - * CRYP Registers - Offset mapping - * +-----------------+ - * 00h | CRYP_CR | Configuration register - * +-----------------+ - * 04h | CRYP_SR | Status register - * +-----------------+ - * 08h | CRYP_DIN | Data In register - * +-----------------+ - * 0ch | CRYP_DOUT | Data out register - * +-----------------+ - * 10h | CRYP_DMACR | DMA control register - * +-----------------+ - * 14h | CRYP_IMSC | IMSC - * +-----------------+ - * 18h | CRYP_RIS | Raw interrupt status - * +-----------------+ - * 1ch | CRYP_MIS | Masked interrupt status. - * +-----------------+ - * Key registers - * IVR registers - * Peripheral - * Cell IDs - * - * Refer data structure for other register map - */ - -/** - * struct cryp_register - * @cr - Configuration register - * @status - Status register - * @din - Data input register - * @din_size - Data input size register - * @dout - Data output register - * @dout_size - Data output size register - * @dmacr - Dma control register - * @imsc - Interrupt mask set/clear register - * @ris - Raw interrupt status - * @mis - Masked interrupt statu register - * @key_1_l - Key register 1 L - * @key_1_r - Key register 1 R - * @key_2_l - Key register 2 L - * @key_2_r - Key register 2 R - * @key_3_l - Key register 3 L - * @key_3_r - Key register 3 R - * @key_4_l - Key register 4 L - * @key_4_r - Key register 4 R - * @init_vect_0_l - init vector 0 L - * @init_vect_0_r - init vector 0 R - * @init_vect_1_l - init vector 1 L - * @init_vect_1_r - init vector 1 R - * @cryp_unused1 - unused registers - * @itcr - Integration test control register - * @itip - Integration test input register - * @itop - Integration test output register - * @cryp_unused2 - unused registers - * @periphId0 - FE0 CRYP Peripheral Identication Register - * @periphId1 - FE4 - * @periphId2 - FE8 - * @periphId3 - FEC - * @pcellId0 - FF0 CRYP PCell Identication Register - * @pcellId1 - FF4 - * @pcellId2 - FF8 - * @pcellId3 - FFC - */ -struct cryp_register { - u32 cr; /* Configuration register */ - u32 sr; /* Status register */ - u32 din; /* Data input register */ - u32 din_size; /* Data input size register */ - u32 dout; /* Data output register */ - u32 dout_size; /* Data output size register */ - u32 dmacr; /* Dma control register */ - u32 imsc; /* Interrupt mask set/clear register */ - u32 ris; /* Raw interrupt status */ - u32 mis; /* Masked interrupt statu register */ - - u32 key_1_l; /*Key register 1 L */ - u32 key_1_r; /*Key register 1 R */ - u32 key_2_l; /*Key register 2 L */ - u32 key_2_r; /*Key register 2 R */ - u32 key_3_l; /*Key register 3 L */ - u32 key_3_r; /*Key register 3 R */ - u32 key_4_l; /*Key register 4 L */ - u32 key_4_r; /*Key register 4 R */ - - u32 init_vect_0_l; /*init vector 0 L */ - u32 init_vect_0_r; /*init vector 0 R */ - u32 init_vect_1_l; /*init vector 1 L */ - u32 init_vect_1_r; /*init vector 1 R */ - - u32 cryp_unused1[(0x80 - 0x58) / sizeof(u32)]; /* unused registers */ - u32 itcr; /*Integration test control register */ - u32 itip; /*Integration test input register */ - u32 itop; /*Integration test output register */ - u32 cryp_unused2[(0xFE0 - 0x8C) / sizeof(u32)]; /* unused registers */ - - u32 periphId0; /* FE0 CRYP Peripheral Identication Register */ - u32 periphId1; /* FE4 */ - u32 periphId2; /* FE8 */ - u32 periphId3; /* FEC */ - - u32 pcellId0; /* FF0 CRYP PCell Identication Register */ - u32 pcellId1; /* FF4 */ - u32 pcellId2; /* FF8 */ - u32 pcellId3; /* FFC */ -}; - -#endif diff --git a/drivers/crypto/ux500/cryp/cryp_p.h b/drivers/crypto/ux500/cryp/cryp_p.h deleted file mode 100644 index 60b47fe4de35d..0000000000000 --- a/drivers/crypto/ux500/cryp/cryp_p.h +++ /dev/null @@ -1,122 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) ST-Ericsson SA 2010 - * Author: Shujuan Chen for ST-Ericsson. - * Author: Jonas Linde for ST-Ericsson. - * Author: Joakim Bech for ST-Ericsson. - * Author: Berne Hebark for ST-Ericsson. - * Author: Niklas Hernaeus for ST-Ericsson. - */ - -#ifndef _CRYP_P_H_ -#define _CRYP_P_H_ - -#include -#include - -#include "cryp.h" -#include "cryp_irqp.h" - -/* - * Generic Macros - */ -#define CRYP_SET_BITS(reg_name, mask) \ - writel_relaxed((readl_relaxed(reg_name) | mask), reg_name) - -#define CRYP_WRITE_BIT(reg_name, val, mask) \ - writel_relaxed(((readl_relaxed(reg_name) & ~(mask)) |\ - ((val) & (mask))), reg_name) - -#define CRYP_TEST_BITS(reg_name, val) \ - (readl_relaxed(reg_name) & (val)) - -#define CRYP_PUT_BITS(reg, val, shift, mask) \ - writel_relaxed(((readl_relaxed(reg) & ~(mask)) | \ - (((u32)val << shift) & (mask))), reg) - -/* - * CRYP specific Macros - */ -#define CRYP_PERIPHERAL_ID0 0xE3 -#define CRYP_PERIPHERAL_ID1 0x05 - -#define CRYP_PERIPHERAL_ID2_DB8500 0x28 -#define CRYP_PERIPHERAL_ID3 0x00 - -#define CRYP_PCELL_ID0 0x0D -#define CRYP_PCELL_ID1 0xF0 -#define CRYP_PCELL_ID2 0x05 -#define CRYP_PCELL_ID3 0xB1 - -/* - * CRYP register default values - */ -#define MAX_DEVICE_SUPPORT 2 - -/* Priv set, keyrden set and datatype 8bits swapped set as default. */ -#define CRYP_CR_DEFAULT 0x0482 -#define CRYP_DMACR_DEFAULT 0x0 -#define CRYP_IMSC_DEFAULT 0x0 -#define CRYP_DIN_DEFAULT 0x0 -#define CRYP_DOUT_DEFAULT 0x0 -#define CRYP_KEY_DEFAULT 0x0 -#define CRYP_INIT_VECT_DEFAULT 0x0 - -/* - * CRYP Control register specific mask - */ -#define CRYP_CR_SECURE_MASK BIT(0) -#define CRYP_CR_PRLG_MASK BIT(1) -#define CRYP_CR_ALGODIR_MASK BIT(2) -#define CRYP_CR_ALGOMODE_MASK (BIT(5) | BIT(4) | BIT(3)) -#define CRYP_CR_DATATYPE_MASK (BIT(7) | BIT(6)) -#define CRYP_CR_KEYSIZE_MASK (BIT(9) | BIT(8)) -#define CRYP_CR_KEYRDEN_MASK BIT(10) -#define CRYP_CR_KSE_MASK BIT(11) -#define CRYP_CR_START_MASK BIT(12) -#define CRYP_CR_INIT_MASK BIT(13) -#define CRYP_CR_FFLUSH_MASK BIT(14) -#define CRYP_CR_CRYPEN_MASK BIT(15) -#define CRYP_CR_CONTEXT_SAVE_MASK (CRYP_CR_SECURE_MASK |\ - CRYP_CR_PRLG_MASK |\ - CRYP_CR_ALGODIR_MASK |\ - CRYP_CR_ALGOMODE_MASK |\ - CRYP_CR_KEYSIZE_MASK |\ - CRYP_CR_KEYRDEN_MASK |\ - CRYP_CR_DATATYPE_MASK) - - -#define CRYP_SR_INFIFO_READY_MASK (BIT(0) | BIT(1)) -#define CRYP_SR_IFEM_MASK BIT(0) -#define CRYP_SR_BUSY_MASK BIT(4) - -/* - * Bit position used while setting bits in register - */ -#define CRYP_CR_PRLG_POS 1 -#define CRYP_CR_ALGODIR_POS 2 -#define CRYP_CR_ALGOMODE_POS 3 -#define CRYP_CR_DATATYPE_POS 6 -#define CRYP_CR_KEYSIZE_POS 8 -#define CRYP_CR_KEYRDEN_POS 10 -#define CRYP_CR_KSE_POS 11 -#define CRYP_CR_START_POS 12 -#define CRYP_CR_INIT_POS 13 -#define CRYP_CR_CRYPEN_POS 15 - -#define CRYP_SR_BUSY_POS 4 - -/* - * CRYP PCRs------PC_NAND control register - * BIT_MASK - */ -#define CRYP_DMA_REQ_MASK (BIT(1) | BIT(0)) -#define CRYP_DMA_REQ_MASK_POS 0 - - -struct cryp_system_context { - /* CRYP Register structure */ - struct cryp_register *p_cryp_reg[MAX_DEVICE_SUPPORT]; -}; - -#endif -- GitLab From c9b8a83a8f2dca9f82288a621595a6a5970cdc5e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 7 Dec 2022 16:44:41 -0400 Subject: [PATCH 1282/1423] iommufd: Fix comment typos Repair some typos in comments that were noticed late in the review cycle. Fixes: f394576eb11d ("iommufd: PFN handling for iopt_pages") Link: https://lore.kernel.org/r/1-v1-0362a1a1c034+98-iommufd_fixes1_jgg@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Eric Auger Reported-by: Binbin Wu Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/io_pagetable.h | 2 +- drivers/iommu/iommufd/pages.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h index 83e7c175f2a27..0ec3509b7e339 100644 --- a/drivers/iommu/iommufd/io_pagetable.h +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -33,7 +33,7 @@ struct iommu_domain; * * The io_pagetable::iova_rwsem protects node * The iopt_pages::mutex protects pages_node - * iopt and immu_prot are immutable + * iopt and iommu_prot are immutable * The pages::mutex protects num_accesses */ struct iopt_area { diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index 429fa3b0a239c..fccdba782cb69 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -342,7 +342,7 @@ static void batch_destroy(struct pfn_batch *batch, void *backup) kfree(batch->pfns); } -/* true if the pfn could be added, false otherwise */ +/* true if the pfn was added, false otherwise */ static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn) { const unsigned int MAX_NPFNS = type_max(typeof(*batch->npfns)); @@ -418,7 +418,7 @@ static struct page **raw_pages_from_domain(struct iommu_domain *domain, return out_pages; } -/* Continues reading a domain until we reach a discontiguity in the pfns. */ +/* Continues reading a domain until we reach a discontinuity in the pfns. */ static void batch_from_domain_continue(struct pfn_batch *batch, struct iommu_domain *domain, struct iopt_area *area, -- GitLab From a26fa392068d1dcdf781397b7a7dd908dd68f030 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 7 Dec 2022 16:44:42 -0400 Subject: [PATCH 1283/1423] iommufd: Improve a few unclear bits of code Correct a few items noticed late in review: - We should assert that the math in batch_clear_carry() doesn't underflow - user->locked should be -1 not 0 sicne we just did mmput - npages should not have been recalculated, it already has that value No functional change. Fixes: 8d160cd4d506 ("iommufd: Algorithms for PFN storage") Link: https://lore.kernel.org/r/2-v1-0362a1a1c034+98-iommufd_fixes1_jgg@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Eric Auger Reported-by: Binbin Wu Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/pages.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index fccdba782cb69..c771772296485 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -289,6 +289,10 @@ static void batch_clear_carry(struct pfn_batch *batch, unsigned int keep_pfns) if (!keep_pfns) return batch_clear(batch); + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(!batch->end || + batch->npfns[batch->end - 1] < keep_pfns); + batch->total_pfns = keep_pfns; batch->npfns[0] = keep_pfns; batch->pfns[0] = batch->pfns[batch->end - 1] + @@ -723,7 +727,7 @@ static void pfn_reader_user_destroy(struct pfn_reader_user *user, mmap_read_unlock(pages->source_mm); if (pages->source_mm != current->mm) mmput(pages->source_mm); - user->locked = 0; + user->locked = -1; } kfree(user->upages); @@ -810,7 +814,6 @@ static int incr_user_locked_vm(struct iopt_pages *pages, unsigned long npages) lock_limit = task_rlimit(pages->source_task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; - npages = pages->npinned - pages->last_npinned; do { cur_pages = atomic_long_read(&pages->source_user->locked_vm); new_pages = cur_pages + npages; -- GitLab From d6c55c0a20e5059abdde81713ddf6324a946eb3c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 7 Dec 2022 16:44:43 -0400 Subject: [PATCH 1284/1423] iommufd: Change the order of MSI setup Eric points out this is wrong for the rare case of someone using allow_unsafe_interrupts on ARM. We always have to setup the MSI window in the domain if the iommu driver asks for it. Move the iommu_get_msi_cookie() setup to the top of the function and always do it, regardless of the security mode. Add checks to iommufd_device_setup_msi() to ensure the driver is not doing something incomprehensible. No current driver will set both a HW and SW MSI window, or have more than one SW MSI window. Fixes: e8d57210035b ("iommufd: Add kAPI toward external drivers for physical devices") Link: https://lore.kernel.org/r/3-v1-0362a1a1c034+98-iommufd_fixes1_jgg@nvidia.com Reviewed-by: Kevin Tian Reported-by: Eric Auger Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 56 +++++++++++++--------------- drivers/iommu/iommufd/io_pagetable.c | 24 +++++++----- 2 files changed, 39 insertions(+), 41 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index dd2a415b603e3..d81f93a321afc 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -139,19 +139,11 @@ static int iommufd_device_setup_msi(struct iommufd_device *idev, int rc; /* - * IOMMU_CAP_INTR_REMAP means that the platform is isolating MSI, and it - * creates the MSI window by default in the iommu domain. Nothing - * further to do. - */ - if (device_iommu_capable(idev->dev, IOMMU_CAP_INTR_REMAP)) - return 0; - - /* - * On ARM systems that set the global IRQ_DOMAIN_FLAG_MSI_REMAP every - * allocated iommu_domain will block interrupts by default and this - * special flow is needed to turn them back on. iommu_dma_prepare_msi() - * will install pages into our domain after request_irq() to make this - * work. + * If the IOMMU driver gives a IOMMU_RESV_SW_MSI then it is asking us to + * call iommu_get_msi_cookie() on its behalf. This is necessary to setup + * the MSI window so iommu_dma_prepare_msi() can install pages into our + * domain after request_irq(). If it is not done interrupts will not + * work on this domain. * * FIXME: This is conceptually broken for iommufd since we want to allow * userspace to change the domains, eg switch from an identity IOAS to a @@ -159,33 +151,35 @@ static int iommufd_device_setup_msi(struct iommufd_device *idev, * matches what the IRQ layer actually expects in a newly created * domain. */ - if (irq_domain_check_msi_remap()) { - if (WARN_ON(!sw_msi_start)) - return -EPERM; + if (sw_msi_start != PHYS_ADDR_MAX && !hwpt->msi_cookie) { + rc = iommu_get_msi_cookie(hwpt->domain, sw_msi_start); + if (rc) + return rc; + /* * iommu_get_msi_cookie() can only be called once per domain, * it returns -EBUSY on later calls. */ - if (hwpt->msi_cookie) - return 0; - rc = iommu_get_msi_cookie(hwpt->domain, sw_msi_start); - if (rc) - return rc; hwpt->msi_cookie = true; - return 0; } /* - * Otherwise the platform has a MSI window that is not isolated. For - * historical compat with VFIO allow a module parameter to ignore the - * insecurity. + * For historical compat with VFIO the insecure interrupt path is + * allowed if the module parameter is set. Insecure means that a MemWr + * operation from the device (eg a simple DMA) cannot trigger an + * interrupt outside this iommufd context. */ - if (!allow_unsafe_interrupts) - return -EPERM; + if (!device_iommu_capable(idev->dev, IOMMU_CAP_INTR_REMAP) && + !irq_domain_check_msi_remap()) { + if (!allow_unsafe_interrupts) + return -EPERM; - dev_warn( - idev->dev, - "MSI interrupt window cannot be isolated by the IOMMU, this platform is insecure. Use the \"allow_unsafe_interrupts\" module parameter to override\n"); + dev_warn( + idev->dev, + "MSI interrupts are not secure, they cannot be isolated by the platform. " + "Check that platform features like interrupt remapping are enabled. " + "Use the \"allow_unsafe_interrupts\" module parameter to override\n"); + } return 0; } @@ -203,7 +197,7 @@ static bool iommufd_hw_pagetable_has_group(struct iommufd_hw_pagetable *hwpt, static int iommufd_device_do_attach(struct iommufd_device *idev, struct iommufd_hw_pagetable *hwpt) { - phys_addr_t sw_msi_start = 0; + phys_addr_t sw_msi_start = PHYS_ADDR_MAX; int rc; mutex_lock(&hwpt->devices_lock); diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 3467cea795684..e0ae72b9e67f8 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -1170,6 +1170,8 @@ int iopt_table_enforce_group_resv_regions(struct io_pagetable *iopt, struct iommu_resv_region *resv; struct iommu_resv_region *tmp; LIST_HEAD(group_resv_regions); + unsigned int num_hw_msi = 0; + unsigned int num_sw_msi = 0; int rc; down_write(&iopt->iova_rwsem); @@ -1181,23 +1183,25 @@ int iopt_table_enforce_group_resv_regions(struct io_pagetable *iopt, if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE) continue; - /* - * The presence of any 'real' MSI regions should take precedence - * over the software-managed one if the IOMMU driver happens to - * advertise both types. - */ - if (sw_msi_start && resv->type == IOMMU_RESV_MSI) { - *sw_msi_start = 0; - sw_msi_start = NULL; - } - if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) + if (sw_msi_start && resv->type == IOMMU_RESV_MSI) + num_hw_msi++; + if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) { *sw_msi_start = resv->start; + num_sw_msi++; + } rc = iopt_reserve_iova(iopt, resv->start, resv->length - 1 + resv->start, device); if (rc) goto out_reserved; } + + /* Drivers must offer sane combinations of regions */ + if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) { + rc = -EINVAL; + goto out_reserved; + } + rc = 0; goto out_free_resv; -- GitLab From 3282a549cf9b300e2d1b007925ed007ab24e4131 Mon Sep 17 00:00:00 2001 From: Daisuke Matsuda Date: Fri, 9 Dec 2022 13:59:26 +0900 Subject: [PATCH 1285/1423] RDMA/rxe: Fix oops with zero length reads The commit 686d348476ee ("RDMA/rxe: Remove unnecessary mr testing") causes a kernel crash. If responder get a zero-byte RDMA Read request, qp->resp.mr is not set in check_rkey() (see IBA C9-88). The mr is NULL in this case, and a NULL pointer dereference occurs as shown below. BUG: kernel NULL pointer dereference, address: 0000000000000010 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 0 P4D 0 Oops: 0002 [#1] PREEMPT SMP PTI CPU: 2 PID: 3622 Comm: python3 Kdump: loaded Not tainted 6.1.0-rc3+ #34 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 RIP: 0010:__rxe_put+0xc/0x60 [rdma_rxe] Code: cc cc cc 31 f6 e8 64 36 1b d3 41 b8 01 00 00 00 44 89 c0 c3 cc cc cc cc 41 89 c0 eb c1 90 0f 1f 44 00 00 41 54 b8 ff ff ff ff 0f c1 47 10 83 f8 01 74 11 45 31 e4 85 c0 7e 20 44 89 e0 41 5c RSP: 0018:ffffb27bc012ce78 EFLAGS: 00010246 RAX: 00000000ffffffff RBX: ffff9790857b0580 RCX: 0000000000000000 RDX: ffff979080fe145a RSI: 000055560e3e0000 RDI: 0000000000000000 RBP: ffff97909c7dd800 R08: 0000000000000001 R09: e7ce43d97f7bed0f R10: ffff97908b29c300 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: ffff97908b29c300 R15: 0000000000000000 FS: 00007f276f7bd740(0000) GS:ffff9792b5c80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000010 CR3: 0000000114230002 CR4: 0000000000060ee0 Call Trace: read_reply+0xda/0x310 [rdma_rxe] rxe_responder+0x82d/0xe50 [rdma_rxe] do_task+0x84/0x170 [rdma_rxe] tasklet_action_common.constprop.0+0xa7/0x120 __do_softirq+0xcb/0x2ac do_softirq+0x63/0x90 Support a NULL mr during read_reply() Fixes: 686d348476ee ("RDMA/rxe: Remove unnecessary mr testing") Fixes: b5f9a01fae42 ("RDMA/rxe: Fix mr leak in RESPST_ERR_RNR") Link: https://lore.kernel.org/r/20221209045926.531689-1-matsuda-daisuke@fujitsu.com Link: https://lore.kernel.org/r/20221202145713.13152-1-lizhijian@fujitsu.com Signed-off-by: Daisuke Matsuda Signed-off-by: Li Zhijian Reviewed-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_resp.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 12eb85e8d4154..0c8bec2819376 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -460,7 +460,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp, return RESPST_EXECUTE; } - /* A zero-byte op is not required to set an addr or rkey. */ + /* A zero-byte op is not required to set an addr or rkey. See C9-88 */ if ((pkt->mask & RXE_READ_OR_WRITE_MASK) && (pkt->mask & RXE_RETH_MASK) && reth_len(pkt) == 0) { @@ -880,13 +880,15 @@ static enum resp_states read_reply(struct rxe_qp *qp, skb = prepare_ack_packet(qp, &ack_pkt, opcode, payload, res->cur_psn, AETH_ACK_UNLIMITED); if (!skb) { - rxe_put(mr); + if (mr) + rxe_put(mr); return RESPST_ERR_RNR; } err = rxe_mr_copy(mr, res->read.va, payload_addr(&ack_pkt), payload, RXE_FROM_MR_OBJ); - rxe_put(mr); + if (mr) + rxe_put(mr); if (err) { kfree_skb(skb); return RESPST_ERR_RKEY_VIOLATION; -- GitLab From 6ff8ca3f93d3cd2a77f051d2d971cf3638d39546 Mon Sep 17 00:00:00 2001 From: Qinglin Pan Date: Mon, 28 Nov 2022 10:36:43 +0800 Subject: [PATCH 1286/1423] riscv: mm: call best_map_size many times during linear-mapping Modify the best_map_size function to give map_size many times instead of only once, so a memory region can be mapped by both PMD_SIZE and PAGE_SIZE. Signed-off-by: Qinglin Pan Reviewed-by: Andrew Jones Reviewed-by: Alexandre Ghiti Tested-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20221128023643.329091-1-panqinglin2020@iscas.ac.cn Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 7d59516ce6b39..bb0028c07ef30 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -673,10 +673,11 @@ void __init create_pgd_mapping(pgd_t *pgdp, static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size) { /* Upgrade to PMD_SIZE mappings whenever possible */ - if ((base & (PMD_SIZE - 1)) || (size & (PMD_SIZE - 1))) - return PAGE_SIZE; + base &= PMD_SIZE - 1; + if (!base && size >= PMD_SIZE) + return PMD_SIZE; - return PMD_SIZE; + return PAGE_SIZE; } #ifdef CONFIG_XIP_KERNEL @@ -1111,9 +1112,9 @@ static void __init setup_vm_final(void) if (end >= __pa(PAGE_OFFSET) + memory_limit) end = __pa(PAGE_OFFSET) + memory_limit; - map_size = best_map_size(start, end - start); for (pa = start; pa < end; pa += map_size) { va = (uintptr_t)__va(pa); + map_size = best_map_size(pa, end - pa); create_pgd_mapping(swapper_pg_dir, va, pa, map_size, pgprot_from_va(va)); -- GitLab From 689c5421bfe0eac65526bd97a466b9590a6aad3c Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 8 Dec 2022 15:09:46 -0600 Subject: [PATCH 1287/1423] RDMA/rxe: Fix incorrect responder length checking The code in rxe_resp.c at check_length() is incorrect as it compares pkt->opcode an 8 bit value against various mask bits which are all higher than 256 so nothing is ever reported. This patch rewrites this to compare against pkt->mask which is correct. However this now exposes another error. For UD send packets the value of the pmtu cannot be determined from qp->mtu. All that is required here is to later check if the payload fits into the posted receive buffer in that case. Fixes: 837a55847ead ("RDMA/rxe: Implement packet length validation on responder") Link: https://lore.kernel.org/r/20221208210945.28607-1-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Reviewed-by: Daisuke Matsuda Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_resp.c | 62 ++++++++++++++++------------ 1 file changed, 36 insertions(+), 26 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 0c8bec2819376..abbaa41017e87 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -392,36 +392,46 @@ static enum resp_states check_resource(struct rxe_qp *qp, return RESPST_CHK_LENGTH; } -static enum resp_states check_length(struct rxe_qp *qp, - struct rxe_pkt_info *pkt) +static enum resp_states rxe_resp_check_length(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) { - int mtu = qp->mtu; - u32 payload = payload_size(pkt); - u32 dmalen = reth_len(pkt); - - /* RoCEv2 packets do not have LRH. - * Let's skip checking it. + /* + * See IBA C9-92 + * For UD QPs we only check if the packet will fit in the + * receive buffer later. For rmda operations additional + * length checks are performed in check_rkey. */ - - if ((pkt->opcode & RXE_START_MASK) && - (pkt->opcode & RXE_END_MASK)) { - /* "only" packets */ - if (payload > mtu) - return RESPST_ERR_LENGTH; - } else if ((pkt->opcode & RXE_START_MASK) || - (pkt->opcode & RXE_MIDDLE_MASK)) { - /* "first" or "middle" packets */ - if (payload != mtu) - return RESPST_ERR_LENGTH; - } else if (pkt->opcode & RXE_END_MASK) { - /* "last" packets */ - if ((payload == 0) || (payload > mtu)) - return RESPST_ERR_LENGTH; + if (pkt->mask & RXE_PAYLOAD_MASK && ((qp_type(qp) == IB_QPT_RC) || + (qp_type(qp) == IB_QPT_UC))) { + unsigned int mtu = qp->mtu; + unsigned int payload = payload_size(pkt); + + if ((pkt->mask & RXE_START_MASK) && + (pkt->mask & RXE_END_MASK)) { + if (unlikely(payload > mtu)) { + rxe_dbg_qp(qp, "only packet too long"); + return RESPST_ERR_LENGTH; + } + } else if ((pkt->mask & RXE_START_MASK) || + (pkt->mask & RXE_MIDDLE_MASK)) { + if (unlikely(payload != mtu)) { + rxe_dbg_qp(qp, "first or middle packet not mtu"); + return RESPST_ERR_LENGTH; + } + } else if (pkt->mask & RXE_END_MASK) { + if (unlikely((payload == 0) || (payload > mtu))) { + rxe_dbg_qp(qp, "last packet zero or too long"); + return RESPST_ERR_LENGTH; + } + } } - if (pkt->opcode & (RXE_WRITE_MASK | RXE_READ_MASK)) { - if (dmalen > (1 << 31)) + /* See IBA C9-94 */ + if (pkt->mask & RXE_RETH_MASK) { + if (reth_len(pkt) > (1U << 31)) { + rxe_dbg_qp(qp, "dma length too long"); return RESPST_ERR_LENGTH; + } } return RESPST_CHK_RKEY; @@ -1401,7 +1411,7 @@ int rxe_responder(void *arg) state = check_resource(qp, pkt); break; case RESPST_CHK_LENGTH: - state = check_length(qp, pkt); + state = rxe_resp_check_length(qp, pkt); break; case RESPST_CHK_RKEY: state = check_rkey(qp, pkt); -- GitLab From 0c17da492dc6c33cc5b99633adb4bd7b2587153c Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 6 Dec 2022 21:01:52 +0800 Subject: [PATCH 1288/1423] RDMA: Extend RDMA user ABI to support flush This commit extends the RDMA user ABI to support the flush operation defined in IBA A19.4.1. These changes are backward compatible with the existing RDMA user ABI. Link: https://lore.kernel.org/r/20221206130201.30986-2-lizhijian@fujitsu.com Reviewed-by: Zhu Yanjun Signed-off-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/ib_user_ioctl_verbs.h | 2 ++ include/uapi/rdma/ib_user_verbs.h | 17 +++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h index e0c25537fd2e9..d7c5aaa327445 100644 --- a/include/uapi/rdma/ib_user_ioctl_verbs.h +++ b/include/uapi/rdma/ib_user_ioctl_verbs.h @@ -57,6 +57,8 @@ enum ib_uverbs_access_flags { IB_UVERBS_ACCESS_ZERO_BASED = 1 << 5, IB_UVERBS_ACCESS_ON_DEMAND = 1 << 6, IB_UVERBS_ACCESS_HUGETLB = 1 << 7, + IB_UVERBS_ACCESS_FLUSH_GLOBAL = 1 << 8, + IB_UVERBS_ACCESS_FLUSH_PERSISTENT = 1 << 9, IB_UVERBS_ACCESS_RELAXED_ORDERING = IB_UVERBS_ACCESS_OPTIONAL_FIRST, IB_UVERBS_ACCESS_OPTIONAL_RANGE = diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 2378148155442..e16650f0c85dd 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -105,6 +105,18 @@ enum { IB_USER_VERBS_EX_CMD_MODIFY_CQ }; +/* see IBA A19.4.1.1 Placement Types */ +enum ib_placement_type { + IB_FLUSH_GLOBAL = 1U << 0, + IB_FLUSH_PERSISTENT = 1U << 1, +}; + +/* see IBA A19.4.1.2 Selectivity Level */ +enum ib_selectivity_level { + IB_FLUSH_RANGE = 0, + IB_FLUSH_MR, +}; + /* * Make sure that all structs defined in this file remain laid out so * that they pack the same way on 32-bit and 64-bit architectures (to @@ -466,6 +478,7 @@ enum ib_uverbs_wc_opcode { IB_UVERBS_WC_BIND_MW = 5, IB_UVERBS_WC_LOCAL_INV = 6, IB_UVERBS_WC_TSO = 7, + IB_UVERBS_WC_FLUSH = 8, IB_UVERBS_WC_ATOMIC_WRITE = 9, }; @@ -785,6 +798,7 @@ enum ib_uverbs_wr_opcode { IB_UVERBS_WR_RDMA_READ_WITH_INV = 11, IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP = 12, IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD = 13, + IB_UVERBS_WR_FLUSH = 14, IB_UVERBS_WR_ATOMIC_WRITE = 15, /* Review enum ib_wr_opcode before modifying this */ }; @@ -1333,6 +1347,9 @@ enum ib_uverbs_device_cap_flags { /* Deprecated. Please use IB_UVERBS_RAW_PACKET_CAP_SCATTER_FCS. */ IB_UVERBS_DEVICE_RAW_SCATTER_FCS = 1ULL << 34, IB_UVERBS_DEVICE_PCI_WRITE_END_PADDING = 1ULL << 36, + /* Flush placement types */ + IB_UVERBS_DEVICE_FLUSH_GLOBAL = 1ULL << 38, + IB_UVERBS_DEVICE_FLUSH_PERSISTENT = 1ULL << 39, /* Atomic write attributes */ IB_UVERBS_DEVICE_ATOMIC_WRITE = 1ULL << 40, }; -- GitLab From 208e3a134b50d95ea3962d7a37b4d8a8f5368376 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 6 Dec 2022 21:01:53 +0800 Subject: [PATCH 1289/1423] RDMA: Extend RDMA kernel verbs ABI to support flush This commit extends the RDMA kernel verbs ABI to support the flush operation defined in IBA A19.4.1. These changes are backward compatible with the existing RDMA kernel verbs ABI. It makes device/HCA support new FLUSH attributes/capabilities, and it also makes memory region support new FLUSH access flags. Users can use ibv_reg_mr(3) to register flush access flags. Only the access flags also supported by device's capabilities can be registered successfully. Once registered successfully, it means the MR is flushable. Similarly, A flushable MR should also have one or both of GLOBAL_VISIBILITY and PERSISTENT attributes/capabilities like device/HCA. Link: https://lore.kernel.org/r/20221206130201.30986-3-lizhijian@fujitsu.com Reviewed-by: Zhu Yanjun Signed-off-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- include/rdma/ib_pack.h | 3 +++ include/rdma/ib_verbs.h | 18 +++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h index f932d164af637..b8c56d7dc35da 100644 --- a/include/rdma/ib_pack.h +++ b/include/rdma/ib_pack.h @@ -84,6 +84,7 @@ enum { /* opcode 0x15 is reserved */ IB_OPCODE_SEND_LAST_WITH_INVALIDATE = 0x16, IB_OPCODE_SEND_ONLY_WITH_INVALIDATE = 0x17, + IB_OPCODE_FLUSH = 0x1C, IB_OPCODE_ATOMIC_WRITE = 0x1D, /* real constants follow -- see comment about above IB_OPCODE() @@ -113,6 +114,7 @@ enum { IB_OPCODE(RC, FETCH_ADD), IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE), IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE), + IB_OPCODE(RC, FLUSH), IB_OPCODE(RC, ATOMIC_WRITE), /* UC */ @@ -151,6 +153,7 @@ enum { IB_OPCODE(RD, ATOMIC_ACKNOWLEDGE), IB_OPCODE(RD, COMPARE_SWAP), IB_OPCODE(RD, FETCH_ADD), + IB_OPCODE(RD, FLUSH), /* UD */ IB_OPCODE(UD, SEND_ONLY), diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index df6bb26ba0be6..a9a429172c0a1 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -270,6 +270,9 @@ enum ib_device_cap_flags { /* The device supports padding incoming writes to cacheline. */ IB_DEVICE_PCI_WRITE_END_PADDING = IB_UVERBS_DEVICE_PCI_WRITE_END_PADDING, + /* Placement type attributes */ + IB_DEVICE_FLUSH_GLOBAL = IB_UVERBS_DEVICE_FLUSH_GLOBAL, + IB_DEVICE_FLUSH_PERSISTENT = IB_UVERBS_DEVICE_FLUSH_PERSISTENT, IB_DEVICE_ATOMIC_WRITE = IB_UVERBS_DEVICE_ATOMIC_WRITE, }; @@ -987,6 +990,7 @@ enum ib_wc_opcode { IB_WC_REG_MR, IB_WC_MASKED_COMP_SWAP, IB_WC_MASKED_FETCH_ADD, + IB_WC_FLUSH = IB_UVERBS_WC_FLUSH, /* * Set value of IB_WC_RECV so consumers can test if a completion is a * receive by testing (opcode & IB_WC_RECV). @@ -1327,6 +1331,7 @@ enum ib_wr_opcode { IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP, IB_WR_MASKED_ATOMIC_FETCH_AND_ADD = IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD, + IB_WR_FLUSH = IB_UVERBS_WR_FLUSH, IB_WR_ATOMIC_WRITE = IB_UVERBS_WR_ATOMIC_WRITE, /* These are kernel only and can not be issued by userspace */ @@ -1461,10 +1466,12 @@ enum ib_access_flags { IB_ACCESS_ON_DEMAND = IB_UVERBS_ACCESS_ON_DEMAND, IB_ACCESS_HUGETLB = IB_UVERBS_ACCESS_HUGETLB, IB_ACCESS_RELAXED_ORDERING = IB_UVERBS_ACCESS_RELAXED_ORDERING, + IB_ACCESS_FLUSH_GLOBAL = IB_UVERBS_ACCESS_FLUSH_GLOBAL, + IB_ACCESS_FLUSH_PERSISTENT = IB_UVERBS_ACCESS_FLUSH_PERSISTENT, IB_ACCESS_OPTIONAL = IB_UVERBS_ACCESS_OPTIONAL_RANGE, IB_ACCESS_SUPPORTED = - ((IB_ACCESS_HUGETLB << 1) - 1) | IB_ACCESS_OPTIONAL, + ((IB_ACCESS_FLUSH_PERSISTENT << 1) - 1) | IB_ACCESS_OPTIONAL, }; /* @@ -4325,6 +4332,8 @@ int ib_dealloc_xrcd_user(struct ib_xrcd *xrcd, struct ib_udata *udata); static inline int ib_check_mr_access(struct ib_device *ib_dev, unsigned int flags) { + u64 device_cap = ib_dev->attrs.device_cap_flags; + /* * Local write permission is required if remote write or * remote atomic permission is also requested. @@ -4339,6 +4348,13 @@ static inline int ib_check_mr_access(struct ib_device *ib_dev, if (flags & IB_ACCESS_ON_DEMAND && !(ib_dev->attrs.kernel_cap_flags & IBK_ON_DEMAND_PAGING)) return -EOPNOTSUPP; + + if ((flags & IB_ACCESS_FLUSH_GLOBAL && + !(device_cap & IB_DEVICE_FLUSH_GLOBAL)) || + (flags & IB_ACCESS_FLUSH_PERSISTENT && + !(device_cap & IB_DEVICE_FLUSH_PERSISTENT))) + return -EOPNOTSUPP; + return 0; } -- GitLab From 668ce52d5eef477c0def757610768a1a3ccc9785 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 6 Dec 2022 21:01:54 +0800 Subject: [PATCH 1290/1423] RDMA/rxe: Extend rxe user ABI to support flush This commit extends the rxe user ABI to support the flush operation defined in IBA A19.4.1. These changes are backward compatible with the existing rxe user ABI. The user API request a flush by filling this structure. Link: https://lore.kernel.org/r/20221206130201.30986-4-lizhijian@fujitsu.com Reviewed-by: Zhu Yanjun Signed-off-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/rdma_user_rxe.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h index d20d1ecf046fd..bb092fccb813c 100644 --- a/include/uapi/rdma/rdma_user_rxe.h +++ b/include/uapi/rdma/rdma_user_rxe.h @@ -82,6 +82,13 @@ struct rxe_send_wr { __u32 invalidate_rkey; } ex; union { + struct { + __aligned_u64 remote_addr; + __u32 length; + __u32 rkey; + __u8 type; + __u8 level; + } flush; struct { __aligned_u64 remote_addr; __u32 rkey; -- GitLab From 02ea0a511558c907bde0e01fdebcd4536924d996 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 6 Dec 2022 21:01:55 +0800 Subject: [PATCH 1291/1423] RDMA/rxe: Allow registering persistent flag for pmem MR only Memory region could support at most 2 flush access flags: IB_ACCESS_FLUSH_PERSISTENT and IB_ACCESS_FLUSH_GLOBAL But we only allow user to register persistent flush flags to the pmem MR where it has the ability of persisting data across power cycles. So registering a persistent access flag to a non-pmem MR will be rejected. Link: https://lore.kernel.org/r/20221206130201.30986-5-lizhijian@fujitsu.com CC: Dan Williams Signed-off-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mr.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index b7c9ff1ddf0e1..81a438e5010aa 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -111,6 +111,15 @@ void rxe_mr_init_dma(int access, struct rxe_mr *mr) mr->ibmr.type = IB_MR_TYPE_DMA; } +static bool is_pmem_page(struct page *pg) +{ + unsigned long paddr = page_to_phys(pg); + + return REGION_INTERSECTS == + region_intersects(paddr, PAGE_SIZE, IORESOURCE_MEM, + IORES_DESC_PERSISTENT_MEMORY); +} + int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, int access, struct rxe_mr *mr) { @@ -146,16 +155,25 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, num_buf = 0; map = mr->map; if (length > 0) { - buf = map[0]->buf; + bool persistent_access = access & IB_ACCESS_FLUSH_PERSISTENT; + buf = map[0]->buf; for_each_sgtable_page (&umem->sgt_append.sgt, &sg_iter, 0) { + struct page *pg = sg_page_iter_page(&sg_iter); + + if (persistent_access && !is_pmem_page(pg)) { + rxe_dbg_mr(mr, "Unable to register persistent access to non-pmem device\n"); + err = -EINVAL; + goto err_release_umem; + } + if (num_buf >= RXE_BUF_PER_MAP) { map++; buf = map[0]->buf; num_buf = 0; } - vaddr = page_address(sg_page_iter_page(&sg_iter)); + vaddr = page_address(pg); if (!vaddr) { rxe_dbg_mr(mr, "Unable to get virtual address\n"); err = -ENOMEM; -- GitLab From 02e9a31c897d17981508ceaac4430b93ff56ffc7 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 6 Dec 2022 21:01:56 +0800 Subject: [PATCH 1292/1423] RDMA/rxe: Extend rxe packet format to support flush Extend rxe opcode tables, headers, helper and constants to support flush operations. Refer to the IBA A19.4.1 for more FETH definition details Link: https://lore.kernel.org/r/20221206130201.30986-6-lizhijian@fujitsu.com Reviewed-by: Zhu Yanjun Signed-off-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_hdr.h | 47 ++++++++++++++++++++++++++ drivers/infiniband/sw/rxe/rxe_opcode.c | 17 ++++++++++ drivers/infiniband/sw/rxe/rxe_opcode.h | 14 +++++--- 3 files changed, 73 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_hdr.h b/drivers/infiniband/sw/rxe/rxe_hdr.h index 804594b76040a..46f82b27fcd2f 100644 --- a/drivers/infiniband/sw/rxe/rxe_hdr.h +++ b/drivers/infiniband/sw/rxe/rxe_hdr.h @@ -607,6 +607,52 @@ static inline void reth_set_len(struct rxe_pkt_info *pkt, u32 len) rxe_opcode[pkt->opcode].offset[RXE_RETH], len); } +/****************************************************************************** + * FLUSH Extended Transport Header + ******************************************************************************/ + +struct rxe_feth { + __be32 bits; +}; + +#define FETH_PLT_MASK (0x0000000f) /* bits 3-0 */ +#define FETH_SEL_MASK (0x00000030) /* bits 5-4 */ +#define FETH_SEL_SHIFT (4U) + +static inline u32 __feth_plt(void *arg) +{ + struct rxe_feth *feth = arg; + + return be32_to_cpu(feth->bits) & FETH_PLT_MASK; +} + +static inline u32 __feth_sel(void *arg) +{ + struct rxe_feth *feth = arg; + + return (be32_to_cpu(feth->bits) & FETH_SEL_MASK) >> FETH_SEL_SHIFT; +} + +static inline u32 feth_plt(struct rxe_pkt_info *pkt) +{ + return __feth_plt(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_FETH]); +} + +static inline u32 feth_sel(struct rxe_pkt_info *pkt) +{ + return __feth_sel(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_FETH]); +} + +static inline void feth_init(struct rxe_pkt_info *pkt, u8 type, u8 level) +{ + struct rxe_feth *feth = (struct rxe_feth *) + (pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_FETH]); + u32 bits = ((level << FETH_SEL_SHIFT) & FETH_SEL_MASK) | + (type & FETH_PLT_MASK); + + feth->bits = cpu_to_be32(bits); +} + /****************************************************************************** * Atomic Extended Transport Header ******************************************************************************/ @@ -909,6 +955,7 @@ enum rxe_hdr_length { RXE_ATMETH_BYTES = sizeof(struct rxe_atmeth), RXE_IETH_BYTES = sizeof(struct rxe_ieth), RXE_RDETH_BYTES = sizeof(struct rxe_rdeth), + RXE_FETH_BYTES = sizeof(struct rxe_feth), }; static inline size_t header_size(struct rxe_pkt_info *pkt) diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.c b/drivers/infiniband/sw/rxe/rxe_opcode.c index fb196029048e5..5c0d5c6ffda4f 100644 --- a/drivers/infiniband/sw/rxe/rxe_opcode.c +++ b/drivers/infiniband/sw/rxe/rxe_opcode.c @@ -101,6 +101,12 @@ struct rxe_wr_opcode_info rxe_wr_opcode_info[] = { [IB_QPT_UC] = WR_LOCAL_OP_MASK, }, }, + [IB_WR_FLUSH] = { + .name = "IB_WR_FLUSH", + .mask = { + [IB_QPT_RC] = WR_FLUSH_MASK, + }, + }, [IB_WR_ATOMIC_WRITE] = { .name = "IB_WR_ATOMIC_WRITE", .mask = { @@ -384,6 +390,17 @@ struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE] = { RXE_IETH_BYTES, } }, + [IB_OPCODE_RC_FLUSH] = { + .name = "IB_OPCODE_RC_FLUSH", + .mask = RXE_FETH_MASK | RXE_RETH_MASK | RXE_FLUSH_MASK | + RXE_START_MASK | RXE_END_MASK | RXE_REQ_MASK, + .length = RXE_BTH_BYTES + RXE_FETH_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_FETH] = RXE_BTH_BYTES, + [RXE_RETH] = RXE_BTH_BYTES + RXE_FETH_BYTES, + } + }, [IB_OPCODE_RC_ATOMIC_WRITE] = { .name = "IB_OPCODE_RC_ATOMIC_WRITE", .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK | diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.h b/drivers/infiniband/sw/rxe/rxe_opcode.h index a470e9b0b8842..cea4e0a639199 100644 --- a/drivers/infiniband/sw/rxe/rxe_opcode.h +++ b/drivers/infiniband/sw/rxe/rxe_opcode.h @@ -20,6 +20,7 @@ enum rxe_wr_mask { WR_READ_MASK = BIT(3), WR_WRITE_MASK = BIT(4), WR_LOCAL_OP_MASK = BIT(5), + WR_FLUSH_MASK = BIT(6), WR_ATOMIC_WRITE_MASK = BIT(7), WR_READ_OR_WRITE_MASK = WR_READ_MASK | WR_WRITE_MASK, @@ -48,6 +49,7 @@ enum rxe_hdr_type { RXE_RDETH, RXE_DETH, RXE_IMMDT, + RXE_FETH, RXE_PAYLOAD, NUM_HDR_TYPES }; @@ -64,6 +66,7 @@ enum rxe_hdr_mask { RXE_IETH_MASK = BIT(RXE_IETH), RXE_RDETH_MASK = BIT(RXE_RDETH), RXE_DETH_MASK = BIT(RXE_DETH), + RXE_FETH_MASK = BIT(RXE_FETH), RXE_PAYLOAD_MASK = BIT(RXE_PAYLOAD), RXE_REQ_MASK = BIT(NUM_HDR_TYPES + 0), @@ -72,13 +75,14 @@ enum rxe_hdr_mask { RXE_WRITE_MASK = BIT(NUM_HDR_TYPES + 3), RXE_READ_MASK = BIT(NUM_HDR_TYPES + 4), RXE_ATOMIC_MASK = BIT(NUM_HDR_TYPES + 5), + RXE_FLUSH_MASK = BIT(NUM_HDR_TYPES + 6), - RXE_RWR_MASK = BIT(NUM_HDR_TYPES + 6), - RXE_COMP_MASK = BIT(NUM_HDR_TYPES + 7), + RXE_RWR_MASK = BIT(NUM_HDR_TYPES + 7), + RXE_COMP_MASK = BIT(NUM_HDR_TYPES + 8), - RXE_START_MASK = BIT(NUM_HDR_TYPES + 8), - RXE_MIDDLE_MASK = BIT(NUM_HDR_TYPES + 9), - RXE_END_MASK = BIT(NUM_HDR_TYPES + 10), + RXE_START_MASK = BIT(NUM_HDR_TYPES + 9), + RXE_MIDDLE_MASK = BIT(NUM_HDR_TYPES + 10), + RXE_END_MASK = BIT(NUM_HDR_TYPES + 11), RXE_LOOPBACK_MASK = BIT(NUM_HDR_TYPES + 12), -- GitLab From fa1fd682ad3ef35b0e532c3bb14140786d17527c Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 6 Dec 2022 21:01:57 +0800 Subject: [PATCH 1293/1423] RDMA/rxe: Implement RC RDMA FLUSH service in requester side Implement FLUSH request operation in the requester. Link: https://lore.kernel.org/r/20221206130201.30986-7-lizhijian@fujitsu.com Reviewed-by: Zhu Yanjun Signed-off-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_req.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 2713e90589225..899c8779f8001 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -241,6 +241,9 @@ static int next_opcode_rc(struct rxe_qp *qp, u32 opcode, int fits) IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE : IB_OPCODE_RC_SEND_FIRST; + case IB_WR_FLUSH: + return IB_OPCODE_RC_FLUSH; + case IB_WR_RDMA_READ: return IB_OPCODE_RC_RDMA_READ_REQUEST; @@ -425,11 +428,18 @@ static struct sk_buff *init_req_packet(struct rxe_qp *qp, /* init optional headers */ if (pkt->mask & RXE_RETH_MASK) { - reth_set_rkey(pkt, ibwr->wr.rdma.rkey); + if (pkt->mask & RXE_FETH_MASK) + reth_set_rkey(pkt, ibwr->wr.flush.rkey); + else + reth_set_rkey(pkt, ibwr->wr.rdma.rkey); reth_set_va(pkt, wqe->iova); reth_set_len(pkt, wqe->dma.resid); } + /* Fill Flush Extension Transport Header */ + if (pkt->mask & RXE_FETH_MASK) + feth_init(pkt, ibwr->wr.flush.type, ibwr->wr.flush.level); + if (pkt->mask & RXE_IMMDT_MASK) immdt_set_imm(pkt, ibwr->ex.imm_data); @@ -488,6 +498,9 @@ static int finish_packet(struct rxe_qp *qp, struct rxe_av *av, memset(pad, 0, bth_pad(pkt)); } + } else if (pkt->mask & RXE_FLUSH_MASK) { + /* oA19-2: shall have no payload. */ + wqe->dma.resid = 0; } if (pkt->mask & RXE_ATOMIC_WRITE_MASK) { -- GitLab From ea1bb00ee9a5527b032a6efebe4a879db4cb42bb Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 6 Dec 2022 21:01:58 +0800 Subject: [PATCH 1294/1423] RDMA/rxe: Implement flush execution in responder side Only the requested placement types that also registered in the destination memory region are acceptable. Otherwise, responder will also reply NAK "Remote Access Error" if it found a placement type violation. We will persist data via arch_wb_cache_pmem(), which could be architecture specific. This commit also adds 2 helpers to update qp.resp from the incoming packet. Link: https://lore.kernel.org/r/20221206130201.30986-8-lizhijian@fujitsu.com Reviewed-by: Zhu Yanjun Signed-off-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_loc.h | 1 + drivers/infiniband/sw/rxe/rxe_mr.c | 36 ++++++ drivers/infiniband/sw/rxe/rxe_resp.c | 160 ++++++++++++++++++++++---- drivers/infiniband/sw/rxe/rxe_verbs.h | 6 + 4 files changed, 183 insertions(+), 20 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index a22476d27b384..948ce4902b10f 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -64,6 +64,7 @@ void rxe_mr_init_dma(int access, struct rxe_mr *mr); int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, int access, struct rxe_mr *mr); int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr); +int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, int length); int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, enum rxe_mr_copy_dir dir); int copy_data(struct rxe_pd *pd, int access, struct rxe_dma_info *dma, diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 81a438e5010aa..072eac4b65d29 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -4,6 +4,8 @@ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. */ +#include + #include "rxe.h" #include "rxe_loc.h" @@ -192,6 +194,7 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, mr->offset = ib_umem_offset(umem); mr->state = RXE_MR_STATE_VALID; mr->ibmr.type = IB_MR_TYPE_USER; + mr->ibmr.page_size = PAGE_SIZE; return 0; @@ -295,6 +298,39 @@ out: return addr; } +int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, int length) +{ + size_t offset; + + if (length == 0) + return 0; + + if (mr->ibmr.type == IB_MR_TYPE_DMA) + return -EFAULT; + + offset = (iova - mr->ibmr.iova + mr->offset) & mr->page_mask; + while (length > 0) { + u8 *va; + int bytes; + + bytes = mr->ibmr.page_size - offset; + if (bytes > length) + bytes = length; + + va = iova_to_vaddr(mr, iova, length); + if (!va) + return -EFAULT; + + arch_wb_cache_pmem(va, bytes); + + length -= bytes; + iova += bytes; + offset = 0; + } + + return 0; +} + /* copy data from a range (vaddr, vaddr+length-1) to or from * a mr object starting at iova. */ diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index abbaa41017e87..7a60c7709da04 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -23,6 +23,7 @@ enum resp_states { RESPST_READ_REPLY, RESPST_ATOMIC_REPLY, RESPST_ATOMIC_WRITE_REPLY, + RESPST_PROCESS_FLUSH, RESPST_COMPLETE, RESPST_ACKNOWLEDGE, RESPST_CLEANUP, @@ -59,6 +60,7 @@ static char *resp_state_name[] = { [RESPST_READ_REPLY] = "READ_REPLY", [RESPST_ATOMIC_REPLY] = "ATOMIC_REPLY", [RESPST_ATOMIC_WRITE_REPLY] = "ATOMIC_WRITE_REPLY", + [RESPST_PROCESS_FLUSH] = "PROCESS_FLUSH", [RESPST_COMPLETE] = "COMPLETE", [RESPST_ACKNOWLEDGE] = "ACKNOWLEDGE", [RESPST_CLEANUP] = "CLEANUP", @@ -258,19 +260,37 @@ static enum resp_states check_op_seq(struct rxe_qp *qp, } } +static bool check_qp_attr_access(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + if (((pkt->mask & RXE_READ_MASK) && + !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_READ)) || + ((pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) && + !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) || + ((pkt->mask & RXE_ATOMIC_MASK) && + !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) + return false; + + if (pkt->mask & RXE_FLUSH_MASK) { + u32 flush_type = feth_plt(pkt); + + if ((flush_type & IB_FLUSH_GLOBAL && + !(qp->attr.qp_access_flags & IB_ACCESS_FLUSH_GLOBAL)) || + (flush_type & IB_FLUSH_PERSISTENT && + !(qp->attr.qp_access_flags & IB_ACCESS_FLUSH_PERSISTENT))) + return false; + } + + return true; +} + static enum resp_states check_op_valid(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { switch (qp_type(qp)) { case IB_QPT_RC: - if (((pkt->mask & RXE_READ_MASK) && - !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_READ)) || - ((pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) && - !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) || - ((pkt->mask & RXE_ATOMIC_MASK) && - !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) { + if (!check_qp_attr_access(qp, pkt)) return RESPST_ERR_UNSUPPORTED_OPCODE; - } break; @@ -437,6 +457,23 @@ static enum resp_states rxe_resp_check_length(struct rxe_qp *qp, return RESPST_CHK_RKEY; } +static void qp_resp_from_reth(struct rxe_qp *qp, struct rxe_pkt_info *pkt) +{ + qp->resp.va = reth_va(pkt); + qp->resp.offset = 0; + qp->resp.rkey = reth_rkey(pkt); + qp->resp.resid = reth_len(pkt); + qp->resp.length = reth_len(pkt); +} + +static void qp_resp_from_atmeth(struct rxe_qp *qp, struct rxe_pkt_info *pkt) +{ + qp->resp.va = atmeth_va(pkt); + qp->resp.offset = 0; + qp->resp.rkey = atmeth_rkey(pkt); + qp->resp.resid = sizeof(u64); +} + static enum resp_states check_rkey(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { @@ -448,23 +485,26 @@ static enum resp_states check_rkey(struct rxe_qp *qp, u32 pktlen; int mtu = qp->mtu; enum resp_states state; - int access; + int access = 0; if (pkt->mask & (RXE_READ_OR_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) { - if (pkt->mask & RXE_RETH_MASK) { - qp->resp.va = reth_va(pkt); - qp->resp.offset = 0; - qp->resp.rkey = reth_rkey(pkt); - qp->resp.resid = reth_len(pkt); - qp->resp.length = reth_len(pkt); - } + if (pkt->mask & RXE_RETH_MASK) + qp_resp_from_reth(qp, pkt); + access = (pkt->mask & RXE_READ_MASK) ? IB_ACCESS_REMOTE_READ : IB_ACCESS_REMOTE_WRITE; + } else if (pkt->mask & RXE_FLUSH_MASK) { + u32 flush_type = feth_plt(pkt); + + if (pkt->mask & RXE_RETH_MASK) + qp_resp_from_reth(qp, pkt); + + if (flush_type & IB_FLUSH_GLOBAL) + access |= IB_ACCESS_FLUSH_GLOBAL; + if (flush_type & IB_FLUSH_PERSISTENT) + access |= IB_ACCESS_FLUSH_PERSISTENT; } else if (pkt->mask & RXE_ATOMIC_MASK) { - qp->resp.va = atmeth_va(pkt); - qp->resp.offset = 0; - qp->resp.rkey = atmeth_rkey(pkt); - qp->resp.resid = sizeof(u64); + qp_resp_from_atmeth(qp, pkt); access = IB_ACCESS_REMOTE_ATOMIC; } else { return RESPST_EXECUTE; @@ -511,11 +551,20 @@ static enum resp_states check_rkey(struct rxe_qp *qp, } } + if (pkt->mask & RXE_FLUSH_MASK) { + /* FLUSH MR may not set va or resid + * no need to check range since we will flush whole mr + */ + if (feth_sel(pkt) == IB_FLUSH_MR) + goto skip_check_range; + } + if (mr_check_range(mr, va + qp->resp.offset, resid)) { state = RESPST_ERR_RKEY_VIOLATION; goto err; } +skip_check_range: if (pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) { if (resid > mtu) { if (pktlen != mtu || bth_pad(pkt)) { @@ -621,11 +670,61 @@ static struct resp_res *rxe_prepare_res(struct rxe_qp *qp, res->last_psn = pkt->psn; res->cur_psn = pkt->psn; break; + case RXE_FLUSH_MASK: + res->flush.va = qp->resp.va + qp->resp.offset; + res->flush.length = qp->resp.length; + res->flush.type = feth_plt(pkt); + res->flush.level = feth_sel(pkt); } return res; } +static enum resp_states process_flush(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + u64 length, start; + struct rxe_mr *mr = qp->resp.mr; + struct resp_res *res = qp->resp.res; + + /* oA19-14, oA19-15 */ + if (res && res->replay) + return RESPST_ACKNOWLEDGE; + else if (!res) { + res = rxe_prepare_res(qp, pkt, RXE_FLUSH_MASK); + qp->resp.res = res; + } + + if (res->flush.level == IB_FLUSH_RANGE) { + start = res->flush.va; + length = res->flush.length; + } else { /* level == IB_FLUSH_MR */ + start = mr->ibmr.iova; + length = mr->ibmr.length; + } + + if (res->flush.type & IB_FLUSH_PERSISTENT) { + if (rxe_flush_pmem_iova(mr, start, length)) + return RESPST_ERR_RKEY_VIOLATION; + /* Make data persistent. */ + wmb(); + } else if (res->flush.type & IB_FLUSH_GLOBAL) { + /* Make data global visibility. */ + wmb(); + } + + qp->resp.msn++; + + /* next expected psn, read handles this separately */ + qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK; + qp->resp.ack_psn = qp->resp.psn; + + qp->resp.opcode = pkt->opcode; + qp->resp.status = IB_WC_SUCCESS; + + return RESPST_ACKNOWLEDGE; +} + /* Guarantee atomicity of atomic operations at the machine level. */ static DEFINE_SPINLOCK(atomic_ops_lock); @@ -980,6 +1079,8 @@ static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt) return RESPST_ATOMIC_REPLY; } else if (pkt->mask & RXE_ATOMIC_WRITE_MASK) { return RESPST_ATOMIC_WRITE_REPLY; + } else if (pkt->mask & RXE_FLUSH_MASK) { + return RESPST_PROCESS_FLUSH; } else { /* Unreachable */ WARN_ON_ONCE(1); @@ -1176,7 +1277,7 @@ static enum resp_states acknowledge(struct rxe_qp *qp, send_ack(qp, qp->resp.aeth_syndrome, pkt->psn); else if (pkt->mask & RXE_ATOMIC_MASK) send_atomic_ack(qp, AETH_ACK_UNLIMITED, pkt->psn); - else if (pkt->mask & RXE_ATOMIC_WRITE_MASK) + else if (pkt->mask & (RXE_FLUSH_MASK | RXE_ATOMIC_WRITE_MASK)) send_read_response_ack(qp, AETH_ACK_UNLIMITED, pkt->psn); else if (bth_ack(pkt)) send_ack(qp, AETH_ACK_UNLIMITED, pkt->psn); @@ -1234,6 +1335,22 @@ static enum resp_states duplicate_request(struct rxe_qp *qp, /* SEND. Ack again and cleanup. C9-105. */ send_ack(qp, AETH_ACK_UNLIMITED, prev_psn); return RESPST_CLEANUP; + } else if (pkt->mask & RXE_FLUSH_MASK) { + struct resp_res *res; + + /* Find the operation in our list of responder resources. */ + res = find_resource(qp, pkt->psn); + if (res) { + res->replay = 1; + res->cur_psn = pkt->psn; + qp->resp.res = res; + rc = RESPST_PROCESS_FLUSH; + goto out; + } + + /* Resource not found. Class D error. Drop the request. */ + rc = RESPST_CLEANUP; + goto out; } else if (pkt->mask & RXE_READ_MASK) { struct resp_res *res; @@ -1431,6 +1548,9 @@ int rxe_responder(void *arg) case RESPST_ATOMIC_WRITE_REPLY: state = atomic_write_reply(qp, pkt); break; + case RESPST_PROCESS_FLUSH: + state = process_flush(qp, pkt); + break; case RESPST_ACKNOWLEDGE: state = acknowledge(qp, pkt); break; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 22a299b0a9f0a..19ddfa8904803 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -165,6 +165,12 @@ struct resp_res { u64 va; u32 resid; } read; + struct { + u32 length; + u64 va; + u8 type; + u8 level; + } flush; }; }; -- GitLab From 70aad902ce8aeb094dd3ef14988a652f24cce7c8 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 6 Dec 2022 21:01:59 +0800 Subject: [PATCH 1295/1423] RDMA/rxe: Implement flush completion Per IBA SPEC, FLUSH will ack in rdma read response with 0 length. Use IB_WC_FLUSH (aka IB_UVERBS_WC_FLUSH) code to tell userspace a FLUSH completion. Link: https://lore.kernel.org/r/20221206130201.30986-9-lizhijian@fujitsu.com Reviewed-by: Zhu Yanjun Signed-off-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_comp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index 1c525325e2716..20737fec392bf 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -105,6 +105,7 @@ static enum ib_wc_opcode wr_to_wc_opcode(enum ib_wr_opcode opcode) case IB_WR_REG_MR: return IB_WC_REG_MR; case IB_WR_BIND_MW: return IB_WC_BIND_MW; case IB_WR_ATOMIC_WRITE: return IB_WC_ATOMIC_WRITE; + case IB_WR_FLUSH: return IB_WC_FLUSH; default: return 0xff; @@ -278,7 +279,8 @@ static inline enum comp_state check_ack(struct rxe_qp *qp, */ case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE: if (wqe->wr.opcode != IB_WR_RDMA_READ && - wqe->wr.opcode != IB_WR_RDMA_READ_WITH_INV) { + wqe->wr.opcode != IB_WR_RDMA_READ_WITH_INV && + wqe->wr.opcode != IB_WR_FLUSH) { wqe->status = IB_WC_FATAL_ERR; return COMPST_ERROR; } -- GitLab From 8b4d379b399d19f4c803e565bfe13f07b66b5ad7 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 6 Dec 2022 21:02:00 +0800 Subject: [PATCH 1296/1423] RDMA/cm: Make QP FLUSHABLE for supported device Similar to RDMA and Atomic qp attributes enabled by default in CM, enable FLUSH attribute for supported device. That makes applications that are built with rdma_create_ep, rdma_accept APIs have FLUSH qp attribute natively so that user is able to request FLUSH operation simpler. Note that, a FLUSH operation requires FLUSH are supported by both device(HCA) and memory region(MR) and QP at the same time, so it's safe to enable FLUSH qp attribute by default here. FLUSH attribute can be disable by modify_qp() interface. Link: https://lore.kernel.org/r/20221206130201.30986-10-lizhijian@fujitsu.com Signed-off-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 1f9938a2c4752..603c0aecc3614 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -4094,9 +4094,18 @@ static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv, *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT; qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE; - if (cm_id_priv->responder_resources) + if (cm_id_priv->responder_resources) { + struct ib_device *ib_dev = cm_id_priv->id.device; + u64 support_flush = ib_dev->attrs.device_cap_flags & + (IB_DEVICE_FLUSH_GLOBAL | IB_DEVICE_FLUSH_PERSISTENT); + u32 flushable = support_flush ? + (IB_ACCESS_FLUSH_GLOBAL | + IB_ACCESS_FLUSH_PERSISTENT) : 0; + qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_READ | - IB_ACCESS_REMOTE_ATOMIC; + IB_ACCESS_REMOTE_ATOMIC | + flushable; + } qp_attr->pkey_index = cm_id_priv->av.pkey_index; if (cm_id_priv->av.port) qp_attr->port_num = cm_id_priv->av.port->port_num; -- GitLab From 124011e6e933bead5852c3f69b32dec43919fe1a Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 6 Dec 2022 21:02:01 +0800 Subject: [PATCH 1297/1423] RDMA/rxe: Enable RDMA FLUSH capability for rxe device Now we are ready to enable RDMA FLUSH capability for RXE. It can support Global Visibility and Persistence placement types. Link: https://lore.kernel.org/r/20221206130201.30986-11-lizhijian@fujitsu.com Reviewed-by: Zhu Yanjun Signed-off-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_param.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h index bbc88cd71d950..a754fc902e3d1 100644 --- a/drivers/infiniband/sw/rxe/rxe_param.h +++ b/drivers/infiniband/sw/rxe/rxe_param.h @@ -51,6 +51,8 @@ enum rxe_device_param { | IB_DEVICE_SRQ_RESIZE | IB_DEVICE_MEM_MGT_EXTENSIONS | IB_DEVICE_MEM_WINDOW + | IB_DEVICE_FLUSH_GLOBAL + | IB_DEVICE_FLUSH_PERSISTENT #ifdef CONFIG_64BIT | IB_DEVICE_MEM_WINDOW_TYPE_2B | IB_DEVICE_ATOMIC_WRITE, -- GitLab From 2ba8c7dc71c098935977528747b82ffae43f3f18 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Tue, 29 Nov 2022 16:00:50 +0100 Subject: [PATCH 1298/1423] riscv: Don't duplicate __ALTERNATIVE_CFG in __ALTERNATIVE_CFG_2 Build __ALTERNATIVE_CFG_2 by adding on to __ALTERNATIVE_CFG rather than duplicating it. Signed-off-by: Andrew Jones Tested-by: Lad Prabhakar Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20221129150053.50464-2-ajones@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/alternative-macros.h | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/arch/riscv/include/asm/alternative-macros.h b/arch/riscv/include/asm/alternative-macros.h index ec2f3f1b836f3..c7036166af2c6 100644 --- a/arch/riscv/include/asm/alternative-macros.h +++ b/arch/riscv/include/asm/alternative-macros.h @@ -49,14 +49,7 @@ .macro __ALTERNATIVE_CFG_2 old_c, new_c_1, vendor_id_1, errata_id_1, enable_1, \ new_c_2, vendor_id_2, errata_id_2, enable_2 -886 : - .option push - .option norvc - .option norelax - \old_c - .option pop -887 : - ALT_NEW_CONTENT \vendor_id_1, \errata_id_1, \enable_1, \new_c_1 + __ALTERNATIVE_CFG \old_c, \new_c_1, \vendor_id_1, \errata_id_1, \enable_1 ALT_NEW_CONTENT \vendor_id_2, \errata_id_2, \enable_2, \new_c_2 .endm @@ -116,14 +109,7 @@ enable_1, \ new_c_2, vendor_id_2, errata_id_2, \ enable_2) \ - "886 :\n" \ - ".option push\n" \ - ".option norvc\n" \ - ".option norelax\n" \ - old_c "\n" \ - ".option pop\n" \ - "887 :\n" \ - ALT_NEW_CONTENT(vendor_id_1, errata_id_1, enable_1, new_c_1) \ + __ALTERNATIVE_CFG(old_c, new_c_1, vendor_id_1, errata_id_1, enable_1) \ ALT_NEW_CONTENT(vendor_id_2, errata_id_2, enable_2, new_c_2) #define _ALTERNATIVE_CFG_2(old_c, new_c_1, vendor_id_1, errata_id_1, \ -- GitLab From 7d52eace1bf5c55704bb0ca5dc8f2489927683ff Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Tue, 29 Nov 2022 16:00:51 +0100 Subject: [PATCH 1299/1423] riscv: alternatives: Don't name unused macro parameters Without CONFIG_RISCV_ALTERNATIVE only the first parameter of the ALTERNATIVE macros is needed. Use ... for the rest to cut down on clutter. While there, fix a couple space vs. tab issues. Signed-off-by: Andrew Jones Tested-by: Lad Prabhakar Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20221129150053.50464-3-ajones@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/alternative-macros.h | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/arch/riscv/include/asm/alternative-macros.h b/arch/riscv/include/asm/alternative-macros.h index c7036166af2c6..7cc2b587c5c4e 100644 --- a/arch/riscv/include/asm/alternative-macros.h +++ b/arch/riscv/include/asm/alternative-macros.h @@ -130,28 +130,22 @@ \old_c .endm -#define _ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, CONFIG_k) \ +#define _ALTERNATIVE_CFG(old_c, ...) \ __ALTERNATIVE_CFG old_c -#define _ALTERNATIVE_CFG_2(old_c, new_c_1, vendor_id_1, errata_id_1, \ - CONFIG_k_1, \ - new_c_2, vendor_id_2, errata_id_2, \ - CONFIG_k_2) \ - __ALTERNATIVE_CFG old_c +#define _ALTERNATIVE_CFG_2(old_c, ...) \ + __ALTERNATIVE_CFG old_c #else /* !__ASSEMBLY__ */ -#define __ALTERNATIVE_CFG(old_c) \ +#define __ALTERNATIVE_CFG(old_c) \ old_c "\n" -#define _ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, CONFIG_k) \ +#define _ALTERNATIVE_CFG(old_c, ...) \ __ALTERNATIVE_CFG(old_c) -#define _ALTERNATIVE_CFG_2(old_c, new_c_1, vendor_id_1, errata_id_1, \ - CONFIG_k_1, \ - new_c_2, vendor_id_2, errata_id_2, \ - CONFIG_k_2) \ - __ALTERNATIVE_CFG(old_c) +#define _ALTERNATIVE_CFG_2(old_c, ...) \ + __ALTERNATIVE_CFG(old_c) #endif /* __ASSEMBLY__ */ #endif /* CONFIG_RISCV_ALTERNATIVE */ -- GitLab From bb2efcde594628ae08ee6e4be51b2047df9d2d06 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Tue, 29 Nov 2022 16:00:52 +0100 Subject: [PATCH 1300/1423] riscv: alternatives: Drop the underscores from the assembly macro names The underscores aren't needed because there isn't anything already named without them and the _CFG extension. This is a bit of a cleanup by itself, but the real motivation is for a coming patch which would otherwise need to add two more underscores to these macro names, i.e. ____ALTERNATIVE_CFG, and that'd be gross. Signed-off-by: Andrew Jones Tested-by: Lad Prabhakar Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20221129150053.50464-4-ajones@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/alternative-macros.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/riscv/include/asm/alternative-macros.h b/arch/riscv/include/asm/alternative-macros.h index 7cc2b587c5c4e..9ea95331a2801 100644 --- a/arch/riscv/include/asm/alternative-macros.h +++ b/arch/riscv/include/asm/alternative-macros.h @@ -33,7 +33,7 @@ .endif .endm -.macro __ALTERNATIVE_CFG old_c, new_c, vendor_id, errata_id, enable +.macro ALTERNATIVE_CFG old_c, new_c, vendor_id, errata_id, enable 886 : .option push .option norvc @@ -45,11 +45,11 @@ .endm #define _ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, CONFIG_k) \ - __ALTERNATIVE_CFG old_c, new_c, vendor_id, errata_id, IS_ENABLED(CONFIG_k) + ALTERNATIVE_CFG old_c, new_c, vendor_id, errata_id, IS_ENABLED(CONFIG_k) -.macro __ALTERNATIVE_CFG_2 old_c, new_c_1, vendor_id_1, errata_id_1, enable_1, \ - new_c_2, vendor_id_2, errata_id_2, enable_2 - __ALTERNATIVE_CFG \old_c, \new_c_1, \vendor_id_1, \errata_id_1, \enable_1 +.macro ALTERNATIVE_CFG_2 old_c, new_c_1, vendor_id_1, errata_id_1, enable_1, \ + new_c_2, vendor_id_2, errata_id_2, enable_2 + ALTERNATIVE_CFG \old_c, \new_c_1, \vendor_id_1, \errata_id_1, \enable_1 ALT_NEW_CONTENT \vendor_id_2, \errata_id_2, \enable_2, \new_c_2 .endm @@ -57,9 +57,9 @@ CONFIG_k_1, \ new_c_2, vendor_id_2, errata_id_2, \ CONFIG_k_2) \ - __ALTERNATIVE_CFG_2 old_c, new_c_1, vendor_id_1, errata_id_1, \ + ALTERNATIVE_CFG_2 old_c, new_c_1, vendor_id_1, errata_id_1, \ IS_ENABLED(CONFIG_k_1), \ - new_c_2, vendor_id_2, errata_id_2, \ + new_c_2, vendor_id_2, errata_id_2, \ IS_ENABLED(CONFIG_k_2) #else /* !__ASSEMBLY__ */ @@ -126,15 +126,15 @@ #else /* CONFIG_RISCV_ALTERNATIVE */ #ifdef __ASSEMBLY__ -.macro __ALTERNATIVE_CFG old_c +.macro ALTERNATIVE_CFG old_c \old_c .endm #define _ALTERNATIVE_CFG(old_c, ...) \ - __ALTERNATIVE_CFG old_c + ALTERNATIVE_CFG old_c #define _ALTERNATIVE_CFG_2(old_c, ...) \ - __ALTERNATIVE_CFG old_c + ALTERNATIVE_CFG old_c #else /* !__ASSEMBLY__ */ -- GitLab From 26fb4b90b745a808e94a81dc732d440c285fa74b Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Tue, 29 Nov 2022 16:00:53 +0100 Subject: [PATCH 1301/1423] riscv: Don't duplicate _ALTERNATIVE_CFG* macros Reduce clutter by only defining the _ALTERNATIVE_CFG* macros once, rather than once for assembly and once for C. To do that, we need to add __ALTERNATIVE_CFG* macros to the assembly side, but those are one-liners. Also take the opportunity to do a bit of reformatting, taking full advantage of the fact checkpatch gives us 100 char lines. Signed-off-by: Andrew Jones Tested-by: Lad Prabhakar Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20221129150053.50464-5-ajones@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/alternative-macros.h | 53 +++++++-------------- 1 file changed, 17 insertions(+), 36 deletions(-) diff --git a/arch/riscv/include/asm/alternative-macros.h b/arch/riscv/include/asm/alternative-macros.h index 9ea95331a2801..7226e2462584b 100644 --- a/arch/riscv/include/asm/alternative-macros.h +++ b/arch/riscv/include/asm/alternative-macros.h @@ -44,23 +44,14 @@ ALT_NEW_CONTENT \vendor_id, \errata_id, \enable, \new_c .endm -#define _ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, CONFIG_k) \ - ALTERNATIVE_CFG old_c, new_c, vendor_id, errata_id, IS_ENABLED(CONFIG_k) - .macro ALTERNATIVE_CFG_2 old_c, new_c_1, vendor_id_1, errata_id_1, enable_1, \ new_c_2, vendor_id_2, errata_id_2, enable_2 ALTERNATIVE_CFG \old_c, \new_c_1, \vendor_id_1, \errata_id_1, \enable_1 ALT_NEW_CONTENT \vendor_id_2, \errata_id_2, \enable_2, \new_c_2 .endm -#define _ALTERNATIVE_CFG_2(old_c, new_c_1, vendor_id_1, errata_id_1, \ - CONFIG_k_1, \ - new_c_2, vendor_id_2, errata_id_2, \ - CONFIG_k_2) \ - ALTERNATIVE_CFG_2 old_c, new_c_1, vendor_id_1, errata_id_1, \ - IS_ENABLED(CONFIG_k_1), \ - new_c_2, vendor_id_2, errata_id_2, \ - IS_ENABLED(CONFIG_k_2) +#define __ALTERNATIVE_CFG(...) ALTERNATIVE_CFG __VA_ARGS__ +#define __ALTERNATIVE_CFG_2(...) ALTERNATIVE_CFG_2 __VA_ARGS__ #else /* !__ASSEMBLY__ */ @@ -102,27 +93,21 @@ "887 :\n" \ ALT_NEW_CONTENT(vendor_id, errata_id, enable, new_c) -#define _ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, CONFIG_k) \ - __ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, IS_ENABLED(CONFIG_k)) - -#define __ALTERNATIVE_CFG_2(old_c, new_c_1, vendor_id_1, errata_id_1, \ - enable_1, \ - new_c_2, vendor_id_2, errata_id_2, \ - enable_2) \ - __ALTERNATIVE_CFG(old_c, new_c_1, vendor_id_1, errata_id_1, enable_1) \ +#define __ALTERNATIVE_CFG_2(old_c, new_c_1, vendor_id_1, errata_id_1, enable_1, \ + new_c_2, vendor_id_2, errata_id_2, enable_2) \ + __ALTERNATIVE_CFG(old_c, new_c_1, vendor_id_1, errata_id_1, enable_1) \ ALT_NEW_CONTENT(vendor_id_2, errata_id_2, enable_2, new_c_2) -#define _ALTERNATIVE_CFG_2(old_c, new_c_1, vendor_id_1, errata_id_1, \ - CONFIG_k_1, \ - new_c_2, vendor_id_2, errata_id_2, \ - CONFIG_k_2) \ - __ALTERNATIVE_CFG_2(old_c, new_c_1, vendor_id_1, errata_id_1, \ - IS_ENABLED(CONFIG_k_1), \ - new_c_2, vendor_id_2, errata_id_2, \ - IS_ENABLED(CONFIG_k_2)) - #endif /* __ASSEMBLY__ */ +#define _ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, CONFIG_k) \ + __ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, IS_ENABLED(CONFIG_k)) + +#define _ALTERNATIVE_CFG_2(old_c, new_c_1, vendor_id_1, errata_id_1, CONFIG_k_1, \ + new_c_2, vendor_id_2, errata_id_2, CONFIG_k_2) \ + __ALTERNATIVE_CFG_2(old_c, new_c_1, vendor_id_1, errata_id_1, IS_ENABLED(CONFIG_k_1), \ + new_c_2, vendor_id_2, errata_id_2, IS_ENABLED(CONFIG_k_2)) + #else /* CONFIG_RISCV_ALTERNATIVE */ #ifdef __ASSEMBLY__ @@ -173,13 +158,9 @@ * on the following sample code and then replace ALTERNATIVE() with * ALTERNATIVE_2() to append its customized content. */ -#define ALTERNATIVE_2(old_content, new_content_1, vendor_id_1, \ - errata_id_1, CONFIG_k_1, \ - new_content_2, vendor_id_2, \ - errata_id_2, CONFIG_k_2) \ - _ALTERNATIVE_CFG_2(old_content, new_content_1, vendor_id_1, \ - errata_id_1, CONFIG_k_1, \ - new_content_2, vendor_id_2, \ - errata_id_2, CONFIG_k_2) +#define ALTERNATIVE_2(old_content, new_content_1, vendor_id_1, errata_id_1, CONFIG_k_1, \ + new_content_2, vendor_id_2, errata_id_2, CONFIG_k_2) \ + _ALTERNATIVE_CFG_2(old_content, new_content_1, vendor_id_1, errata_id_1, CONFIG_k_1, \ + new_content_2, vendor_id_2, errata_id_2, CONFIG_k_2) #endif -- GitLab From 726855549cf8d5c6b05795cf74a9c23584f45544 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Tue, 29 Nov 2022 15:34:45 +0100 Subject: [PATCH 1302/1423] RISC-V: Improve use of isa2hwcap[] Improve isa2hwcap[] by removing it from static storage, as riscv_fill_hwcap() is only called once, and by reducing its size from 256 bytes to 26. The latter improvement is possible because isa2hwcap[] will never be indexed with capital letters and we can precompute the offsets from 'a'. No functional change intended. Signed-off-by: Andrew Jones Reviewed-by: Conor Dooley Reviewed-by: Heiko Stuebner Link: https://lore.kernel.org/r/20221129143447.49714-2-ajones@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cpufeature.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 694267d1fe814..4677320d7e318 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -74,15 +74,15 @@ void __init riscv_fill_hwcap(void) const char *isa; char print_str[NUM_ALPHA_EXTS + 1]; int i, j, rc; - static unsigned long isa2hwcap[256] = {0}; + unsigned long isa2hwcap[26] = {0}; unsigned long hartid; - isa2hwcap['i'] = isa2hwcap['I'] = COMPAT_HWCAP_ISA_I; - isa2hwcap['m'] = isa2hwcap['M'] = COMPAT_HWCAP_ISA_M; - isa2hwcap['a'] = isa2hwcap['A'] = COMPAT_HWCAP_ISA_A; - isa2hwcap['f'] = isa2hwcap['F'] = COMPAT_HWCAP_ISA_F; - isa2hwcap['d'] = isa2hwcap['D'] = COMPAT_HWCAP_ISA_D; - isa2hwcap['c'] = isa2hwcap['C'] = COMPAT_HWCAP_ISA_C; + isa2hwcap['i' - 'a'] = COMPAT_HWCAP_ISA_I; + isa2hwcap['m' - 'a'] = COMPAT_HWCAP_ISA_M; + isa2hwcap['a' - 'a'] = COMPAT_HWCAP_ISA_A; + isa2hwcap['f' - 'a'] = COMPAT_HWCAP_ISA_F; + isa2hwcap['d' - 'a'] = COMPAT_HWCAP_ISA_D; + isa2hwcap['c' - 'a'] = COMPAT_HWCAP_ISA_C; elf_hwcap = 0; @@ -196,8 +196,10 @@ void __init riscv_fill_hwcap(void) if (unlikely(ext_err)) continue; if (!ext_long) { - this_hwcap |= isa2hwcap[(unsigned char)(*ext)]; - set_bit(*ext - 'a', this_isa); + int nr = *ext - 'a'; + + this_hwcap |= isa2hwcap[nr]; + set_bit(nr, this_isa); } else { SET_ISA_EXT_MAP("sscofpmf", RISCV_ISA_EXT_SSCOFPMF); SET_ISA_EXT_MAP("svpbmt", RISCV_ISA_EXT_SVPBMT); -- GitLab From fb0ff0a95d61f69415cb8d8f2d921e1f7eed75af Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Tue, 29 Nov 2022 15:34:46 +0100 Subject: [PATCH 1303/1423] RISC-V: Introduce riscv_isa_extension_check Currently any isa extension found in the isa string is set in the isa bitmap. An isa extension set in the bitmap indicates that the extension is present and may be used (a.k.a is enabled). However, when an extension cannot be used due to missing dependencies or errata it should not be added to the bitmap. Introduce a function where additional checks may be placed in order to determine if an extension should be enabled or not. Note, the checks may simply indicate an issue with the DT, but, since extensions may be used in early boot, it's not always possible to simply produce an error at the point the issue is determined. It's best to keep the extension disabled and produce an error. No functional change intended, as the function is only introduced and always returns true. A later patch will provide checks for an isa extension. Signed-off-by: Andrew Jones Reviewed-by: Conor Dooley Reviewed-by: Heiko Stuebner Link: https://lore.kernel.org/r/20221129143447.49714-3-ajones@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cpufeature.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 4677320d7e318..220be7222129b 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -68,6 +68,11 @@ bool __riscv_isa_extension_available(const unsigned long *isa_bitmap, int bit) } EXPORT_SYMBOL_GPL(__riscv_isa_extension_available); +static bool riscv_isa_extension_check(int id) +{ + return true; +} + void __init riscv_fill_hwcap(void) { struct device_node *node; @@ -189,7 +194,8 @@ void __init riscv_fill_hwcap(void) #define SET_ISA_EXT_MAP(name, bit) \ do { \ if ((ext_end - ext == sizeof(name) - 1) && \ - !memcmp(ext, name, sizeof(name) - 1)) \ + !memcmp(ext, name, sizeof(name) - 1) && \ + riscv_isa_extension_check(bit)) \ set_bit(bit, this_isa); \ } while (false) \ @@ -198,8 +204,10 @@ void __init riscv_fill_hwcap(void) if (!ext_long) { int nr = *ext - 'a'; - this_hwcap |= isa2hwcap[nr]; - set_bit(nr, this_isa); + if (riscv_isa_extension_check(nr)) { + this_hwcap |= isa2hwcap[nr]; + set_bit(nr, this_isa); + } } else { SET_ISA_EXT_MAP("sscofpmf", RISCV_ISA_EXT_SSCOFPMF); SET_ISA_EXT_MAP("svpbmt", RISCV_ISA_EXT_SVPBMT); -- GitLab From 9daaca4a44d6f0741060e67c54a0175c035edb1f Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Tue, 29 Nov 2022 15:34:47 +0100 Subject: [PATCH 1304/1423] RISC-V: Ensure Zicbom has a valid block size When a DT puts zicbom in the isa string, but does not provide a block size, ALT_CMO_OP() will attempt to do cache operations on address zero since the start address will be ANDed with zero. We can't simply BUG() in riscv_init_cbom_blocksize() when we fail to find a block size because the failure will happen before logging works, leaving users to scratch their heads as to why the boot hung. Instead, ensure Zicbom is disabled and output an error which will hopefully alert people that the DT needs to be fixed. While at it, add a check that the block size is a power-of-2 too. Signed-off-by: Andrew Jones Reviewed-by: Conor Dooley Reviewed-by: Heiko Stuebner Link: https://lore.kernel.org/r/20221129143447.49714-4-ajones@ventanamicro.com [Palmer: base on 5c20a3a9df19 ("RISC-V: Fix compilation without RISCV_ISA_ZICBOM"] Reported-by: kernel test robot Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cpufeature.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 220be7222129b..93e45560af307 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -70,6 +71,18 @@ EXPORT_SYMBOL_GPL(__riscv_isa_extension_available); static bool riscv_isa_extension_check(int id) { + switch (id) { + case RISCV_ISA_EXT_ZICBOM: + if (!riscv_cbom_block_size) { + pr_err("Zicbom detected in ISA string, but no cbom-block-size found\n"); + return false; + } else if (!is_power_of_2(riscv_cbom_block_size)) { + pr_err("cbom-block-size present, but is not a power-of-2\n"); + return false; + } + return true; + } + return true; } -- GitLab From d8d2b65a940bb497749d66bdab59b530901d3854 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 9 Dec 2022 11:01:00 -0600 Subject: [PATCH 1305/1423] PCI/portdrv: Allow AER service only for Root Ports & RCECs Previously portdrv allowed the AER service for any device with an AER capability (assuming Linux had control of AER) even though the AER service driver only attaches to Root Port and RCECs. Because get_port_device_capability() included AER for non-RP, non-RCEC devices, we tried to initialize the AER IRQ even though these devices don't generate AER interrupts. Intel DG1 and DG2 discrete graphics cards contain a switch leading to a GPU. The switch supports AER but not MSI, so initializing an AER IRQ failed, and portdrv failed to claim the switch port at all. The GPU itself could be suspended, but the switch could not be put in a low-power state because it had no driver. Don't allow the AER service on non-Root Port, non-Root Complex Event Collector devices. This means we won't enable Bus Mastering if the device doesn't require MSI, the AER service will not appear in sysfs, and the AER service driver will not bind to the device. Link: https://lore.kernel.org/r/20221207084105.84947-1-mika.westerberg@linux.intel.com Link: https://lore.kernel.org/r/20221210002922.1749403-1-helgaas@kernel.org Based-on-patch-by: Mika Westerberg Signed-off-by: Bjorn Helgaas Reviewed-by: Kuppuswamy Sathyanarayanan --- drivers/pci/pcie/portdrv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pcie/portdrv.c b/drivers/pci/pcie/portdrv.c index a6c4225505d53..8b16e96ec15c6 100644 --- a/drivers/pci/pcie/portdrv.c +++ b/drivers/pci/pcie/portdrv.c @@ -232,7 +232,9 @@ static int get_port_device_capability(struct pci_dev *dev) } #ifdef CONFIG_PCIEAER - if (dev->aer_cap && pci_aer_available() && + if ((pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT || + pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC) && + dev->aer_cap && pci_aer_available() && (pcie_ports_native || host->native_aer)) services |= PCIE_PORT_SERVICE_AER; #endif -- GitLab From 07eab0901ede8b7540c52160663bd300cc238164 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 8 Dec 2022 13:03:38 -0600 Subject: [PATCH 1306/1423] efi/x86: Remove EfiMemoryMappedIO from E820 map Firmware can use EfiMemoryMappedIO to request that MMIO regions be mapped by the OS so they can be accessed by EFI runtime services, but should have no other significance to the OS (UEFI r2.10, sec 7.2). However, most bootloaders and EFI stubs convert EfiMemoryMappedIO regions to E820_TYPE_RESERVED entries, which prevent Linux from allocating space from them (see remove_e820_regions()). Some platforms use EfiMemoryMappedIO entries for PCI MMCONFIG space and PCI host bridge windows, which means Linux can't allocate BAR space for hot-added devices. Remove large EfiMemoryMappedIO regions from the E820 map to avoid this problem. Leave small (< 256KB) EfiMemoryMappedIO regions alone because on some platforms, these describe non-window space that's included in host bridge _CRS. If we assign that space to PCI devices, they don't work. On the Lenovo X1 Carbon, this leads to suspend/resume failures. The previous solution to the problem of allocating BARs in these regions was to add pci_crs_quirks[] entries to disable E820 checking for these machines (see d341838d776a ("x86/PCI: Disable E820 reserved region clipping via quirks")): Acer DMI_PRODUCT_NAME Spin SP513-54N Clevo DMI_BOARD_NAME X170KM-G Lenovo DMI_PRODUCT_VERSION *IIL* Florent reported the BAR allocation issue on the Clevo NL4XLU. We could add another quirk for the NL4XLU, but I hope this generic change can solve it for many machines without having to add quirks. This change has been tested on Clevo X170KM-G (Konrad) and Lenovo Ideapad Slim 3 (Matt) and solves the problem even when overriding the existing quirks by booting with "pci=use_e820". Link: https://bugzilla.kernel.org/show_bug.cgi?id=216565 Clevo NL4XLU Link: https://bugzilla.kernel.org/show_bug.cgi?id=206459#c78 Clevo X170KM-G Link: https://bugzilla.redhat.com/show_bug.cgi?id=1868899 Ideapad Slim 3 Link: https://bugzilla.redhat.com/show_bug.cgi?id=2029207 X1 Carbon Link: https://lore.kernel.org/r/20221208190341.1560157-2-helgaas@kernel.org Reported-by: Florent DELAHAYE Tested-by: Konrad J Hambrick Tested-by: Matt Hansen <2lprbe78@duck.com> Signed-off-by: Bjorn Helgaas Acked-by: Hans de Goede --- arch/x86/platform/efi/efi.c | 46 +++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index ebc98a68c4005..75bf0e56bb53d 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -303,6 +303,50 @@ static void __init efi_clean_memmap(void) } } +/* + * Firmware can use EfiMemoryMappedIO to request that MMIO regions be + * mapped by the OS so they can be accessed by EFI runtime services, but + * should have no other significance to the OS (UEFI r2.10, sec 7.2). + * However, most bootloaders and EFI stubs convert EfiMemoryMappedIO + * regions to E820_TYPE_RESERVED entries, which prevent Linux from + * allocating space from them (see remove_e820_regions()). + * + * Some platforms use EfiMemoryMappedIO entries for PCI MMCONFIG space and + * PCI host bridge windows, which means Linux can't allocate BAR space for + * hot-added devices. + * + * Remove large EfiMemoryMappedIO regions from the E820 map to avoid this + * problem. + * + * Retain small EfiMemoryMappedIO regions because on some platforms, these + * describe non-window space that's included in host bridge _CRS. If we + * assign that space to PCI devices, they don't work. + */ +static void __init efi_remove_e820_mmio(void) +{ + efi_memory_desc_t *md; + u64 size, start, end; + int i = 0; + + for_each_efi_memory_desc(md) { + if (md->type == EFI_MEMORY_MAPPED_IO) { + size = md->num_pages << EFI_PAGE_SHIFT; + start = md->phys_addr; + end = start + size - 1; + if (size >= 256*1024) { + pr_info("Remove mem%02u: MMIO range=[0x%08llx-0x%08llx] (%lluMB) from e820 map\n", + i, start, end, size >> 20); + e820__range_remove(start, size, + E820_TYPE_RESERVED, 1); + } else { + pr_info("Not removing mem%02u: MMIO range=[0x%08llx-0x%08llx] (%lluKB) from e820 map\n", + i, start, end, size >> 10); + } + } + i++; + } +} + void __init efi_print_memmap(void) { efi_memory_desc_t *md; @@ -474,6 +518,8 @@ void __init efi_init(void) set_bit(EFI_RUNTIME_SERVICES, &efi.flags); efi_clean_memmap(); + efi_remove_e820_mmio(); + if (efi_enabled(EFI_DBG)) efi_print_memmap(); } -- GitLab From 5c5fb3c3a793b34554e1d21f07cda34308b082cd Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 8 Dec 2022 13:03:39 -0600 Subject: [PATCH 1307/1423] PCI: Skip allocate_resource() if too little space available pci_bus_alloc_from_region() allocates MMIO space by iterating through all the resources available on the bus. The available resource might be reduced if the caller requires 32-bit space or we're avoiding BIOS or E820 areas. Don't bother calling allocate_resource() if we need more space than is available in this resource. This prevents some pointless and annoying messages about avoided areas. Link: https://lore.kernel.org/r/20221208190341.1560157-3-helgaas@kernel.org Signed-off-by: Bjorn Helgaas Acked-by: Hans de Goede --- drivers/pci/bus.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c index 3cef835b375fd..83ae838ceb5f0 100644 --- a/drivers/pci/bus.c +++ b/drivers/pci/bus.c @@ -197,6 +197,10 @@ static int pci_bus_alloc_from_region(struct pci_bus *bus, struct resource *res, max = avail.end; + /* Don't bother if available space isn't large enough */ + if (size > max - min_used + 1) + continue; + /* Ok, try it out.. */ ret = allocate_resource(r, res, size, min_used, max, align, alignf, alignf_data); -- GitLab From 00904bf64c2819d90a76407d79bdbc8918541320 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 8 Dec 2022 13:03:40 -0600 Subject: [PATCH 1308/1423] x86/PCI: Tidy E820 removal messages These messages: clipped [mem size 0x00000000 64bit] to [mem size 0xfffffffffffa0000 64bit] for e820 entry [mem 0x0009f000-0x000fffff] aren't as useful as they could be because (a) the resource is often IORESOURCE_UNSET, so we print the size instead of the start/end and (b) we print the available resource even if it is empty after removing the E820 entry. Print the available space by hand to avoid the IORESOURCE_UNSET problem and only if it's non-empty. No functional change intended. Link: https://lore.kernel.org/r/20221208190341.1560157-4-helgaas@kernel.org Signed-off-by: Bjorn Helgaas Acked-by: Hans de Goede --- arch/x86/kernel/resource.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c index bba1abd05bfeb..79bc8a97a083c 100644 --- a/arch/x86/kernel/resource.c +++ b/arch/x86/kernel/resource.c @@ -42,8 +42,16 @@ static void remove_e820_regions(struct resource *avail) resource_clip(avail, e820_start, e820_end); if (orig.start != avail->start || orig.end != avail->end) { - pr_info("clipped %pR to %pR for e820 entry [mem %#010Lx-%#010Lx]\n", - &orig, avail, e820_start, e820_end); + pr_info("resource: avoiding allocation from e820 entry [mem %#010Lx-%#010Lx]\n", + e820_start, e820_end); + if (avail->end > avail->start) + /* + * Use %pa instead of %pR because "avail" + * is typically IORESOURCE_UNSET, so %pR + * shows the size instead of addresses. + */ + pr_info("resource: remaining [mem %pa-%pa] available\n", + &avail->start, &avail->end); orig = *avail; } } -- GitLab From 2bfa89fab5ff06703c034c4702e6ea4418194ffe Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 8 Dec 2022 13:03:41 -0600 Subject: [PATCH 1309/1423] x86/PCI: Fix log message typo Add missing word in the log message: - ... so future kernels can this automatically + ... so future kernels can do this automatically Suggested-by: Andy Shevchenko Link: https://lore.kernel.org/r/20221208190341.1560157-5-helgaas@kernel.org Signed-off-by: Bjorn Helgaas Reviewed-by: Andy Shevchenko Acked-by: Hans de Goede --- arch/x86/pci/acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 2f82480fd4305..83dfea9e9894b 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -245,7 +245,7 @@ void __init pci_acpi_crs_quirks(void) printk(KERN_INFO "PCI: %s E820 reservations for host bridge windows\n", pci_use_e820 ? "Using" : "Ignoring"); if (pci_probe & (PCI_NO_E820 | PCI_USE_E820)) - printk(KERN_INFO "PCI: Please notify linux-pci@vger.kernel.org so future kernels can this automatically\n"); + printk(KERN_INFO "PCI: Please notify linux-pci@vger.kernel.org so future kernels can do this automatically\n"); } #ifdef CONFIG_PCI_MMCONFIG -- GitLab From d91482bb212b36354b0e46d7a5c0adae807e7a12 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 9 Dec 2022 14:41:27 -0600 Subject: [PATCH 1310/1423] x86/PCI: Use pr_info() when possible Use pr_info() and similar when possible. No functional change intended. Link: https://lore.kernel.org/r/20221209205131.GA1726524@bhelgaas Suggested-by: Andy Shevchenko Signed-off-by: Bjorn Helgaas Reviewed-by: Andy Shevchenko --- arch/x86/pci/acpi.c | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 83dfea9e9894b..ea2eb2ec90e2b 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -1,4 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 + +#define pr_fmt(fmt) "PCI: " fmt + #include #include #include @@ -37,15 +40,15 @@ static int __init set_nouse_crs(const struct dmi_system_id *id) static int __init set_ignore_seg(const struct dmi_system_id *id) { - printk(KERN_INFO "PCI: %s detected: ignoring ACPI _SEG\n", id->ident); + pr_info("%s detected: ignoring ACPI _SEG\n", id->ident); pci_ignore_seg = true; return 0; } static int __init set_no_e820(const struct dmi_system_id *id) { - printk(KERN_INFO "PCI: %s detected: not clipping E820 regions from _CRS\n", - id->ident); + pr_info("%s detected: not clipping E820 regions from _CRS\n", + id->ident); pci_use_e820 = false; return 0; } @@ -231,10 +234,9 @@ void __init pci_acpi_crs_quirks(void) else if (pci_probe & PCI_USE__CRS) pci_use_crs = true; - printk(KERN_INFO "PCI: %s host bridge windows from ACPI; " - "if necessary, use \"pci=%s\" and report a bug\n", - pci_use_crs ? "Using" : "Ignoring", - pci_use_crs ? "nocrs" : "use_crs"); + pr_info("%s host bridge windows from ACPI; if necessary, use \"pci=%s\" and report a bug\n", + pci_use_crs ? "Using" : "Ignoring", + pci_use_crs ? "nocrs" : "use_crs"); /* "pci=use_e820"/"pci=no_e820" on the kernel cmdline takes precedence */ if (pci_probe & PCI_NO_E820) @@ -242,19 +244,17 @@ void __init pci_acpi_crs_quirks(void) else if (pci_probe & PCI_USE_E820) pci_use_e820 = true; - printk(KERN_INFO "PCI: %s E820 reservations for host bridge windows\n", - pci_use_e820 ? "Using" : "Ignoring"); + pr_info("%s E820 reservations for host bridge windows\n", + pci_use_e820 ? "Using" : "Ignoring"); if (pci_probe & (PCI_NO_E820 | PCI_USE_E820)) - printk(KERN_INFO "PCI: Please notify linux-pci@vger.kernel.org so future kernels can do this automatically\n"); + pr_info("Please notify linux-pci@vger.kernel.org so future kernels can do this automatically\n"); } #ifdef CONFIG_PCI_MMCONFIG static int check_segment(u16 seg, struct device *dev, char *estr) { if (seg) { - dev_err(dev, - "%s can't access PCI configuration " - "space under this host bridge.\n", + dev_err(dev, "%s can't access configuration space under this host bridge\n", estr); return -EIO; } @@ -264,9 +264,7 @@ static int check_segment(u16 seg, struct device *dev, char *estr) * just can't access extended configuration space of * devices under this host bridge. */ - dev_warn(dev, - "%s can't access extended PCI configuration " - "space under this bridge.\n", + dev_warn(dev, "%s can't access extended configuration space under this bridge\n", estr); return 0; @@ -421,9 +419,8 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) root->segment = domain = 0; if (domain && !pci_domains_supported) { - printk(KERN_WARNING "pci_bus %04x:%02x: " - "ignored (multiple domains not supported)\n", - domain, busnum); + pr_warn("pci_bus %04x:%02x: ignored (multiple domains not supported)\n", + domain, busnum); return NULL; } @@ -491,7 +488,7 @@ int __init pci_acpi_init(void) if (acpi_noirq) return -ENODEV; - printk(KERN_INFO "PCI: Using ACPI for IRQ routing\n"); + pr_info("Using ACPI for IRQ routing\n"); acpi_irq_penalty_init(); pcibios_enable_irq = acpi_pci_irq_enable; pcibios_disable_irq = acpi_pci_irq_disable; @@ -503,7 +500,7 @@ int __init pci_acpi_init(void) * also do it here in case there are still broken drivers that * don't use pci_enable_device(). */ - printk(KERN_INFO "PCI: Routing PCI interrupts for all devices because \"pci=routeirq\" specified\n"); + pr_info("Routing PCI interrupts for all devices because \"pci=routeirq\" specified\n"); for_each_pci_dev(dev) acpi_pci_irq_enable(dev); } -- GitLab From e42f9c2e6aad583986e91979bf2fce47aaced1c2 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 9 Dec 2022 10:21:56 -0400 Subject: [PATCH 1311/1423] RDMA: Add missed netdev_put() for the netdevice_tracker The netdev core will detect if any untracked puts are done on tracked pointers and throw refcount warnings: refcount_t: decrement hit 0; leaking memory. WARNING: CPU: 1 PID: 33 at lib/refcount.c:31 refcount_warn_saturate+0x1d7/0x1f0 lib/refcount.c:31 Modules linked in: CPU: 1 PID: 33 Comm: kworker/u4:2 Not tainted 6.1.0-rc8-next-20221207-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022 Workqueue: ib-unreg-wq ib_unregister_work RIP: 0010:refcount_warn_saturate+0x1d7/0x1f0 lib/refcount.c:31 Code: 05 5a 60 51 0a 01 e8 35 0a b5 05 0f 0b e9 d3 fe ff ff e8 6c 9b 75 fd 48 c7 c7 c0 6d a6 8a c6 05 37 60 51 0a 01 e8 16 0a b5 05 <0f> 0b e9 b4 fe +ff ff 48 89 ef e8 5a b5 c3 fd e9 5c fe ff ff 0f 1f RSP: 0018:ffffc90000aa7b30 EFLAGS: 00010082 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: ffff8880172f9d40 RSI: ffffffff8166b1dc RDI: fffff52000154f58 RBP: ffff88807906c600 R08: 0000000000000005 R09: 0000000000000000 R10: 0000000080000001 R11: 0000000000000000 R12: 1ffff92000154f6b R13: 0000000000000000 R14: ffff88807906c600 R15: ffff888046894000 FS: 0000000000000000(0000) GS:ffff8880b9900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ffe350a8ff8 CR3: 000000007a9e7000 CR4: 00000000003526e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __refcount_dec include/linux/refcount.h:344 [inline] refcount_dec include/linux/refcount.h:359 [inline] ref_tracker_free+0x539/0x6b0 lib/ref_tracker.c:118 netdev_tracker_free include/linux/netdevice.h:4039 [inline] netdev_put include/linux/netdevice.h:4056 [inline] dev_put include/linux/netdevice.h:4082 [inline] free_netdevs+0x1f8/0x470 drivers/infiniband/core/device.c:2204 __ib_unregister_device+0xa0/0x1a0 drivers/infiniband/core/device.c:1478 ib_unregister_work+0x19/0x30 drivers/infiniband/core/device.c:1586 process_one_work+0x9bf/0x1710 kernel/workqueue.c:2289 worker_thread+0x669/0x1090 kernel/workqueue.c:2436 kthread+0x2e8/0x3a0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308 So change the missed dev_put for pdata->netdev to also follow the tracker. Fixes: 09f530f0c6d6 ("RDMA: Add netdevice_tracker to ib_device_set_netdev()") Reported-by: syzbot+3fd8326d9a0812d19218@syzkaller.appspotmail.com Reported-by: syzbot+a1ed8ffe3121380cd5dd@syzkaller.appspotmail.com Reported-by: syzbot+8d0a099c8a6d1e4e601c@syzkaller.appspotmail.com Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/0-v1-e99919867b8d+1e2-netdev_tracker2_jgg@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 9211d2794aac7..894c06846224e 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2201,7 +2201,7 @@ static void free_netdevs(struct ib_device *ib_dev) * comparisons after the put */ rcu_assign_pointer(pdata->netdev, NULL); - dev_put(ndev); + netdev_put(ndev, &pdata->netdev_tracker); } spin_unlock_irqrestore(&pdata->netdev_lock, flags); } -- GitLab From dbc94a0fb81771a38733c0e8f2ea8c4fa6934dc1 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Thu, 8 Dec 2022 09:52:54 +0200 Subject: [PATCH 1312/1423] IB/IPoIB: Fix queue count inconsistency for PKEY child interfaces There are 2 ways to create IPoIB PKEY child interfaces: 1) Writing a PKEY to /sys/class/net//create_child. 2) Using netlink with iproute. While with sysfs the child interface has the same number of tx and rx queues as the parent, with netlink there will always be 1 tx and 1 rx queue for the child interface. That's because the get_num_tx/rx_queues() netlink ops are missing and the default value of 1 is taken for the number of queues (in rtnl_create_link()). This change adds the get_num_tx/rx_queues() ops which allows for interfaces with multiple queues to be created over netlink. This constant only represents the max number of tx and rx queues on that net device. Fixes: 9baa0b036410 ("IB/ipoib: Add rtnl_link_ops support") Signed-off-by: Dragos Tatulea Link: https://lore.kernel.org/r/f4a42c8aa43c02d5ae5559a60c3e5e0f18c82531.1670485816.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/ipoib/ipoib_netlink.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c index ea16ba5d8da6c..9ad8d98562752 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c @@ -41,6 +41,11 @@ static const struct nla_policy ipoib_policy[IFLA_IPOIB_MAX + 1] = { [IFLA_IPOIB_UMCAST] = { .type = NLA_U16 }, }; +static unsigned int ipoib_get_max_num_queues(void) +{ + return min_t(unsigned int, num_possible_cpus(), 128); +} + static int ipoib_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); @@ -172,6 +177,8 @@ static struct rtnl_link_ops ipoib_link_ops __read_mostly = { .changelink = ipoib_changelink, .get_size = ipoib_get_size, .fill_info = ipoib_fill_info, + .get_num_rx_queues = ipoib_get_max_num_queues, + .get_num_tx_queues = ipoib_get_max_num_queues, }; struct rtnl_link_ops *ipoib_get_link_ops(void) -- GitLab From 37ba7b005a7a4454046bd8659c7a9c5330552396 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 29 Oct 2022 00:01:38 +0900 Subject: [PATCH 1313/1423] ksmbd: set SMB2_SESSION_FLAG_ENCRYPT_DATA when enforcing data encryption for this share Currently, SMB2_SESSION_FLAG_ENCRYPT_DATA is always set session setup response. Since this forces data encryption from the client, there is a problem that data is always encrypted regardless of the use of the cifs seal mount option. SMB2_SESSION_FLAG_ENCRYPT_DATA should be set according to KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION flags, and in case of KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF, encryption mode is turned off for all connections. Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/ksmbd_netlink.h | 1 + fs/ksmbd/smb2ops.c | 10 ++++++++-- fs/ksmbd/smb2pdu.c | 8 +++++--- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h index ff07c67f4565e..b6bd8311e6b4d 100644 --- a/fs/ksmbd/ksmbd_netlink.h +++ b/fs/ksmbd/ksmbd_netlink.h @@ -74,6 +74,7 @@ struct ksmbd_heartbeat { #define KSMBD_GLOBAL_FLAG_SMB2_LEASES BIT(0) #define KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION BIT(1) #define KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL BIT(2) +#define KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF BIT(3) /* * IPC request for ksmbd server startup diff --git a/fs/ksmbd/smb2ops.c b/fs/ksmbd/smb2ops.c index ab23da2120b94..e401302478c36 100644 --- a/fs/ksmbd/smb2ops.c +++ b/fs/ksmbd/smb2ops.c @@ -247,8 +247,9 @@ void init_smb3_02_server(struct ksmbd_conn *conn) if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING; - if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION && - conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION) + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION || + (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) && + conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION)) conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION; if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) @@ -271,6 +272,11 @@ int init_smb3_11_server(struct ksmbd_conn *conn) if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING; + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION || + (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) && + conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION)) + conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION; + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) conn->vals->capabilities |= SMB2_GLOBAL_CAP_MULTI_CHANNEL; diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index b2fc85d440d03..56d68ddc409cd 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -903,7 +903,7 @@ static void decode_encrypt_ctxt(struct ksmbd_conn *conn, return; } - if (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION)) + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) return; for (i = 0; i < cph_cnt; i++) { @@ -1508,7 +1508,8 @@ static int ntlm_authenticate(struct ksmbd_work *work) return -EINVAL; } sess->enc = true; - rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE; + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION) + rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE; /* * signing is disable if encryption is enable * on this session @@ -1599,7 +1600,8 @@ static int krb5_authenticate(struct ksmbd_work *work) return -EINVAL; } sess->enc = true; - rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE; + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION) + rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE; sess->sign = false; } -- GitLab From 7ecbe92696bb7fe32c80b6cf64736a0d157717a9 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 11 Nov 2022 08:11:53 -0500 Subject: [PATCH 1314/1423] ksmbd: use F_SETLK when unlocking a file ksmbd seems to be trying to use a cmd value of 0 when unlocking a file. That activity requires a type of F_UNLCK with a cmd of F_SETLK. For local POSIX locking, it doesn't matter much since vfs_lock_file ignores @cmd, but filesystems that define their own ->lock operation expect to see it set sanely. Cc: David Howells Signed-off-by: Jeff Layton Reviewed-by: David Howells Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 56d68ddc409cd..de8e367095c9d 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -6753,7 +6753,7 @@ static int smb2_set_flock_flags(struct file_lock *flock, int flags) case SMB2_LOCKFLAG_UNLOCK: ksmbd_debug(SMB, "received unlock request\n"); flock->fl_type = F_UNLCK; - cmd = 0; + cmd = F_SETLK; break; } @@ -7131,7 +7131,7 @@ out: rlock->fl_start = smb_lock->start; rlock->fl_end = smb_lock->end; - rc = vfs_lock_file(filp, 0, rlock, NULL); + rc = vfs_lock_file(filp, F_SETLK, rlock, NULL); if (rc) pr_err("rollback unlock fail : %d\n", rc); -- GitLab From 30429388531b120902126a39cf64f877fc5c7773 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 15 Nov 2022 09:35:10 -0600 Subject: [PATCH 1315/1423] ksmbd: replace one-element arrays with flexible-array members One-element arrays are deprecated, and we are replacing them with flexible array members instead. So, replace one-element arrays with flexible-array members in multiple structs in fs/ksmbd/smb_common.h and one in fs/ksmbd/smb2pdu.h. Important to mention is that doing a build before/after this patch results in no binary output differences. This helps with the ongoing efforts to tighten the FORTIFY_SOURCE routines on memcpy() and help us make progress towards globally enabling -fstrict-flex-arrays=3 [1]. Link: https://github.com/KSPP/linux/issues/242 Link: https://github.com/KSPP/linux/issues/79 Link: https://gcc.gnu.org/pipermail/gcc-patches/2022-October/602902.html [1] Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Reviewed-by: Sergey Senozhatsky Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 4 ++-- fs/ksmbd/smb2pdu.h | 2 +- fs/ksmbd/smb_common.h | 12 ++++++------ 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index de8e367095c9d..79261493a2128 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -3440,7 +3440,7 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level, goto free_conv_name; } - struct_sz = readdir_info_level_struct_sz(info_level) - 1 + conv_len; + struct_sz = readdir_info_level_struct_sz(info_level) + conv_len; next_entry_offset = ALIGN(struct_sz, KSMBD_DIR_INFO_ALIGNMENT); d_info->last_entry_off_align = next_entry_offset - struct_sz; @@ -3692,7 +3692,7 @@ static int reserve_populate_dentry(struct ksmbd_dir_info *d_info, return -EOPNOTSUPP; conv_len = (d_info->name_len + 1) * 2; - next_entry_offset = ALIGN(struct_sz - 1 + conv_len, + next_entry_offset = ALIGN(struct_sz + conv_len, KSMBD_DIR_INFO_ALIGNMENT); if (next_entry_offset > d_info->out_buf_len) { diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h index 092fdd3f87505..aa5dbe54f5a18 100644 --- a/fs/ksmbd/smb2pdu.h +++ b/fs/ksmbd/smb2pdu.h @@ -443,7 +443,7 @@ struct smb2_posix_info { /* SidBuffer contain two sids (UNIX user sid(16), UNIX group sid(16)) */ u8 SidBuffer[32]; __le32 name_len; - u8 name[1]; + u8 name[]; /* * var sized owner SID * var sized group SID diff --git a/fs/ksmbd/smb_common.h b/fs/ksmbd/smb_common.h index 318c16fa81da3..e663ab9ea7590 100644 --- a/fs/ksmbd/smb_common.h +++ b/fs/ksmbd/smb_common.h @@ -277,14 +277,14 @@ struct file_directory_info { __le64 AllocationSize; __le32 ExtFileAttributes; __le32 FileNameLength; - char FileName[1]; + char FileName[]; } __packed; /* level 0x101 FF resp data */ struct file_names_info { __le32 NextEntryOffset; __u32 FileIndex; __le32 FileNameLength; - char FileName[1]; + char FileName[]; } __packed; /* level 0xc FF resp data */ struct file_full_directory_info { @@ -299,7 +299,7 @@ struct file_full_directory_info { __le32 ExtFileAttributes; __le32 FileNameLength; __le32 EaSize; - char FileName[1]; + char FileName[]; } __packed; /* level 0x102 FF resp */ struct file_both_directory_info { @@ -317,7 +317,7 @@ struct file_both_directory_info { __u8 ShortNameLength; __u8 Reserved; __u8 ShortName[24]; - char FileName[1]; + char FileName[]; } __packed; /* level 0x104 FFrsp data */ struct file_id_both_directory_info { @@ -337,7 +337,7 @@ struct file_id_both_directory_info { __u8 ShortName[24]; __le16 Reserved2; __le64 UniqueId; - char FileName[1]; + char FileName[]; } __packed; struct file_id_full_dir_info { @@ -354,7 +354,7 @@ struct file_id_full_dir_info { __le32 EaSize; /* EA size */ __le32 Reserved; __le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/ - char FileName[1]; + char FileName[]; } __packed; /* level 0x105 FF rsp data */ struct smb_version_values { -- GitLab From bc044414fa0326a4e5c3c509c00b1fcaf621b5f4 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Wed, 16 Nov 2022 20:22:37 +0800 Subject: [PATCH 1316/1423] ksmbd: Fix resource leak in ksmbd_session_rpc_open() When ksmbd_rpc_open() fails then it must call ksmbd_rpc_id_free() to undo the result of ksmbd_ipc_id_alloc(). Fixes: e2f34481b24d ("cifsd: add server-side procedures for SMB3") Signed-off-by: Xiu Jianfeng Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/mgmt/user_session.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/ksmbd/mgmt/user_session.c b/fs/ksmbd/mgmt/user_session.c index 3fa2139a0b309..92b1603b5abeb 100644 --- a/fs/ksmbd/mgmt/user_session.c +++ b/fs/ksmbd/mgmt/user_session.c @@ -108,15 +108,17 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name) entry->method = method; entry->id = ksmbd_ipc_id_alloc(); if (entry->id < 0) - goto error; + goto free_entry; resp = ksmbd_rpc_open(sess, entry->id); if (!resp) - goto error; + goto free_id; kvfree(resp); return entry->id; -error: +free_id: + ksmbd_rpc_id_free(entry->id); +free_entry: list_del(&entry->list); kfree(entry); return -EINVAL; -- GitLab From 01f6c61bae3d658058ee6322af77acea26a5ee3a Mon Sep 17 00:00:00 2001 From: Marios Makassikis Date: Tue, 29 Nov 2022 12:19:33 +0100 Subject: [PATCH 1317/1423] ksmbd: Fix resource leak in smb2_lock() "flock" is leaked if an error happens before smb2_lock_init(), as the lock is not added to the lock_list to be cleaned up. Signed-off-by: Marios Makassikis Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 79261493a2128..88d2c23369fe8 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -6857,6 +6857,7 @@ int smb2_lock(struct ksmbd_work *work) if (lock_start > U64_MAX - lock_length) { pr_err("Invalid lock range requested\n"); rsp->hdr.Status = STATUS_INVALID_LOCK_RANGE; + locks_free_lock(flock); goto out; } @@ -6876,6 +6877,7 @@ int smb2_lock(struct ksmbd_work *work) "the end offset(%llx) is smaller than the start offset(%llx)\n", flock->fl_end, flock->fl_start); rsp->hdr.Status = STATUS_INVALID_LOCK_RANGE; + locks_free_lock(flock); goto out; } @@ -6887,6 +6889,7 @@ int smb2_lock(struct ksmbd_work *work) flock->fl_type != F_UNLCK) { pr_err("conflict two locks in one request\n"); err = -EINVAL; + locks_free_lock(flock); goto out; } } @@ -6895,6 +6898,7 @@ int smb2_lock(struct ksmbd_work *work) smb_lock = smb2_lock_init(flock, cmd, flags, &lock_list); if (!smb_lock) { err = -EINVAL; + locks_free_lock(flock); goto out; } } -- GitLab From 72ee45fd46d0d3578c4e6046f66fae3218543ce3 Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Wed, 7 Dec 2022 09:29:27 +0800 Subject: [PATCH 1318/1423] ksmbd: Convert to use sysfs_emit()/sysfs_emit_at() APIs Follow the advice of the Documentation/filesystems/sysfs.rst and show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. Signed-off-by: ye xingchen Reviewed-by: Sergey Senozhatsky Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/server.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/fs/ksmbd/server.c b/fs/ksmbd/server.c index a0d635304754a..394b6ceac4312 100644 --- a/fs/ksmbd/server.c +++ b/fs/ksmbd/server.c @@ -432,11 +432,9 @@ static ssize_t stats_show(struct class *class, struct class_attribute *attr, "reset", "shutdown" }; - - ssize_t sz = scnprintf(buf, PAGE_SIZE, "%d %s %d %lu\n", stats_version, - state[server_conf.state], server_conf.tcp_port, - server_conf.ipc_last_active / HZ); - return sz; + return sysfs_emit(buf, "%d %s %d %lu\n", stats_version, + state[server_conf.state], server_conf.tcp_port, + server_conf.ipc_last_active / HZ); } static ssize_t kill_server_store(struct class *class, @@ -468,19 +466,13 @@ static ssize_t debug_show(struct class *class, struct class_attribute *attr, for (i = 0; i < ARRAY_SIZE(debug_type_strings); i++) { if ((ksmbd_debug_types >> i) & 1) { - pos = scnprintf(buf + sz, - PAGE_SIZE - sz, - "[%s] ", - debug_type_strings[i]); + pos = sysfs_emit_at(buf, sz, "[%s] ", debug_type_strings[i]); } else { - pos = scnprintf(buf + sz, - PAGE_SIZE - sz, - "%s ", - debug_type_strings[i]); + pos = sysfs_emit_at(buf, sz, "%s ", debug_type_strings[i]); } sz += pos; } - sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n"); + sz += sysfs_emit_at(buf, sz, "\n"); return sz; } -- GitLab From 01744ce9f07f0b76b0b2d30adba2a7c104f1ff2a Mon Sep 17 00:00:00 2001 From: Naveen Krishna Chatradhi Date: Mon, 5 Dec 2022 10:54:13 +0000 Subject: [PATCH 1319/1423] i3c: Correct the macro module_i3c_i2c_driver Present definition for module_i3c_i2c_driver uses only the 1st argument i.e., struct i3c_driver. Irrespective of CONFIG_I3C being enabled/disabled, struct i2c_driver is never passed to module_driver() Passing struct i2c_driver as the 4th argument works. Signed-off-by: Akshay Gupta Signed-off-by: Naveen Krishna Chatradhi Link: https://lore.kernel.org/r/20221205105413.937704-1-naveenkrishna.chatradhi@amd.com Signed-off-by: Alexandre Belloni --- include/linux/i3c/device.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/i3c/device.h b/include/linux/i3c/device.h index 8242e13e7b0bc..419192b5cc4d3 100644 --- a/include/linux/i3c/device.h +++ b/include/linux/i3c/device.h @@ -287,7 +287,8 @@ static inline void i3c_i2c_driver_unregister(struct i3c_driver *i3cdrv, #define module_i3c_i2c_driver(__i3cdrv, __i2cdrv) \ module_driver(__i3cdrv, \ i3c_i2c_driver_register, \ - i3c_i2c_driver_unregister) + i3c_i2c_driver_unregister, \ + __i2cdrv) int i3c_device_do_priv_xfers(struct i3c_device *dev, struct i3c_priv_xfer *xfers, -- GitLab From 672825cd2823a0cee4687ce80fef5b702ff3caa3 Mon Sep 17 00:00:00 2001 From: Jack Chen Date: Wed, 7 Dec 2022 15:50:59 -0500 Subject: [PATCH 1320/1423] i3c: export SETDASA method Because not all I3C drivers have the hot-join feature ready, and especially not all I3C devices support hot-join feature, exporting SETDASA method could be useful. With this function, the I3C controller could perform a DAA to I3C devices when users decide to turn these I3C devices off and on again during run-time. Tested: This change has been tested with turnning off an I3C device and turning on it again during run-time. The device driver calls SETDASA method to perform DAA to the device. And communication between I3C controller and device is set up again correctly. Signed-off-by: Jack Chen Link: https://lore.kernel.org/r/20221207205059.3848851-1-zenghuchen@google.com Signed-off-by: Alexandre Belloni --- drivers/i3c/device.c | 20 ++++++++++++++++++++ drivers/i3c/internals.h | 1 + drivers/i3c/master.c | 19 +++++++++++++++++++ include/linux/i3c/device.h | 2 ++ 4 files changed, 42 insertions(+) diff --git a/drivers/i3c/device.c b/drivers/i3c/device.c index e92d3e9a52bd5..9762630b917ea 100644 --- a/drivers/i3c/device.c +++ b/drivers/i3c/device.c @@ -50,6 +50,26 @@ int i3c_device_do_priv_xfers(struct i3c_device *dev, } EXPORT_SYMBOL_GPL(i3c_device_do_priv_xfers); +/** + * i3c_device_do_setdasa() - do I3C dynamic address assignement with + * static address + * + * @dev: device with which the DAA should be done + * + * Return: 0 in case of success, a negative error core otherwise. + */ +int i3c_device_do_setdasa(struct i3c_device *dev) +{ + int ret; + + i3c_bus_normaluse_lock(dev->bus); + ret = i3c_dev_setdasa_locked(dev->desc); + i3c_bus_normaluse_unlock(dev->bus); + + return ret; +} +EXPORT_SYMBOL_GPL(i3c_device_do_setdasa); + /** * i3c_device_get_info() - get I3C device information * diff --git a/drivers/i3c/internals.h b/drivers/i3c/internals.h index 86b7b44cfca28..908a807badaf9 100644 --- a/drivers/i3c/internals.h +++ b/drivers/i3c/internals.h @@ -15,6 +15,7 @@ extern struct bus_type i3c_bus_type; void i3c_bus_normaluse_lock(struct i3c_bus *bus); void i3c_bus_normaluse_unlock(struct i3c_bus *bus); +int i3c_dev_setdasa_locked(struct i3c_dev_desc *dev); int i3c_dev_do_priv_xfers_locked(struct i3c_dev_desc *dev, struct i3c_priv_xfer *xfers, int nxfers); diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c index 351c81a929a6c..d7e6f6c99aea5 100644 --- a/drivers/i3c/master.c +++ b/drivers/i3c/master.c @@ -2708,6 +2708,25 @@ int i3c_master_unregister(struct i3c_master_controller *master) } EXPORT_SYMBOL_GPL(i3c_master_unregister); +int i3c_dev_setdasa_locked(struct i3c_dev_desc *dev) +{ + struct i3c_master_controller *master; + + if (!dev) + return -ENOENT; + + master = i3c_dev_get_master(dev); + if (!master) + return -EINVAL; + + if (!dev->boardinfo || !dev->boardinfo->init_dyn_addr || + !dev->boardinfo->static_addr) + return -EINVAL; + + return i3c_master_setdasa_locked(master, dev->info.static_addr, + dev->boardinfo->init_dyn_addr); +} + int i3c_dev_do_priv_xfers_locked(struct i3c_dev_desc *dev, struct i3c_priv_xfer *xfers, int nxfers) diff --git a/include/linux/i3c/device.h b/include/linux/i3c/device.h index 419192b5cc4d3..1c997abe868c6 100644 --- a/include/linux/i3c/device.h +++ b/include/linux/i3c/device.h @@ -294,6 +294,8 @@ int i3c_device_do_priv_xfers(struct i3c_device *dev, struct i3c_priv_xfer *xfers, int nxfers); +int i3c_device_do_setdasa(struct i3c_device *dev); + void i3c_device_get_info(struct i3c_device *dev, struct i3c_device_info *info); struct i3c_ibi_payload { -- GitLab From 08dcf0732cb4d97b85493d9f60470e48eebf87fe Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Sun, 11 Dec 2022 21:55:38 +0100 Subject: [PATCH 1321/1423] MAINTAINERS: mark I3C DRIVER FOR SYNOPSYS DESIGNWARE orphan Vitor left Synopsys and the email address is now bouncing. Link: https://lore.kernel.org/r/20221211205539.19353-1-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni --- MAINTAINERS | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index cf0f185023724..ebd7f7c957ea7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9697,8 +9697,7 @@ F: Documentation/devicetree/bindings/i3c/cdns,i3c-master.yaml F: drivers/i3c/master/i3c-master-cdns.c I3C DRIVER FOR SYNOPSYS DESIGNWARE -M: Vitor Soares -S: Maintained +S: Orphan F: Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.yaml F: drivers/i3c/master/dw* -- GitLab From b003b3b77d65133a0011ae3b7b255347438c12f6 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Tue, 29 Nov 2022 18:35:14 -0800 Subject: [PATCH 1322/1423] RISC-V: Align the shadow stack The standard RISC-V ABIs all require 16-byte stack alignment. We're only calling that one function on the shadow stack so I doubt it'd result in a real issue, but might as well keep this lined up. Fixes: 31da94c25aea ("riscv: add VMAP_STACK overflow detection") Reviewed-by: Jisheng Zhang Link: https://lore.kernel.org/r/20221130023515.20217-1-palmer@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/traps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index be54ccea8c475..acdfcacd7e578 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -206,7 +206,7 @@ static DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], * shadow stack, handled_ kernel_ stack_ overflow(in kernel/entry.S) is used * to get per-cpu overflow stack(get_overflow_stack). */ -long shadow_stack[SHADOW_OVERFLOW_STACK_SIZE/sizeof(long)]; +long shadow_stack[SHADOW_OVERFLOW_STACK_SIZE/sizeof(long)] __aligned(16); asmlinkage unsigned long get_overflow_stack(void) { return (unsigned long)this_cpu_ptr(overflow_stack) + -- GitLab From de57ecc476103179e93fd85091770921f76a19af Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Tue, 29 Nov 2022 18:35:15 -0800 Subject: [PATCH 1323/1423] RISC-V: Add some comments about the shadow and overflow stacks It took me a while to page all this back in when trying to review the recent spin_shadow_stack, so I figured I'd just write up some comments. Reviewed-by: Guo Ren Reviewed-by: Jisheng Zhang Link: https://lore.kernel.org/r/20221130023515.20217-2-palmer@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/traps.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index acdfcacd7e578..336d4aadadb1c 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -200,18 +200,18 @@ void __init trap_init(void) } #ifdef CONFIG_VMAP_STACK +/* + * Extra stack space that allows us to provide panic messages when the kernel + * has overflowed its stack. + */ static DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack)__aligned(16); /* - * shadow stack, handled_ kernel_ stack_ overflow(in kernel/entry.S) is used - * to get per-cpu overflow stack(get_overflow_stack). + * A temporary stack for use by handle_kernel_stack_overflow. This is used so + * we can call into C code to get the per-hart overflow stack. Usage of this + * stack must be protected by spin_shadow_stack. */ long shadow_stack[SHADOW_OVERFLOW_STACK_SIZE/sizeof(long)] __aligned(16); -asmlinkage unsigned long get_overflow_stack(void) -{ - return (unsigned long)this_cpu_ptr(overflow_stack) + - OVERFLOW_STACK_SIZE; -} /* * A pseudo spinlock to protect the shadow stack from being used by multiple @@ -222,6 +222,12 @@ asmlinkage unsigned long get_overflow_stack(void) */ unsigned long spin_shadow_stack; +asmlinkage unsigned long get_overflow_stack(void) +{ + return (unsigned long)this_cpu_ptr(overflow_stack) + + OVERFLOW_STACK_SIZE; +} + asmlinkage void handle_bad_stack(struct pt_regs *regs) { unsigned long tsk_stk = (unsigned long)current->stack; -- GitLab From e4b731ccb0975fd97283e0c0d9841a89063ec31a Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 18 Oct 2022 09:03:29 +0800 Subject: [PATCH 1324/1423] ceph: remove useless session parameter for check_caps() The session parameter makes no sense any more. Signed-off-by: Xiubo Li Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 2 +- fs/ceph/caps.c | 23 +++++++++-------------- fs/ceph/file.c | 17 +++++++---------- fs/ceph/inode.c | 6 +++--- fs/ceph/ioctl.c | 2 +- fs/ceph/super.h | 3 +-- 6 files changed, 22 insertions(+), 31 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index dcf701b05cc1c..ff6e3c279a798 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1367,7 +1367,7 @@ out: folio_put(folio); if (check_cap) - ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); + ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY); return copied; } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index e54814d0c2f7b..1323fa28ab017 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1898,8 +1898,7 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci) * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without * further delay. */ -void ceph_check_caps(struct ceph_inode_info *ci, int flags, - struct ceph_mds_session *session) +void ceph_check_caps(struct ceph_inode_info *ci, int flags) { struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); @@ -1913,15 +1912,12 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, bool queue_invalidate = false; bool tried_invalidate = false; bool queue_writeback = false; - - if (session) - ceph_get_mds_session(session); + struct ceph_mds_session *session = NULL; spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { /* Don't send messages until we get async create reply */ spin_unlock(&ci->i_ceph_lock); - ceph_put_mds_session(session); return; } @@ -2851,7 +2847,7 @@ static void check_max_size(struct inode *inode, loff_t endoff) check = 1; spin_unlock(&ci->i_ceph_lock); if (check) - ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY); } static inline int get_used_fmode(int caps) @@ -3140,7 +3136,7 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, switch (mode) { case PUT_CAP_REFS_SYNC: if (last) - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); else if (flushsnaps) ceph_flush_snaps(ci, NULL); break; @@ -3255,7 +3251,7 @@ unlock: spin_unlock(&ci->i_ceph_lock); if (last) { - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); } else if (flush_snaps) { ceph_flush_snaps(ci, NULL); } @@ -3604,10 +3600,9 @@ static void handle_cap_grant(struct inode *inode, mutex_unlock(&session->s_mutex); if (check_caps == 1) - ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL, - session); + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL); else if (check_caps == 2) - ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session); + ceph_check_caps(ci, CHECK_CAPS_NOINVAL); } /* @@ -4333,7 +4328,7 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc) if (inode) { spin_unlock(&mdsc->cap_delay_lock); dout("check_delayed_caps on %p\n", inode); - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); iput(inode); spin_lock(&mdsc->cap_delay_lock); } @@ -4362,7 +4357,7 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s) dout("flush_dirty_caps %llx.%llx\n", ceph_vinop(inode)); spin_unlock(&mdsc->cap_dirty_lock); ceph_wait_on_async_create(inode); - ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL); + ceph_check_caps(ci, CHECK_CAPS_FLUSH); iput(inode); spin_lock(&mdsc->cap_dirty_lock); } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 04fd34557de84..4e68220bc06d8 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -313,7 +313,7 @@ int ceph_renew_caps(struct inode *inode, int fmode) spin_unlock(&ci->i_ceph_lock); dout("renew caps %p want %s issued %s updating mds_wanted\n", inode, ceph_cap_string(wanted), ceph_cap_string(issued)); - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); return 0; } spin_unlock(&ci->i_ceph_lock); @@ -408,7 +408,7 @@ int ceph_open(struct inode *inode, struct file *file) if ((issued & wanted) != wanted && (mds_wanted & wanted) != wanted && ceph_snap(inode) != CEPH_SNAPDIR) - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); return ceph_init_file(inode, file, fmode); } else if (ceph_snap(inode) != CEPH_NOSNAP && @@ -1092,7 +1092,7 @@ static void ceph_aio_complete(struct inode *inode, loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len; if (endoff > i_size_read(inode)) { if (ceph_inode_set_size(inode, endoff)) - ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY); } spin_lock(&ci->i_ceph_lock); @@ -1421,8 +1421,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (write && pos > size) { if (ceph_inode_set_size(inode, pos)) ceph_check_caps(ceph_inode(inode), - CHECK_CAPS_AUTHONLY, - NULL); + CHECK_CAPS_AUTHONLY); } } @@ -1577,8 +1576,7 @@ out: check_caps = ceph_inode_set_size(inode, pos); if (check_caps) ceph_check_caps(ceph_inode(inode), - CHECK_CAPS_AUTHONLY, - NULL); + CHECK_CAPS_AUTHONLY); } } @@ -1906,7 +1904,7 @@ retry_snap: if (dirty) __mark_inode_dirty(inode, dirty); if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos)) - ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL); + ceph_check_caps(ci, CHECK_CAPS_FLUSH); } dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", @@ -2521,8 +2519,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, /* Let the MDS know about dst file size change */ if (ceph_inode_set_size(dst_inode, dst_off) || ceph_quota_is_max_bytes_approaching(dst_inode, dst_off)) - ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_FLUSH, - NULL); + ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_FLUSH); } /* Mark Fw dirty */ spin_lock(&dst_ci->i_ceph_lock); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index bad9eeb6a1a59..12173c00129f2 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1909,7 +1909,7 @@ static void ceph_do_invalidate_pages(struct inode *inode) mutex_unlock(&ci->i_truncate_mutex); out: if (check) - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); } /* @@ -1969,7 +1969,7 @@ retry: mutex_unlock(&ci->i_truncate_mutex); if (wrbuffer_refs == 0) - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); wake_up_all(&ci->i_cap_wq); } @@ -1991,7 +1991,7 @@ static void ceph_inode_work(struct work_struct *work) __ceph_do_pending_vmtruncate(inode); if (test_and_clear_bit(CEPH_I_WORK_CHECK_CAPS, &ci->i_work_mask)) - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); if (test_and_clear_bit(CEPH_I_WORK_FLUSH_SNAPS, &ci->i_work_mask)) ceph_flush_snaps(ci, NULL); diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 6e061bf62ad47..deac817647eb0 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -253,7 +253,7 @@ static long ceph_ioctl_lazyio(struct file *file) spin_unlock(&ci->i_ceph_lock); dout("ioctl_layzio: file %p marked lazy\n", file); - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); } else { dout("ioctl_layzio: file %p already lazy\n", file); } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 40630e6f691c7..e8bc1d0d2614b 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1200,8 +1200,7 @@ extern void ceph_remove_capsnap(struct inode *inode, extern void ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session **psession); extern bool __ceph_should_report_size(struct ceph_inode_info *ci); -extern void ceph_check_caps(struct ceph_inode_info *ci, int flags, - struct ceph_mds_session *session); +extern void ceph_check_caps(struct ceph_inode_info *ci, int flags); extern unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc); extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc); extern int ceph_drop_caps_for_unlink(struct inode *inode); -- GitLab From 68c62bee9d081cf815310b3a96e38d94fc16007d Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Mon, 17 Oct 2022 22:17:35 +0800 Subject: [PATCH 1325/1423] ceph: try to check caps immediately after async creating finishes We should call the check_caps() again immediately after the async creating finishes in case the MDS is waiting for caps revocation to finish. Link: https://tracker.ceph.com/issues/46904 Signed-off-by: Xiubo Li Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 2 ++ fs/ceph/file.c | 9 +++++++++ fs/ceph/super.h | 2 ++ 3 files changed, 13 insertions(+) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 1323fa28ab017..4b159f97fe7b5 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1916,6 +1916,8 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags) spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { + ci->i_ceph_flags |= CEPH_I_ASYNC_CHECK_CAPS; + /* Don't send messages until we get async create reply */ spin_unlock(&ci->i_ceph_lock); return; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 4e68220bc06d8..a6681b12e2806 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -534,14 +534,23 @@ static void wake_async_create_waiters(struct inode *inode, struct ceph_mds_session *session) { struct ceph_inode_info *ci = ceph_inode(inode); + bool check_cap = false; spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE; wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT); + + if (ci->i_ceph_flags & CEPH_I_ASYNC_CHECK_CAPS) { + ci->i_ceph_flags &= ~CEPH_I_ASYNC_CHECK_CAPS; + check_cap = true; + } } ceph_kick_flushing_inode_caps(session, ci); spin_unlock(&ci->i_ceph_lock); + + if (check_cap) + ceph_check_caps(ci, CHECK_CAPS_FLUSH); } static void ceph_async_create_cb(struct ceph_mds_client *mdsc, diff --git a/fs/ceph/super.h b/fs/ceph/super.h index e8bc1d0d2614b..e1bd6f487226e 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -593,6 +593,8 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_ASYNC_CREATE_BIT (12) /* async create in flight for this */ #define CEPH_I_ASYNC_CREATE (1 << CEPH_ASYNC_CREATE_BIT) #define CEPH_I_SHUTDOWN (1 << 13) /* inode is no longer usable */ +#define CEPH_I_ASYNC_CHECK_CAPS (1 << 14) /* check caps immediately after async + creating finishes */ /* * Masks of ceph inode work. -- GitLab From c19204cbd65c12fdcd34fb8f5d645007238ed5cd Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 8 Dec 2022 16:11:00 -0600 Subject: [PATCH 1326/1423] cifs: minor cleanup of some headers checkpatch showed formatting problems with extra spaces, and extra semicolon and some missing blank lines in some cifs headers. Reviewed-by: Paulo Alcantara (SUSE) Reviewed-by: Germano Percossi Signed-off-by: Steve French --- fs/cifs/cifs_ioctl.h | 2 +- fs/cifs/cifsfs.h | 4 ++-- fs/cifs/cifsglob.h | 7 +++++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index d86d78d5bfdc1..332588e77c311 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -108,7 +108,7 @@ struct smb3_notify_info { #define CIFS_IOC_NOTIFY _IOW(CIFS_IOCTL_MAGIC, 9, struct smb3_notify) #define CIFS_DUMP_FULL_KEY _IOWR(CIFS_IOCTL_MAGIC, 10, struct smb3_full_key_debug_info) #define CIFS_IOC_NOTIFY_INFO _IOWR(CIFS_IOCTL_MAGIC, 11, struct smb3_notify_info) -#define CIFS_IOC_SHUTDOWN _IOR ('X', 125, __u32) +#define CIFS_IOC_SHUTDOWN _IOR('X', 125, __u32) /* * Flags for going down operation diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 388b745a978e2..00a573e0ad0e1 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -105,8 +105,8 @@ extern int cifs_lock(struct file *, int, struct file_lock *); extern int cifs_fsync(struct file *, loff_t, loff_t, int); extern int cifs_strict_fsync(struct file *, loff_t, loff_t, int); extern int cifs_flush(struct file *, fl_owner_t id); -extern int cifs_file_mmap(struct file * , struct vm_area_struct *); -extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *); +extern int cifs_file_mmap(struct file *file, struct vm_area_struct *vma); +extern int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma); extern const struct file_operations cifs_dir_ops; extern int cifs_dir_open(struct inode *inode, struct file *file); extern int cifs_readdir(struct file *file, struct dir_context *ctx); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 1420acf987f03..cd3a173e65b1c 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -785,6 +785,7 @@ static inline unsigned int in_flight(struct TCP_Server_Info *server) { unsigned int num; + spin_lock(&server->req_lock); num = server->in_flight; spin_unlock(&server->req_lock); @@ -795,6 +796,7 @@ static inline bool has_credits(struct TCP_Server_Info *server, int *credits, int num_credits) { int num; + spin_lock(&server->req_lock); num = *credits; spin_unlock(&server->req_lock); @@ -1025,7 +1027,7 @@ struct cifs_ses { struct TCP_Server_Info *server; /* pointer to server info */ int ses_count; /* reference counter */ enum ses_status_enum ses_status; /* updates protected by cifs_tcp_ses_lock */ - unsigned overrideSecFlg; /* if non-zero override global sec flags */ + unsigned int overrideSecFlg; /* if non-zero override global sec flags */ char *serverOS; /* name of operating system underlying server */ char *serverNOS; /* name of network operating system of server */ char *serverDomain; /* security realm of server */ @@ -1381,7 +1383,7 @@ struct cifsFileInfo { __u32 pid; /* process id who opened file */ struct cifs_fid fid; /* file id from remote */ struct list_head rlist; /* reconnect list */ - /* BB add lock scope info here if needed */ ; + /* BB add lock scope info here if needed */ /* lock scope id (0 if none) */ struct dentry *dentry; struct tcon_link *tlink; @@ -1769,6 +1771,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param, int number_of_items) { int i; + if ((number_of_items == 0) || (param == NULL)) return; for (i = 0; i < number_of_items; i++) { -- GitLab From 9544597b5b63ff1674d60e069f93555ab924b62b Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 9 Dec 2022 16:55:51 -0600 Subject: [PATCH 1327/1423] cifs: fix various whitespace errors in headers Fix some extra spaces and a few comments that were unnecessarily split over two lines. These were some trivial issues pointed out by checkpatch) Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/cifspdu.h | 50 +++++++++++++++++++-------------------------- fs/cifs/cifsproto.h | 2 +- 2 files changed, 22 insertions(+), 30 deletions(-) diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index d1abaeea974a9..623caece2b10c 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -1429,7 +1429,7 @@ typedef struct smb_com_transaction_change_notify_req { __u8 WatchTree; /* 1 = Monitor subdirectories */ __u8 Reserved2; __le16 ByteCount; -/* __u8 Pad[3];*/ +/* __u8 Pad[3];*/ /* __u8 Data[1];*/ } __attribute__((packed)) TRANSACT_CHANGE_NOTIFY_REQ; @@ -1752,8 +1752,7 @@ struct smb_com_transaction2_sfi_rsp { struct smb_hdr hdr; /* wct = 10 + SetupCount */ struct trans2_resp t2; __u16 ByteCount; - __u16 Reserved2; /* parameter word reserved - - present for infolevels > 100 */ + __u16 Reserved2; /* parameter word reserved - present for infolevels > 100 */ } __attribute__((packed)); struct smb_t2_qfi_req { @@ -1768,8 +1767,7 @@ struct smb_t2_qfi_rsp { struct smb_hdr hdr; /* wct = 10 + SetupCount */ struct trans2_resp t2; __u16 ByteCount; - __u16 Reserved2; /* parameter word reserved - - present for infolevels > 100 */ + __u16 Reserved2; /* parameter word reserved - present for infolevels > 100 */ } __attribute__((packed)); /* @@ -2146,13 +2144,11 @@ typedef struct { #define CIFS_UNIX_POSIX_PATH_OPS_CAP 0x00000020 /* Allow new POSIX path based calls including posix open and posix unlink */ -#define CIFS_UNIX_LARGE_READ_CAP 0x00000040 /* support reads >128K (up - to 0xFFFF00 */ +#define CIFS_UNIX_LARGE_READ_CAP 0x00000040 /* support reads >128K (up to 0xFFFF00 */ #define CIFS_UNIX_LARGE_WRITE_CAP 0x00000080 #define CIFS_UNIX_TRANSPORT_ENCRYPTION_CAP 0x00000100 /* can do SPNEGO crypt */ #define CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP 0x00000200 /* must do */ -#define CIFS_UNIX_PROXY_CAP 0x00000400 /* Proxy cap: 0xACE ioctl and - QFS PROXY call */ +#define CIFS_UNIX_PROXY_CAP 0x00000400 /* Proxy cap: 0xACE ioctl and QFS PROXY call */ #ifdef CONFIG_CIFS_POSIX /* presumably don't need the 0x20 POSIX_PATH_OPS_CAP since we never send LockingX instead of posix locking call on unix sess (and we do not expect @@ -2368,8 +2364,7 @@ typedef struct { struct file_allocation_info { __le64 AllocationSize; /* Note old Samba srvr rounds this up too much */ -} __attribute__((packed)); /* size used on disk, for level 0x103 for set, - 0x105 for query */ +} __packed; /* size used on disk, for level 0x103 for set, 0x105 for query */ struct file_end_of_file_info { __le64 FileSize; /* offset to end of file */ @@ -2409,8 +2404,7 @@ struct cifs_posix_acl { /* access conrol list (ACL) */ __le16 access_entry_count; /* access ACL - count of entries */ __le16 default_entry_count; /* default ACL - count of entries */ struct cifs_posix_ace ace_array[]; - /* followed by - struct cifs_posix_ace default_ace_arraay[] */ + /* followed by struct cifs_posix_ace default_ace_array[] */ } __attribute__((packed)); /* level 0x204 */ /* types of access control entries already defined in posix_acl.h */ @@ -2429,17 +2423,17 @@ struct cifs_posix_acl { /* access conrol list (ACL) */ /* end of POSIX ACL definitions */ /* POSIX Open Flags */ -#define SMB_O_RDONLY 0x1 -#define SMB_O_WRONLY 0x2 -#define SMB_O_RDWR 0x4 -#define SMB_O_CREAT 0x10 -#define SMB_O_EXCL 0x20 -#define SMB_O_TRUNC 0x40 -#define SMB_O_APPEND 0x80 -#define SMB_O_SYNC 0x100 -#define SMB_O_DIRECTORY 0x200 -#define SMB_O_NOFOLLOW 0x400 -#define SMB_O_DIRECT 0x800 +#define SMB_O_RDONLY 0x1 +#define SMB_O_WRONLY 0x2 +#define SMB_O_RDWR 0x4 +#define SMB_O_CREAT 0x10 +#define SMB_O_EXCL 0x20 +#define SMB_O_TRUNC 0x40 +#define SMB_O_APPEND 0x80 +#define SMB_O_SYNC 0x100 +#define SMB_O_DIRECTORY 0x200 +#define SMB_O_NOFOLLOW 0x400 +#define SMB_O_DIRECT 0x800 typedef struct { __le32 OpenFlags; /* same as NT CreateX */ @@ -2716,15 +2710,13 @@ typedef struct file_xattr_info { __u32 xattr_value_len; char xattr_name[]; /* followed by xattr_value[xattr_value_len], no pad */ -} __attribute__((packed)) FILE_XATTR_INFO; /* extended attribute info - level 0x205 */ +} __packed FILE_XATTR_INFO; /* extended attribute info level 0x205 */ /* flags for lsattr and chflags commands removed arein uapi/linux/fs.h */ typedef struct file_chattr_info { __le64 mask; /* list of all possible attribute bits */ __le64 mode; /* list of actual attribute bits on this inode */ -} __attribute__((packed)) FILE_CHATTR_INFO; /* ext attributes - (chattr, chflags) level 0x206 */ -#endif /* POSIX */ +} __packed FILE_CHATTR_INFO; /* ext attributes (chattr, chflags) level 0x206 */ +#endif /* POSIX */ #endif /* _CIFSPDU_H */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 83e83d8beabba..f216fa269c850 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -124,7 +124,7 @@ extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *, struct kvec * /* resp vec */); extern int SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *ptcon, - struct smb_hdr *in_buf , + struct smb_hdr *in_buf, struct smb_hdr *out_buf, int *bytes_returned); void -- GitLab From 2bfd81043e944af0e52835ef6d9b41795af22341 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 11 Dec 2022 13:54:21 -0600 Subject: [PATCH 1328/1423] cifs: fix missing display of three mount options Three mount options: "tcpnodelay" and "noautotune" and "noblocksend" were not displayed when passed in on cifs/smb3 mounts (e.g. displayed in /proc/mounts e.g.). No change to defaults so these are not displayed if not specified on mount. Cc: stable@vger.kernel.org Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 712a431614480..6094cb2ff099b 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -678,9 +678,15 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",echo_interval=%lu", tcon->ses->server->echo_interval / HZ); - /* Only display max_credits if it was overridden on mount */ + /* Only display the following if overridden on mount */ if (tcon->ses->server->max_credits != SMB2_MAX_CREDITS_AVAILABLE) seq_printf(s, ",max_credits=%u", tcon->ses->server->max_credits); + if (tcon->ses->server->tcp_nodelay) + seq_puts(s, ",tcpnodelay"); + if (tcon->ses->server->noautotune) + seq_puts(s, ",noautotune"); + if (tcon->ses->server->noblocksnd) + seq_puts(s, ",noblocksend"); if (tcon->snapshot_time) seq_printf(s, ",snapshot=%llu", tcon->snapshot_time); -- GitLab From 9d91f8108ebfed54284332e04d2073107df18794 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 11 Dec 2022 14:44:31 -0600 Subject: [PATCH 1329/1423] cifs: print warning when conflicting soft vs. hard mount options specified If the user specifies conflicting hard vs. soft mount options (or nosoft vs. nohard) print a warning to dmesg We were missing a warning when a user e.g. mounted with both "hard,soft" mount options. Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/fs_context.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index 45119597c7655..2c92a821e0284 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -884,16 +884,21 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, ctx->nodfs = 1; break; case Opt_hard: - if (result.negated) + if (result.negated) { + if (ctx->retry == 1) + cifs_dbg(VFS, "conflicting hard vs. soft mount options\n"); ctx->retry = 0; - else + } else ctx->retry = 1; break; case Opt_soft: if (result.negated) ctx->retry = 1; - else + else { + if (ctx->retry == 1) + cifs_dbg(VFS, "conflicting hard vs soft mount options\n"); ctx->retry = 0; + } break; case Opt_mapposix: if (result.negated) -- GitLab From f7f291e14dde32a07b1f0aa06921d28f875a7b54 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Sun, 11 Dec 2022 18:18:55 -0300 Subject: [PATCH 1330/1423] cifs: fix oops during encryption When running xfstests against Azure the following oops occurred on an arm64 system Unable to handle kernel write to read-only memory at virtual address ffff0001221cf000 Mem abort info: ESR = 0x9600004f EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x0f: level 3 permission fault Data abort info: ISV = 0, ISS = 0x0000004f CM = 0, WnR = 1 swapper pgtable: 4k pages, 48-bit VAs, pgdp=00000000294f3000 [ffff0001221cf000] pgd=18000001ffff8003, p4d=18000001ffff8003, pud=18000001ff82e003, pmd=18000001ff71d003, pte=00600001221cf787 Internal error: Oops: 9600004f [#1] PREEMPT SMP ... pstate: 80000005 (Nzcv daif -PAN -UAO -TCO BTYPE=--) pc : __memcpy+0x40/0x230 lr : scatterwalk_copychunks+0xe0/0x200 sp : ffff800014e92de0 x29: ffff800014e92de0 x28: ffff000114f9de80 x27: 0000000000000008 x26: 0000000000000008 x25: ffff800014e92e78 x24: 0000000000000008 x23: 0000000000000001 x22: 0000040000000000 x21: ffff000000000000 x20: 0000000000000001 x19: ffff0001037c4488 x18: 0000000000000014 x17: 235e1c0d6efa9661 x16: a435f9576b6edd6c x15: 0000000000000058 x14: 0000000000000001 x13: 0000000000000008 x12: ffff000114f2e590 x11: ffffffffffffffff x10: 0000040000000000 x9 : ffff8000105c3580 x8 : 2e9413b10000001a x7 : 534b4410fb86b005 x6 : 534b4410fb86b005 x5 : ffff0001221cf008 x4 : ffff0001037c4490 x3 : 0000000000000001 x2 : 0000000000000008 x1 : ffff0001037c4488 x0 : ffff0001221cf000 Call trace: __memcpy+0x40/0x230 scatterwalk_map_and_copy+0x98/0x100 crypto_ccm_encrypt+0x150/0x180 crypto_aead_encrypt+0x2c/0x40 crypt_message+0x750/0x880 smb3_init_transform_rq+0x298/0x340 smb_send_rqst.part.11+0xd8/0x180 smb_send_rqst+0x3c/0x100 compound_send_recv+0x534/0xbc0 smb2_query_info_compound+0x32c/0x440 smb2_set_ea+0x438/0x4c0 cifs_xattr_set+0x5d4/0x7c0 This is because in scatterwalk_copychunks(), we attempted to write to a buffer (@sign) that was allocated in the stack (vmalloc area) by crypt_message() and thus accessing its remaining 8 (x2) bytes ended up crossing a page boundary. To simply fix it, we could just pass @sign kmalloc'd from crypt_message() and then we're done. Luckily, we don't seem to pass any other vmalloc'd buffers in smb_rqst::rq_iov... Instead, let's map the correct pages and offsets from vmalloc buffers as well in cifs_sg_set_buf() and then avoiding such oopses. Signed-off-by: Paulo Alcantara (SUSE) Cc: stable@vger.kernel.org Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 68 +++++++++++++++++++++ fs/cifs/cifsproto.h | 4 +- fs/cifs/misc.c | 4 +- fs/cifs/smb2ops.c | 143 +++++++++++++++++++++----------------------- 4 files changed, 140 insertions(+), 79 deletions(-) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index cd3a173e65b1c..703685e2db5e7 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include #include #include @@ -2140,4 +2142,70 @@ static inline void move_cifs_info_to_smb2(struct smb2_file_all_info *dst, const dst->FileNameLength = src->FileNameLength; } +static inline unsigned int cifs_get_num_sgs(const struct smb_rqst *rqst, + int num_rqst, + const u8 *sig) +{ + unsigned int len, skip; + unsigned int nents = 0; + unsigned long addr; + int i, j; + + /* Assumes the first rqst has a transform header as the first iov. + * I.e. + * rqst[0].rq_iov[0] is transform header + * rqst[0].rq_iov[1+] data to be encrypted/decrypted + * rqst[1+].rq_iov[0+] data to be encrypted/decrypted + */ + for (i = 0; i < num_rqst; i++) { + /* + * The first rqst has a transform header where the + * first 20 bytes are not part of the encrypted blob. + */ + for (j = 0; j < rqst[i].rq_nvec; j++) { + struct kvec *iov = &rqst[i].rq_iov[j]; + + skip = (i == 0) && (j == 0) ? 20 : 0; + addr = (unsigned long)iov->iov_base + skip; + if (unlikely(is_vmalloc_addr((void *)addr))) { + len = iov->iov_len - skip; + nents += DIV_ROUND_UP(offset_in_page(addr) + len, + PAGE_SIZE); + } else { + nents++; + } + } + nents += rqst[i].rq_npages; + } + nents += DIV_ROUND_UP(offset_in_page(sig) + SMB2_SIGNATURE_SIZE, PAGE_SIZE); + return nents; +} + +/* We can not use the normal sg_set_buf() as we will sometimes pass a + * stack object as buf. + */ +static inline struct scatterlist *cifs_sg_set_buf(struct scatterlist *sg, + const void *buf, + unsigned int buflen) +{ + unsigned long addr = (unsigned long)buf; + unsigned int off = offset_in_page(addr); + + addr &= PAGE_MASK; + if (unlikely(is_vmalloc_addr((void *)addr))) { + do { + unsigned int len = min_t(unsigned int, buflen, PAGE_SIZE - off); + + sg_set_page(sg++, vmalloc_to_page((void *)addr), len, off); + + off = 0; + addr += PAGE_SIZE; + buflen -= len; + } while (buflen); + } else { + sg_set_page(sg++, virt_to_page(addr), buflen, off); + } + return sg; +} + #endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index f216fa269c850..9c6147ca029d2 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -600,8 +600,8 @@ int setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw); int cifs_alloc_hash(const char *name, struct shash_desc **sdesc); void cifs_free_hash(struct shash_desc **sdesc); -extern void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page, - unsigned int *len, unsigned int *offset); +void rqst_page_get_length(const struct smb_rqst *rqst, unsigned int page, + unsigned int *len, unsigned int *offset); struct cifs_chan * cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server); int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 3e68d8208cf5e..1cbecd64d697f 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -1136,8 +1136,8 @@ cifs_free_hash(struct shash_desc **sdesc) * @len: Where to store the length for this page: * @offset: Where to store the offset for this page */ -void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page, - unsigned int *len, unsigned int *offset) +void rqst_page_get_length(const struct smb_rqst *rqst, unsigned int page, + unsigned int *len, unsigned int *offset) { *len = rqst->rq_pagesz; *offset = (page == 0) ? rqst->rq_offset : 0; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 72b22d033ed56..6e772b31e02a3 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -4204,69 +4204,82 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len, memcpy(&tr_hdr->SessionId, &shdr->SessionId, 8); } -/* We can not use the normal sg_set_buf() as we will sometimes pass a - * stack object as buf. - */ -static inline void smb2_sg_set_buf(struct scatterlist *sg, const void *buf, - unsigned int buflen) +static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst *rqst, + int num_rqst, const u8 *sig, u8 **iv, + struct aead_request **req, struct scatterlist **sgl, + unsigned int *num_sgs) { - void *addr; - /* - * VMAP_STACK (at least) puts stack into the vmalloc address space - */ - if (is_vmalloc_addr(buf)) - addr = vmalloc_to_page(buf); - else - addr = virt_to_page(buf); - sg_set_page(sg, addr, buflen, offset_in_page(buf)); + unsigned int req_size = sizeof(**req) + crypto_aead_reqsize(tfm); + unsigned int iv_size = crypto_aead_ivsize(tfm); + unsigned int len; + u8 *p; + + *num_sgs = cifs_get_num_sgs(rqst, num_rqst, sig); + + len = iv_size; + len += crypto_aead_alignmask(tfm) & ~(crypto_tfm_ctx_alignment() - 1); + len = ALIGN(len, crypto_tfm_ctx_alignment()); + len += req_size; + len = ALIGN(len, __alignof__(struct scatterlist)); + len += *num_sgs * sizeof(**sgl); + + p = kmalloc(len, GFP_ATOMIC); + if (!p) + return NULL; + + *iv = (u8 *)PTR_ALIGN(p, crypto_aead_alignmask(tfm) + 1); + *req = (struct aead_request *)PTR_ALIGN(*iv + iv_size, + crypto_tfm_ctx_alignment()); + *sgl = (struct scatterlist *)PTR_ALIGN((u8 *)*req + req_size, + __alignof__(struct scatterlist)); + return p; } -/* Assumes the first rqst has a transform header as the first iov. - * I.e. - * rqst[0].rq_iov[0] is transform header - * rqst[0].rq_iov[1+] data to be encrypted/decrypted - * rqst[1+].rq_iov[0+] data to be encrypted/decrypted - */ -static struct scatterlist * -init_sg(int num_rqst, struct smb_rqst *rqst, u8 *sign) +static void *smb2_get_aead_req(struct crypto_aead *tfm, const struct smb_rqst *rqst, + int num_rqst, const u8 *sig, u8 **iv, + struct aead_request **req, struct scatterlist **sgl) { - unsigned int sg_len; + unsigned int off, len, skip; struct scatterlist *sg; - unsigned int i; - unsigned int j; - unsigned int idx = 0; - int skip; - - sg_len = 1; - for (i = 0; i < num_rqst; i++) - sg_len += rqst[i].rq_nvec + rqst[i].rq_npages; + unsigned int num_sgs; + unsigned long addr; + int i, j; + void *p; - sg = kmalloc_array(sg_len, sizeof(struct scatterlist), GFP_KERNEL); - if (!sg) + p = smb2_aead_req_alloc(tfm, rqst, num_rqst, sig, iv, req, sgl, &num_sgs); + if (!p) return NULL; - sg_init_table(sg, sg_len); + sg_init_table(*sgl, num_sgs); + sg = *sgl; + + /* Assumes the first rqst has a transform header as the first iov. + * I.e. + * rqst[0].rq_iov[0] is transform header + * rqst[0].rq_iov[1+] data to be encrypted/decrypted + * rqst[1+].rq_iov[0+] data to be encrypted/decrypted + */ for (i = 0; i < num_rqst; i++) { + /* + * The first rqst has a transform header where the + * first 20 bytes are not part of the encrypted blob. + */ for (j = 0; j < rqst[i].rq_nvec; j++) { - /* - * The first rqst has a transform header where the - * first 20 bytes are not part of the encrypted blob - */ - skip = (i == 0) && (j == 0) ? 20 : 0; - smb2_sg_set_buf(&sg[idx++], - rqst[i].rq_iov[j].iov_base + skip, - rqst[i].rq_iov[j].iov_len - skip); - } + struct kvec *iov = &rqst[i].rq_iov[j]; + skip = (i == 0) && (j == 0) ? 20 : 0; + addr = (unsigned long)iov->iov_base + skip; + len = iov->iov_len - skip; + sg = cifs_sg_set_buf(sg, (void *)addr, len); + } for (j = 0; j < rqst[i].rq_npages; j++) { - unsigned int len, offset; - - rqst_page_get_length(&rqst[i], j, &len, &offset); - sg_set_page(&sg[idx++], rqst[i].rq_pages[j], len, offset); + rqst_page_get_length(&rqst[i], j, &len, &off); + sg_set_page(sg++, rqst[i].rq_pages[j], len, off); } } - smb2_sg_set_buf(&sg[idx], sign, SMB2_SIGNATURE_SIZE); - return sg; + cifs_sg_set_buf(sg, sig, SMB2_SIGNATURE_SIZE); + + return p; } static int @@ -4314,11 +4327,11 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, u8 sign[SMB2_SIGNATURE_SIZE] = {}; u8 key[SMB3_ENC_DEC_KEY_SIZE]; struct aead_request *req; - char *iv; - unsigned int iv_len; + u8 *iv; DECLARE_CRYPTO_WAIT(wait); struct crypto_aead *tfm; unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize); + void *creq; rc = smb2_get_enc_key(server, le64_to_cpu(tr_hdr->SessionId), enc, key); if (rc) { @@ -4352,32 +4365,15 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, return rc; } - req = aead_request_alloc(tfm, GFP_KERNEL); - if (!req) { - cifs_server_dbg(VFS, "%s: Failed to alloc aead request\n", __func__); + creq = smb2_get_aead_req(tfm, rqst, num_rqst, sign, &iv, &req, &sg); + if (unlikely(!creq)) return -ENOMEM; - } if (!enc) { memcpy(sign, &tr_hdr->Signature, SMB2_SIGNATURE_SIZE); crypt_len += SMB2_SIGNATURE_SIZE; } - sg = init_sg(num_rqst, rqst, sign); - if (!sg) { - cifs_server_dbg(VFS, "%s: Failed to init sg\n", __func__); - rc = -ENOMEM; - goto free_req; - } - - iv_len = crypto_aead_ivsize(tfm); - iv = kzalloc(iv_len, GFP_KERNEL); - if (!iv) { - cifs_server_dbg(VFS, "%s: Failed to alloc iv\n", __func__); - rc = -ENOMEM; - goto free_sg; - } - if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) || (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) memcpy(iv, (char *)tr_hdr->Nonce, SMB3_AES_GCM_NONCE); @@ -4386,6 +4382,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, memcpy(iv + 1, (char *)tr_hdr->Nonce, SMB3_AES_CCM_NONCE); } + aead_request_set_tfm(req, tfm); aead_request_set_crypt(req, sg, sg, crypt_len, iv); aead_request_set_ad(req, assoc_data_len); @@ -4398,11 +4395,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, if (!rc && enc) memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE); - kfree_sensitive(iv); -free_sg: - kfree_sensitive(sg); -free_req: - kfree_sensitive(req); + kfree_sensitive(creq); return rc; } -- GitLab From d1f0f50fbbbbca1e3e8157e51934613bf88f6d44 Mon Sep 17 00:00:00 2001 From: Shang XiaoJing Date: Thu, 8 Dec 2022 09:33:41 +0800 Subject: [PATCH 1331/1423] samples: vfio-mdev: Fix missing pci_disable_device() in mdpy_fb_probe() Add missing pci_disable_device() in fail path of mdpy_fb_probe(). Besides, fix missing release functions in mdpy_fb_remove(). Fixes: cacade1946a4 ("sample: vfio mdev display - guest driver") Signed-off-by: Shang XiaoJing Link: https://lore.kernel.org/r/20221208013341.3999-1-shangxiaojing@huawei.com Signed-off-by: Alex Williamson --- samples/vfio-mdev/mdpy-fb.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/samples/vfio-mdev/mdpy-fb.c b/samples/vfio-mdev/mdpy-fb.c index 9ec93d90e8a5a..4eb7aa11cfbb2 100644 --- a/samples/vfio-mdev/mdpy-fb.c +++ b/samples/vfio-mdev/mdpy-fb.c @@ -109,7 +109,7 @@ static int mdpy_fb_probe(struct pci_dev *pdev, ret = pci_request_regions(pdev, "mdpy-fb"); if (ret < 0) - return ret; + goto err_disable_dev; pci_read_config_dword(pdev, MDPY_FORMAT_OFFSET, &format); pci_read_config_dword(pdev, MDPY_WIDTH_OFFSET, &width); @@ -191,6 +191,9 @@ err_release_fb: err_release_regions: pci_release_regions(pdev); +err_disable_dev: + pci_disable_device(pdev); + return ret; } @@ -199,7 +202,10 @@ static void mdpy_fb_remove(struct pci_dev *pdev) struct fb_info *info = pci_get_drvdata(pdev); unregister_framebuffer(info); + iounmap(info->screen_base); framebuffer_release(info); + pci_release_regions(pdev); + pci_disable_device(pdev); } static struct pci_device_id mdpy_fb_pci_table[] = { -- GitLab From fe3dd71db2b81c202bc80532bbe0e07238a45ed9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 8 Dec 2022 19:01:26 +0300 Subject: [PATCH 1332/1423] vfio/mlx5: fix error code in mlx5vf_precopy_ioctl() The copy_to_user() function returns the number of bytes remaining to be copied but we want to return a negative error code here. Fixes: 0dce165b1adf ("vfio/mlx5: Introduce vfio precopy ioctl implementation") Signed-off-by: Dan Carpenter Reviewed-by: Yishai Hadas Link: https://lore.kernel.org/r/Y5IKVknlf5Z5NPtU@kili Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index cd90eb86128c3..94f7a0fd10e85 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -404,7 +404,10 @@ static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, done: mlx5vf_state_mutex_unlock(mvdev); - return copy_to_user((void __user *)arg, &info, minsz); + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + return 0; + err_migf_unlock: mutex_unlock(&migf->lock); err_state_unlock: -- GitLab From 70be6f322860d322ebcd120cf0c05402ead5c6de Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 8 Dec 2022 19:02:17 +0300 Subject: [PATCH 1333/1423] vfio/mlx5: error pointer dereference in error handling This code frees the wrong "buf" variable and results in an error pointer dereference. Fixes: 34e2f27143d1 ("vfio/mlx5: Introduce multiple loads") Signed-off-by: Dan Carpenter Reviewed-by: Yishai Hadas Link: https://lore.kernel.org/r/Y5IKia5SaiVxYmG5@kili Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 94f7a0fd10e85..031ac8cc215d4 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -826,7 +826,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) spin_lock_init(&migf->list_lock); return migf; out_buf: - mlx5vf_free_data_buffer(buf); + mlx5vf_free_data_buffer(migf->buf); out_pd: mlx5vf_cmd_dealloc_pd(migf); out_free: -- GitLab From e480751970e84bc13ab5c288dbbe16b0638cc088 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Thu, 24 Nov 2022 11:37:08 +0800 Subject: [PATCH 1334/1423] f2fs: remove F2FS_SET_FEATURE() and F2FS_CLEAR_FEATURE() macro F2FS_SET_FEATURE() and F2FS_CLEAR_FEATURE() have never been used since they were introduced by this commit 76f105a2dbcd("f2fs: add feature facility in superblock"). So let's remove them. BTW, convert f2fs_sb_has_##name to return bool. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 296683648d4fa..cf738f1275b2d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -203,10 +203,6 @@ struct f2fs_mount_info { #define __F2FS_HAS_FEATURE(raw_super, mask) \ ((raw_super->feature & cpu_to_le32(mask)) != 0) #define F2FS_HAS_FEATURE(sbi, mask) __F2FS_HAS_FEATURE(sbi->raw_super, mask) -#define F2FS_SET_FEATURE(sbi, mask) \ - (sbi->raw_super->feature |= cpu_to_le32(mask)) -#define F2FS_CLEAR_FEATURE(sbi, mask) \ - (sbi->raw_super->feature &= ~cpu_to_le32(mask)) /* * Default values for user and/or group using reserved blocks @@ -4387,7 +4383,7 @@ static inline bool f2fs_disable_compressed_file(struct inode *inode) } #define F2FS_FEATURE_FUNCS(name, flagname) \ -static inline int f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \ +static inline bool f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \ { \ return F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_##flagname); \ } -- GitLab From ed8ac22b6b75804743f1dae6563d75f85cfd1483 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Thu, 24 Nov 2022 10:48:42 +0800 Subject: [PATCH 1335/1423] f2fs: introduce f2fs_is_readonly() for readability Introduce f2fs_is_readonly() and use it to simplify code. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 5 +++++ fs/f2fs/super.c | 5 ++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cf738f1275b2d..eb8c27c4e5fc2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4575,6 +4575,11 @@ static inline void f2fs_handle_page_eio(struct f2fs_sb_info *sbi, pgoff_t ofs, } } +static inline bool f2fs_is_readonly(struct f2fs_sb_info *sbi) +{ + return f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb); +} + #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a5f6f632cf7cf..79bf1faf41616 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1351,8 +1351,7 @@ default_check: return -EINVAL; } - if ((f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb)) && - test_opt(sbi, FLUSH_MERGE)) { + if (f2fs_is_readonly(sbi) && test_opt(sbi, FLUSH_MERGE)) { f2fs_err(sbi, "FLUSH_MERGE not compatible with readonly mode"); return -EINVAL; } @@ -2083,7 +2082,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, MERGE_CHECKPOINT); F2FS_OPTION(sbi).unusable_cap = 0; sbi->sb->s_flags |= SB_LAZYTIME; - if (!f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) + if (!f2fs_is_readonly(sbi)) set_opt(sbi, FLUSH_MERGE); if (f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) set_opt(sbi, DISCARD); -- GitLab From 12607c1ba7637e750402f555b6695c50fce77a2b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 30 Nov 2022 09:36:43 -0800 Subject: [PATCH 1336/1423] f2fs: specify extent cache for read explicitly Let's descrbie it's read extent cache. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 4 ++-- fs/f2fs/f2fs.h | 10 +++++----- fs/f2fs/inode.c | 2 +- fs/f2fs/node.c | 2 +- fs/f2fs/node.h | 2 +- fs/f2fs/segment.c | 4 ++-- fs/f2fs/super.c | 12 ++++++------ 7 files changed, 18 insertions(+), 18 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 932c070173b97..8cd87aee0292b 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -383,7 +383,7 @@ static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage) if (!i_ext || !i_ext->len) return; - get_extent_info(&ei, i_ext); + get_read_extent_info(&ei, i_ext); write_lock(&et->lock); if (atomic_read(&et->node_cnt)) @@ -710,7 +710,7 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) unsigned int node_cnt = 0, tree_cnt = 0; int remained; - if (!test_opt(sbi, EXTENT_CACHE)) + if (!test_opt(sbi, READ_EXTENT_CACHE)) return 0; if (!atomic_read(&sbi->total_zombie_tree)) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index eb8c27c4e5fc2..1c39f8145b61b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -92,7 +92,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_FLUSH_MERGE 0x00000400 #define F2FS_MOUNT_NOBARRIER 0x00000800 #define F2FS_MOUNT_FASTBOOT 0x00001000 -#define F2FS_MOUNT_EXTENT_CACHE 0x00002000 +#define F2FS_MOUNT_READ_EXTENT_CACHE 0x00002000 #define F2FS_MOUNT_DATA_FLUSH 0x00008000 #define F2FS_MOUNT_FAULT_INJECTION 0x00010000 #define F2FS_MOUNT_USRQUOTA 0x00080000 @@ -600,7 +600,7 @@ enum { #define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */ /* number of extent info in extent cache we try to shrink */ -#define EXTENT_CACHE_SHRINK_NUMBER 128 +#define READ_EXTENT_CACHE_SHRINK_NUMBER 128 #define RECOVERY_MAX_RA_BLOCKS BIO_MAX_VECS #define RECOVERY_MIN_RA_BLOCKS 1 @@ -830,7 +830,7 @@ struct f2fs_inode_info { loff_t original_i_size; /* original i_size before atomic write */ }; -static inline void get_extent_info(struct extent_info *ext, +static inline void get_read_extent_info(struct extent_info *ext, struct f2fs_extent *i_ext) { ext->fofs = le32_to_cpu(i_ext->fofs); @@ -838,7 +838,7 @@ static inline void get_extent_info(struct extent_info *ext, ext->len = le32_to_cpu(i_ext->len); } -static inline void set_raw_extent(struct extent_info *ext, +static inline void set_raw_read_extent(struct extent_info *ext, struct f2fs_extent *i_ext) { i_ext->fofs = cpu_to_le32(ext->fofs); @@ -4407,7 +4407,7 @@ static inline bool f2fs_may_extent_tree(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - if (!test_opt(sbi, EXTENT_CACHE) || + if (!test_opt(sbi, READ_EXTENT_CACHE) || is_inode_flag_set(inode, FI_NO_EXTENT) || (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && !f2fs_sb_has_readonly(sbi))) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 577f109b4e1d9..2c705c60019b2 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -629,7 +629,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) if (et) { read_lock(&et->lock); - set_raw_extent(&et->largest, &ri->i_ext); + set_raw_read_extent(&et->largest, &ri->i_ext); read_unlock(&et->lock); } else { memset(&ri->i_ext, 0, sizeof(ri->i_ext)); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b9ee5a1176a07..84b147966080e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -85,7 +85,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) sizeof(struct ino_entry); mem_size >>= PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); - } else if (type == EXTENT_CACHE) { + } else if (type == READ_EXTENT_CACHE) { mem_size = (atomic_read(&sbi->total_ext_tree) * sizeof(struct extent_tree) + atomic_read(&sbi->total_ext_node) * diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 3c09cae058b0a..0aa48704c77a0 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -146,7 +146,7 @@ enum mem_type { NAT_ENTRIES, /* indicates the cached nat entry */ DIRTY_DENTS, /* indicates dirty dentry pages */ INO_ENTRIES, /* indicates inode entries */ - EXTENT_CACHE, /* indicates extent cache */ + READ_EXTENT_CACHE, /* indicates read extent cache */ DISCARD_CACHE, /* indicates memory of cached discard cmds */ COMPRESS_PAGE, /* indicates memory of cached compressed pages */ BASE_CHECK, /* check kernel status */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9486ca49ecb15..51de358bc4520 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -449,8 +449,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) return; /* try to shrink extent cache when there is no enough memory */ - if (!f2fs_available_free_memory(sbi, EXTENT_CACHE)) - f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER); + if (!f2fs_available_free_memory(sbi, READ_EXTENT_CACHE)) + f2fs_shrink_extent_tree(sbi, READ_EXTENT_CACHE_SHRINK_NUMBER); /* check the # of cached NAT entries */ if (!f2fs_available_free_memory(sbi, NAT_ENTRIES)) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 79bf1faf41616..412c2e7352c04 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -814,10 +814,10 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) set_opt(sbi, FASTBOOT); break; case Opt_extent_cache: - set_opt(sbi, EXTENT_CACHE); + set_opt(sbi, READ_EXTENT_CACHE); break; case Opt_noextent_cache: - clear_opt(sbi, EXTENT_CACHE); + clear_opt(sbi, READ_EXTENT_CACHE); break; case Opt_noinline_data: clear_opt(sbi, INLINE_DATA); @@ -1954,7 +1954,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",barrier"); if (test_opt(sbi, FASTBOOT)) seq_puts(seq, ",fastboot"); - if (test_opt(sbi, EXTENT_CACHE)) + if (test_opt(sbi, READ_EXTENT_CACHE)) seq_puts(seq, ",extent_cache"); else seq_puts(seq, ",noextent_cache"); @@ -2076,7 +2076,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, INLINE_XATTR); set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); - set_opt(sbi, EXTENT_CACHE); + set_opt(sbi, READ_EXTENT_CACHE); set_opt(sbi, NOHEAP); clear_opt(sbi, DISABLE_CHECKPOINT); set_opt(sbi, MERGE_CHECKPOINT); @@ -2218,7 +2218,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool need_restart_ckpt = false, need_stop_ckpt = false; bool need_restart_flush = false, need_stop_flush = false; bool need_restart_discard = false, need_stop_discard = false; - bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); + bool no_read_extent_cache = !test_opt(sbi, READ_EXTENT_CACHE); bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT); bool no_io_align = !F2FS_IO_ALIGNED(sbi); bool no_atgc = !test_opt(sbi, ATGC); @@ -2308,7 +2308,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } /* disallow enable/disable extent_cache dynamically */ - if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) { + if (no_read_extent_cache == !!test_opt(sbi, READ_EXTENT_CACHE)) { err = -EINVAL; f2fs_warn(sbi, "switch extent_cache option is not allowed"); goto restore_opts; -- GitLab From 3bac20a8f011b8ed4012b43f4f33010432b3c647 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 30 Nov 2022 09:44:58 -0800 Subject: [PATCH 1337/1423] f2fs: move internal functions into extent_cache.c No functional change. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 88 +++++++++++++++++++++++++++++++++++++----- fs/f2fs/f2fs.h | 69 +-------------------------------- 2 files changed, 81 insertions(+), 76 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 8cd87aee0292b..2a8e31e6d518f 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -15,6 +15,77 @@ #include "node.h" #include +static void __set_extent_info(struct extent_info *ei, + unsigned int fofs, unsigned int len, + block_t blk, bool keep_clen) +{ + ei->fofs = fofs; + ei->blk = blk; + ei->len = len; + + if (keep_clen) + return; + +#ifdef CONFIG_F2FS_FS_COMPRESSION + ei->c_len = 0; +#endif +} + +static bool f2fs_may_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + /* + * for recovered files during mount do not create extents + * if shrinker is not registered. + */ + if (list_empty(&sbi->s_list)) + return false; + + if (!test_opt(sbi, READ_EXTENT_CACHE) || + is_inode_flag_set(inode, FI_NO_EXTENT) || + (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && + !f2fs_sb_has_readonly(sbi))) + return false; + + return S_ISREG(inode->i_mode); +} + +static void __try_update_largest_extent(struct extent_tree *et, + struct extent_node *en) +{ + if (en->ei.len <= et->largest.len) + return; + + et->largest = en->ei; + et->largest_updated = true; +} + +static bool __is_extent_mergeable(struct extent_info *back, + struct extent_info *front) +{ +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (back->c_len && back->len != back->c_len) + return false; + if (front->c_len && front->len != front->c_len) + return false; +#endif + return (back->fofs + back->len == front->fofs && + back->blk + back->len == front->blk); +} + +static bool __is_back_mergeable(struct extent_info *cur, + struct extent_info *back) +{ + return __is_extent_mergeable(back, cur); +} + +static bool __is_front_mergeable(struct extent_info *cur, + struct extent_info *front) +{ + return __is_extent_mergeable(cur, front); +} + static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re, unsigned int ofs) { @@ -591,16 +662,16 @@ static void f2fs_update_extent_tree_range(struct inode *inode, if (end < org_end && org_end - end >= F2FS_MIN_EXTENT_LEN) { if (parts) { - set_extent_info(&ei, end, - end - dei.fofs + dei.blk, - org_end - end); + __set_extent_info(&ei, + end, org_end - end, + end - dei.fofs + dei.blk, false); en1 = __insert_extent_tree(sbi, et, &ei, NULL, NULL, true); next_en = en1; } else { - en->ei.fofs = end; - en->ei.blk += end - dei.fofs; - en->ei.len -= end - dei.fofs; + __set_extent_info(&en->ei, + end, en->ei.len - (end - dei.fofs), + en->ei.blk + (end - dei.fofs), true); next_en = en; } parts++; @@ -632,8 +703,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, /* 3. update extent in extent cache */ if (blkaddr) { - - set_extent_info(&ei, fofs, blkaddr, len); + __set_extent_info(&ei, fofs, len, blkaddr, false); if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) __insert_extent_tree(sbi, et, &ei, insert_p, insert_parent, leftmost); @@ -692,7 +762,7 @@ void f2fs_update_extent_tree_range_compressed(struct inode *inode, if (en) goto unlock_out; - set_extent_info(&ei, fofs, blkaddr, llen); + __set_extent_info(&ei, fofs, llen, blkaddr, true); ei.c_len = c_len; if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1c39f8145b61b..04fdf010bb771 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -621,7 +621,7 @@ struct rb_entry { struct extent_info { unsigned int fofs; /* start offset in a file */ unsigned int len; /* length of the extent */ - u32 blk; /* start block address of the extent */ + block_t blk; /* start block address of the extent */ #ifdef CONFIG_F2FS_FS_COMPRESSION unsigned int c_len; /* physical extent length of compressed blocks */ #endif @@ -846,17 +846,6 @@ static inline void set_raw_read_extent(struct extent_info *ext, i_ext->len = cpu_to_le32(ext->len); } -static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, - u32 blk, unsigned int len) -{ - ei->fofs = fofs; - ei->blk = blk; - ei->len = len; -#ifdef CONFIG_F2FS_FS_COMPRESSION - ei->c_len = 0; -#endif -} - static inline bool __is_discard_mergeable(struct discard_info *back, struct discard_info *front, unsigned int max_len) { @@ -876,41 +865,6 @@ static inline bool __is_discard_front_mergeable(struct discard_info *cur, return __is_discard_mergeable(cur, front, max_len); } -static inline bool __is_extent_mergeable(struct extent_info *back, - struct extent_info *front) -{ -#ifdef CONFIG_F2FS_FS_COMPRESSION - if (back->c_len && back->len != back->c_len) - return false; - if (front->c_len && front->len != front->c_len) - return false; -#endif - return (back->fofs + back->len == front->fofs && - back->blk + back->len == front->blk); -} - -static inline bool __is_back_mergeable(struct extent_info *cur, - struct extent_info *back) -{ - return __is_extent_mergeable(back, cur); -} - -static inline bool __is_front_mergeable(struct extent_info *cur, - struct extent_info *front) -{ - return __is_extent_mergeable(cur, front); -} - -extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync); -static inline void __try_update_largest_extent(struct extent_tree *et, - struct extent_node *en) -{ - if (en->ei.len > et->largest.len) { - et->largest = en->ei; - et->largest_updated = true; - } -} - /* * For free nid management */ @@ -2581,6 +2535,7 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); } +extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync); static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, struct inode *inode, bool is_inode) { @@ -4403,26 +4358,6 @@ F2FS_FEATURE_FUNCS(casefold, CASEFOLD); F2FS_FEATURE_FUNCS(compression, COMPRESSION); F2FS_FEATURE_FUNCS(readonly, RO); -static inline bool f2fs_may_extent_tree(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - - if (!test_opt(sbi, READ_EXTENT_CACHE) || - is_inode_flag_set(inode, FI_NO_EXTENT) || - (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && - !f2fs_sb_has_readonly(sbi))) - return false; - - /* - * for recovered files during mount do not create extents - * if shrinker is not registered. - */ - if (list_empty(&sbi->s_list)) - return false; - - return S_ISREG(inode->i_mode); -} - #ifdef CONFIG_BLK_DEV_ZONED static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi, block_t blkaddr) -- GitLab From 749d543c0d451fff31e8f7a3e0a031ffcbf1ebb1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 30 Nov 2022 10:01:18 -0800 Subject: [PATCH 1338/1423] f2fs: remove unnecessary __init_extent_tree Added into the caller. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 2a8e31e6d518f..c6810347e2054 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -386,21 +386,6 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) return et; } -static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi, - struct extent_tree *et, struct extent_info *ei) -{ - struct rb_node **p = &et->root.rb_root.rb_node; - struct extent_node *en; - - en = __attach_extent_node(sbi, et, ei, NULL, p, true); - if (!en) - return NULL; - - et->largest = en->ei; - et->cached_en = en; - return en; -} - static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, struct extent_tree *et) { @@ -460,8 +445,12 @@ static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage) if (atomic_read(&et->node_cnt)) goto out; - en = __init_extent_tree(sbi, et, &ei); + en = __attach_extent_node(sbi, et, &ei, NULL, + &et->root.rb_root.rb_node, true); if (en) { + et->largest = en->ei; + et->cached_en = en; + spin_lock(&sbi->extent_lock); list_add_tail(&en->list, &sbi->extent_list); spin_unlock(&sbi->extent_lock); -- GitLab From e7547daccd6a37522f0af74ec4b5a3036f3dd328 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 30 Nov 2022 09:26:29 -0800 Subject: [PATCH 1339/1423] f2fs: refactor extent_cache to support for read and more This patch prepares extent_cache to be ready for addition. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 20 +- fs/f2fs/debug.c | 65 +++-- fs/f2fs/extent_cache.c | 463 +++++++++++++++++++++--------------- fs/f2fs/f2fs.h | 119 +++++---- fs/f2fs/file.c | 8 +- fs/f2fs/gc.c | 4 +- fs/f2fs/inode.c | 6 +- fs/f2fs/node.c | 8 +- fs/f2fs/segment.c | 3 +- fs/f2fs/shrinker.c | 19 +- include/trace/events/f2fs.h | 62 +++-- 11 files changed, 470 insertions(+), 307 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 35c19248b1e2d..75abd450730b3 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1126,7 +1126,7 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) { dn->data_blkaddr = blkaddr; f2fs_set_data_blkaddr(dn); - f2fs_update_extent_cache(dn); + f2fs_update_read_extent_cache(dn); } /* dn->ofs_in_node will be returned with up-to-date last block pointer */ @@ -1195,7 +1195,7 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) struct extent_info ei = {0, }; struct inode *inode = dn->inode; - if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn->data_blkaddr = ei.blk + index - ei.fofs; return 0; } @@ -1217,7 +1217,7 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, if (!page) return ERR_PTR(-ENOMEM); - if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn.data_blkaddr = ei.blk + index - ei.fofs; if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ)) { @@ -1485,7 +1485,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, pgofs = (pgoff_t)map->m_lblk; end = pgofs + maxblocks; - if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) { + if (!create && f2fs_lookup_read_extent_cache(inode, pgofs, &ei)) { if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO && map->m_may_create) goto next_dnode; @@ -1695,7 +1695,7 @@ skip: if (map->m_flags & F2FS_MAP_MAPPED) { unsigned int ofs = start_pgofs - map->m_lblk; - f2fs_update_extent_cache_range(&dn, + f2fs_update_read_extent_cache_range(&dn, start_pgofs, map->m_pblk + ofs, map->m_len - ofs); } @@ -1740,7 +1740,7 @@ sync_out: if (map->m_flags & F2FS_MAP_MAPPED) { unsigned int ofs = start_pgofs - map->m_lblk; - f2fs_update_extent_cache_range(&dn, + f2fs_update_read_extent_cache_range(&dn, start_pgofs, map->m_pblk + ofs, map->m_len - ofs); } @@ -2201,7 +2201,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (f2fs_cluster_is_empty(cc)) goto out; - if (f2fs_lookup_extent_cache(inode, start_idx, &ei)) + if (f2fs_lookup_read_extent_cache(inode, start_idx, &ei)) from_dnode = false; if (!from_dnode) @@ -2635,7 +2635,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) set_new_dnode(&dn, inode, NULL, NULL, 0); if (need_inplace_update(fio) && - f2fs_lookup_extent_cache(inode, page->index, &ei)) { + f2fs_lookup_read_extent_cache(inode, page->index, &ei)) { fio->old_blkaddr = ei.blk + page->index - ei.fofs; if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, @@ -3359,7 +3359,7 @@ restart: } else if (locked) { err = f2fs_get_block(&dn, index); } else { - if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn.data_blkaddr = ei.blk + index - ei.fofs; } else { /* hole case */ @@ -3400,7 +3400,7 @@ static int __find_data_block(struct inode *inode, pgoff_t index, set_new_dnode(&dn, inode, ipage, ipage, 0); - if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn.data_blkaddr = ei.blk + index - ei.fofs; } else { /* hole case */ diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index a216dcdf69418..a9baa121d829f 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -72,15 +72,23 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->main_area_zones = si->main_area_sections / le32_to_cpu(raw_super->secs_per_zone); - /* validation check of the segment numbers */ + /* general extent cache stats */ + for (i = 0; i < NR_EXTENT_CACHES; i++) { + struct extent_tree_info *eti = &sbi->extent_tree[i]; + + si->hit_cached[i] = atomic64_read(&sbi->read_hit_cached[i]); + si->hit_rbtree[i] = atomic64_read(&sbi->read_hit_rbtree[i]); + si->total_ext[i] = atomic64_read(&sbi->total_hit_ext[i]); + si->hit_total[i] = si->hit_cached[i] + si->hit_rbtree[i]; + si->ext_tree[i] = atomic_read(&eti->total_ext_tree); + si->zombie_tree[i] = atomic_read(&eti->total_zombie_tree); + si->ext_node[i] = atomic_read(&eti->total_ext_node); + } + /* read extent_cache only */ si->hit_largest = atomic64_read(&sbi->read_hit_largest); - si->hit_cached = atomic64_read(&sbi->read_hit_cached); - si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree); - si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree; - si->total_ext = atomic64_read(&sbi->total_hit_ext); - si->ext_tree = atomic_read(&sbi->total_ext_tree); - si->zombie_tree = atomic_read(&sbi->total_zombie_tree); - si->ext_node = atomic_read(&sbi->total_ext_node); + si->hit_total[EX_READ] += si->hit_largest; + + /* validation check of the segment numbers */ si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META); @@ -294,10 +302,16 @@ get_cache: sizeof(struct nat_entry_set); for (i = 0; i < MAX_INO_ENTRY; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); - si->cache_mem += atomic_read(&sbi->total_ext_tree) * + + for (i = 0; i < NR_EXTENT_CACHES; i++) { + struct extent_tree_info *eti = &sbi->extent_tree[i]; + + si->ext_mem[i] = atomic_read(&eti->total_ext_tree) * sizeof(struct extent_tree); - si->cache_mem += atomic_read(&sbi->total_ext_node) * + si->ext_mem[i] += atomic_read(&eti->total_ext_node) * sizeof(struct extent_node); + si->cache_mem += si->ext_mem[i]; + } si->page_mem = 0; if (sbi->node_inode) { @@ -490,16 +504,18 @@ static int stat_show(struct seq_file *s, void *v) si->bg_node_blks); seq_printf(s, "BG skip : IO: %u, Other: %u\n", si->io_skip_bggc, si->other_skip_bggc); - seq_puts(s, "\nExtent Cache:\n"); + seq_puts(s, "\nExtent Cache (Read):\n"); seq_printf(s, " - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n", - si->hit_largest, si->hit_cached, - si->hit_rbtree); + si->hit_largest, si->hit_cached[EX_READ], + si->hit_rbtree[EX_READ]); seq_printf(s, " - Hit Ratio: %llu%% (%llu / %llu)\n", - !si->total_ext ? 0 : - div64_u64(si->hit_total * 100, si->total_ext), - si->hit_total, si->total_ext); + !si->total_ext[EX_READ] ? 0 : + div64_u64(si->hit_total[EX_READ] * 100, + si->total_ext[EX_READ]), + si->hit_total[EX_READ], si->total_ext[EX_READ]); seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", - si->ext_tree, si->zombie_tree, si->ext_node); + si->ext_tree[EX_READ], si->zombie_tree[EX_READ], + si->ext_node[EX_READ]); seq_puts(s, "\nBalancing F2FS Async:\n"); seq_printf(s, " - DIO (R: %4d, W: %4d)\n", si->nr_dio_read, si->nr_dio_write); @@ -566,8 +582,10 @@ static int stat_show(struct seq_file *s, void *v) (si->base_mem + si->cache_mem + si->page_mem) >> 10); seq_printf(s, " - static: %llu KB\n", si->base_mem >> 10); - seq_printf(s, " - cached: %llu KB\n", + seq_printf(s, " - cached all: %llu KB\n", si->cache_mem >> 10); + seq_printf(s, " - read extent cache: %llu KB\n", + si->ext_mem[EX_READ] >> 10); seq_printf(s, " - paged : %llu KB\n", si->page_mem >> 10); } @@ -600,10 +618,15 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) si->sbi = sbi; sbi->stat_info = si; - atomic64_set(&sbi->total_hit_ext, 0); - atomic64_set(&sbi->read_hit_rbtree, 0); + /* general extent cache stats */ + for (i = 0; i < NR_EXTENT_CACHES; i++) { + atomic64_set(&sbi->total_hit_ext[i], 0); + atomic64_set(&sbi->read_hit_rbtree[i], 0); + atomic64_set(&sbi->read_hit_cached[i], 0); + } + + /* read extent_cache only */ atomic64_set(&sbi->read_hit_largest, 0); - atomic64_set(&sbi->read_hit_cached, 0); atomic_set(&sbi->inline_xattr, 0); atomic_set(&sbi->inline_inode, 0); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index c6810347e2054..654a14ab8977c 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -17,21 +17,37 @@ static void __set_extent_info(struct extent_info *ei, unsigned int fofs, unsigned int len, - block_t blk, bool keep_clen) + block_t blk, bool keep_clen, + enum extent_type type) { ei->fofs = fofs; - ei->blk = blk; ei->len = len; - if (keep_clen) - return; - + if (type == EX_READ) { + ei->blk = blk; + if (keep_clen) + return; #ifdef CONFIG_F2FS_FS_COMPRESSION - ei->c_len = 0; + ei->c_len = 0; #endif + } +} + +static bool __may_read_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (!test_opt(sbi, READ_EXTENT_CACHE)) + return false; + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + return false; + if (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && + !f2fs_sb_has_readonly(sbi)) + return false; + return S_ISREG(inode->i_mode); } -static bool f2fs_may_extent_tree(struct inode *inode) +static bool __may_extent_tree(struct inode *inode, enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -42,18 +58,16 @@ static bool f2fs_may_extent_tree(struct inode *inode) if (list_empty(&sbi->s_list)) return false; - if (!test_opt(sbi, READ_EXTENT_CACHE) || - is_inode_flag_set(inode, FI_NO_EXTENT) || - (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && - !f2fs_sb_has_readonly(sbi))) - return false; - - return S_ISREG(inode->i_mode); + if (type == EX_READ) + return __may_read_extent_tree(inode); + return false; } static void __try_update_largest_extent(struct extent_tree *et, struct extent_node *en) { + if (et->type != EX_READ) + return; if (en->ei.len <= et->largest.len) return; @@ -62,28 +76,31 @@ static void __try_update_largest_extent(struct extent_tree *et, } static bool __is_extent_mergeable(struct extent_info *back, - struct extent_info *front) + struct extent_info *front, enum extent_type type) { + if (type == EX_READ) { #ifdef CONFIG_F2FS_FS_COMPRESSION - if (back->c_len && back->len != back->c_len) - return false; - if (front->c_len && front->len != front->c_len) - return false; + if (back->c_len && back->len != back->c_len) + return false; + if (front->c_len && front->len != front->c_len) + return false; #endif - return (back->fofs + back->len == front->fofs && - back->blk + back->len == front->blk); + return (back->fofs + back->len == front->fofs && + back->blk + back->len == front->blk); + } + return false; } static bool __is_back_mergeable(struct extent_info *cur, - struct extent_info *back) + struct extent_info *back, enum extent_type type) { - return __is_extent_mergeable(back, cur); + return __is_extent_mergeable(back, cur, type); } static bool __is_front_mergeable(struct extent_info *cur, - struct extent_info *front) + struct extent_info *front, enum extent_type type) { - return __is_extent_mergeable(cur, front); + return __is_extent_mergeable(cur, front, type); } static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re, @@ -308,6 +325,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, struct rb_node *parent, struct rb_node **p, bool leftmost) { + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; struct extent_node *en; en = f2fs_kmem_cache_alloc(extent_node_slab, GFP_ATOMIC, false, sbi); @@ -321,16 +339,18 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, rb_link_node(&en->rb_node, parent, p); rb_insert_color_cached(&en->rb_node, &et->root, leftmost); atomic_inc(&et->node_cnt); - atomic_inc(&sbi->total_ext_node); + atomic_inc(&eti->total_ext_node); return en; } static void __detach_extent_node(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_node *en) { + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; + rb_erase_cached(&en->rb_node, &et->root); atomic_dec(&et->node_cnt); - atomic_dec(&sbi->total_ext_node); + atomic_dec(&eti->total_ext_node); if (et->cached_en == en) et->cached_en = NULL; @@ -346,42 +366,47 @@ static void __detach_extent_node(struct f2fs_sb_info *sbi, static void __release_extent_node(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_node *en) { - spin_lock(&sbi->extent_lock); + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; + + spin_lock(&eti->extent_lock); f2fs_bug_on(sbi, list_empty(&en->list)); list_del_init(&en->list); - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); __detach_extent_node(sbi, et, en); } -static struct extent_tree *__grab_extent_tree(struct inode *inode) +static struct extent_tree *__grab_extent_tree(struct inode *inode, + enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree_info *eti = &sbi->extent_tree[type]; struct extent_tree *et; nid_t ino = inode->i_ino; - mutex_lock(&sbi->extent_tree_lock); - et = radix_tree_lookup(&sbi->extent_tree_root, ino); + mutex_lock(&eti->extent_tree_lock); + et = radix_tree_lookup(&eti->extent_tree_root, ino); if (!et) { et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS, true, NULL); - f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et); + f2fs_radix_tree_insert(&eti->extent_tree_root, ino, et); memset(et, 0, sizeof(struct extent_tree)); et->ino = ino; + et->type = type; et->root = RB_ROOT_CACHED; et->cached_en = NULL; rwlock_init(&et->lock); INIT_LIST_HEAD(&et->list); atomic_set(&et->node_cnt, 0); - atomic_inc(&sbi->total_ext_tree); + atomic_inc(&eti->total_ext_tree); } else { - atomic_dec(&sbi->total_zombie_tree); + atomic_dec(&eti->total_zombie_tree); list_del_init(&et->list); } - mutex_unlock(&sbi->extent_tree_lock); + mutex_unlock(&eti->extent_tree_lock); /* never died until evict_inode */ - F2FS_I(inode)->extent_tree = et; + F2FS_I(inode)->extent_tree[type] = et; return et; } @@ -415,35 +440,38 @@ static void __drop_largest_extent(struct extent_tree *et, } /* return true, if inode page is changed */ -static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage) +static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage, + enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree_info *eti = &sbi->extent_tree[type]; struct f2fs_extent *i_ext = ipage ? &F2FS_INODE(ipage)->i_ext : NULL; struct extent_tree *et; struct extent_node *en; struct extent_info ei; - if (!f2fs_may_extent_tree(inode)) { - /* drop largest extent */ - if (i_ext && i_ext->len) { + if (!__may_extent_tree(inode, type)) { + /* drop largest read extent */ + if (type == EX_READ && i_ext && i_ext->len) { f2fs_wait_on_page_writeback(ipage, NODE, true, true); i_ext->len = 0; set_page_dirty(ipage); - return; } - return; + goto out; } - et = __grab_extent_tree(inode); + et = __grab_extent_tree(inode, type); if (!i_ext || !i_ext->len) - return; + goto out; + + BUG_ON(type != EX_READ); get_read_extent_info(&ei, i_ext); write_lock(&et->lock); if (atomic_read(&et->node_cnt)) - goto out; + goto unlock_out; en = __attach_extent_node(sbi, et, &ei, NULL, &et->root.rb_root.rb_node, true); @@ -451,37 +479,40 @@ static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage) et->largest = en->ei; et->cached_en = en; - spin_lock(&sbi->extent_lock); - list_add_tail(&en->list, &sbi->extent_list); - spin_unlock(&sbi->extent_lock); + spin_lock(&eti->extent_lock); + list_add_tail(&en->list, &eti->extent_list); + spin_unlock(&eti->extent_lock); } -out: +unlock_out: write_unlock(&et->lock); +out: + if (type == EX_READ && !F2FS_I(inode)->extent_tree[EX_READ]) + set_inode_flag(inode, FI_NO_EXTENT); } void f2fs_init_extent_tree(struct inode *inode, struct page *ipage) { - __f2fs_init_extent_tree(inode, ipage); - - if (!F2FS_I(inode)->extent_tree) - set_inode_flag(inode, FI_NO_EXTENT); + /* initialize read cache */ + __f2fs_init_extent_tree(inode, ipage, EX_READ); } -static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, - struct extent_info *ei) +static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei, enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree_info *eti = &sbi->extent_tree[type]; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; struct extent_node *en; bool ret = false; f2fs_bug_on(sbi, !et); - trace_f2fs_lookup_extent_tree_start(inode, pgofs); + trace_f2fs_lookup_extent_tree_start(inode, pgofs, type); read_lock(&et->lock); - if (et->largest.fofs <= pgofs && + if (type == EX_READ && + et->largest.fofs <= pgofs && et->largest.fofs + et->largest.len > pgofs) { *ei = et->largest; ret = true; @@ -495,23 +526,24 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, goto out; if (en == et->cached_en) - stat_inc_cached_node_hit(sbi); + stat_inc_cached_node_hit(sbi, type); else - stat_inc_rbtree_node_hit(sbi); + stat_inc_rbtree_node_hit(sbi, type); *ei = en->ei; - spin_lock(&sbi->extent_lock); + spin_lock(&eti->extent_lock); if (!list_empty(&en->list)) { - list_move_tail(&en->list, &sbi->extent_list); + list_move_tail(&en->list, &eti->extent_list); et->cached_en = en; } - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); ret = true; out: - stat_inc_total_hit(sbi); + stat_inc_total_hit(sbi, type); read_unlock(&et->lock); - trace_f2fs_lookup_extent_tree_end(inode, pgofs, ei); + if (type == EX_READ) + trace_f2fs_lookup_read_extent_tree_end(inode, pgofs, ei); return ret; } @@ -520,18 +552,20 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, struct extent_node *prev_ex, struct extent_node *next_ex) { + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; struct extent_node *en = NULL; - if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) { + if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei, et->type)) { prev_ex->ei.len += ei->len; ei = &prev_ex->ei; en = prev_ex; } - if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) { + if (next_ex && __is_front_mergeable(ei, &next_ex->ei, et->type)) { next_ex->ei.fofs = ei->fofs; - next_ex->ei.blk = ei->blk; next_ex->ei.len += ei->len; + if (et->type == EX_READ) + next_ex->ei.blk = ei->blk; if (en) __release_extent_node(sbi, et, prev_ex); @@ -543,12 +577,12 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, __try_update_largest_extent(et, en); - spin_lock(&sbi->extent_lock); + spin_lock(&eti->extent_lock); if (!list_empty(&en->list)) { - list_move_tail(&en->list, &sbi->extent_list); + list_move_tail(&en->list, &eti->extent_list); et->cached_en = en; } - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); return en; } @@ -558,6 +592,7 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, struct rb_node *insert_parent, bool leftmost) { + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; struct rb_node **p; struct rb_node *parent = NULL; struct extent_node *en = NULL; @@ -580,47 +615,50 @@ do_insert: __try_update_largest_extent(et, en); /* update in global extent list */ - spin_lock(&sbi->extent_lock); - list_add_tail(&en->list, &sbi->extent_list); + spin_lock(&eti->extent_lock); + list_add_tail(&en->list, &eti->extent_list); et->cached_en = en; - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); return en; } -static void f2fs_update_extent_tree_range(struct inode *inode, - pgoff_t fofs, block_t blkaddr, unsigned int len) +static void __update_extent_tree_range(struct inode *inode, + struct extent_info *tei, enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; struct extent_node *en = NULL, *en1 = NULL; struct extent_node *prev_en = NULL, *next_en = NULL; struct extent_info ei, dei, prev; struct rb_node **insert_p = NULL, *insert_parent = NULL; + unsigned int fofs = tei->fofs, len = tei->len; unsigned int end = fofs + len; - unsigned int pos = (unsigned int)fofs; bool updated = false; bool leftmost = false; if (!et) return; - trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len, 0); - + if (type == EX_READ) + trace_f2fs_update_read_extent_tree_range(inode, fofs, len, + tei->blk, 0); write_lock(&et->lock); - if (is_inode_flag_set(inode, FI_NO_EXTENT)) { - write_unlock(&et->lock); - return; - } + if (type == EX_READ) { + if (is_inode_flag_set(inode, FI_NO_EXTENT)) { + write_unlock(&et->lock); + return; + } - prev = et->largest; - dei.len = 0; + prev = et->largest; + dei.len = 0; - /* - * drop largest extent before lookup, in case it's already - * been shrunk from extent tree - */ - __drop_largest_extent(et, fofs, len); + /* + * drop largest extent before lookup, in case it's already + * been shrunk from extent tree + */ + __drop_largest_extent(et, fofs, len); + } /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root, @@ -641,26 +679,30 @@ static void f2fs_update_extent_tree_range(struct inode *inode, dei = en->ei; org_end = dei.fofs + dei.len; - f2fs_bug_on(sbi, pos >= org_end); + f2fs_bug_on(sbi, fofs >= org_end); - if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { - en->ei.len = pos - en->ei.fofs; + if (fofs > dei.fofs && (type != EX_READ || + fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN)) { + en->ei.len = fofs - en->ei.fofs; prev_en = en; parts = 1; } - if (end < org_end && org_end - end >= F2FS_MIN_EXTENT_LEN) { + if (end < org_end && (type != EX_READ || + org_end - end >= F2FS_MIN_EXTENT_LEN)) { if (parts) { __set_extent_info(&ei, end, org_end - end, - end - dei.fofs + dei.blk, false); + end - dei.fofs + dei.blk, false, + type); en1 = __insert_extent_tree(sbi, et, &ei, NULL, NULL, true); next_en = en1; } else { __set_extent_info(&en->ei, end, en->ei.len - (end - dei.fofs), - en->ei.blk + (end - dei.fofs), true); + en->ei.blk + (end - dei.fofs), true, + type); next_en = en; } parts++; @@ -690,9 +732,11 @@ static void f2fs_update_extent_tree_range(struct inode *inode, en = next_en; } - /* 3. update extent in extent cache */ - if (blkaddr) { - __set_extent_info(&ei, fofs, len, blkaddr, false); + /* 3. update extent in read extent cache */ + BUG_ON(type != EX_READ); + + if (tei->blk) { + __set_extent_info(&ei, fofs, len, tei->blk, false, EX_READ); if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) __insert_extent_tree(sbi, et, &ei, insert_p, insert_parent, leftmost); @@ -722,19 +766,20 @@ static void f2fs_update_extent_tree_range(struct inode *inode, } #ifdef CONFIG_F2FS_FS_COMPRESSION -void f2fs_update_extent_tree_range_compressed(struct inode *inode, +void f2fs_update_read_extent_tree_range_compressed(struct inode *inode, pgoff_t fofs, block_t blkaddr, unsigned int llen, unsigned int c_len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ]; struct extent_node *en = NULL; struct extent_node *prev_en = NULL, *next_en = NULL; struct extent_info ei; struct rb_node **insert_p = NULL, *insert_parent = NULL; bool leftmost = false; - trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, llen, c_len); + trace_f2fs_update_read_extent_tree_range(inode, fofs, llen, + blkaddr, c_len); /* it is safe here to check FI_NO_EXTENT w/o et->lock in ro image */ if (is_inode_flag_set(inode, FI_NO_EXTENT)) @@ -751,7 +796,7 @@ void f2fs_update_extent_tree_range_compressed(struct inode *inode, if (en) goto unlock_out; - __set_extent_info(&ei, fofs, llen, blkaddr, true); + __set_extent_info(&ei, fofs, llen, blkaddr, true, EX_READ); ei.c_len = c_len; if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) @@ -762,24 +807,43 @@ unlock_out: } #endif -unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) +static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type type) { + struct extent_info ei; + + if (!__may_extent_tree(dn->inode, type)) + return; + + ei.fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + + dn->ofs_in_node; + ei.len = 1; + + if (type == EX_READ) { + if (dn->data_blkaddr == NEW_ADDR) + ei.blk = NULL_ADDR; + else + ei.blk = dn->data_blkaddr; + } + __update_extent_tree_range(dn->inode, &ei, type); +} + +static unsigned int __shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink, + enum extent_type type) +{ + struct extent_tree_info *eti = &sbi->extent_tree[type]; struct extent_tree *et, *next; struct extent_node *en; unsigned int node_cnt = 0, tree_cnt = 0; int remained; - if (!test_opt(sbi, READ_EXTENT_CACHE)) - return 0; - - if (!atomic_read(&sbi->total_zombie_tree)) + if (!atomic_read(&eti->total_zombie_tree)) goto free_node; - if (!mutex_trylock(&sbi->extent_tree_lock)) + if (!mutex_trylock(&eti->extent_tree_lock)) goto out; /* 1. remove unreferenced extent tree */ - list_for_each_entry_safe(et, next, &sbi->zombie_list, list) { + list_for_each_entry_safe(et, next, &eti->zombie_list, list) { if (atomic_read(&et->node_cnt)) { write_lock(&et->lock); node_cnt += __free_extent_tree(sbi, et); @@ -787,61 +851,100 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) } f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); list_del_init(&et->list); - radix_tree_delete(&sbi->extent_tree_root, et->ino); + radix_tree_delete(&eti->extent_tree_root, et->ino); kmem_cache_free(extent_tree_slab, et); - atomic_dec(&sbi->total_ext_tree); - atomic_dec(&sbi->total_zombie_tree); + atomic_dec(&eti->total_ext_tree); + atomic_dec(&eti->total_zombie_tree); tree_cnt++; if (node_cnt + tree_cnt >= nr_shrink) goto unlock_out; cond_resched(); } - mutex_unlock(&sbi->extent_tree_lock); + mutex_unlock(&eti->extent_tree_lock); free_node: /* 2. remove LRU extent entries */ - if (!mutex_trylock(&sbi->extent_tree_lock)) + if (!mutex_trylock(&eti->extent_tree_lock)) goto out; remained = nr_shrink - (node_cnt + tree_cnt); - spin_lock(&sbi->extent_lock); + spin_lock(&eti->extent_lock); for (; remained > 0; remained--) { - if (list_empty(&sbi->extent_list)) + if (list_empty(&eti->extent_list)) break; - en = list_first_entry(&sbi->extent_list, + en = list_first_entry(&eti->extent_list, struct extent_node, list); et = en->et; if (!write_trylock(&et->lock)) { /* refresh this extent node's position in extent list */ - list_move_tail(&en->list, &sbi->extent_list); + list_move_tail(&en->list, &eti->extent_list); continue; } list_del_init(&en->list); - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); __detach_extent_node(sbi, et, en); write_unlock(&et->lock); node_cnt++; - spin_lock(&sbi->extent_lock); + spin_lock(&eti->extent_lock); } - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); unlock_out: - mutex_unlock(&sbi->extent_tree_lock); + mutex_unlock(&eti->extent_tree_lock); out: - trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt); + trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt, type); return node_cnt + tree_cnt; } -unsigned int f2fs_destroy_extent_node(struct inode *inode) +/* read extent cache operations */ +bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) +{ + if (!__may_extent_tree(inode, EX_READ)) + return false; + + return __lookup_extent_tree(inode, pgofs, ei, EX_READ); +} + +void f2fs_update_read_extent_cache(struct dnode_of_data *dn) +{ + return __update_extent_cache(dn, EX_READ); +} + +void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, block_t blkaddr, unsigned int len) +{ + struct extent_info ei = { + .fofs = fofs, + .len = len, + .blk = blkaddr, + }; + + if (!__may_extent_tree(dn->inode, EX_READ)) + return; + + __update_extent_tree_range(dn->inode, &ei, EX_READ); +} + +unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) +{ + if (!test_opt(sbi, READ_EXTENT_CACHE)) + return 0; + + return __shrink_extent_tree(sbi, nr_shrink, EX_READ); +} + +static unsigned int __destroy_extent_node(struct inode *inode, + enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; unsigned int node_cnt = 0; if (!et || !atomic_read(&et->node_cnt)) @@ -854,31 +957,44 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode) return node_cnt; } -void f2fs_drop_extent_tree(struct inode *inode) +void f2fs_destroy_extent_node(struct inode *inode) +{ + __destroy_extent_node(inode, EX_READ); +} + +static void __drop_extent_tree(struct inode *inode, enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; bool updated = false; - if (!f2fs_may_extent_tree(inode)) + if (!__may_extent_tree(inode, type)) return; write_lock(&et->lock); - set_inode_flag(inode, FI_NO_EXTENT); __free_extent_tree(sbi, et); - if (et->largest.len) { - et->largest.len = 0; - updated = true; + if (type == EX_READ) { + set_inode_flag(inode, FI_NO_EXTENT); + if (et->largest.len) { + et->largest.len = 0; + updated = true; + } } write_unlock(&et->lock); if (updated) f2fs_mark_inode_dirty_sync(inode, true); } -void f2fs_destroy_extent_tree(struct inode *inode) +void f2fs_drop_extent_tree(struct inode *inode) +{ + __drop_extent_tree(inode, EX_READ); +} + +static void __destroy_extent_tree(struct inode *inode, enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree_info *eti = &sbi->extent_tree[type]; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; unsigned int node_cnt = 0; if (!et) @@ -886,76 +1002,49 @@ void f2fs_destroy_extent_tree(struct inode *inode) if (inode->i_nlink && !is_bad_inode(inode) && atomic_read(&et->node_cnt)) { - mutex_lock(&sbi->extent_tree_lock); - list_add_tail(&et->list, &sbi->zombie_list); - atomic_inc(&sbi->total_zombie_tree); - mutex_unlock(&sbi->extent_tree_lock); + mutex_lock(&eti->extent_tree_lock); + list_add_tail(&et->list, &eti->zombie_list); + atomic_inc(&eti->total_zombie_tree); + mutex_unlock(&eti->extent_tree_lock); return; } /* free all extent info belong to this extent tree */ - node_cnt = f2fs_destroy_extent_node(inode); + node_cnt = __destroy_extent_node(inode, type); /* delete extent tree entry in radix tree */ - mutex_lock(&sbi->extent_tree_lock); + mutex_lock(&eti->extent_tree_lock); f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); - radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); + radix_tree_delete(&eti->extent_tree_root, inode->i_ino); kmem_cache_free(extent_tree_slab, et); - atomic_dec(&sbi->total_ext_tree); - mutex_unlock(&sbi->extent_tree_lock); + atomic_dec(&eti->total_ext_tree); + mutex_unlock(&eti->extent_tree_lock); - F2FS_I(inode)->extent_tree = NULL; + F2FS_I(inode)->extent_tree[type] = NULL; - trace_f2fs_destroy_extent_tree(inode, node_cnt); + trace_f2fs_destroy_extent_tree(inode, node_cnt, type); } -bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, - struct extent_info *ei) -{ - if (!f2fs_may_extent_tree(inode)) - return false; - - return f2fs_lookup_extent_tree(inode, pgofs, ei); -} - -void f2fs_update_extent_cache(struct dnode_of_data *dn) +void f2fs_destroy_extent_tree(struct inode *inode) { - pgoff_t fofs; - block_t blkaddr; - - if (!f2fs_may_extent_tree(dn->inode)) - return; - - if (dn->data_blkaddr == NEW_ADDR) - blkaddr = NULL_ADDR; - else - blkaddr = dn->data_blkaddr; - - fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + - dn->ofs_in_node; - f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1); + __destroy_extent_tree(inode, EX_READ); } -void f2fs_update_extent_cache_range(struct dnode_of_data *dn, - pgoff_t fofs, block_t blkaddr, unsigned int len) - +static void __init_extent_tree_info(struct extent_tree_info *eti) { - if (!f2fs_may_extent_tree(dn->inode)) - return; - - f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len); + INIT_RADIX_TREE(&eti->extent_tree_root, GFP_NOIO); + mutex_init(&eti->extent_tree_lock); + INIT_LIST_HEAD(&eti->extent_list); + spin_lock_init(&eti->extent_lock); + atomic_set(&eti->total_ext_tree, 0); + INIT_LIST_HEAD(&eti->zombie_list); + atomic_set(&eti->total_zombie_tree, 0); + atomic_set(&eti->total_ext_node, 0); } void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi) { - INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO); - mutex_init(&sbi->extent_tree_lock); - INIT_LIST_HEAD(&sbi->extent_list); - spin_lock_init(&sbi->extent_lock); - atomic_set(&sbi->total_ext_tree, 0); - INIT_LIST_HEAD(&sbi->zombie_list); - atomic_set(&sbi->total_zombie_tree, 0); - atomic_set(&sbi->total_ext_node, 0); + __init_extent_tree_info(&sbi->extent_tree[EX_READ]); } int __init f2fs_create_extent_cache(void) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 04fdf010bb771..7c68bedee6492 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -596,16 +596,22 @@ enum { /* dirty segments threshold for triggering CP */ #define DEFAULT_DIRTY_THRESHOLD 4 +#define RECOVERY_MAX_RA_BLOCKS BIO_MAX_VECS +#define RECOVERY_MIN_RA_BLOCKS 1 + +#define F2FS_ONSTACK_PAGES 16 /* nr of onstack pages */ + /* for in-memory extent cache entry */ #define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */ /* number of extent info in extent cache we try to shrink */ #define READ_EXTENT_CACHE_SHRINK_NUMBER 128 -#define RECOVERY_MAX_RA_BLOCKS BIO_MAX_VECS -#define RECOVERY_MIN_RA_BLOCKS 1 - -#define F2FS_ONSTACK_PAGES 16 /* nr of onstack pages */ +/* extent cache type */ +enum extent_type { + EX_READ, + NR_EXTENT_CACHES, +}; struct rb_entry { struct rb_node rb_node; /* rb node located in rb-tree */ @@ -621,10 +627,17 @@ struct rb_entry { struct extent_info { unsigned int fofs; /* start offset in a file */ unsigned int len; /* length of the extent */ - block_t blk; /* start block address of the extent */ + union { + /* read extent_cache */ + struct { + /* start block address of the extent */ + block_t blk; #ifdef CONFIG_F2FS_FS_COMPRESSION - unsigned int c_len; /* physical extent length of compressed blocks */ + /* physical extent length of compressed blocks */ + unsigned int c_len; #endif + }; + }; }; struct extent_node { @@ -636,13 +649,25 @@ struct extent_node { struct extent_tree { nid_t ino; /* inode number */ + enum extent_type type; /* keep the extent tree type */ struct rb_root_cached root; /* root of extent info rb-tree */ struct extent_node *cached_en; /* recently accessed extent node */ - struct extent_info largest; /* largested extent info */ struct list_head list; /* to be used by sbi->zombie_list */ rwlock_t lock; /* protect extent info rb-tree */ atomic_t node_cnt; /* # of extent node in rb-tree*/ bool largest_updated; /* largest extent updated */ + struct extent_info largest; /* largest cached extent for EX_READ */ +}; + +struct extent_tree_info { + struct radix_tree_root extent_tree_root;/* cache extent cache entries */ + struct mutex extent_tree_lock; /* locking extent radix tree */ + struct list_head extent_list; /* lru list for shrinker */ + spinlock_t extent_lock; /* locking extent lru list */ + atomic_t total_ext_tree; /* extent tree count */ + struct list_head zombie_list; /* extent zombie tree list */ + atomic_t total_zombie_tree; /* extent zombie tree count */ + atomic_t total_ext_node; /* extent info count */ }; /* @@ -805,7 +830,8 @@ struct f2fs_inode_info { struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ struct task_struct *atomic_write_task; /* store atomic write task */ - struct extent_tree *extent_tree; /* cached extent_tree entry */ + struct extent_tree *extent_tree[NR_EXTENT_CACHES]; + /* cached extent_tree entry */ struct inode *cow_inode; /* copy-on-write inode for atomic write */ /* avoid racing between foreground op and gc */ @@ -1626,14 +1652,7 @@ struct f2fs_sb_info { struct mutex flush_lock; /* for flush exclusion */ /* for extent tree cache */ - struct radix_tree_root extent_tree_root;/* cache extent cache entries */ - struct mutex extent_tree_lock; /* locking extent radix tree */ - struct list_head extent_list; /* lru list for shrinker */ - spinlock_t extent_lock; /* locking extent lru list */ - atomic_t total_ext_tree; /* extent tree count */ - struct list_head zombie_list; /* extent zombie tree list */ - atomic_t total_zombie_tree; /* extent zombie tree count */ - atomic_t total_ext_node; /* extent info count */ + struct extent_tree_info extent_tree[NR_EXTENT_CACHES]; /* basic filesystem units */ unsigned int log_sectors_per_block; /* log2 sectors per block */ @@ -1718,10 +1737,14 @@ struct f2fs_sb_info { unsigned int segment_count[2]; /* # of allocated segments */ unsigned int block_count[2]; /* # of allocated blocks */ atomic_t inplace_count; /* # of inplace update */ - atomic64_t total_hit_ext; /* # of lookup extent cache */ - atomic64_t read_hit_rbtree; /* # of hit rbtree extent node */ - atomic64_t read_hit_largest; /* # of hit largest extent node */ - atomic64_t read_hit_cached; /* # of hit cached extent node */ + /* # of lookup extent cache */ + atomic64_t total_hit_ext[NR_EXTENT_CACHES]; + /* # of hit rbtree extent node */ + atomic64_t read_hit_rbtree[NR_EXTENT_CACHES]; + /* # of hit cached extent node */ + atomic64_t read_hit_cached[NR_EXTENT_CACHES]; + /* # of hit largest extent node in read extent cache */ + atomic64_t read_hit_largest; atomic_t inline_xattr; /* # of inline_xattr inodes */ atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ @@ -3823,9 +3846,17 @@ struct f2fs_stat_info { struct f2fs_sb_info *sbi; int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; int main_area_segs, main_area_sections, main_area_zones; - unsigned long long hit_largest, hit_cached, hit_rbtree; - unsigned long long hit_total, total_ext; - int ext_tree, zombie_tree, ext_node; + unsigned long long hit_cached[NR_EXTENT_CACHES]; + unsigned long long hit_rbtree[NR_EXTENT_CACHES]; + unsigned long long total_ext[NR_EXTENT_CACHES]; + unsigned long long hit_total[NR_EXTENT_CACHES]; + int ext_tree[NR_EXTENT_CACHES]; + int zombie_tree[NR_EXTENT_CACHES]; + int ext_node[NR_EXTENT_CACHES]; + /* to count memory footprint */ + unsigned long long ext_mem[NR_EXTENT_CACHES]; + /* for read extent cache */ + unsigned long long hit_largest; int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta; int ndirty_data, ndirty_qdata; unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all; @@ -3884,10 +3915,10 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_other_skip_bggc_count(sbi) ((sbi)->other_skip_bggc++) #define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++) #define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--) -#define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext)) -#define stat_inc_rbtree_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_rbtree)) +#define stat_inc_total_hit(sbi, type) (atomic64_inc(&(sbi)->total_hit_ext[type])) +#define stat_inc_rbtree_node_hit(sbi, type) (atomic64_inc(&(sbi)->read_hit_rbtree[type])) #define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest)) -#define stat_inc_cached_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_cached)) +#define stat_inc_cached_node_hit(sbi, type) (atomic64_inc(&(sbi)->read_hit_cached[type])) #define stat_inc_inline_xattr(inode) \ do { \ if (f2fs_has_inline_xattr(inode)) \ @@ -4010,10 +4041,10 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #define stat_other_skip_bggc_count(sbi) do { } while (0) #define stat_inc_dirty_inode(sbi, type) do { } while (0) #define stat_dec_dirty_inode(sbi, type) do { } while (0) -#define stat_inc_total_hit(sbi) do { } while (0) -#define stat_inc_rbtree_node_hit(sbi) do { } while (0) +#define stat_inc_total_hit(sbi, type) do { } while (0) +#define stat_inc_rbtree_node_hit(sbi, type) do { } while (0) #define stat_inc_largest_node_hit(sbi) do { } while (0) -#define stat_inc_cached_node_hit(sbi) do { } while (0) +#define stat_inc_cached_node_hit(sbi, type) do { } while (0) #define stat_inc_inline_xattr(inode) do { } while (0) #define stat_dec_inline_xattr(inode) do { } while (0) #define stat_inc_inline_inode(inode) do { } while (0) @@ -4119,20 +4150,23 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, bool force, bool *leftmost); bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, struct rb_root_cached *root, bool check_key); -unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); void f2fs_init_extent_tree(struct inode *inode, struct page *ipage); void f2fs_drop_extent_tree(struct inode *inode); -unsigned int f2fs_destroy_extent_node(struct inode *inode); +void f2fs_destroy_extent_node(struct inode *inode); void f2fs_destroy_extent_tree(struct inode *inode); -bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, - struct extent_info *ei); -void f2fs_update_extent_cache(struct dnode_of_data *dn); -void f2fs_update_extent_cache_range(struct dnode_of_data *dn, - pgoff_t fofs, block_t blkaddr, unsigned int len); void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi); int __init f2fs_create_extent_cache(void); void f2fs_destroy_extent_cache(void); +/* read extent cache ops */ +bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei); +void f2fs_update_read_extent_cache(struct dnode_of_data *dn); +void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, block_t blkaddr, unsigned int len); +unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, + int nr_shrink); + /* * sysfs.c */ @@ -4202,9 +4236,9 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, struct writeback_control *wbc, enum iostat_type io_type); int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index); -void f2fs_update_extent_tree_range_compressed(struct inode *inode, - pgoff_t fofs, block_t blkaddr, unsigned int llen, - unsigned int c_len); +void f2fs_update_read_extent_tree_range_compressed(struct inode *inode, + pgoff_t fofs, block_t blkaddr, + unsigned int llen, unsigned int c_len); int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, bool is_readahead, bool for_write); @@ -4285,9 +4319,10 @@ static inline bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) { } #define inc_compr_inode_stat(inode) do { } while (0) -static inline void f2fs_update_extent_tree_range_compressed(struct inode *inode, - pgoff_t fofs, block_t blkaddr, unsigned int llen, - unsigned int c_len) { } +static inline void f2fs_update_read_extent_tree_range_compressed( + struct inode *inode, + pgoff_t fofs, block_t blkaddr, + unsigned int llen, unsigned int c_len) { } #endif static inline int set_compress_context(struct inode *inode) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ab0a0d3730f6e..cbe7c24065c7b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -618,7 +618,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) */ fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + ofs; - f2fs_update_extent_cache_range(dn, fofs, 0, len); + f2fs_update_read_extent_cache_range(dn, fofs, 0, len); dec_valid_block_count(sbi, dn->inode, nr_free); } dn->ofs_in_node = ofs; @@ -1496,7 +1496,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, f2fs_set_data_blkaddr(dn); } - f2fs_update_extent_cache_range(dn, start, 0, index - start); + f2fs_update_read_extent_cache_range(dn, start, 0, index - start); return ret; } @@ -2558,7 +2558,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, struct f2fs_map_blocks map = { .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE, .m_may_create = false }; - struct extent_info ei = {0, 0, 0}; + struct extent_info ei = {0, }; pgoff_t pg_start, pg_end, next_pgofs; unsigned int blk_per_seg = sbi->blocks_per_seg; unsigned int total = 0, sec_num; @@ -2590,7 +2590,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, * lookup mapping info in extent cache, skip defragmenting if physical * block addresses are continuous. */ - if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, pg_start, &ei)) { if (ei.fofs + ei.len >= pg_end) goto out; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d19e26b2e875e..f0c6506d89758 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1146,7 +1146,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index) struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; - struct extent_info ei = {0, 0, 0}; + struct extent_info ei = {0, }; struct f2fs_io_info fio = { .sbi = sbi, .ino = inode->i_ino, @@ -1164,7 +1164,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index) if (!page) return -ENOMEM; - if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn.data_blkaddr = ei.blk + index - ei.fofs; if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ))) { diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 2c705c60019b2..086f201f15a02 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -262,8 +262,8 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } - if (fi->extent_tree) { - struct extent_info *ei = &fi->extent_tree->largest; + if (fi->extent_tree[EX_READ]) { + struct extent_info *ei = &fi->extent_tree[EX_READ]->largest; if (ei->len && (!f2fs_is_valid_blkaddr(sbi, ei->blk, @@ -607,7 +607,7 @@ retry: void f2fs_update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ]; f2fs_wait_on_page_writeback(node_page, NODE, true, true); set_page_dirty(node_page); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 84b147966080e..07419c3e42a52 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -86,9 +86,11 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) mem_size >>= PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else if (type == READ_EXTENT_CACHE) { - mem_size = (atomic_read(&sbi->total_ext_tree) * + struct extent_tree_info *eti = &sbi->extent_tree[EX_READ]; + + mem_size = (atomic_read(&eti->total_ext_tree) * sizeof(struct extent_tree) + - atomic_read(&sbi->total_ext_node) * + atomic_read(&eti->total_ext_node) * sizeof(struct extent_node)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else if (type == DISCARD_CACHE) { @@ -859,7 +861,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) blkaddr = data_blkaddr(dn->inode, dn->node_page, dn->ofs_in_node + 1); - f2fs_update_extent_tree_range_compressed(dn->inode, + f2fs_update_read_extent_tree_range_compressed(dn->inode, index, blkaddr, F2FS_I(dn->inode)->i_cluster_size, c_len); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 51de358bc4520..8722d1a13c170 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -450,7 +450,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) /* try to shrink extent cache when there is no enough memory */ if (!f2fs_available_free_memory(sbi, READ_EXTENT_CACHE)) - f2fs_shrink_extent_tree(sbi, READ_EXTENT_CACHE_SHRINK_NUMBER); + f2fs_shrink_read_extent_tree(sbi, + READ_EXTENT_CACHE_SHRINK_NUMBER); /* check the # of cached NAT entries */ if (!f2fs_available_free_memory(sbi, NAT_ENTRIES)) diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index dd3c3c7a90ec8..33c490e69ae30 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -28,10 +28,13 @@ static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) return count > 0 ? count : 0; } -static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi) +static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi, + enum extent_type type) { - return atomic_read(&sbi->total_zombie_tree) + - atomic_read(&sbi->total_ext_node); + struct extent_tree_info *eti = &sbi->extent_tree[type]; + + return atomic_read(&eti->total_zombie_tree) + + atomic_read(&eti->total_ext_node); } unsigned long f2fs_shrink_count(struct shrinker *shrink, @@ -53,8 +56,8 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink, } spin_unlock(&f2fs_list_lock); - /* count extent cache entries */ - count += __count_extent_cache(sbi); + /* count read extent cache entries */ + count += __count_extent_cache(sbi, EX_READ); /* count clean nat cache entries */ count += __count_nat_entries(sbi); @@ -99,8 +102,8 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink, sbi->shrinker_run_no = run_no; - /* shrink extent cache entries */ - freed += f2fs_shrink_extent_tree(sbi, nr >> 1); + /* shrink read extent cache entries */ + freed += f2fs_shrink_read_extent_tree(sbi, nr >> 1); /* shrink clean nat cache entries */ if (freed < nr) @@ -130,7 +133,7 @@ void f2fs_join_shrinker(struct f2fs_sb_info *sbi) void f2fs_leave_shrinker(struct f2fs_sb_info *sbi) { - f2fs_shrink_extent_tree(sbi, __count_extent_cache(sbi)); + f2fs_shrink_read_extent_tree(sbi, __count_extent_cache(sbi, EX_READ)); spin_lock(&f2fs_list_lock); list_del_init(&sbi->s_list); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 7fbfce498472f..2bb37892d2ba1 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -48,6 +48,7 @@ TRACE_DEFINE_ENUM(CP_DISCARD); TRACE_DEFINE_ENUM(CP_TRIMMED); TRACE_DEFINE_ENUM(CP_PAUSE); TRACE_DEFINE_ENUM(CP_RESIZE); +TRACE_DEFINE_ENUM(EX_READ); #define show_block_type(type) \ __print_symbolic(type, \ @@ -1522,28 +1523,31 @@ TRACE_EVENT(f2fs_issue_flush, TRACE_EVENT(f2fs_lookup_extent_tree_start, - TP_PROTO(struct inode *inode, unsigned int pgofs), + TP_PROTO(struct inode *inode, unsigned int pgofs, enum extent_type type), - TP_ARGS(inode, pgofs), + TP_ARGS(inode, pgofs, type), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(unsigned int, pgofs) + __field(enum extent_type, type) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pgofs = pgofs; + __entry->type = type; ), - TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u", + TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, type = %s", show_dev_ino(__entry), - __entry->pgofs) + __entry->pgofs, + __entry->type == EX_READ ? "Read" : "N/A") ); -TRACE_EVENT_CONDITION(f2fs_lookup_extent_tree_end, +TRACE_EVENT_CONDITION(f2fs_lookup_read_extent_tree_end, TP_PROTO(struct inode *inode, unsigned int pgofs, struct extent_info *ei), @@ -1557,8 +1561,8 @@ TRACE_EVENT_CONDITION(f2fs_lookup_extent_tree_end, __field(ino_t, ino) __field(unsigned int, pgofs) __field(unsigned int, fofs) - __field(u32, blk) __field(unsigned int, len) + __field(u32, blk) ), TP_fast_assign( @@ -1566,26 +1570,26 @@ TRACE_EVENT_CONDITION(f2fs_lookup_extent_tree_end, __entry->ino = inode->i_ino; __entry->pgofs = pgofs; __entry->fofs = ei->fofs; - __entry->blk = ei->blk; __entry->len = ei->len; + __entry->blk = ei->blk; ), TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, " - "ext_info(fofs: %u, blk: %u, len: %u)", + "read_ext_info(fofs: %u, len: %u, blk: %u)", show_dev_ino(__entry), __entry->pgofs, __entry->fofs, - __entry->blk, - __entry->len) + __entry->len, + __entry->blk) ); -TRACE_EVENT(f2fs_update_extent_tree_range, +TRACE_EVENT(f2fs_update_read_extent_tree_range, - TP_PROTO(struct inode *inode, unsigned int pgofs, block_t blkaddr, - unsigned int len, + TP_PROTO(struct inode *inode, unsigned int pgofs, unsigned int len, + block_t blkaddr, unsigned int c_len), - TP_ARGS(inode, pgofs, blkaddr, len, c_len), + TP_ARGS(inode, pgofs, len, blkaddr, c_len), TP_STRUCT__entry( __field(dev_t, dev) @@ -1600,67 +1604,73 @@ TRACE_EVENT(f2fs_update_extent_tree_range, __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pgofs = pgofs; - __entry->blk = blkaddr; __entry->len = len; + __entry->blk = blkaddr; __entry->c_len = c_len; ), TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, " - "blkaddr = %u, len = %u, " - "c_len = %u", + "len = %u, blkaddr = %u, c_len = %u", show_dev_ino(__entry), __entry->pgofs, - __entry->blk, __entry->len, + __entry->blk, __entry->c_len) ); TRACE_EVENT(f2fs_shrink_extent_tree, TP_PROTO(struct f2fs_sb_info *sbi, unsigned int node_cnt, - unsigned int tree_cnt), + unsigned int tree_cnt, enum extent_type type), - TP_ARGS(sbi, node_cnt, tree_cnt), + TP_ARGS(sbi, node_cnt, tree_cnt, type), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned int, node_cnt) __field(unsigned int, tree_cnt) + __field(enum extent_type, type) ), TP_fast_assign( __entry->dev = sbi->sb->s_dev; __entry->node_cnt = node_cnt; __entry->tree_cnt = tree_cnt; + __entry->type = type; ), - TP_printk("dev = (%d,%d), shrunk: node_cnt = %u, tree_cnt = %u", + TP_printk("dev = (%d,%d), shrunk: node_cnt = %u, tree_cnt = %u, type = %s", show_dev(__entry->dev), __entry->node_cnt, - __entry->tree_cnt) + __entry->tree_cnt, + __entry->type == EX_READ ? "Read" : "N/A") ); TRACE_EVENT(f2fs_destroy_extent_tree, - TP_PROTO(struct inode *inode, unsigned int node_cnt), + TP_PROTO(struct inode *inode, unsigned int node_cnt, + enum extent_type type), - TP_ARGS(inode, node_cnt), + TP_ARGS(inode, node_cnt, type), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(unsigned int, node_cnt) + __field(enum extent_type, type) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->node_cnt = node_cnt; + __entry->type = type; ), - TP_printk("dev = (%d,%d), ino = %lu, destroyed: node_cnt = %u", + TP_printk("dev = (%d,%d), ino = %lu, destroyed: node_cnt = %u, type = %s", show_dev_ino(__entry), - __entry->node_cnt) + __entry->node_cnt, + __entry->type == EX_READ ? "Read" : "N/A") ); DECLARE_EVENT_CLASS(f2fs_sync_dirty_inodes, -- GitLab From 72840cccc0a1a0a0dc1bb27b669a9111be6d0f6a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 2 Dec 2022 13:51:09 -0800 Subject: [PATCH 1340/1423] f2fs: allocate the extent_cache by default Let's allocate it to remove the runtime complexity. Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 38 +++++++++++++++++++------------------- fs/f2fs/f2fs.h | 3 ++- fs/f2fs/inode.c | 6 ++++-- fs/f2fs/namei.c | 4 ++-- 4 files changed, 27 insertions(+), 24 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 654a14ab8977c..305f969e3ad12 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -47,20 +47,23 @@ static bool __may_read_extent_tree(struct inode *inode) return S_ISREG(inode->i_mode); } -static bool __may_extent_tree(struct inode *inode, enum extent_type type) +static bool __init_may_extent_tree(struct inode *inode, enum extent_type type) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + if (type == EX_READ) + return __may_read_extent_tree(inode); + return false; +} +static bool __may_extent_tree(struct inode *inode, enum extent_type type) +{ /* * for recovered files during mount do not create extents * if shrinker is not registered. */ - if (list_empty(&sbi->s_list)) + if (list_empty(&F2FS_I_SB(inode)->s_list)) return false; - if (type == EX_READ) - return __may_read_extent_tree(inode); - return false; + return __init_may_extent_tree(inode, type); } static void __try_update_largest_extent(struct extent_tree *et, @@ -439,20 +442,18 @@ static void __drop_largest_extent(struct extent_tree *et, } } -/* return true, if inode page is changed */ -static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage, - enum extent_type type) +void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree_info *eti = &sbi->extent_tree[type]; - struct f2fs_extent *i_ext = ipage ? &F2FS_INODE(ipage)->i_ext : NULL; + struct extent_tree_info *eti = &sbi->extent_tree[EX_READ]; + struct f2fs_extent *i_ext = &F2FS_INODE(ipage)->i_ext; struct extent_tree *et; struct extent_node *en; struct extent_info ei; - if (!__may_extent_tree(inode, type)) { + if (!__may_extent_tree(inode, EX_READ)) { /* drop largest read extent */ - if (type == EX_READ && i_ext && i_ext->len) { + if (i_ext && i_ext->len) { f2fs_wait_on_page_writeback(ipage, NODE, true, true); i_ext->len = 0; set_page_dirty(ipage); @@ -460,13 +461,11 @@ static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage, goto out; } - et = __grab_extent_tree(inode, type); + et = __grab_extent_tree(inode, EX_READ); if (!i_ext || !i_ext->len) goto out; - BUG_ON(type != EX_READ); - get_read_extent_info(&ei, i_ext); write_lock(&et->lock); @@ -486,14 +485,15 @@ static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage, unlock_out: write_unlock(&et->lock); out: - if (type == EX_READ && !F2FS_I(inode)->extent_tree[EX_READ]) + if (!F2FS_I(inode)->extent_tree[EX_READ]) set_inode_flag(inode, FI_NO_EXTENT); } -void f2fs_init_extent_tree(struct inode *inode, struct page *ipage) +void f2fs_init_extent_tree(struct inode *inode) { /* initialize read cache */ - __f2fs_init_extent_tree(inode, ipage, EX_READ); + if (__init_may_extent_tree(inode, EX_READ)) + __grab_extent_tree(inode, EX_READ); } static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7c68bedee6492..ec52e06f8e61c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4150,7 +4150,7 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, bool force, bool *leftmost); bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, struct rb_root_cached *root, bool check_key); -void f2fs_init_extent_tree(struct inode *inode, struct page *ipage); +void f2fs_init_extent_tree(struct inode *inode); void f2fs_drop_extent_tree(struct inode *inode); void f2fs_destroy_extent_node(struct inode *inode); void f2fs_destroy_extent_tree(struct inode *inode); @@ -4159,6 +4159,7 @@ int __init f2fs_create_extent_cache(void); void f2fs_destroy_extent_cache(void); /* read extent cache ops */ +void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage); bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs, struct extent_info *ei); void f2fs_update_read_extent_cache(struct dnode_of_data *dn); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 086f201f15a02..c845c16f97d05 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -392,8 +392,6 @@ static int do_read_inode(struct inode *inode) fi->i_pino = le32_to_cpu(ri->i_pino); fi->i_dir_level = ri->i_dir_level; - f2fs_init_extent_tree(inode, node_page); - get_inline_info(inode, ri); fi->i_extra_isize = f2fs_has_extra_attr(inode) ? @@ -479,6 +477,10 @@ static int do_read_inode(struct inode *inode) } init_idisk_time(inode); + + /* Need all the flag bits */ + f2fs_init_read_extent_tree(inode, node_page); + f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 58a91ce8fe083..46de782c2baaa 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -284,8 +284,6 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, } F2FS_I(inode)->i_inline_xattr_size = xattr_size; - f2fs_init_extent_tree(inode, NULL); - F2FS_I(inode)->i_flags = f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); @@ -311,6 +309,8 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, f2fs_set_inode_flags(inode); + f2fs_init_extent_tree(inode); + trace_f2fs_new_inode(inode, 0); return inode; -- GitLab From 71644dff481180ba024ac4f5cb1f068756357adf Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 1 Dec 2022 17:37:15 -0800 Subject: [PATCH 1341/1423] f2fs: add block_age-based extent cache This patch introduces a runtime hot/cold data separation method for f2fs, in order to improve the accuracy for data temperature classification, reduce the garbage collection overhead after long-term data updates. Enhanced hot/cold data separation can record data block update frequency as "age" of the extent per inode, and take use of the age info to indicate better temperature type for data block allocation: - It records total data blocks allocated since mount; - When file extent has been updated, it calculate the count of data blocks allocated since last update as the age of the extent; - Before the data block allocated, it searches for the age info and chooses the suitable segment for allocation. Test and result: - Prepare: create about 30000 files * 3% for cold files (with cold file extension like .apk, from 3M to 10M) * 50% for warm files (with random file extension like .FcDxq, from 1K to 4M) * 47% for hot files (with hot file extension like .db, from 1K to 256K) - create(5%)/random update(90%)/delete(5%) the files * total write amount is about 70G * fsync will be called for .db files, and buffered write will be used for other files The storage of test device is large enough(128G) so that it will not switch to SSR mode during the test. Benefit: dirty segment count increment reduce about 14% - before: Dirty +21110 - after: Dirty +18286 Signed-off-by: qixiaoyu1 Signed-off-by: xiongping1 Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 14 ++ Documentation/filesystems/f2fs.rst | 4 + fs/f2fs/debug.c | 21 +++ fs/f2fs/extent_cache.c | 183 +++++++++++++++++++++++- fs/f2fs/f2fs.h | 38 +++++ fs/f2fs/file.c | 1 + fs/f2fs/inode.c | 1 + fs/f2fs/node.c | 10 +- fs/f2fs/node.h | 1 + fs/f2fs/segment.c | 33 +++++ fs/f2fs/shrinker.c | 10 +- fs/f2fs/super.c | 14 ++ fs/f2fs/sysfs.c | 24 ++++ include/trace/events/f2fs.h | 86 ++++++++++- 14 files changed, 430 insertions(+), 10 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 84a009aab1a13..9e3756625a819 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -655,3 +655,17 @@ Description: When space utilization exceeds this, do background DISCARD aggressi Does DISCARD forcibly in a period of given min_discard_issue_time when the number of discards is not 0 and set discard granularity to 1. Default: 80 + +What: /sys/fs/f2fs//hot_data_age_threshold +Date: November 2022 +Contact: "Ping Xiong" +Description: When DATA SEPARATION is on, it controls the age threshold to indicate + the data blocks as hot. By default it was initialized as 262144 blocks + (equals to 1GB). + +What: /sys/fs/f2fs//warm_data_age_threshold +Date: November 2022 +Contact: "Ping Xiong" +Description: When DATA SEPARATION is on, it controls the age threshold to indicate + the data blocks as warm. By default it was initialized as 2621440 blocks + (equals to 10GB). diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 67e1f3e86f32f..220f3e0d3f559 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -347,6 +347,10 @@ memory=%s Control memory mode. This supports "normal" and "low" modes. Because of the nature of low memory devices, in this mode, f2fs will try to save memory sometimes by sacrificing performance. "normal" mode is the default mode and same as before. +age_extent_cache Enable an age extent cache based on rb-tree. It records + data block update frequency of the extent per inode, in + order to provide better temperature hints for data block + allocation. ======================== ============================================================ Debugfs Entries diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index a9baa121d829f..8f1ef742551fc 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -88,6 +88,9 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->hit_largest = atomic64_read(&sbi->read_hit_largest); si->hit_total[EX_READ] += si->hit_largest; + /* block age extent_cache only */ + si->allocated_data_blocks = atomic64_read(&sbi->allocated_data_blocks); + /* validation check of the segment numbers */ si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); @@ -516,6 +519,22 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree[EX_READ], si->zombie_tree[EX_READ], si->ext_node[EX_READ]); + seq_puts(s, "\nExtent Cache (Block Age):\n"); + seq_printf(s, " - Allocated Data Blocks: %llu\n", + si->allocated_data_blocks); + seq_printf(s, " - Hit Count: L1:%llu L2:%llu\n", + si->hit_cached[EX_BLOCK_AGE], + si->hit_rbtree[EX_BLOCK_AGE]); + seq_printf(s, " - Hit Ratio: %llu%% (%llu / %llu)\n", + !si->total_ext[EX_BLOCK_AGE] ? 0 : + div64_u64(si->hit_total[EX_BLOCK_AGE] * 100, + si->total_ext[EX_BLOCK_AGE]), + si->hit_total[EX_BLOCK_AGE], + si->total_ext[EX_BLOCK_AGE]); + seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", + si->ext_tree[EX_BLOCK_AGE], + si->zombie_tree[EX_BLOCK_AGE], + si->ext_node[EX_BLOCK_AGE]); seq_puts(s, "\nBalancing F2FS Async:\n"); seq_printf(s, " - DIO (R: %4d, W: %4d)\n", si->nr_dio_read, si->nr_dio_write); @@ -586,6 +605,8 @@ static int stat_show(struct seq_file *s, void *v) si->cache_mem >> 10); seq_printf(s, " - read extent cache: %llu KB\n", si->ext_mem[EX_READ] >> 10); + seq_printf(s, " - block age extent cache: %llu KB\n", + si->ext_mem[EX_BLOCK_AGE] >> 10); seq_printf(s, " - paged : %llu KB\n", si->page_mem >> 10); } diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 305f969e3ad12..1bd38a78ebba9 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -6,6 +6,10 @@ * Copyright (c) 2015 Samsung Electronics * Authors: Jaegeuk Kim * Chao Yu + * + * block_age-based extent cache added by: + * Copyright (c) 2022 xiaomi Co., Ltd. + * http://www.xiaomi.com/ */ #include @@ -18,6 +22,7 @@ static void __set_extent_info(struct extent_info *ei, unsigned int fofs, unsigned int len, block_t blk, bool keep_clen, + unsigned long age, unsigned long last_blocks, enum extent_type type) { ei->fofs = fofs; @@ -30,6 +35,9 @@ static void __set_extent_info(struct extent_info *ei, #ifdef CONFIG_F2FS_FS_COMPRESSION ei->c_len = 0; #endif + } else if (type == EX_BLOCK_AGE) { + ei->age = age; + ei->last_blocks = last_blocks; } } @@ -47,10 +55,27 @@ static bool __may_read_extent_tree(struct inode *inode) return S_ISREG(inode->i_mode); } +static bool __may_age_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (!test_opt(sbi, AGE_EXTENT_CACHE)) + return false; + /* don't cache block age info for cold file */ + if (is_inode_flag_set(inode, FI_COMPRESSED_FILE)) + return false; + if (file_is_cold(inode)) + return false; + + return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode); +} + static bool __init_may_extent_tree(struct inode *inode, enum extent_type type) { if (type == EX_READ) return __may_read_extent_tree(inode); + else if (type == EX_BLOCK_AGE) + return __may_age_extent_tree(inode); return false; } @@ -90,6 +115,11 @@ static bool __is_extent_mergeable(struct extent_info *back, #endif return (back->fofs + back->len == front->fofs && back->blk + back->len == front->blk); + } else if (type == EX_BLOCK_AGE) { + return (back->fofs + back->len == front->fofs && + abs(back->age - front->age) <= SAME_AGE_REGION && + abs(back->last_blocks - front->last_blocks) <= + SAME_AGE_REGION); } return false; } @@ -489,11 +519,22 @@ out: set_inode_flag(inode, FI_NO_EXTENT); } +void f2fs_init_age_extent_tree(struct inode *inode) +{ + if (!__init_may_extent_tree(inode, EX_BLOCK_AGE)) + return; + __grab_extent_tree(inode, EX_BLOCK_AGE); +} + void f2fs_init_extent_tree(struct inode *inode) { /* initialize read cache */ if (__init_may_extent_tree(inode, EX_READ)) __grab_extent_tree(inode, EX_READ); + + /* initialize block age cache */ + if (__init_may_extent_tree(inode, EX_BLOCK_AGE)) + __grab_extent_tree(inode, EX_BLOCK_AGE); } static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs, @@ -544,6 +585,8 @@ out: if (type == EX_READ) trace_f2fs_lookup_read_extent_tree_end(inode, pgofs, ei); + else if (type == EX_BLOCK_AGE) + trace_f2fs_lookup_age_extent_tree_end(inode, pgofs, ei); return ret; } @@ -642,6 +685,10 @@ static void __update_extent_tree_range(struct inode *inode, if (type == EX_READ) trace_f2fs_update_read_extent_tree_range(inode, fofs, len, tei->blk, 0); + else if (type == EX_BLOCK_AGE) + trace_f2fs_update_age_extent_tree_range(inode, fofs, len, + tei->age, tei->last_blocks); + write_lock(&et->lock); if (type == EX_READ) { @@ -694,6 +741,7 @@ static void __update_extent_tree_range(struct inode *inode, __set_extent_info(&ei, end, org_end - end, end - dei.fofs + dei.blk, false, + dei.age, dei.last_blocks, type); en1 = __insert_extent_tree(sbi, et, &ei, NULL, NULL, true); @@ -702,6 +750,7 @@ static void __update_extent_tree_range(struct inode *inode, __set_extent_info(&en->ei, end, en->ei.len - (end - dei.fofs), en->ei.blk + (end - dei.fofs), true, + dei.age, dei.last_blocks, type); next_en = en; } @@ -732,11 +781,15 @@ static void __update_extent_tree_range(struct inode *inode, en = next_en; } + if (type == EX_BLOCK_AGE) + goto update_age_extent_cache; + /* 3. update extent in read extent cache */ BUG_ON(type != EX_READ); if (tei->blk) { - __set_extent_info(&ei, fofs, len, tei->blk, false, EX_READ); + __set_extent_info(&ei, fofs, len, tei->blk, false, + 0, 0, EX_READ); if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) __insert_extent_tree(sbi, et, &ei, insert_p, insert_parent, leftmost); @@ -758,7 +811,17 @@ static void __update_extent_tree_range(struct inode *inode, et->largest_updated = false; updated = true; } + goto out_read_extent_cache; +update_age_extent_cache: + if (!tei->last_blocks) + goto out_read_extent_cache; + __set_extent_info(&ei, fofs, len, 0, false, + tei->age, tei->last_blocks, EX_BLOCK_AGE); + if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) + __insert_extent_tree(sbi, et, &ei, + insert_p, insert_parent, leftmost); +out_read_extent_cache: write_unlock(&et->lock); if (updated) @@ -796,7 +859,7 @@ void f2fs_update_read_extent_tree_range_compressed(struct inode *inode, if (en) goto unlock_out; - __set_extent_info(&ei, fofs, llen, blkaddr, true, EX_READ); + __set_extent_info(&ei, fofs, llen, blkaddr, true, 0, 0, EX_READ); ei.c_len = c_len; if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) @@ -807,6 +870,72 @@ unlock_out: } #endif +static unsigned long long __calculate_block_age(unsigned long long new, + unsigned long long old) +{ + unsigned long long diff; + + diff = (new >= old) ? new - (new - old) : new + (old - new); + + return div_u64(diff * LAST_AGE_WEIGHT, 100); +} + +/* This returns a new age and allocated blocks in ei */ +static int __get_new_block_age(struct inode *inode, struct extent_info *ei) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + loff_t f_size = i_size_read(inode); + unsigned long long cur_blocks = + atomic64_read(&sbi->allocated_data_blocks); + + /* + * When I/O is not aligned to a PAGE_SIZE, update will happen to the last + * file block even in seq write. So don't record age for newly last file + * block here. + */ + if ((f_size >> PAGE_SHIFT) == ei->fofs && f_size & (PAGE_SIZE - 1) && + ei->blk == NEW_ADDR) + return -EINVAL; + + if (__lookup_extent_tree(inode, ei->fofs, ei, EX_BLOCK_AGE)) { + unsigned long long cur_age; + + if (cur_blocks >= ei->last_blocks) + cur_age = cur_blocks - ei->last_blocks; + else + /* allocated_data_blocks overflow */ + cur_age = ULLONG_MAX - ei->last_blocks + cur_blocks; + + if (ei->age) + ei->age = __calculate_block_age(cur_age, ei->age); + else + ei->age = cur_age; + ei->last_blocks = cur_blocks; + WARN_ON(ei->age > cur_blocks); + return 0; + } + + f2fs_bug_on(sbi, ei->blk == NULL_ADDR); + + /* the data block was allocated for the first time */ + if (ei->blk == NEW_ADDR) + goto out; + + if (__is_valid_data_blkaddr(ei->blk) && + !f2fs_is_valid_blkaddr(sbi, ei->blk, DATA_GENERIC_ENHANCE)) { + f2fs_bug_on(sbi, 1); + return -EINVAL; + } +out: + /* + * init block age with zero, this can happen when the block age extent + * was reclaimed due to memory constraint or system reboot + */ + ei->age = 0; + ei->last_blocks = cur_blocks; + return 0; +} + static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type type) { struct extent_info ei; @@ -823,6 +952,10 @@ static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type typ ei.blk = NULL_ADDR; else ei.blk = dn->data_blkaddr; + } else if (type == EX_BLOCK_AGE) { + ei.blk = dn->data_blkaddr; + if (__get_new_block_age(dn->inode, &ei)) + return; } __update_extent_tree_range(dn->inode, &ei, type); } @@ -940,6 +1073,43 @@ unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, int nr_shrin return __shrink_extent_tree(sbi, nr_shrink, EX_READ); } +/* block age extent cache operations */ +bool f2fs_lookup_age_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) +{ + if (!__may_extent_tree(inode, EX_BLOCK_AGE)) + return false; + + return __lookup_extent_tree(inode, pgofs, ei, EX_BLOCK_AGE); +} + +void f2fs_update_age_extent_cache(struct dnode_of_data *dn) +{ + return __update_extent_cache(dn, EX_BLOCK_AGE); +} + +void f2fs_update_age_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, unsigned int len) +{ + struct extent_info ei = { + .fofs = fofs, + .len = len, + }; + + if (!__may_extent_tree(dn->inode, EX_BLOCK_AGE)) + return; + + __update_extent_tree_range(dn->inode, &ei, EX_BLOCK_AGE); +} + +unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) +{ + if (!test_opt(sbi, AGE_EXTENT_CACHE)) + return 0; + + return __shrink_extent_tree(sbi, nr_shrink, EX_BLOCK_AGE); +} + static unsigned int __destroy_extent_node(struct inode *inode, enum extent_type type) { @@ -960,6 +1130,7 @@ static unsigned int __destroy_extent_node(struct inode *inode, void f2fs_destroy_extent_node(struct inode *inode) { __destroy_extent_node(inode, EX_READ); + __destroy_extent_node(inode, EX_BLOCK_AGE); } static void __drop_extent_tree(struct inode *inode, enum extent_type type) @@ -988,6 +1159,7 @@ static void __drop_extent_tree(struct inode *inode, enum extent_type type) void f2fs_drop_extent_tree(struct inode *inode) { __drop_extent_tree(inode, EX_READ); + __drop_extent_tree(inode, EX_BLOCK_AGE); } static void __destroy_extent_tree(struct inode *inode, enum extent_type type) @@ -1028,6 +1200,7 @@ static void __destroy_extent_tree(struct inode *inode, enum extent_type type) void f2fs_destroy_extent_tree(struct inode *inode) { __destroy_extent_tree(inode, EX_READ); + __destroy_extent_tree(inode, EX_BLOCK_AGE); } static void __init_extent_tree_info(struct extent_tree_info *eti) @@ -1045,6 +1218,12 @@ static void __init_extent_tree_info(struct extent_tree_info *eti) void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi) { __init_extent_tree_info(&sbi->extent_tree[EX_READ]); + __init_extent_tree_info(&sbi->extent_tree[EX_BLOCK_AGE]); + + /* initialize for block age extents */ + atomic64_set(&sbi->allocated_data_blocks, 0); + sbi->hot_data_age_threshold = DEF_HOT_DATA_AGE_THRESHOLD; + sbi->warm_data_age_threshold = DEF_WARM_DATA_AGE_THRESHOLD; } int __init f2fs_create_extent_cache(void) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ec52e06f8e61c..e8953c3dc81ab 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -107,6 +107,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000 #define F2FS_MOUNT_GC_MERGE 0x20000000 #define F2FS_MOUNT_COMPRESS_CACHE 0x40000000 +#define F2FS_MOUNT_AGE_EXTENT_CACHE 0x80000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -607,9 +608,22 @@ enum { /* number of extent info in extent cache we try to shrink */ #define READ_EXTENT_CACHE_SHRINK_NUMBER 128 +/* number of age extent info in extent cache we try to shrink */ +#define AGE_EXTENT_CACHE_SHRINK_NUMBER 128 +#define LAST_AGE_WEIGHT 30 +#define SAME_AGE_REGION 1024 + +/* + * Define data block with age less than 1GB as hot data + * define data block with age less than 10GB but more than 1GB as warm data + */ +#define DEF_HOT_DATA_AGE_THRESHOLD 262144 +#define DEF_WARM_DATA_AGE_THRESHOLD 2621440 + /* extent cache type */ enum extent_type { EX_READ, + EX_BLOCK_AGE, NR_EXTENT_CACHES, }; @@ -637,6 +651,13 @@ struct extent_info { unsigned int c_len; #endif }; + /* block age extent_cache */ + struct { + /* block age of the extent */ + unsigned long long age; + /* last total blocks allocated */ + unsigned long long last_blocks; + }; }; }; @@ -1653,6 +1674,11 @@ struct f2fs_sb_info { /* for extent tree cache */ struct extent_tree_info extent_tree[NR_EXTENT_CACHES]; + atomic64_t allocated_data_blocks; /* for block age extent_cache */ + + /* The threshold used for hot and warm data seperation*/ + unsigned int hot_data_age_threshold; + unsigned int warm_data_age_threshold; /* basic filesystem units */ unsigned int log_sectors_per_block; /* log2 sectors per block */ @@ -3857,6 +3883,8 @@ struct f2fs_stat_info { unsigned long long ext_mem[NR_EXTENT_CACHES]; /* for read extent cache */ unsigned long long hit_largest; + /* for block age extent cache */ + unsigned long long allocated_data_blocks; int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta; int ndirty_data, ndirty_qdata; unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all; @@ -4168,6 +4196,16 @@ void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn, unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); +/* block age extent cache ops */ +void f2fs_init_age_extent_tree(struct inode *inode); +bool f2fs_lookup_age_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei); +void f2fs_update_age_extent_cache(struct dnode_of_data *dn); +void f2fs_update_age_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, unsigned int len); +unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi, + int nr_shrink); + /* * sysfs.c */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index cbe7c24065c7b..56c23b5e9d658 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -619,6 +619,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + ofs; f2fs_update_read_extent_cache_range(dn, fofs, 0, len); + f2fs_update_age_extent_cache_range(dn, fofs, nr_free); dec_valid_block_count(sbi, dn->inode, nr_free); } dn->ofs_in_node = ofs; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index c845c16f97d05..ff6cf66ed46b2 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -480,6 +480,7 @@ static int do_read_inode(struct inode *inode) /* Need all the flag bits */ f2fs_init_read_extent_tree(inode, node_page); + f2fs_init_age_extent_tree(inode); f2fs_put_page(node_page, 1); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 07419c3e42a52..dde4c04587049 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -60,7 +60,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) avail_ram = val.totalram - val.totalhigh; /* - * give 25%, 25%, 50%, 50%, 50% memory for each components respectively + * give 25%, 25%, 50%, 50%, 25%, 25% memory for each components respectively */ if (type == FREE_NIDS) { mem_size = (nm_i->nid_cnt[FREE_NID] * @@ -85,14 +85,16 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) sizeof(struct ino_entry); mem_size >>= PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); - } else if (type == READ_EXTENT_CACHE) { - struct extent_tree_info *eti = &sbi->extent_tree[EX_READ]; + } else if (type == READ_EXTENT_CACHE || type == AGE_EXTENT_CACHE) { + enum extent_type etype = type == READ_EXTENT_CACHE ? + EX_READ : EX_BLOCK_AGE; + struct extent_tree_info *eti = &sbi->extent_tree[etype]; mem_size = (atomic_read(&eti->total_ext_tree) * sizeof(struct extent_tree) + atomic_read(&eti->total_ext_node) * sizeof(struct extent_node)) >> PAGE_SHIFT; - res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == DISCARD_CACHE) { mem_size = (atomic_read(&dcc->discard_cmd_cnt) * sizeof(struct discard_cmd)) >> PAGE_SHIFT; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 0aa48704c77a0..99454d46a9396 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -147,6 +147,7 @@ enum mem_type { DIRTY_DENTS, /* indicates dirty dentry pages */ INO_ENTRIES, /* indicates inode entries */ READ_EXTENT_CACHE, /* indicates read extent cache */ + AGE_EXTENT_CACHE, /* indicates age extent cache */ DISCARD_CACHE, /* indicates memory of cached discard cmds */ COMPRESS_PAGE, /* indicates memory of cached compressed pages */ BASE_CHECK, /* check kernel status */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8722d1a13c170..dee712f7225f9 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -453,6 +453,11 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) f2fs_shrink_read_extent_tree(sbi, READ_EXTENT_CACHE_SHRINK_NUMBER); + /* try to shrink age extent cache when there is no enough memory */ + if (!f2fs_available_free_memory(sbi, AGE_EXTENT_CACHE)) + f2fs_shrink_age_extent_tree(sbi, + AGE_EXTENT_CACHE_SHRINK_NUMBER); + /* check the # of cached NAT entries */ if (!f2fs_available_free_memory(sbi, NAT_ENTRIES)) f2fs_try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK); @@ -3151,10 +3156,28 @@ static int __get_segment_type_4(struct f2fs_io_info *fio) } } +static int __get_age_segment_type(struct inode *inode, pgoff_t pgofs) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_info ei; + + if (f2fs_lookup_age_extent_cache(inode, pgofs, &ei)) { + if (!ei.age) + return NO_CHECK_TYPE; + if (ei.age <= sbi->hot_data_age_threshold) + return CURSEG_HOT_DATA; + if (ei.age <= sbi->warm_data_age_threshold) + return CURSEG_WARM_DATA; + return CURSEG_COLD_DATA; + } + return NO_CHECK_TYPE; +} + static int __get_segment_type_6(struct f2fs_io_info *fio) { if (fio->type == DATA) { struct inode *inode = fio->page->mapping->host; + int type; if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) return CURSEG_COLD_DATA_PINNED; @@ -3169,6 +3192,11 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) } if (file_is_cold(inode) || f2fs_need_compress_data(inode)) return CURSEG_COLD_DATA; + + type = __get_age_segment_type(inode, fio->page->index); + if (type != NO_CHECK_TYPE) + return type; + if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || f2fs_is_cow_file(inode)) @@ -3287,6 +3315,9 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr)); + if (IS_DATASEG(type)) + atomic64_inc(&sbi->allocated_data_blocks); + up_write(&sit_i->sentry_lock); if (page && IS_NODESEG(type)) { @@ -3414,6 +3445,8 @@ void f2fs_outplace_write_data(struct dnode_of_data *dn, struct f2fs_summary sum; f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR); + if (fio->io_type == FS_DATA_IO || fio->io_type == FS_CP_DATA_IO) + f2fs_update_age_extent_cache(dn); set_summary(&sum, dn->nid, dn->ofs_in_node, fio->version); do_write_page(&sum, fio); f2fs_update_data_blkaddr(dn, fio->new_blkaddr); diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 33c490e69ae30..83d6fb97dcae0 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -59,6 +59,9 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink, /* count read extent cache entries */ count += __count_extent_cache(sbi, EX_READ); + /* count block age extent cache entries */ + count += __count_extent_cache(sbi, EX_BLOCK_AGE); + /* count clean nat cache entries */ count += __count_nat_entries(sbi); @@ -102,8 +105,11 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink, sbi->shrinker_run_no = run_no; + /* shrink extent cache entries */ + freed += f2fs_shrink_age_extent_tree(sbi, nr >> 2); + /* shrink read extent cache entries */ - freed += f2fs_shrink_read_extent_tree(sbi, nr >> 1); + freed += f2fs_shrink_read_extent_tree(sbi, nr >> 2); /* shrink clean nat cache entries */ if (freed < nr) @@ -134,6 +140,8 @@ void f2fs_join_shrinker(struct f2fs_sb_info *sbi) void f2fs_leave_shrinker(struct f2fs_sb_info *sbi) { f2fs_shrink_read_extent_tree(sbi, __count_extent_cache(sbi, EX_READ)); + f2fs_shrink_age_extent_tree(sbi, + __count_extent_cache(sbi, EX_BLOCK_AGE)); spin_lock(&f2fs_list_lock); list_del_init(&sbi->s_list); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 412c2e7352c04..180d8b804d139 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -163,6 +163,7 @@ enum { Opt_nogc_merge, Opt_discard_unit, Opt_memory_mode, + Opt_age_extent_cache, Opt_err, }; @@ -241,6 +242,7 @@ static match_table_t f2fs_tokens = { {Opt_nogc_merge, "nogc_merge"}, {Opt_discard_unit, "discard_unit=%s"}, {Opt_memory_mode, "memory=%s"}, + {Opt_age_extent_cache, "age_extent_cache"}, {Opt_err, NULL}, }; @@ -1257,6 +1259,9 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) } kfree(name); break; + case Opt_age_extent_cache: + set_opt(sbi, AGE_EXTENT_CACHE); + break; default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); @@ -1958,6 +1963,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",extent_cache"); else seq_puts(seq, ",noextent_cache"); + if (test_opt(sbi, AGE_EXTENT_CACHE)) + seq_puts(seq, ",age_extent_cache"); if (test_opt(sbi, DATA_FLUSH)) seq_puts(seq, ",data_flush"); @@ -2219,6 +2226,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool need_restart_flush = false, need_stop_flush = false; bool need_restart_discard = false, need_stop_discard = false; bool no_read_extent_cache = !test_opt(sbi, READ_EXTENT_CACHE); + bool no_age_extent_cache = !test_opt(sbi, AGE_EXTENT_CACHE); bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT); bool no_io_align = !F2FS_IO_ALIGNED(sbi); bool no_atgc = !test_opt(sbi, ATGC); @@ -2313,6 +2321,12 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) f2fs_warn(sbi, "switch extent_cache option is not allowed"); goto restore_opts; } + /* disallow enable/disable age extent_cache dynamically */ + if (no_age_extent_cache == !!test_opt(sbi, AGE_EXTENT_CACHE)) { + err = -EINVAL; + f2fs_warn(sbi, "switch age_extent_cache option is not allowed"); + goto restore_opts; + } if (no_io_align == !!F2FS_IO_ALIGNED(sbi)) { err = -EINVAL; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index a4745d596310f..2ab2151105966 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -668,6 +668,24 @@ out: return count; } + if (!strcmp(a->attr.name, "hot_data_age_threshold")) { + if (t == 0 || t >= sbi->warm_data_age_threshold) + return -EINVAL; + if (t == *ui) + return count; + *ui = (unsigned int)t; + return count; + } + + if (!strcmp(a->attr.name, "warm_data_age_threshold")) { + if (t == 0 || t <= sbi->hot_data_age_threshold) + return -EINVAL; + if (t == *ui) + return count; + *ui = (unsigned int)t; + return count; + } + *ui = (unsigned int)t; return count; @@ -923,6 +941,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, peak_atomic_write, peak_atomic_write); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, committed_atomic_block, committed_atomic_block); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, revoked_atomic_block, revoked_atomic_block); +/* For block age extent cache */ +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, hot_data_age_threshold, hot_data_age_threshold); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, warm_data_age_threshold, warm_data_age_threshold); + #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { ATTR_LIST(gc_urgent_sleep_time), @@ -1018,6 +1040,8 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(peak_atomic_write), ATTR_LIST(committed_atomic_block), ATTR_LIST(revoked_atomic_block), + ATTR_LIST(hot_data_age_threshold), + ATTR_LIST(warm_data_age_threshold), NULL, }; ATTRIBUTE_GROUPS(f2fs); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 2bb37892d2ba1..31d994e6b4ca9 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -49,6 +49,7 @@ TRACE_DEFINE_ENUM(CP_TRIMMED); TRACE_DEFINE_ENUM(CP_PAUSE); TRACE_DEFINE_ENUM(CP_RESIZE); TRACE_DEFINE_ENUM(EX_READ); +TRACE_DEFINE_ENUM(EX_BLOCK_AGE); #define show_block_type(type) \ __print_symbolic(type, \ @@ -155,6 +156,11 @@ TRACE_DEFINE_ENUM(EX_READ); { COMPRESS_ZSTD, "ZSTD" }, \ { COMPRESS_LZORLE, "LZO-RLE" }) +#define show_extent_type(type) \ + __print_symbolic(type, \ + { EX_READ, "Read" }, \ + { EX_BLOCK_AGE, "Block Age" }) + struct f2fs_sb_info; struct f2fs_io_info; struct extent_info; @@ -1544,7 +1550,7 @@ TRACE_EVENT(f2fs_lookup_extent_tree_start, TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, type = %s", show_dev_ino(__entry), __entry->pgofs, - __entry->type == EX_READ ? "Read" : "N/A") + show_extent_type(__entry->type)) ); TRACE_EVENT_CONDITION(f2fs_lookup_read_extent_tree_end, @@ -1583,6 +1589,45 @@ TRACE_EVENT_CONDITION(f2fs_lookup_read_extent_tree_end, __entry->blk) ); +TRACE_EVENT_CONDITION(f2fs_lookup_age_extent_tree_end, + + TP_PROTO(struct inode *inode, unsigned int pgofs, + struct extent_info *ei), + + TP_ARGS(inode, pgofs, ei), + + TP_CONDITION(ei), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(unsigned int, pgofs) + __field(unsigned int, fofs) + __field(unsigned int, len) + __field(unsigned long long, age) + __field(unsigned long long, blocks) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pgofs = pgofs; + __entry->fofs = ei->fofs; + __entry->len = ei->len; + __entry->age = ei->age; + __entry->blocks = ei->last_blocks; + ), + + TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, " + "age_ext_info(fofs: %u, len: %u, age: %llu, blocks: %llu)", + show_dev_ino(__entry), + __entry->pgofs, + __entry->fofs, + __entry->len, + __entry->age, + __entry->blocks) +); + TRACE_EVENT(f2fs_update_read_extent_tree_range, TP_PROTO(struct inode *inode, unsigned int pgofs, unsigned int len, @@ -1618,6 +1663,41 @@ TRACE_EVENT(f2fs_update_read_extent_tree_range, __entry->c_len) ); +TRACE_EVENT(f2fs_update_age_extent_tree_range, + + TP_PROTO(struct inode *inode, unsigned int pgofs, unsigned int len, + unsigned long long age, + unsigned long long last_blks), + + TP_ARGS(inode, pgofs, len, age, last_blks), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(unsigned int, pgofs) + __field(unsigned int, len) + __field(unsigned long long, age) + __field(unsigned long long, blocks) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pgofs = pgofs; + __entry->len = len; + __entry->age = age; + __entry->blocks = last_blks; + ), + + TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, " + "len = %u, age = %llu, blocks = %llu", + show_dev_ino(__entry), + __entry->pgofs, + __entry->len, + __entry->age, + __entry->blocks) +); + TRACE_EVENT(f2fs_shrink_extent_tree, TP_PROTO(struct f2fs_sb_info *sbi, unsigned int node_cnt, @@ -1643,7 +1723,7 @@ TRACE_EVENT(f2fs_shrink_extent_tree, show_dev(__entry->dev), __entry->node_cnt, __entry->tree_cnt, - __entry->type == EX_READ ? "Read" : "N/A") + show_extent_type(__entry->type)) ); TRACE_EVENT(f2fs_destroy_extent_tree, @@ -1670,7 +1750,7 @@ TRACE_EVENT(f2fs_destroy_extent_tree, TP_printk("dev = (%d,%d), ino = %lu, destroyed: node_cnt = %u, type = %s", show_dev_ino(__entry), __entry->node_cnt, - __entry->type == EX_READ ? "Read" : "N/A") + show_extent_type(__entry->type)) ); DECLARE_EVENT_CLASS(f2fs_sync_dirty_inodes, -- GitLab From db8dcd25ec84120d4e57a7f17a566825cec17ae8 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 7 Dec 2022 13:42:17 +0000 Subject: [PATCH 1342/1423] f2fs: Fix spelling mistake in label: free_bio_enrty_cache -> free_bio_entry_cache There is a spelling mistake in a label name. Fix it. Signed-off-by: Colin Ian King Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 180d8b804d139..c02a717cf880b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4723,7 +4723,7 @@ static int __init init_f2fs_fs(void) goto free_iostat; err = f2fs_init_bioset(); if (err) - goto free_bio_enrty_cache; + goto free_bio_entry_cache; err = f2fs_init_compress_mempool(); if (err) goto free_bioset; @@ -4740,7 +4740,7 @@ free_compress_mempool: f2fs_destroy_compress_mempool(); free_bioset: f2fs_destroy_bioset(); -free_bio_enrty_cache: +free_bio_entry_cache: f2fs_destroy_bio_entry_cache(); free_iostat: f2fs_destroy_iostat_processing(); -- GitLab From 15e38ee44d50cad264da80ef75626b9224ddc4a3 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Mon, 5 Dec 2022 22:56:03 +0800 Subject: [PATCH 1343/1423] f2fs: fix iostat parameter for discard Just like other data we count uses the number of bytes as the basic unit, but discard uses the number of cmds as the statistical unit. In fact the discard command contains the number of blocks, so let's change to the number of bytes as the base unit. Fixes: b0af6d491a6b ("f2fs: add app/fs io stat") Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index dee712f7225f9..f1845a032885d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1187,7 +1187,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, atomic_inc(&dcc->issued_discard); - f2fs_update_iostat(sbi, NULL, FS_DISCARD, 1); + f2fs_update_iostat(sbi, NULL, FS_DISCARD, len * F2FS_BLKSIZE); lstart += len; start += len; -- GitLab From 25547439f1dcc3def6062bd3e69165cd806a594e Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Fri, 2 Dec 2022 12:58:41 +0800 Subject: [PATCH 1344/1423] f2fs: don't call f2fs_issue_discard_timeout() when discard_cmd_cnt is 0 in f2fs_put_super() No need to call f2fs_issue_discard_timeout() in f2fs_put_super, when no discard command requires issue. Since the caller of f2fs_issue_discard_timeout() usually judges the number of discard commands before using it. Let's move this logic to f2fs_issue_discard_timeout(). By the way, use f2fs_realtime_discard_enable to simplify the code. Reported-by: kernel test robot Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 ++++-- fs/f2fs/super.c | 8 ++------ 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f1845a032885d..a9099a754dd2e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1661,6 +1661,9 @@ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi) struct discard_policy dpolicy; bool dropped; + if (!atomic_read(&dcc->discard_cmd_cnt)) + return false; + __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); __issue_discard_cmd(sbi, &dpolicy); @@ -2116,8 +2119,7 @@ static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) * Recovery can cache discard commands, so in error path of * fill_super(), it needs to give a chance to handle them. */ - if (unlikely(atomic_read(&dcc->discard_cmd_cnt))) - f2fs_issue_discard_timeout(sbi); + f2fs_issue_discard_timeout(sbi); kfree(dcc); SM_I(sbi)->dcc_info = NULL; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index c02a717cf880b..1f812b9ce985b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1581,8 +1581,7 @@ static void f2fs_put_super(struct super_block *sb) /* be sure to wait for any on-going discard commands */ dropped = f2fs_issue_discard_timeout(sbi); - if ((f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) && - !sbi->discard_blks && !dropped) { + if (f2fs_realtime_discard_enable(sbi) && !sbi->discard_blks && !dropped) { struct cp_control cpc = { .reason = CP_UMOUNT | CP_TRIMMED, }; @@ -2233,7 +2232,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool no_discard = !test_opt(sbi, DISCARD); bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE); bool block_unit_discard = f2fs_block_unit_discard(sbi); - struct discard_cmd_control *dcc; #ifdef CONFIG_QUOTA int i, j; #endif @@ -2420,10 +2418,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_flush; need_stop_discard = true; } else { - dcc = SM_I(sbi)->dcc_info; f2fs_stop_discard_thread(sbi); - if (atomic_read(&dcc->discard_cmd_cnt)) - f2fs_issue_discard_timeout(sbi); + f2fs_issue_discard_timeout(sbi); need_restart_discard = true; } } -- GitLab From 7411143f2021530d7641fbb40daaada4ee63f7e6 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 29 Nov 2022 12:15:23 +0800 Subject: [PATCH 1345/1423] f2fs: fix some format WARNING in debug.c and sysfs.c To fix: WARNING: function definition argument 'struct f2fs_attr *' should also have an identifier name + ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); WARNING: return sysfs_emit(...) formats should include a terminating newline + return sysfs_emit(buf, "(none)"); WARNING: Prefer 'unsigned int' to bare use of 'unsigned' + unsigned npages = NODE_MAPPING(sbi)->nrpages; WARNING: Missing a blank line after declarations + unsigned npages = COMPRESS_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; WARNING: quoted string split across lines + seq_printf(s, "CP merge (Queued: %4d, Issued: %4d, Total: %4d, " + "Cur time: %4d(ms), Peak time: %4d(ms))\n", Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 45 +++++++++++++++++++++++---------------------- fs/f2fs/sysfs.c | 10 +++++----- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 8f1ef742551fc..32af4f0c57357 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -318,18 +318,19 @@ get_cache: si->page_mem = 0; if (sbi->node_inode) { - unsigned npages = NODE_MAPPING(sbi)->nrpages; + unsigned long npages = NODE_MAPPING(sbi)->nrpages; si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } if (sbi->meta_inode) { - unsigned npages = META_MAPPING(sbi)->nrpages; + unsigned long npages = META_MAPPING(sbi)->nrpages; si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } #ifdef CONFIG_F2FS_FS_COMPRESSION if (sbi->compress_inode) { - unsigned npages = COMPRESS_MAPPING(sbi)->nrpages; + unsigned long npages = COMPRESS_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } #endif @@ -477,28 +478,28 @@ static int stat_show(struct seq_file *s, void *v) si->meta_count[META_NAT]); seq_printf(s, " - ssa blocks : %u\n", si->meta_count[META_SSA]); - seq_printf(s, "CP merge (Queued: %4d, Issued: %4d, Total: %4d, " - "Cur time: %4d(ms), Peak time: %4d(ms))\n", - si->nr_queued_ckpt, si->nr_issued_ckpt, - si->nr_total_ckpt, si->cur_ckpt_time, - si->peak_ckpt_time); + seq_puts(s, "CP merge:\n"); + seq_printf(s, " - Queued : %4d\n", si->nr_queued_ckpt); + seq_printf(s, " - Issued : %4d\n", si->nr_issued_ckpt); + seq_printf(s, " - Total : %4d\n", si->nr_total_ckpt); + seq_printf(s, " - Cur time : %4d(ms)\n", si->cur_ckpt_time); + seq_printf(s, " - Peak time : %4d(ms)\n", si->peak_ckpt_time); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); seq_printf(s, " - data segments : %d (%d)\n", si->data_segs, si->bg_data_segs); seq_printf(s, " - node segments : %d (%d)\n", si->node_segs, si->bg_node_segs); - seq_printf(s, " - Reclaimed segs : Normal (%d), Idle CB (%d), " - "Idle Greedy (%d), Idle AT (%d), " - "Urgent High (%d), Urgent Mid (%d), " - "Urgent Low (%d)\n", - si->sbi->gc_reclaimed_segs[GC_NORMAL], - si->sbi->gc_reclaimed_segs[GC_IDLE_CB], - si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY], - si->sbi->gc_reclaimed_segs[GC_IDLE_AT], - si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH], - si->sbi->gc_reclaimed_segs[GC_URGENT_MID], - si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]); + seq_puts(s, " - Reclaimed segs :\n"); + seq_printf(s, " - Normal : %d\n", si->sbi->gc_reclaimed_segs[GC_NORMAL]); + seq_printf(s, " - Idle CB : %d\n", si->sbi->gc_reclaimed_segs[GC_IDLE_CB]); + seq_printf(s, " - Idle Greedy : %d\n", + si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY]); + seq_printf(s, " - Idle AT : %d\n", si->sbi->gc_reclaimed_segs[GC_IDLE_AT]); + seq_printf(s, " - Urgent High : %d\n", + si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH]); + seq_printf(s, " - Urgent Mid : %d\n", si->sbi->gc_reclaimed_segs[GC_URGENT_MID]); + seq_printf(s, " - Urgent Low : %d\n", si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]); seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks, si->bg_data_blks + si->bg_node_blks); seq_printf(s, " - data blocks : %d (%d)\n", si->data_blks, @@ -540,11 +541,11 @@ static int stat_show(struct seq_file *s, void *v) si->nr_dio_read, si->nr_dio_write); seq_printf(s, " - IO_R (Data: %4d, Node: %4d, Meta: %4d\n", si->nr_rd_data, si->nr_rd_node, si->nr_rd_meta); - seq_printf(s, " - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), " - "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n", + seq_printf(s, " - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), ", si->nr_wb_cp_data, si->nr_wb_data, si->nr_flushing, si->nr_flushed, - si->flush_list_empty, + si->flush_list_empty); + seq_printf(s, "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n", si->nr_discarding, si->nr_discarded, si->nr_discard_cmd, si->undiscard_blks); seq_printf(s, " - atomic IO: %4d (Max. %4d)\n", diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 2ab2151105966..83a366f3ee80e 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -53,9 +53,9 @@ static const char *gc_mode_names[MAX_GC_MODE] = { struct f2fs_attr { struct attribute attr; - ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); - ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *, - const char *, size_t); + ssize_t (*show)(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf); + ssize_t (*store)(struct f2fs_attr *a, struct f2fs_sb_info *sbi, + const char *buf, size_t len); int struct_type; int offset; int id; @@ -232,13 +232,13 @@ static ssize_t encoding_show(struct f2fs_attr *a, (sb->s_encoding->version >> 8) & 0xff, sb->s_encoding->version & 0xff); #endif - return sysfs_emit(buf, "(none)"); + return sysfs_emit(buf, "(none)\n"); } static ssize_t mounted_time_sec_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sysfs_emit(buf, "%llu", SIT_I(sbi)->mounted_time); + return sysfs_emit(buf, "%llu\n", SIT_I(sbi)->mounted_time); } #ifdef CONFIG_F2FS_STAT_FS -- GitLab From 26a8057a1ada97b528b93fdf3ac4fd03170f1900 Mon Sep 17 00:00:00 2001 From: Yuwei Guan Date: Sun, 11 Dec 2022 21:08:41 +0800 Subject: [PATCH 1346/1423] f2fs: reset wait_ms to default if any of the victims have been selected In non-foreground gc mode, if no victim is selected, the gc process will wait for no_gc_sleep_time before waking up again. In this subsequent time, even though a victim will be selected, the gc process still waits for no_gc_sleep_time before waking up. The configuration of wait_ms is not reasonable. After any of the victims have been selected, we need to reset wait_ms to default sleep time from no_gc_sleep_time. Signed-off-by: Yuwei Guan Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index f0c6506d89758..d7a9d84ba57c1 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -141,6 +141,10 @@ do_gc: /* don't bother wait_ms by foreground gc */ if (!foreground) wait_ms = gc_th->no_gc_sleep_time; + } else { + /* reset wait_ms to default sleep time */ + if (wait_ms == gc_th->no_gc_sleep_time) + wait_ms = gc_th->min_sleep_time; } if (foreground) -- GitLab From e923f4625ed3ad7656c3f9f086c898798bafbbc5 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Thu, 1 Dec 2022 12:37:50 +0100 Subject: [PATCH 1347/1423] riscv: Apply a static assert to riscv_isa_ext_id Add a static assert to ensure a RISCV_ISA_EXT_* enum is never created with a value >= RISCV_ISA_EXT_MAX. We can do this by putting RISCV_ISA_EXT_ID_MAX to more work. Before it was redundant with RISCV_ISA_EXT_MAX and hence only used to document the limit. Now it grows with the enum and is used to check the limit. Signed-off-by: Andrew Jones Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20221201113750.18021-1-ajones@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/hwcap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index b225252900730..86328e3acb02e 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -59,8 +59,9 @@ enum riscv_isa_ext_id { RISCV_ISA_EXT_ZIHINTPAUSE, RISCV_ISA_EXT_SSTC, RISCV_ISA_EXT_SVINVAL, - RISCV_ISA_EXT_ID_MAX = RISCV_ISA_EXT_MAX, + RISCV_ISA_EXT_ID_MAX }; +static_assert(RISCV_ISA_EXT_ID_MAX <= RISCV_ISA_EXT_MAX); /* * This enum represents the logical ID for each RISC-V ISA extension static -- GitLab From 71fc3621efc38ace9640ee6a0db3300900689592 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Thu, 1 Dec 2022 14:51:28 +0100 Subject: [PATCH 1348/1423] riscv: Fix P4D_SHIFT definition for 3-level page table mode RISC-V kernels support 3,4,5-level page tables at runtime by folding upper levels. In case of a 3-level page table, PGDIR is folded into P4D which in turn is folded into PUD: PGDIR_SHIFT value is correctly set to the same value as PUD_SHIFT, but P4D_SHIFT is not, then any use of P4D_SHIFT will access invalid address bits (all set to 1). Fix this by dynamically defining P4D_SHIFT value, like we already do for PGDIR_SHIFT. Fixes: d10efa21a937 ("riscv: mm: Control p4d's folding by pgtable_l5_enabled") Signed-off-by: Alexandre Ghiti Reviewed-by: Palmer Dabbelt Link: https://lore.kernel.org/r/20221201135128.1482189-2-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/pgtable-64.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h index dc42375c23571..42a042c0e13ed 100644 --- a/arch/riscv/include/asm/pgtable-64.h +++ b/arch/riscv/include/asm/pgtable-64.h @@ -25,7 +25,11 @@ extern bool pgtable_l5_enabled; #define PGDIR_MASK (~(PGDIR_SIZE - 1)) /* p4d is folded into pgd in case of 4-level page table */ -#define P4D_SHIFT 39 +#define P4D_SHIFT_L3 30 +#define P4D_SHIFT_L4 39 +#define P4D_SHIFT_L5 39 +#define P4D_SHIFT (pgtable_l5_enabled ? P4D_SHIFT_L5 : \ + (pgtable_l4_enabled ? P4D_SHIFT_L4 : P4D_SHIFT_L3)) #define P4D_SIZE (_AC(1, UL) << P4D_SHIFT) #define P4D_MASK (~(P4D_SIZE - 1)) -- GitLab From c528ef0888b75f673f7d48022de8d31d5b451e8c Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Wed, 7 Dec 2022 04:11:12 -0500 Subject: [PATCH 1349/1423] riscv: Fixup compile error with !MMU Current nommu_virt_defconfig can't compile: In file included from arch/riscv/kernel/crash_core.c:3: arch/riscv/kernel/crash_core.c: In function 'arch_crash_save_vmcoreinfo': arch/riscv/kernel/crash_core.c:8:27: error: 'VA_BITS' undeclared (first use in this function) 8 | VMCOREINFO_NUMBER(VA_BITS); | ^~~~~~~ Add MMU dependency for KEXEC_FILE. Fixes: 6261586e0c91 ("RISC-V: Add kexec_file support") Reported-by: Conor Dooley Reported-by: kernel test robot Signed-off-by: Guo Ren Signed-off-by: Guo Ren Tested-by: Conor Dooley Link: https://lore.kernel.org/r/20221207091112.2258674-1-guoren@kernel.org Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 67ef08d33d3ad..e1a9fa47f0123 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -498,7 +498,7 @@ config KEXEC_FILE select KEXEC_CORE select KEXEC_ELF select HAVE_IMA_KEXEC if IMA - depends on 64BIT + depends on 64BIT && MMU help This is new version of kexec system call. This system call is file based and takes file descriptors as system call argument -- GitLab From 37f0ab1477994a0d0dc3c1e0de030fae07d37965 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Tue, 6 Dec 2022 18:08:12 -0800 Subject: [PATCH 1350/1423] Documentation: RISC-V: Fix a typo in patch-acceptance I just stumbled on this when modifying the docs. Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20221207020815.16214-2-palmer@rivosinc.com Signed-off-by: Palmer Dabbelt --- Documentation/riscv/patch-acceptance.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/riscv/patch-acceptance.rst b/Documentation/riscv/patch-acceptance.rst index dfe0ac5624fb2..5da6f9b273d6c 100644 --- a/Documentation/riscv/patch-acceptance.rst +++ b/Documentation/riscv/patch-acceptance.rst @@ -29,7 +29,7 @@ their own custom extensions. These custom extensions aren't required to go through any review or ratification process by the RISC-V Foundation. To avoid the maintenance complexity and potential performance impact of adding kernel code for implementor-specific -RISC-V extensions, we'll only to accept patches for extensions that +RISC-V extensions, we'll only accept patches for extensions that have been officially frozen or ratified by the RISC-V Foundation. (Implementors, may, of course, maintain their own Linux kernel trees containing code for any custom extensions that they wish.) -- GitLab From 936100d4507f2e9f0be4621b0c698180d65e8264 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Tue, 6 Dec 2022 18:08:13 -0800 Subject: [PATCH 1351/1423] Documentation: RISC-V: Allow patches for non-standard behavior The patch acceptance policy forbids accepting support for non-standard behavior. This policy was written in order to both steer implementers towards the standards and to avoid coupling the upstream kernel too tightly to vendor-specific features. Those were good goals, but in practice the policy just isn't working: every RISC-V system we have needs vendor-specific behavior in the kernel and we end up taking that support which violates the policy. That's confusing for contributors, which is the main reason we have a written policy in the first place. So let's just start taking code for vendor-defined behavior. Reviewed-by: Conor Dooley Reviewed-by: Anup Patel Signed-off-by: Paul Walmsley Link: https://lore.kernel.org/all/alpine.DEB.2.21.999.2211181027590.4480@utopia.booyaka.com/ [Palmer: merge in Paul's suggestions] Link: https://lore.kernel.org/r/20221207020815.16214-3-palmer@rivosinc.com Signed-off-by: Palmer Dabbelt --- Documentation/riscv/patch-acceptance.rst | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/Documentation/riscv/patch-acceptance.rst b/Documentation/riscv/patch-acceptance.rst index 5da6f9b273d6c..d9d628505cd84 100644 --- a/Documentation/riscv/patch-acceptance.rst +++ b/Documentation/riscv/patch-acceptance.rst @@ -29,7 +29,11 @@ their own custom extensions. These custom extensions aren't required to go through any review or ratification process by the RISC-V Foundation. To avoid the maintenance complexity and potential performance impact of adding kernel code for implementor-specific -RISC-V extensions, we'll only accept patches for extensions that -have been officially frozen or ratified by the RISC-V Foundation. -(Implementors, may, of course, maintain their own Linux kernel trees -containing code for any custom extensions that they wish.) +RISC-V extensions, we'll only consider patches for extensions that either: + +- Have been officially frozen or ratified by the RISC-V Foundation, or +- Have been implemented in hardware that is widely available, per standard + Linux practice. + +(Implementors, may, of course, maintain their own Linux kernel trees containing +code for any custom extensions that they wish.) -- GitLab From 68eabc72023f2c1cdbf7932fde57a7811c65b414 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Tue, 6 Dec 2022 18:08:14 -0800 Subject: [PATCH 1352/1423] Documentation: RISC-V: Mention the UEFI Standards The current patch acceptance policy requires that specifications are approved by the RISC-V foundation, but we rely on external specifications as well. This explicitly calls out the UEFI specifications that we're starting to depend on. Reviewed-by: Conor Dooley Reviewed-by: Atish Patra Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20221207020815.16214-4-palmer@rivosinc.com Signed-off-by: Palmer Dabbelt --- Documentation/riscv/patch-acceptance.rst | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Documentation/riscv/patch-acceptance.rst b/Documentation/riscv/patch-acceptance.rst index d9d628505cd84..389a455843865 100644 --- a/Documentation/riscv/patch-acceptance.rst +++ b/Documentation/riscv/patch-acceptance.rst @@ -20,9 +20,11 @@ Submit Checklist Addendum ------------------------- We'll only accept patches for new modules or extensions if the specifications for those modules or extensions are listed as being -"Frozen" or "Ratified" by the RISC-V Foundation. (Developers may, of -course, maintain their own Linux kernel trees that contain code for -any draft extensions that they wish.) +unlikely to be incompatibly changed in the future. For +specifications from the RISC-V foundation this means "Frozen" or +"Ratified", for the UEFI forum specifications this means a published +ECR. (Developers may, of course, maintain their own Linux kernel trees +that contain code for any draft extensions that they wish.) Additionally, the RISC-V specification allows implementors to create their own custom extensions. These custom extensions aren't required -- GitLab From a39c636506cb90b9ba25cbb0a78bbcc3725ea227 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Tue, 6 Dec 2022 18:08:15 -0800 Subject: [PATCH 1353/1423] Documentation: RISC-V: patch-acceptance: s/implementor/implementer Implementor does appear to be a word, but it's not very common. Suggested-by: Conor Dooley Reviewed-by: Anup Patel Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20221207020815.16214-5-palmer@rivosinc.com Signed-off-by: Palmer Dabbelt --- Documentation/riscv/patch-acceptance.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/riscv/patch-acceptance.rst b/Documentation/riscv/patch-acceptance.rst index 389a455843865..07d5a5623e2a3 100644 --- a/Documentation/riscv/patch-acceptance.rst +++ b/Documentation/riscv/patch-acceptance.rst @@ -26,7 +26,7 @@ specifications from the RISC-V foundation this means "Frozen" or ECR. (Developers may, of course, maintain their own Linux kernel trees that contain code for any draft extensions that they wish.) -Additionally, the RISC-V specification allows implementors to create +Additionally, the RISC-V specification allows implementers to create their own custom extensions. These custom extensions aren't required to go through any review or ratification process by the RISC-V Foundation. To avoid the maintenance complexity and potential @@ -37,5 +37,5 @@ RISC-V extensions, we'll only consider patches for extensions that either: - Have been implemented in hardware that is widely available, per standard Linux practice. -(Implementors, may, of course, maintain their own Linux kernel trees containing +(Implementers, may, of course, maintain their own Linux kernel trees containing code for any custom extensions that they wish.) -- GitLab From d74f4a3f6d88a2416564bc6bf937e423a4ae8f8e Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 14 Dec 2022 10:39:11 +0800 Subject: [PATCH 1354/1423] cifs: Remove duplicated include in cifsglob.h ./fs/cifs/cifsglob.h: linux/scatterlist.h is included more than once. Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=3459 Fixes: f7f291e14dde ("cifs: fix oops during encryption") Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 703685e2db5e7..82f2d3070c266 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -23,7 +23,6 @@ #include "cifs_fs_sb.h" #include "cifsacl.h" #include -#include #include #include "../smbfs_common/smb2pdu.h" #include "smb2pdu.h" -- GitLab From 11e47bbd700f31bd1ee9f8863381bc9e741c0e97 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 12 Dec 2022 14:07:48 +0100 Subject: [PATCH 1355/1423] gpio: sim: set a limit on the number of GPIOs With the removal of ARCH_NR_GPIOS in commit 7b61212f2a07 ("gpiolib: Get rid of ARCH_NR_GPIOS") the gpiolib core no longer sanitizes the number of GPIOs for us. This causes the gpio-sim selftests to now fail when setting the number of GPIOs to 99999 and expecting the probe() to fail. Set a sane limit of 1024 on the number of simulated GPIOs and bail out of probe if it's exceeded. Reported-by: kernel test robot Link: https://lore.kernel.org/oe-lkp/202212112236.756f5db9-oliver.sang@intel.com Fixes: 7b61212f2a07 ("gpiolib: Get rid of ARCH_NR_GPIOS") Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-sim.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpio/gpio-sim.c b/drivers/gpio/gpio-sim.c index 1020c2feb2496..60514bc5454f8 100644 --- a/drivers/gpio/gpio-sim.c +++ b/drivers/gpio/gpio-sim.c @@ -31,6 +31,7 @@ #include "gpiolib.h" +#define GPIO_SIM_NGPIO_MAX 1024 #define GPIO_SIM_PROP_MAX 4 /* Max 3 properties + sentinel. */ #define GPIO_SIM_NUM_ATTRS 3 /* value, pull and sentinel */ @@ -371,6 +372,9 @@ static int gpio_sim_add_bank(struct fwnode_handle *swnode, struct device *dev) if (ret) return ret; + if (num_lines > GPIO_SIM_NGPIO_MAX) + return -ERANGE; + ret = fwnode_property_read_string(swnode, "gpio-sim,label", &label); if (ret) { label = devm_kasprintf(dev, GFP_KERNEL, "%s-%s", -- GitLab From 904f309ae7edaadc9fd0ee04be8281d7781d97e4 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Tue, 13 Dec 2022 18:06:51 -0800 Subject: [PATCH 1356/1423] thermal: intel: Don't set HFI status bit to 1 When CPU doesn't support HFI (Hardware Feedback Interface), don't include BIT 26 in the mask to prevent clearing. otherwise this results in: unchecked MSR access error: WRMSR to 0x1b1 (tried to write 0x0000000004000aa8) at rIP: 0xffffffff8b8559fe (throttle_active_work+0xbe/0x1b0) Fixes: 6fe1e64b6026 ("thermal: intel: Prevent accidental clearing of HFI status") Reported-by: Linus Torvalds Tested-by: Linus Torvalds Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/thermal/intel/therm_throt.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/intel/therm_throt.c b/drivers/thermal/intel/therm_throt.c index 4bb7fddaa1437..2e22bb82b7389 100644 --- a/drivers/thermal/intel/therm_throt.c +++ b/drivers/thermal/intel/therm_throt.c @@ -194,7 +194,7 @@ static const struct attribute_group thermal_attr_group = { #define THERM_STATUS_PROCHOT_LOG BIT(1) #define THERM_STATUS_CLEAR_CORE_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11) | BIT(13) | BIT(15)) -#define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11) | BIT(26)) +#define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11)) /* * Clear the bits in package thermal status register for bit = 1 @@ -211,6 +211,9 @@ void thermal_clear_package_intr_status(int level, u64 bit_mask) } else { msr = MSR_IA32_PACKAGE_THERM_STATUS; msr_val = THERM_STATUS_CLEAR_PKG_MASK; + if (boot_cpu_has(X86_FEATURE_HFI)) + msr_val |= BIT(26); + } msr_val &= ~bit_mask; -- GitLab From f0f4c3adcfe6d93508a7fbe241b115aa5b24fe36 Mon Sep 17 00:00:00 2001 From: Luca Weiss Date: Sun, 16 Oct 2022 11:00:34 +0200 Subject: [PATCH 1357/1423] dt-bindings: thermal: tsens: Add sm8450 compatible Document the tsens-v2 compatible for sm8450 SoC. Signed-off-by: Luca Weiss Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20221016090035.565350-5-luca@z3ntu.xyz Signed-off-by: Daniel Lezcano --- Documentation/devicetree/bindings/thermal/qcom-tsens.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml b/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml index 038d81338fcf1..7db905c1d5558 100644 --- a/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml +++ b/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml @@ -57,6 +57,7 @@ properties: - qcom,sm8150-tsens - qcom,sm8250-tsens - qcom,sm8350-tsens + - qcom,sm8450-tsens - const: qcom,tsens-v2 reg: -- GitLab From 1f455f144fb05e53ae47f84eb109a8021fe168d1 Mon Sep 17 00:00:00 2001 From: Marcus Folkesson Date: Fri, 14 Oct 2022 10:16:20 +0200 Subject: [PATCH 1358/1423] thermal/drivers/imx8mm_thermal: Use GENMASK() when appropriate GENMASK() is preferred to use for bitmasks. Signed-off-by: Marcus Folkesson Link: https://lore.kernel.org/r/20221014081620.1599511-1-marcus.folkesson@gmail.com Signed-off-by: Daniel Lezcano --- drivers/thermal/imx8mm_thermal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/imx8mm_thermal.c b/drivers/thermal/imx8mm_thermal.c index e2c2673025a7a..68608d8e5e29f 100644 --- a/drivers/thermal/imx8mm_thermal.c +++ b/drivers/thermal/imx8mm_thermal.c @@ -23,8 +23,8 @@ #define TER_ADC_PD BIT(30) #define TER_EN BIT(31) -#define TRITSR_TEMP0_VAL_MASK 0xff -#define TRITSR_TEMP1_VAL_MASK 0xff0000 +#define TRITSR_TEMP0_VAL_MASK GENMASK(7, 0) +#define TRITSR_TEMP1_VAL_MASK GENMASK(23, 16) #define PROBE_SEL_ALL GENMASK(31, 30) -- GitLab From d37edc7370273306d8747097fafa62436c1cfe16 Mon Sep 17 00:00:00 2001 From: Marcus Folkesson Date: Fri, 14 Oct 2022 09:35:07 +0200 Subject: [PATCH 1359/1423] thermal/drivers/imx8mm_thermal: Validate temperature range Check against the upper temperature limit (125 degrees C) before consider the temperature valid. Fixes: 5eed800a6811 ("thermal: imx8mm: Add support for i.MX8MM thermal monitoring unit") Signed-off-by: Marcus Folkesson Reviewed-by: Jacky Bai Link: https://lore.kernel.org/r/20221014073507.1594844-1-marcus.folkesson@gmail.com Signed-off-by: Daniel Lezcano --- drivers/thermal/imx8mm_thermal.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/imx8mm_thermal.c b/drivers/thermal/imx8mm_thermal.c index 68608d8e5e29f..182cc207afe90 100644 --- a/drivers/thermal/imx8mm_thermal.c +++ b/drivers/thermal/imx8mm_thermal.c @@ -65,8 +65,14 @@ static int imx8mm_tmu_get_temp(void *data, int *temp) u32 val; val = readl_relaxed(tmu->base + TRITSR) & TRITSR_TEMP0_VAL_MASK; + + /* + * Do not validate against the V bit (bit 31) due to errata + * ERR051272: TMU: Bit 31 of registers TMU_TSCR/TMU_TRITSR/TMU_TRATSR invalid + */ + *temp = val * 1000; - if (*temp < VER1_TEMP_LOW_LIMIT) + if (*temp < VER1_TEMP_LOW_LIMIT || *temp > VER2_TEMP_HIGH_LIMIT) return -EAGAIN; return 0; -- GitLab From 87f9fe8c4b603bd18f025d8c5fc9bb5f17b9a4cc Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 11 Oct 2022 12:52:35 -0500 Subject: [PATCH 1360/1423] dt-bindings: thermal: Convert generic-adc-thermal to DT schema Convert the 'generic-adc-thermal' binding to DT schema format. The binding said '#thermal-sensor-cells' should be 1, but all in tree users are 0 and 1 doesn't make sense for a single channel. Drop the example's related providers and consumers of the 'generic-adc-thermal' node as the convention is to not have those in the examples. Signed-off-by: Rob Herring Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20221011175235.3191509-1-robh@kernel.org Signed-off-by: Daniel Lezcano --- .../bindings/thermal/generic-adc-thermal.yaml | 84 ++++++++++++++++ .../bindings/thermal/thermal-generic-adc.txt | 95 ------------------- 2 files changed, 84 insertions(+), 95 deletions(-) create mode 100644 Documentation/devicetree/bindings/thermal/generic-adc-thermal.yaml delete mode 100644 Documentation/devicetree/bindings/thermal/thermal-generic-adc.txt diff --git a/Documentation/devicetree/bindings/thermal/generic-adc-thermal.yaml b/Documentation/devicetree/bindings/thermal/generic-adc-thermal.yaml new file mode 100644 index 0000000000000..f1fc3b0d86085 --- /dev/null +++ b/Documentation/devicetree/bindings/thermal/generic-adc-thermal.yaml @@ -0,0 +1,84 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/thermal/generic-adc-thermal.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: General Purpose Analog To Digital Converter (ADC) based thermal sensor + +maintainers: + - Laxman Dewangan + +description: + On some of platforms, thermal sensor like thermistors are connected to + one of ADC channel and sensor resistance is read via voltage across the + sensor resistor. The voltage read across the sensor is mapped to + temperature using voltage-temperature lookup table. + +properties: + compatible: + const: generic-adc-thermal + + '#thermal-sensor-cells': + const: 0 + + io-channels: + maxItems: 1 + + io-channel-names: + const: sensor-channel + + temperature-lookup-table: + description: | + Lookup table to map the relation between ADC value and temperature. + When ADC is read, the value is looked up on the table to get the + equivalent temperature. + + If not specified, driver assumes the ADC channel gives milliCelsius + directly. + $ref: /schemas/types.yaml#/definitions/int32-matrix + items: + items: + - description: Temperature in milliCelsius + - description: ADC read value + +required: + - compatible + - '#thermal-sensor-cells' + - io-channels + - io-channel-names + +additionalProperties: false + +examples: + - | + #include + + thermal-sensor { + compatible = "generic-adc-thermal"; + #thermal-sensor-cells = <0>; + io-channels = <&ads1015 1>; + io-channel-names = "sensor-channel"; + temperature-lookup-table = < + (-40000) 2578 + (-39000) 2577 + (-38000) 2576 + (-37000) 2575 + (-36000) 2574 + (-35000) 2573 + (-34000) 2572 + (-33000) 2571 + (-32000) 2569 + (-31000) 2568 + (-30000) 2567 + /* skip */ + 118000 254 + 119000 247 + 120000 240 + 121000 233 + 122000 226 + 123000 220 + 124000 214 + 125000 208>; + }; +... diff --git a/Documentation/devicetree/bindings/thermal/thermal-generic-adc.txt b/Documentation/devicetree/bindings/thermal/thermal-generic-adc.txt deleted file mode 100644 index e136946a2f4fd..0000000000000 --- a/Documentation/devicetree/bindings/thermal/thermal-generic-adc.txt +++ /dev/null @@ -1,95 +0,0 @@ -General Purpose Analog To Digital Converter (ADC) based thermal sensor. - -On some of platforms, thermal sensor like thermistors are connected to -one of ADC channel and sensor resistance is read via voltage across the -sensor resistor. The voltage read across the sensor is mapped to -temperature using voltage-temperature lookup table. - -Required properties: -=================== -- compatible: Must be "generic-adc-thermal". -- #thermal-sensor-cells: Should be 1. See Documentation/devicetree/bindings/thermal/thermal-sensor.yaml for a description - of this property. -Optional properties: -=================== -- temperature-lookup-table: Two dimensional array of Integer; lookup table - to map the relation between ADC value and - temperature. When ADC is read, the value is - looked up on the table to get the equivalent - temperature. - - The first value of the each row of array is the - temperature in milliCelsius and second value of - the each row of array is the ADC read value. - - If not specified, driver assumes the ADC channel - gives milliCelsius directly. - -Example : -#include - -i2c@7000c400 { - ads1015: ads1015@4a { - reg = <0x4a>; - compatible = "ads1015"; - sampling-frequency = <3300>; - #io-channel-cells = <1>; - }; -}; - -tboard_thermistor: thermal-sensor { - compatible = "generic-adc-thermal"; - #thermal-sensor-cells = <0>; - io-channels = <&ads1015 1>; - io-channel-names = "sensor-channel"; - temperature-lookup-table = < (-40000) 2578 - (-39000) 2577 - (-38000) 2576 - (-37000) 2575 - (-36000) 2574 - (-35000) 2573 - (-34000) 2572 - (-33000) 2571 - (-32000) 2569 - (-31000) 2568 - (-30000) 2567 - :::::::::: - 118000 254 - 119000 247 - 120000 240 - 121000 233 - 122000 226 - 123000 220 - 124000 214 - 125000 208>; -}; - -dummy_cool_dev: dummy-cool-dev { - compatible = "dummy-cooling-dev"; - #cooling-cells = <2>; /* min followed by max */ -}; - -thermal-zones { - Tboard { - polling-delay = <15000>; /* milliseconds */ - polling-delay-passive = <0>; /* milliseconds */ - thermal-sensors = <&tboard_thermistor>; - - trips { - therm_est_trip: therm_est_trip { - temperature = <40000>; - type = "active"; - hysteresis = <1000>; - }; - }; - - cooling-maps { - map0 { - trip = <&therm_est_trip>; - cooling-device = <&dummy_cool_dev THERMAL_NO_LIMIT THERMAL_NO_LIMIT>; - contribution = <100>; - }; - - }; - }; -}; -- GitLab From a7c42af78b19a11e98a5555a664c343e3a672632 Mon Sep 17 00:00:00 2001 From: Keerthy Date: Mon, 10 Oct 2022 09:11:26 +0530 Subject: [PATCH 1361/1423] thermal/drivers/k3_j72xx_bandgap: Fix the debug print message The debug print message to check the workaround applicability is inverted. Fix the same. Fixes: ffcb2fc86eb7 ("thermal: k3_j72xx_bandgap: Add the bandgap driver support") Reported-by: Bryan Brattlof Signed-off-by: Keerthy Link: https://lore.kernel.org/r/20221010034126.3550-1-j-keerthy@ti.com Signed-off-by: Daniel Lezcano --- drivers/thermal/k3_j72xx_bandgap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/k3_j72xx_bandgap.c b/drivers/thermal/k3_j72xx_bandgap.c index 16b6bcf1bf4fa..c073b1023bbe7 100644 --- a/drivers/thermal/k3_j72xx_bandgap.c +++ b/drivers/thermal/k3_j72xx_bandgap.c @@ -439,7 +439,7 @@ static int k3_j72xx_bandgap_probe(struct platform_device *pdev) workaround_needed = false; dev_dbg(bgp->dev, "Work around %sneeded\n", - workaround_needed ? "not " : ""); + workaround_needed ? "" : "not "); if (!workaround_needed) init_table(5, ref_table, golden_factors); -- GitLab From 7ef2f023c2c77a452ba5d0c9b1ad04a5a895d553 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 20 Oct 2022 13:36:58 +0300 Subject: [PATCH 1362/1423] thermal/of: Fix memory leak on thermal_of_zone_register() failure The function does not free 'of_ops' upon failure, leading to a memory leak [1]. Fix by freeing 'of_ops' in the error path. [1] unreferenced object 0xffff8ee846198c80 (size 128): comm "swapper/0", pid 1, jiffies 4294699704 (age 70.076s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ d0 3f 6e 8c ff ff ff ff 00 00 00 00 00 00 00 00 .?n............. backtrace: [<00000000d136f562>] __kmalloc_node_track_caller+0x42/0x120 [<0000000063f31678>] kmemdup+0x1d/0x40 [<00000000e6d24096>] thermal_of_zone_register+0x49/0x520 [<000000005e78c755>] devm_thermal_of_zone_register+0x54/0x90 [<00000000ee6b209e>] pmbus_add_sensor+0x1b4/0x1d0 [<00000000896105e3>] pmbus_add_sensor_attrs_one+0x123/0x440 [<0000000049e990a6>] pmbus_add_sensor_attrs+0xfe/0x1d0 [<00000000466b5440>] pmbus_do_probe+0x66b/0x14e0 [<0000000084d42285>] i2c_device_probe+0x13b/0x2f0 [<0000000029e2ae74>] really_probe+0xce/0x2c0 [<00000000692df15c>] driver_probe_device+0x19/0xd0 [<00000000547d9cce>] __device_attach_driver+0x6f/0x100 [<0000000020abd24b>] bus_for_each_drv+0x76/0xc0 [<00000000665d9563>] __device_attach+0xfc/0x180 [<000000008ddd4d6a>] bus_probe_device+0x82/0xa0 [<000000009e61132b>] device_add+0x3fe/0x920 Fixes: 3fd6d6e2b4e8 ("thermal/of: Rework the thermal device tree initialization") Signed-off-by: Ido Schimmel Reviewed-by: Vadim Pasternak Link: https://lore.kernel.org/r/20221020103658.802457-1-idosch@nvidia.com Signed-off-by: Daniel Lezcano --- drivers/thermal/thermal_of.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/thermal_of.c b/drivers/thermal/thermal_of.c index d4b6335ace15f..aacba30bc10c1 100644 --- a/drivers/thermal/thermal_of.c +++ b/drivers/thermal/thermal_of.c @@ -604,13 +604,15 @@ struct thermal_zone_device *thermal_of_zone_register(struct device_node *sensor, if (IS_ERR(np)) { if (PTR_ERR(np) != -ENODEV) pr_err("Failed to find thermal zone for %pOFn id=%d\n", sensor, id); - return ERR_CAST(np); + ret = PTR_ERR(np); + goto out_kfree_of_ops; } trips = thermal_of_trips_init(np, &ntrips); if (IS_ERR(trips)) { pr_err("Failed to find trip points for %pOFn id=%d\n", sensor, id); - return ERR_CAST(trips); + ret = PTR_ERR(trips); + goto out_kfree_of_ops; } ret = thermal_of_monitor_init(np, &delay, &pdelay); @@ -659,6 +661,8 @@ out_kfree_tzp: kfree(tzp); out_kfree_trips: kfree(trips); +out_kfree_of_ops: + kfree(of_ops); return ERR_PTR(ret); } -- GitLab From c6db32ec7c600ae7ab92eb6c56a9fb2918fcd780 Mon Sep 17 00:00:00 2001 From: Robert Marko Date: Fri, 19 Aug 2022 00:02:41 +0200 Subject: [PATCH 1363/1423] dt-bindings: thermal: tsens: Add ipq8074 compatible Qualcomm IPQ8074 has tsens v2.3.0 block, though unlike existing v2 IP it only uses one IRQ, so tsens v2 compatible cannot be used as the fallback. We also have to make sure that correct interrupts are set according to compatibles, so populate interrupt information per compatibles. Signed-off-by: Robert Marko Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220818220245.338396-1-robimarko@gmail.com Signed-off-by: Daniel Lezcano --- .../bindings/thermal/qcom-tsens.yaml | 76 ++++++++++++++++--- 1 file changed, 65 insertions(+), 11 deletions(-) diff --git a/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml b/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml index 7db905c1d5558..f0bd4b979e28a 100644 --- a/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml +++ b/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml @@ -60,6 +60,10 @@ properties: - qcom,sm8450-tsens - const: qcom,tsens-v2 + - description: v2 of TSENS with combined interrupt + enum: + - qcom,ipq8074-tsens + reg: items: - description: TM registers @@ -67,15 +71,11 @@ properties: interrupts: minItems: 1 - items: - - description: Combined interrupt if upper or lower threshold crossed - - description: Interrupt if critical threshold crossed + maxItems: 2 interrupt-names: minItems: 1 - items: - - const: uplow - - const: critical + maxItems: 2 nvmem-cells: minItems: 1 @@ -129,22 +129,61 @@ allOf: then: properties: interrupts: - maxItems: 1 + items: + - description: Combined interrupt if upper or lower threshold crossed interrupt-names: - maxItems: 1 + items: + - const: uplow - else: + - if: + properties: + compatible: + contains: + enum: + - qcom,msm8953-tsens + - qcom,msm8996-tsens + - qcom,msm8998-tsens + - qcom,sc7180-tsens + - qcom,sc7280-tsens + - qcom,sc8180x-tsens + - qcom,sdm630-tsens + - qcom,sdm845-tsens + - qcom,sm8150-tsens + - qcom,sm8250-tsens + - qcom,sm8350-tsens + - qcom,tsens-v2 + then: + properties: + interrupts: + items: + - description: Combined interrupt if upper or lower threshold crossed + - description: Interrupt if critical threshold crossed + interrupt-names: + items: + - const: uplow + - const: critical + + - if: + properties: + compatible: + contains: + enum: + - qcom,ipq8074-tsens + then: properties: interrupts: - minItems: 2 + items: + - description: Combined interrupt if upper, lower or critical thresholds crossed interrupt-names: - minItems: 2 + items: + - const: combined - if: properties: compatible: contains: enum: + - qcom,ipq8074-tsens - qcom,tsens-v0_1 - qcom,tsens-v1 - qcom,tsens-v2 @@ -227,4 +266,19 @@ examples: #qcom,sensors = <13>; #thermal-sensor-cells = <1>; }; + + - | + #include + // Example 4 (for any IPQ8074 based SoC-s): + tsens4: thermal-sensor@4a9000 { + compatible = "qcom,ipq8074-tsens"; + reg = <0x4a9000 0x1000>, + <0x4a8000 0x1000>; + + interrupts = ; + interrupt-names = "combined"; + + #qcom,sensors = <16>; + #thermal-sensor-cells = <1>; + }; ... -- GitLab From 4360af35273b742ffc6605ce44f37ac654272443 Mon Sep 17 00:00:00 2001 From: Robert Marko Date: Fri, 19 Aug 2022 00:02:42 +0200 Subject: [PATCH 1364/1423] thermal/drivers/tsens: Add support for combined interrupt Despite using tsens v2.3 IP, IPQ8074 and IPQ6018 only have one IRQ for signaling both up/low and critical trips. Signed-off-by: Robert Marko Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220818220245.338396-2-robimarko@gmail.com Signed-off-by: Daniel Lezcano --- drivers/thermal/qcom/tsens-8960.c | 1 + drivers/thermal/qcom/tsens-v0_1.c | 1 + drivers/thermal/qcom/tsens-v1.c | 1 + drivers/thermal/qcom/tsens-v2.c | 1 + drivers/thermal/qcom/tsens.c | 38 ++++++++++++++++++++++++++----- drivers/thermal/qcom/tsens.h | 2 ++ 6 files changed, 38 insertions(+), 6 deletions(-) diff --git a/drivers/thermal/qcom/tsens-8960.c b/drivers/thermal/qcom/tsens-8960.c index 67c1748cdf734..ee584e5b07e56 100644 --- a/drivers/thermal/qcom/tsens-8960.c +++ b/drivers/thermal/qcom/tsens-8960.c @@ -269,6 +269,7 @@ static const struct tsens_ops ops_8960 = { static struct tsens_features tsens_8960_feat = { .ver_major = VER_0, .crit_int = 0, + .combo_int = 0, .adc = 1, .srot_split = 0, .max_sensors = 11, diff --git a/drivers/thermal/qcom/tsens-v0_1.c b/drivers/thermal/qcom/tsens-v0_1.c index 327f37202c69f..8cdc3d135d6a8 100644 --- a/drivers/thermal/qcom/tsens-v0_1.c +++ b/drivers/thermal/qcom/tsens-v0_1.c @@ -539,6 +539,7 @@ static int calibrate_9607(struct tsens_priv *priv) static struct tsens_features tsens_v0_1_feat = { .ver_major = VER_0_1, .crit_int = 0, + .combo_int = 0, .adc = 1, .srot_split = 1, .max_sensors = 11, diff --git a/drivers/thermal/qcom/tsens-v1.c b/drivers/thermal/qcom/tsens-v1.c index 573e261ccca74..a4f561a6e5821 100644 --- a/drivers/thermal/qcom/tsens-v1.c +++ b/drivers/thermal/qcom/tsens-v1.c @@ -302,6 +302,7 @@ static int calibrate_8976(struct tsens_priv *priv) static struct tsens_features tsens_v1_feat = { .ver_major = VER_1_X, .crit_int = 0, + .combo_int = 0, .adc = 1, .srot_split = 1, .max_sensors = 11, diff --git a/drivers/thermal/qcom/tsens-v2.c b/drivers/thermal/qcom/tsens-v2.c index b293ed32174b5..129cdb247381c 100644 --- a/drivers/thermal/qcom/tsens-v2.c +++ b/drivers/thermal/qcom/tsens-v2.c @@ -31,6 +31,7 @@ static struct tsens_features tsens_v2_feat = { .ver_major = VER_2_X, .crit_int = 1, + .combo_int = 0, .adc = 0, .srot_split = 1, .max_sensors = 16, diff --git a/drivers/thermal/qcom/tsens.c b/drivers/thermal/qcom/tsens.c index b1b10005fb286..e2511e76f2969 100644 --- a/drivers/thermal/qcom/tsens.c +++ b/drivers/thermal/qcom/tsens.c @@ -532,6 +532,27 @@ static irqreturn_t tsens_irq_thread(int irq, void *data) return IRQ_HANDLED; } +/** + * tsens_combined_irq_thread() - Threaded interrupt handler for combined interrupts + * @irq: irq number + * @data: tsens controller private data + * + * Handle the combined interrupt as if it were 2 separate interrupts, so call the + * critical handler first and then the up/low one. + * + * Return: IRQ_HANDLED + */ +static irqreturn_t tsens_combined_irq_thread(int irq, void *data) +{ + irqreturn_t ret; + + ret = tsens_critical_irq_thread(irq, data); + if (ret != IRQ_HANDLED) + return ret; + + return tsens_irq_thread(irq, data); +} + static int tsens_set_trips(struct thermal_zone_device *tz, int low, int high) { struct tsens_sensor *s = tz->devdata; @@ -1071,13 +1092,18 @@ static int tsens_register(struct tsens_priv *priv) tsens_mC_to_hw(priv->sensor, 0)); } - ret = tsens_register_irq(priv, "uplow", tsens_irq_thread); - if (ret < 0) - return ret; + if (priv->feat->combo_int) { + ret = tsens_register_irq(priv, "combined", + tsens_combined_irq_thread); + } else { + ret = tsens_register_irq(priv, "uplow", tsens_irq_thread); + if (ret < 0) + return ret; - if (priv->feat->crit_int) - ret = tsens_register_irq(priv, "critical", - tsens_critical_irq_thread); + if (priv->feat->crit_int) + ret = tsens_register_irq(priv, "critical", + tsens_critical_irq_thread); + } return ret; } diff --git a/drivers/thermal/qcom/tsens.h b/drivers/thermal/qcom/tsens.h index ba05c82333565..1678c5e9e60bd 100644 --- a/drivers/thermal/qcom/tsens.h +++ b/drivers/thermal/qcom/tsens.h @@ -493,6 +493,7 @@ enum regfield_ids { * struct tsens_features - Features supported by the IP * @ver_major: Major number of IP version * @crit_int: does the IP support critical interrupts? + * @combo_int: does the IP use one IRQ for up, low and critical thresholds? * @adc: do the sensors only output adc code (instead of temperature)? * @srot_split: does the IP neatly splits the register space into SROT and TM, * with SROT only being available to secure boot firmware? @@ -502,6 +503,7 @@ enum regfield_ids { struct tsens_features { unsigned int ver_major; unsigned int crit_int:1; + unsigned int combo_int:1; unsigned int adc:1; unsigned int srot_split:1; unsigned int has_watchdog:1; -- GitLab From f63baced3839f591db76c227e09d47001373d6db Mon Sep 17 00:00:00 2001 From: Robert Marko Date: Fri, 19 Aug 2022 00:02:43 +0200 Subject: [PATCH 1365/1423] thermal/drivers/tsens: Allow configuring min and max trips IPQ8074 and IPQ6018 dont support negative trip temperatures and support up to 204 degrees C as the max trip temperature. So, instead of always setting the -40 as min and 120 degrees C as max allow it to be configured as part of the features. Signed-off-by: Robert Marko Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220818220245.338396-3-robimarko@gmail.com Signed-off-by: Daniel Lezcano --- drivers/thermal/qcom/tsens-8960.c | 2 ++ drivers/thermal/qcom/tsens-v0_1.c | 2 ++ drivers/thermal/qcom/tsens-v1.c | 2 ++ drivers/thermal/qcom/tsens-v2.c | 2 ++ drivers/thermal/qcom/tsens.c | 4 ++-- drivers/thermal/qcom/tsens.h | 4 ++++ 6 files changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/qcom/tsens-8960.c b/drivers/thermal/qcom/tsens-8960.c index ee584e5b07e56..4585904fb3800 100644 --- a/drivers/thermal/qcom/tsens-8960.c +++ b/drivers/thermal/qcom/tsens-8960.c @@ -273,6 +273,8 @@ static struct tsens_features tsens_8960_feat = { .adc = 1, .srot_split = 0, .max_sensors = 11, + .trip_min_temp = -40000, + .trip_max_temp = 120000, }; struct tsens_plat_data data_8960 = { diff --git a/drivers/thermal/qcom/tsens-v0_1.c b/drivers/thermal/qcom/tsens-v0_1.c index 8cdc3d135d6a8..04d012e4f7288 100644 --- a/drivers/thermal/qcom/tsens-v0_1.c +++ b/drivers/thermal/qcom/tsens-v0_1.c @@ -543,6 +543,8 @@ static struct tsens_features tsens_v0_1_feat = { .adc = 1, .srot_split = 1, .max_sensors = 11, + .trip_min_temp = -40000, + .trip_max_temp = 120000, }; static const struct reg_field tsens_v0_1_regfields[MAX_REGFIELDS] = { diff --git a/drivers/thermal/qcom/tsens-v1.c b/drivers/thermal/qcom/tsens-v1.c index a4f561a6e5821..1d7f8a80bd13a 100644 --- a/drivers/thermal/qcom/tsens-v1.c +++ b/drivers/thermal/qcom/tsens-v1.c @@ -306,6 +306,8 @@ static struct tsens_features tsens_v1_feat = { .adc = 1, .srot_split = 1, .max_sensors = 11, + .trip_min_temp = -40000, + .trip_max_temp = 120000, }; static const struct reg_field tsens_v1_regfields[MAX_REGFIELDS] = { diff --git a/drivers/thermal/qcom/tsens-v2.c b/drivers/thermal/qcom/tsens-v2.c index 129cdb247381c..9babc69bfd22e 100644 --- a/drivers/thermal/qcom/tsens-v2.c +++ b/drivers/thermal/qcom/tsens-v2.c @@ -35,6 +35,8 @@ static struct tsens_features tsens_v2_feat = { .adc = 0, .srot_split = 1, .max_sensors = 16, + .trip_min_temp = -40000, + .trip_max_temp = 120000, }; static const struct reg_field tsens_v2_regfields[MAX_REGFIELDS] = { diff --git a/drivers/thermal/qcom/tsens.c b/drivers/thermal/qcom/tsens.c index e2511e76f2969..f31510489a9ab 100644 --- a/drivers/thermal/qcom/tsens.c +++ b/drivers/thermal/qcom/tsens.c @@ -573,8 +573,8 @@ static int tsens_set_trips(struct thermal_zone_device *tz, int low, int high) dev_dbg(dev, "[%u] %s: proposed thresholds: (%d:%d)\n", hw_id, __func__, low, high); - cl_high = clamp_val(high, -40000, 120000); - cl_low = clamp_val(low, -40000, 120000); + cl_high = clamp_val(high, priv->feat->trip_min_temp, priv->feat->trip_max_temp); + cl_low = clamp_val(low, priv->feat->trip_min_temp, priv->feat->trip_max_temp); high_val = tsens_mC_to_hw(s, cl_high); low_val = tsens_mC_to_hw(s, cl_low); diff --git a/drivers/thermal/qcom/tsens.h b/drivers/thermal/qcom/tsens.h index 1678c5e9e60bd..8dc21ca0f2a3f 100644 --- a/drivers/thermal/qcom/tsens.h +++ b/drivers/thermal/qcom/tsens.h @@ -499,6 +499,8 @@ enum regfield_ids { * with SROT only being available to secure boot firmware? * @has_watchdog: does this IP support watchdog functionality? * @max_sensors: maximum sensors supported by this version of the IP + * @trip_min_temp: minimum trip temperature supported by this version of the IP + * @trip_max_temp: maximum trip temperature supported by this version of the IP */ struct tsens_features { unsigned int ver_major; @@ -508,6 +510,8 @@ struct tsens_features { unsigned int srot_split:1; unsigned int has_watchdog:1; unsigned int max_sensors; + int trip_min_temp; + int trip_max_temp; }; /** -- GitLab From 6840455debd38ea82b1f9fbbf8019944da36f7eb Mon Sep 17 00:00:00 2001 From: Robert Marko Date: Fri, 19 Aug 2022 00:02:44 +0200 Subject: [PATCH 1366/1423] thermal/drivers/tsens: Add IPQ8074 support Qualcomm IPQ8074 uses tsens v2.3 IP, however unlike other tsens v2 IP it only has one IRQ, that is used for up/low as well as critical. It also does not support negative trip temperatures. Signed-off-by: Robert Marko Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220818220245.338396-4-robimarko@gmail.com Signed-off-by: Daniel Lezcano --- drivers/thermal/qcom/tsens-v2.c | 17 +++++++++++++++++ drivers/thermal/qcom/tsens.c | 3 +++ drivers/thermal/qcom/tsens.h | 2 +- 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/qcom/tsens-v2.c b/drivers/thermal/qcom/tsens-v2.c index 9babc69bfd22e..29a61d2d6ca31 100644 --- a/drivers/thermal/qcom/tsens-v2.c +++ b/drivers/thermal/qcom/tsens-v2.c @@ -39,6 +39,17 @@ static struct tsens_features tsens_v2_feat = { .trip_max_temp = 120000, }; +static struct tsens_features ipq8074_feat = { + .ver_major = VER_2_X, + .crit_int = 1, + .combo_int = 1, + .adc = 0, + .srot_split = 1, + .max_sensors = 16, + .trip_min_temp = 0, + .trip_max_temp = 204000, +}; + static const struct reg_field tsens_v2_regfields[MAX_REGFIELDS] = { /* ----- SROT ------ */ /* VERSION */ @@ -104,6 +115,12 @@ struct tsens_plat_data data_tsens_v2 = { .fields = tsens_v2_regfields, }; +struct tsens_plat_data data_ipq8074 = { + .ops = &ops_generic_v2, + .feat = &ipq8074_feat, + .fields = tsens_v2_regfields, +}; + /* Kept around for backward compatibility with old msm8996.dtsi */ struct tsens_plat_data data_8996 = { .num_sensors = 13, diff --git a/drivers/thermal/qcom/tsens.c b/drivers/thermal/qcom/tsens.c index f31510489a9ab..b7d978295b2f9 100644 --- a/drivers/thermal/qcom/tsens.c +++ b/drivers/thermal/qcom/tsens.c @@ -980,6 +980,9 @@ static const struct of_device_id tsens_table[] = { { .compatible = "qcom,ipq8064-tsens", .data = &data_8960, + }, { + .compatible = "qcom,ipq8074-tsens", + .data = &data_ipq8074, }, { .compatible = "qcom,mdm9607-tsens", .data = &data_9607, diff --git a/drivers/thermal/qcom/tsens.h b/drivers/thermal/qcom/tsens.h index 8dc21ca0f2a3f..899af128855f7 100644 --- a/drivers/thermal/qcom/tsens.h +++ b/drivers/thermal/qcom/tsens.h @@ -597,6 +597,6 @@ extern struct tsens_plat_data data_8916, data_8939, data_8974, data_9607; extern struct tsens_plat_data data_tsens_v1, data_8976; /* TSENS v2 targets */ -extern struct tsens_plat_data data_8996, data_tsens_v2; +extern struct tsens_plat_data data_8996, data_ipq8074, data_tsens_v2; #endif /* __QCOM_TSENS_H__ */ -- GitLab From de48d8766afcd97d147699aaff78a338081c9973 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Sat, 22 Oct 2022 14:56:55 +0200 Subject: [PATCH 1367/1423] thermal/drivers/qcom/tsens: Init debugfs only with successful probe Calibrate and tsens_register can fail or PROBE_DEFER. This will cause a double or a wrong init of the debugfs information. Init debugfs only with successful probe fixing warning about directory already present. Signed-off-by: Christian Marangi Acked-by: Thara Gopinath Link: https://lore.kernel.org/r/20221022125657.22530-2-ansuelsmth@gmail.com Signed-off-by: Daniel Lezcano --- drivers/thermal/qcom/tsens.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/thermal/qcom/tsens.c b/drivers/thermal/qcom/tsens.c index b7d978295b2f9..3ac780404c72c 100644 --- a/drivers/thermal/qcom/tsens.c +++ b/drivers/thermal/qcom/tsens.c @@ -939,8 +939,6 @@ int __init init_common(struct tsens_priv *priv) if (tsens_version(priv) >= VER_0_1) tsens_enable_irq(priv); - tsens_debug_init(op); - err_put_device: put_device(&op->dev); return ret; @@ -1182,7 +1180,11 @@ static int tsens_probe(struct platform_device *pdev) } } - return tsens_register(priv); + ret = tsens_register(priv); + if (!ret) + tsens_debug_init(pdev); + + return ret; } static int tsens_remove(struct platform_device *pdev) -- GitLab From c7e077e921fa94e0c06c8d14af6c0504c8a5f4bd Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Sat, 22 Oct 2022 14:56:56 +0200 Subject: [PATCH 1368/1423] thermal/drivers/qcom/tsens: Fix wrong version id dbg_version_show For VER_0 the version was incorrectly reported as 0.1.0. Fix that and correctly report the major version for this old tsens revision. Signed-off-by: Christian Marangi Link: https://lore.kernel.org/r/20221022125657.22530-3-ansuelsmth@gmail.com Signed-off-by: Daniel Lezcano --- drivers/thermal/qcom/tsens.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/qcom/tsens.c b/drivers/thermal/qcom/tsens.c index 3ac780404c72c..5f8d8f04351eb 100644 --- a/drivers/thermal/qcom/tsens.c +++ b/drivers/thermal/qcom/tsens.c @@ -713,7 +713,7 @@ static int dbg_version_show(struct seq_file *s, void *data) return ret; seq_printf(s, "%d.%d.%d\n", maj_ver, min_ver, step_ver); } else { - seq_puts(s, "0.1.0\n"); + seq_printf(s, "0.%d.0\n", priv->feat->ver_major); } return 0; -- GitLab From 89992d95ed1046338c7866ef7bbe6de543a2af91 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Sat, 22 Oct 2022 14:56:57 +0200 Subject: [PATCH 1369/1423] thermal/drivers/qcom/tsens: Rework debugfs file structure The current tsens debugfs structure is composed by: - a tsens dir in debugfs with a version file - a directory for each tsens istance with sensors file to dump all the sensors value. This works on the assumption that we have the same version for each istance but this assumption seems fragile and with more than one tsens istance results in the version file not tracking each of them. A better approach is to just create a subdirectory for each tsens istance and put there version and sensors debugfs file. Using this new implementation results in less code since debugfs entry are created only on successful tsens probe. Signed-off-by: Christian Marangi Link: https://lore.kernel.org/r/20221022125657.22530-4-ansuelsmth@gmail.com Signed-off-by: Daniel Lezcano --- drivers/thermal/qcom/tsens.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/thermal/qcom/tsens.c b/drivers/thermal/qcom/tsens.c index 5f8d8f04351eb..b5b136ff323f9 100644 --- a/drivers/thermal/qcom/tsens.c +++ b/drivers/thermal/qcom/tsens.c @@ -725,21 +725,14 @@ DEFINE_SHOW_ATTRIBUTE(dbg_sensors); static void tsens_debug_init(struct platform_device *pdev) { struct tsens_priv *priv = platform_get_drvdata(pdev); - struct dentry *root, *file; - root = debugfs_lookup("tsens", NULL); - if (!root) + priv->debug_root = debugfs_lookup("tsens", NULL); + if (!priv->debug_root) priv->debug_root = debugfs_create_dir("tsens", NULL); - else - priv->debug_root = root; - - file = debugfs_lookup("version", priv->debug_root); - if (!file) - debugfs_create_file("version", 0444, priv->debug_root, - pdev, &dbg_version_fops); /* A directory for each instance of the TSENS IP */ priv->debug = debugfs_create_dir(dev_name(&pdev->dev), priv->debug_root); + debugfs_create_file("version", 0444, priv->debug, pdev, &dbg_version_fops); debugfs_create_file("sensors", 0444, priv->debug, pdev, &dbg_sensors_fops); } #else -- GitLab From 8848c0d7a0782c263a7827697d5acfcc09a19a5f Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 2 Dec 2022 17:23:49 +0100 Subject: [PATCH 1370/1423] dt-bindings: thermal: imx8mm-thermal: Document optional nvmem-cells The TMU TASR, TCALIVn, TRIM registers must be explicitly programmed with calibration values from OCOTP. Document optional phandle to OCOTP nvmem provider. Acked-by: Krzysztof Kozlowski Signed-off-by: Marek Vasut Signed-off-by: Daniel Lezcano --- .../devicetree/bindings/thermal/imx8mm-thermal.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/devicetree/bindings/thermal/imx8mm-thermal.yaml b/Documentation/devicetree/bindings/thermal/imx8mm-thermal.yaml index 89c54e08ee61b..b90726229ac9c 100644 --- a/Documentation/devicetree/bindings/thermal/imx8mm-thermal.yaml +++ b/Documentation/devicetree/bindings/thermal/imx8mm-thermal.yaml @@ -32,6 +32,13 @@ properties: clocks: maxItems: 1 + nvmem-cells: + maxItems: 1 + description: Phandle to the calibration data provided by ocotp + + nvmem-cell-names: + const: calib + "#thermal-sensor-cells": description: | Number of cells required to uniquely identify the thermal -- GitLab From 403291648823f819f7baaccd47589fcf87187c32 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 2 Dec 2022 17:23:53 +0100 Subject: [PATCH 1371/1423] thermal/drivers/imx: Add support for loading calibration data from OCOTP The TMU TASR, TCALIVn, TRIM registers must be explicitly programmed with calibration values in OCOTP. Add support for reading the OCOTP calibration data and programming those into the TMU hardware. The MX8MM/MX8MN TMUv1 uses only one OCOTP cell, while MX8MP TMUv2 uses 4, the programming differs in each case. Based on U-Boot commits: 70487ff386c ("imx8mm: Load fuse for TMU TCALIV and TASR") ebb9aab318b ("imx: load calibration parameters from fuse for i.MX8MP") Reviewed-by: Peng Fan Signed-off-by: Marek Vasut Signed-off-by: Daniel Lezcano --- drivers/thermal/imx8mm_thermal.c | 164 +++++++++++++++++++++++++++++++ 1 file changed, 164 insertions(+) diff --git a/drivers/thermal/imx8mm_thermal.c b/drivers/thermal/imx8mm_thermal.c index 182cc207afe90..e709c03bc4b9f 100644 --- a/drivers/thermal/imx8mm_thermal.c +++ b/drivers/thermal/imx8mm_thermal.c @@ -10,9 +10,11 @@ #include #include #include +#include #include #include #include +#include #include #include "thermal_core.h" @@ -20,6 +22,22 @@ #define TER 0x0 /* TMU enable */ #define TPS 0x4 #define TRITSR 0x20 /* TMU immediate temp */ +/* TMU calibration data registers */ +#define TASR 0x28 +#define TASR_BUF_SLOPE_MASK GENMASK(19, 16) +#define TASR_BUF_VREF_MASK GENMASK(4, 0) /* TMU_V1 */ +#define TASR_BUF_VERF_SEL_MASK GENMASK(1, 0) /* TMU_V2 */ +#define TCALIV(n) (0x30 + ((n) * 4)) +#define TCALIV_EN BIT(31) +#define TCALIV_HR_MASK GENMASK(23, 16) /* TMU_V1 */ +#define TCALIV_RT_MASK GENMASK(7, 0) /* TMU_V1 */ +#define TCALIV_SNSR105C_MASK GENMASK(27, 16) /* TMU_V2 */ +#define TCALIV_SNSR25C_MASK GENMASK(11, 0) /* TMU_V2 */ +#define TRIM 0x3c +#define TRIM_BJT_CUR_MASK GENMASK(23, 20) +#define TRIM_BGR_MASK GENMASK(31, 28) +#define TRIM_VLSB_MASK GENMASK(15, 12) +#define TRIM_EN_CH BIT(7) #define TER_ADC_PD BIT(30) #define TER_EN BIT(31) @@ -32,6 +50,25 @@ #define SIGN_BIT BIT(7) #define TEMP_VAL_MASK GENMASK(6, 0) +/* TMU OCOTP calibration data bitfields */ +#define ANA0_EN BIT(25) +#define ANA0_BUF_VREF_MASK GENMASK(24, 20) +#define ANA0_BUF_SLOPE_MASK GENMASK(19, 16) +#define ANA0_HR_MASK GENMASK(15, 8) +#define ANA0_RT_MASK GENMASK(7, 0) +#define TRIM2_VLSB_MASK GENMASK(23, 20) +#define TRIM2_BGR_MASK GENMASK(19, 16) +#define TRIM2_BJT_CUR_MASK GENMASK(15, 12) +#define TRIM2_BUF_SLOP_SEL_MASK GENMASK(11, 8) +#define TRIM2_BUF_VERF_SEL_MASK GENMASK(7, 6) +#define TRIM3_TCA25_0_LSB_MASK GENMASK(31, 28) +#define TRIM3_TCA40_0_MASK GENMASK(27, 16) +#define TRIM4_TCA40_1_MASK GENMASK(31, 20) +#define TRIM4_TCA105_0_MASK GENMASK(19, 8) +#define TRIM4_TCA25_0_MSB_MASK GENMASK(7, 0) +#define TRIM5_TCA105_1_MASK GENMASK(23, 12) +#define TRIM5_TCA25_1_MASK GENMASK(11, 0) + #define VER1_TEMP_LOW_LIMIT 10000 #define VER2_TEMP_LOW_LIMIT -40000 #define VER2_TEMP_HIGH_LIMIT 125000 @@ -134,6 +171,129 @@ static void imx8mm_tmu_probe_sel_all(struct imx8mm_tmu *tmu) writel_relaxed(val, tmu->base + TPS); } +static int imx8mm_tmu_probe_set_calib_v1(struct platform_device *pdev, + struct imx8mm_tmu *tmu) +{ + struct device *dev = &pdev->dev; + u32 ana0; + int ret; + + ret = nvmem_cell_read_u32(&pdev->dev, "calib", &ana0); + if (ret) { + dev_warn(dev, "Failed to read OCOTP nvmem cell (%d).\n", ret); + return ret; + } + + writel(FIELD_PREP(TASR_BUF_VREF_MASK, + FIELD_GET(ANA0_BUF_VREF_MASK, ana0)) | + FIELD_PREP(TASR_BUF_SLOPE_MASK, + FIELD_GET(ANA0_BUF_SLOPE_MASK, ana0)), + tmu->base + TASR); + + writel(FIELD_PREP(TCALIV_RT_MASK, FIELD_GET(ANA0_RT_MASK, ana0)) | + FIELD_PREP(TCALIV_HR_MASK, FIELD_GET(ANA0_HR_MASK, ana0)) | + ((ana0 & ANA0_EN) ? TCALIV_EN : 0), + tmu->base + TCALIV(0)); + + return 0; +} + +static int imx8mm_tmu_probe_set_calib_v2(struct platform_device *pdev, + struct imx8mm_tmu *tmu) +{ + struct device *dev = &pdev->dev; + struct nvmem_cell *cell; + u32 trim[4] = { 0 }; + size_t len; + void *buf; + + cell = nvmem_cell_get(dev, "calib"); + if (IS_ERR(cell)) + return PTR_ERR(cell); + + buf = nvmem_cell_read(cell, &len); + nvmem_cell_put(cell); + + if (IS_ERR(buf)) + return PTR_ERR(buf); + + memcpy(trim, buf, min(len, sizeof(trim))); + kfree(buf); + + if (len != 16) { + dev_err(dev, + "OCOTP nvmem cell length is %zu, must be 16.\n", len); + return -EINVAL; + } + + /* Blank sample hardware */ + if (!trim[0] && !trim[1] && !trim[2] && !trim[3]) { + /* Use a default 25C binary codes */ + writel(FIELD_PREP(TCALIV_SNSR25C_MASK, 0x63c), + tmu->base + TCALIV(0)); + writel(FIELD_PREP(TCALIV_SNSR25C_MASK, 0x63c), + tmu->base + TCALIV(1)); + return 0; + } + + writel(FIELD_PREP(TASR_BUF_VERF_SEL_MASK, + FIELD_GET(TRIM2_BUF_VERF_SEL_MASK, trim[0])) | + FIELD_PREP(TASR_BUF_SLOPE_MASK, + FIELD_GET(TRIM2_BUF_SLOP_SEL_MASK, trim[0])), + tmu->base + TASR); + + writel(FIELD_PREP(TRIM_BJT_CUR_MASK, + FIELD_GET(TRIM2_BJT_CUR_MASK, trim[0])) | + FIELD_PREP(TRIM_BGR_MASK, FIELD_GET(TRIM2_BGR_MASK, trim[0])) | + FIELD_PREP(TRIM_VLSB_MASK, FIELD_GET(TRIM2_VLSB_MASK, trim[0])) | + TRIM_EN_CH, + tmu->base + TRIM); + + writel(FIELD_PREP(TCALIV_SNSR25C_MASK, + FIELD_GET(TRIM3_TCA25_0_LSB_MASK, trim[1]) | + (FIELD_GET(TRIM4_TCA25_0_MSB_MASK, trim[2]) << 4)) | + FIELD_PREP(TCALIV_SNSR105C_MASK, + FIELD_GET(TRIM4_TCA105_0_MASK, trim[2])), + tmu->base + TCALIV(0)); + + writel(FIELD_PREP(TCALIV_SNSR25C_MASK, + FIELD_GET(TRIM5_TCA25_1_MASK, trim[3])) | + FIELD_PREP(TCALIV_SNSR105C_MASK, + FIELD_GET(TRIM5_TCA105_1_MASK, trim[3])), + tmu->base + TCALIV(1)); + + writel(FIELD_PREP(TCALIV_SNSR25C_MASK, + FIELD_GET(TRIM3_TCA40_0_MASK, trim[1])) | + FIELD_PREP(TCALIV_SNSR105C_MASK, + FIELD_GET(TRIM4_TCA40_1_MASK, trim[2])), + tmu->base + TCALIV(2)); + + return 0; +} + +static int imx8mm_tmu_probe_set_calib(struct platform_device *pdev, + struct imx8mm_tmu *tmu) +{ + struct device *dev = &pdev->dev; + + /* + * Lack of calibration data OCOTP reference is not considered + * fatal to retain compatibility with old DTs. It is however + * strongly recommended to update such old DTs to get correct + * temperature compensation values for each SoC. + */ + if (!of_find_property(pdev->dev.of_node, "nvmem-cells", NULL)) { + dev_warn(dev, + "No OCOTP nvmem reference found, SoC-specific calibration not loaded. Please update your DT.\n"); + return 0; + } + + if (tmu->socdata->version == TMU_VER1) + return imx8mm_tmu_probe_set_calib_v1(pdev, tmu); + + return imx8mm_tmu_probe_set_calib_v2(pdev, tmu); +} + static int imx8mm_tmu_probe(struct platform_device *pdev) { const struct thermal_soc_data *data; @@ -186,6 +346,10 @@ static int imx8mm_tmu_probe(struct platform_device *pdev) platform_set_drvdata(pdev, tmu); + ret = imx8mm_tmu_probe_set_calib(pdev, tmu); + if (ret) + goto disable_clk; + /* enable all the probes for V2 TMU */ if (tmu->socdata->version == TMU_VER2) imx8mm_tmu_probe_sel_all(tmu); -- GitLab From 3f9cb57962bcfad88bcac750f85ef7425475b102 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 21 Nov 2022 16:47:10 +0100 Subject: [PATCH 1372/1423] thermal: ti-soc-thermal: Drop comma after SoC match table sentinel It does not make sense to have a comma after a sentinel, as any new elements must be added before the sentinel. Signed-off-by: Geert Uytterhoeven Acked-by: Keerthy Link: https://lore.kernel.org/r/1d6de2a80b919cb11199e56ac06ad21c273ebe57.1669045586.git.geert+renesas@glider.be Signed-off-by: Daniel Lezcano --- drivers/thermal/ti-soc-thermal/ti-bandgap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/ti-soc-thermal/ti-bandgap.c b/drivers/thermal/ti-soc-thermal/ti-bandgap.c index 67050a1a5b07f..a1c9a15301832 100644 --- a/drivers/thermal/ti-soc-thermal/ti-bandgap.c +++ b/drivers/thermal/ti-soc-thermal/ti-bandgap.c @@ -878,7 +878,7 @@ static struct ti_bandgap *ti_bandgap_build(struct platform_device *pdev) */ static const struct soc_device_attribute soc_no_cpu_notifier[] = { { .machine = "OMAP4430" }, - { /* sentinel */ }, + { /* sentinel */ } }; /*** Device driver call backs ***/ -- GitLab From c464856e63a4c9631add6a27340bd94beb9eb27f Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Wed, 30 Nov 2022 13:20:38 +0000 Subject: [PATCH 1373/1423] dt-bindings: thermal: mediatek: add compatible string for MT7986 and MT7981 SoC Document compatible string 'mediatek,mt7986-thermal' for V3 thermal unit found in MT7986 SoCs. 'mediatek,mt7981-thermal' is also added as it is identical with the thermal unit of MT7986. Signed-off-by: Daniel Golle Acked-by: Krzysztof Kozlowski Signed-off-by: Daniel Lezcano --- Documentation/devicetree/bindings/thermal/mediatek-thermal.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/thermal/mediatek-thermal.txt b/Documentation/devicetree/bindings/thermal/mediatek-thermal.txt index 5c7e7bdd029ab..38b32bb447e39 100644 --- a/Documentation/devicetree/bindings/thermal/mediatek-thermal.txt +++ b/Documentation/devicetree/bindings/thermal/mediatek-thermal.txt @@ -13,6 +13,8 @@ Required properties: - "mediatek,mt2701-thermal" : For MT2701 family of SoCs - "mediatek,mt2712-thermal" : For MT2712 family of SoCs - "mediatek,mt7622-thermal" : For MT7622 SoC + - "mediatek,mt7981-thermal", "mediatek,mt7986-thermal" : For MT7981 SoC + - "mediatek,mt7986-thermal" : For MT7986 SoC - "mediatek,mt8183-thermal" : For MT8183 family of SoCs - "mediatek,mt8516-thermal", "mediatek,mt2701-thermal : For MT8516 family of SoCs - reg: Address range of the thermal controller -- GitLab From 6f8941646234e2d6e589377aea0dd1ba52365623 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 2 Nov 2022 16:26:30 +0100 Subject: [PATCH 1374/1423] thermal: qcom-spmi-adc-tm5: suppress probe-deferral error message Drivers should not be logging errors on probe deferral. Switch to using dev_err_probe() to log failures when parsing the devicetree to avoid errors like: qcom-spmi-adc-tm5 c440000.spmi:pmic@0:adc-tm@3400: get dt data failed: -517 when a channel is not yet available. Signed-off-by: Johan Hovold Reviewed-by: Manivannan Sadhasivam Reviewed-by: Andrew Halaney Link: https://lore.kernel.org/r/20221102152630.696-1-johan+linaro@kernel.org Signed-off-by: Daniel Lezcano --- drivers/thermal/qcom/qcom-spmi-adc-tm5.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/thermal/qcom/qcom-spmi-adc-tm5.c b/drivers/thermal/qcom/qcom-spmi-adc-tm5.c index 1b2c43eab27d1..88462a4628fb5 100644 --- a/drivers/thermal/qcom/qcom-spmi-adc-tm5.c +++ b/drivers/thermal/qcom/qcom-spmi-adc-tm5.c @@ -1030,10 +1030,8 @@ static int adc_tm5_probe(struct platform_device *pdev) return irq; ret = adc_tm5_get_dt_data(adc_tm, node); - if (ret) { - dev_err(dev, "get dt data failed: %d\n", ret); - return ret; - } + if (ret) + return dev_err_probe(dev, ret, "get dt data failed\n"); ret = adc_tm->data->init(adc_tm); if (ret) { -- GitLab From de95d1341a3e682b95ad759bd58b86224819fbec Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Tue, 26 Jul 2022 14:23:31 +0200 Subject: [PATCH 1375/1423] thermal/drivers/imx8mm: Add hwmon support Expose thermal sensors as HWMON devices. Signed-off-by: Alexander Stein Link: https://lore.kernel.org/r/20220726122331.323093-1-alexander.stein@ew.tq-group.com Signed-off-by: Daniel Lezcano --- drivers/thermal/imx8mm_thermal.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/thermal/imx8mm_thermal.c b/drivers/thermal/imx8mm_thermal.c index e709c03bc4b9f..d247b48696cb6 100644 --- a/drivers/thermal/imx8mm_thermal.c +++ b/drivers/thermal/imx8mm_thermal.c @@ -18,6 +18,7 @@ #include #include "thermal_core.h" +#include "thermal_hwmon.h" #define TER 0x0 /* TMU enable */ #define TPS 0x4 @@ -342,6 +343,9 @@ static int imx8mm_tmu_probe(struct platform_device *pdev) goto disable_clk; } tmu->sensors[i].hw_id = i; + + if (devm_thermal_add_hwmon_sysfs(tmu->sensors[i].tzd)) + dev_warn(&pdev->dev, "failed to add hwmon sysfs attributes\n"); } platform_set_drvdata(pdev, tmu); -- GitLab From de04f680b0ab71d2b9085903fe608a15541f66cb Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 26 Nov 2022 11:42:25 +0100 Subject: [PATCH 1376/1423] thermal/core/power allocator: Remove a useless include This file does not use rcu, so there is no point in including . Remove it. Signed-off-by: Christophe JAILLET Reviewed-by: Lukasz Luba Link: https://lore.kernel.org/r/9adeec47cb5a8193016272d5c8bf936235c1711d.1669459337.git.christophe.jaillet@wanadoo.fr Signed-off-by: Daniel Lezcano --- drivers/thermal/gov_power_allocator.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c index 2d1aeaba38a88..d5d4eae16771b 100644 --- a/drivers/thermal/gov_power_allocator.c +++ b/drivers/thermal/gov_power_allocator.c @@ -8,7 +8,6 @@ #define pr_fmt(fmt) "Power allocator: " fmt -#include #include #include -- GitLab From fa17c4136db0bc91e698bd227cb478c648863d11 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 16 Nov 2022 12:31:40 +0100 Subject: [PATCH 1377/1423] dt-bindings: thermal: qcom-tsens: narrow interrupts for SC8280XP, SM6350 and SM8450 Narrow number of interrupts per variants: SC8280XP, SM6350 and SM8450. The compatibles are already used and described. They only missed the constraints of number of interrupts. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20221116113140.69587-1-krzysztof.kozlowski@linaro.org Signed-off-by: Daniel Lezcano --- Documentation/devicetree/bindings/thermal/qcom-tsens.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml b/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml index f0bd4b979e28a..5bcfddc877d36 100644 --- a/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml +++ b/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml @@ -146,11 +146,14 @@ allOf: - qcom,sc7180-tsens - qcom,sc7280-tsens - qcom,sc8180x-tsens + - qcom,sc8280xp-tsens - qcom,sdm630-tsens - qcom,sdm845-tsens + - qcom,sm6350-tsens - qcom,sm8150-tsens - qcom,sm8250-tsens - qcom,sm8350-tsens + - qcom,sm8450-tsens - qcom,tsens-v2 then: properties: -- GitLab From 8763f8acbf8aef22a2321d4c978cd078aa3b8f64 Mon Sep 17 00:00:00 2001 From: Luca Weiss Date: Thu, 20 Oct 2022 16:52:37 +0200 Subject: [PATCH 1378/1423] thermal/drivers/qcom/temp-alarm: Fix inaccurate warning for gen2 On gen2 chips the stage2 threshold is not 140 degC but 125 degC. Make the warning message clearer by using this variable and also by including the temperature that was checked for. Fixes: aa92b3310c55 ("thermal/drivers/qcom-spmi-temp-alarm: Add support for GEN2 rev 1 PMIC peripherals") Signed-off-by: Luca Weiss Reviewed-by: Amit Kucheria Link: https://lore.kernel.org/r/20221020145237.942146-1-luca.weiss@fairphone.com Signed-off-by: Daniel Lezcano --- drivers/thermal/qcom/qcom-spmi-temp-alarm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/qcom/qcom-spmi-temp-alarm.c b/drivers/thermal/qcom/qcom-spmi-temp-alarm.c index be785ab37e53d..ad84978109e6f 100644 --- a/drivers/thermal/qcom/qcom-spmi-temp-alarm.c +++ b/drivers/thermal/qcom/qcom-spmi-temp-alarm.c @@ -252,7 +252,8 @@ static int qpnp_tm_update_critical_trip_temp(struct qpnp_tm_chip *chip, disable_s2_shutdown = true; else dev_warn(chip->dev, - "No ADC is configured and critical temperature is above the maximum stage 2 threshold of 140 C! Configuring stage 2 shutdown at 140 C.\n"); + "No ADC is configured and critical temperature %d mC is above the maximum stage 2 threshold of %ld mC! Configuring stage 2 shutdown at %ld mC.\n", + temp, stage2_threshold_max, stage2_threshold_max); } skip: -- GitLab From 2baad2496383c6036bdb0945ef8aa6b1a5d18c19 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Sat, 29 Oct 2022 10:59:33 +0530 Subject: [PATCH 1379/1423] thermal/drivers/qcom: Demote error log of thermal zone register to debug devm_thermal_of_zone_register() can fail with -ENODEV if thermal zone for the channel is not represented in DT. This is perfectly fine since not all sensors needs to be used for thermal zones but only a few in real world. So demote the error log to debug to avoid spamming users. Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20221029052933.32421-1-manivannan.sadhasivam@linaro.org Signed-off-by: Daniel Lezcano --- drivers/thermal/qcom/qcom-spmi-adc-tm5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/qcom/qcom-spmi-adc-tm5.c b/drivers/thermal/qcom/qcom-spmi-adc-tm5.c index 88462a4628fb5..ff47fc9ac9c5e 100644 --- a/drivers/thermal/qcom/qcom-spmi-adc-tm5.c +++ b/drivers/thermal/qcom/qcom-spmi-adc-tm5.c @@ -678,7 +678,7 @@ static int adc_tm5_register_tzd(struct adc_tm5_chip *adc_tm) &adc_tm5_thermal_ops); if (IS_ERR(tzd)) { if (PTR_ERR(tzd) == -ENODEV) { - dev_warn(adc_tm->dev, "thermal sensor on channel %d is not used\n", + dev_dbg(adc_tm->dev, "thermal sensor on channel %d is not used\n", adc_tm->channels[i].channel); continue; } -- GitLab From 46cab93ab49f303a7abbcf920cf9ef95d02a113c Mon Sep 17 00:00:00 2001 From: Bryan Brattlof Date: Mon, 31 Oct 2022 18:26:52 -0500 Subject: [PATCH 1380/1423] thermal/drivers/k3_j72xx_bandgap: Simplify k3_thermal_get_temp() function The k3_thermal_get_temp() function can be simplified to return only the result of k3_bgp_read_temp() without needing the 'ret' variable Signed-off-by: Bryan Brattlof Link: https://lore.kernel.org/r/20221031232702.10339-2-bb@ti.com Signed-off-by: Daniel Lezcano --- drivers/thermal/k3_j72xx_bandgap.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/thermal/k3_j72xx_bandgap.c b/drivers/thermal/k3_j72xx_bandgap.c index c073b1023bbe7..a9f99a190cb61 100644 --- a/drivers/thermal/k3_j72xx_bandgap.c +++ b/drivers/thermal/k3_j72xx_bandgap.c @@ -249,14 +249,7 @@ static inline int k3_bgp_read_temp(struct k3_thermal_data *devdata, /* Get temperature callback function for thermal zone */ static int k3_thermal_get_temp(struct thermal_zone_device *tz, int *temp) { - struct k3_thermal_data *data = tz->devdata; - int ret = 0; - - ret = k3_bgp_read_temp(data, temp); - if (ret) - return ret; - - return ret; + return k3_bgp_read_temp(tz->devdata, temp); } static const struct thermal_zone_device_ops k3_of_thermal_ops = { -- GitLab From 311f328ffc7572219bee65db77645e5fedd4e8e6 Mon Sep 17 00:00:00 2001 From: Bryan Brattlof Date: Mon, 31 Oct 2022 18:26:53 -0500 Subject: [PATCH 1381/1423] thermal/drivers/k3_j72xx_bandgap: Use bool for i2128 erratum flag Some of TI's J721E SoCs require a software trimming method to report temperatures accurately. Currently we are using a few different data types to indicate when we should apply the erratum. Change the 'workaround_needed' variable's data type to a bool to align with how we are using this variable currently. Signed-off-by: Bryan Brattlof Link: https://lore.kernel.org/r/20221031232702.10339-3-bb@ti.com Signed-off-by: Daniel Lezcano --- drivers/thermal/k3_j72xx_bandgap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/thermal/k3_j72xx_bandgap.c b/drivers/thermal/k3_j72xx_bandgap.c index a9f99a190cb61..b9d20026771a5 100644 --- a/drivers/thermal/k3_j72xx_bandgap.c +++ b/drivers/thermal/k3_j72xx_bandgap.c @@ -340,7 +340,7 @@ static void print_look_up_table(struct device *dev, int *ref_table) } struct k3_j72xx_bandgap_data { - unsigned int has_errata_i2128; + const bool has_errata_i2128; }; static int k3_j72xx_bandgap_probe(struct platform_device *pdev) @@ -351,7 +351,7 @@ static int k3_j72xx_bandgap_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct k3_j72xx_bandgap *bgp; struct k3_thermal_data *data; - int workaround_needed = 0; + bool workaround_needed = false; const struct k3_j72xx_bandgap_data *driver_data; struct thermal_zone_device *ti_thermal; int *ref_table; @@ -522,11 +522,11 @@ static int k3_j72xx_bandgap_remove(struct platform_device *pdev) } static const struct k3_j72xx_bandgap_data k3_j72xx_bandgap_j721e_data = { - .has_errata_i2128 = 1, + .has_errata_i2128 = true, }; static const struct k3_j72xx_bandgap_data k3_j72xx_bandgap_j7200_data = { - .has_errata_i2128 = 0, + .has_errata_i2128 = false, }; static const struct of_device_id of_k3_j72xx_bandgap_match[] = { -- GitLab From 156f0e2fda420c4a4a7b244a909df04086039bd6 Mon Sep 17 00:00:00 2001 From: Bryan Brattlof Date: Mon, 31 Oct 2022 18:26:54 -0500 Subject: [PATCH 1382/1423] thermal/drivers/k3_j72xx_bandgap: Remove fuse_base from structure 'fuse_base' is only needed during the initial probe function to provide data for a software trimming method for some of TI's devices affected by the i2128 erratum. The devices not affected will not use this region Remove fuse_base from the main k3_j72xx_bandgap structure Signed-off-by: Bryan Brattlof Link: https://lore.kernel.org/r/20221031232702.10339-4-bb@ti.com Signed-off-by: Daniel Lezcano --- drivers/thermal/k3_j72xx_bandgap.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/thermal/k3_j72xx_bandgap.c b/drivers/thermal/k3_j72xx_bandgap.c index b9d20026771a5..395a73cb37425 100644 --- a/drivers/thermal/k3_j72xx_bandgap.c +++ b/drivers/thermal/k3_j72xx_bandgap.c @@ -177,7 +177,6 @@ struct k3_j72xx_bandgap { struct device *dev; void __iomem *base; void __iomem *cfg2_base; - void __iomem *fuse_base; struct k3_thermal_data *ts_data[K3_VTM_MAX_NUM_TS]; }; @@ -276,7 +275,7 @@ static int k3_j72xx_bandgap_temp_to_adc_code(int temp) } static void get_efuse_values(int id, struct k3_thermal_data *data, int *err, - struct k3_j72xx_bandgap *bgp) + void __iomem *fuse_base) { int i, tmp, pow; int ct_offsets[5][K3_VTM_CORRECTION_TEMP_CNT] = { @@ -298,16 +297,16 @@ static void get_efuse_values(int id, struct k3_thermal_data *data, int *err, /* Extract the offset value using bit-mask */ if (ct_offsets[id][i] == -1 && i == 1) { /* 25C offset Case of Sensor 2 split between 2 regs */ - tmp = (readl(bgp->fuse_base + 0x8) & 0xE0000000) >> (29); - tmp |= ((readl(bgp->fuse_base + 0xC) & 0x1F) << 3); + tmp = (readl(fuse_base + 0x8) & 0xE0000000) >> (29); + tmp |= ((readl(fuse_base + 0xC) & 0x1F) << 3); pow = tmp & 0x80; } else if (ct_offsets[id][i] == -1 && i == 2) { /* 125C Case of Sensor 3 split between 2 regs */ - tmp = (readl(bgp->fuse_base + 0x4) & 0xF8000000) >> (27); - tmp |= ((readl(bgp->fuse_base + 0x8) & 0xF) << 5); + tmp = (readl(fuse_base + 0x4) & 0xF8000000) >> (27); + tmp |= ((readl(fuse_base + 0x8) & 0xF) << 5); pow = tmp & 0x100; } else { - tmp = readl(bgp->fuse_base + ct_offsets[id][i]); + tmp = readl(fuse_base + ct_offsets[id][i]); tmp &= ct_bm[id][i]; tmp = tmp >> __ffs(ct_bm[id][i]); @@ -356,6 +355,7 @@ static int k3_j72xx_bandgap_probe(struct platform_device *pdev) struct thermal_zone_device *ti_thermal; int *ref_table; struct err_values err_vals; + void __iomem *fuse_base; const s64 golden_factors[] = { -490019999999999936, @@ -387,9 +387,9 @@ static int k3_j72xx_bandgap_probe(struct platform_device *pdev) return PTR_ERR(bgp->cfg2_base); res = platform_get_resource(pdev, IORESOURCE_MEM, 2); - bgp->fuse_base = devm_ioremap_resource(dev, res); - if (IS_ERR(bgp->fuse_base)) - return PTR_ERR(bgp->fuse_base); + fuse_base = devm_ioremap_resource(dev, res); + if (IS_ERR(fuse_base)) + return PTR_ERR(fuse_base); driver_data = of_device_get_match_data(dev); if (driver_data) @@ -428,7 +428,7 @@ static int k3_j72xx_bandgap_probe(struct platform_device *pdev) } /* Workaround not needed if bit30/bit31 is set even for J721e */ - if (workaround_needed && (readl(bgp->fuse_base + 0x0) & 0xc0000000) == 0xc0000000) + if (workaround_needed && (readl(fuse_base + 0x0) & 0xc0000000) == 0xc0000000) workaround_needed = false; dev_dbg(bgp->dev, "Work around %sneeded\n", @@ -452,7 +452,7 @@ static int k3_j72xx_bandgap_probe(struct platform_device *pdev) err_vals.refs[1] = PLUS30CREF; err_vals.refs[2] = PLUS125CREF; err_vals.refs[3] = PLUS150CREF; - get_efuse_values(id, &data[id], err_vals.errs, bgp); + get_efuse_values(id, &data[id], err_vals.errs, fuse_base); } if (id == 0 && workaround_needed) -- GitLab From 366444ebe7e2e1c987c6a07551f06db4b5b7c0ad Mon Sep 17 00:00:00 2001 From: Bryan Brattlof Date: Mon, 31 Oct 2022 18:26:55 -0500 Subject: [PATCH 1383/1423] thermal/drivers/k3_j72xx_bandgap: Map fuse_base only for erratum workaround Some of TI's J721E SoCs require a software trimming procedure for the temperature monitors to function properly. To determine if a particular J721E is not affected by this erratum, both bits in the WKUP_SPARE_FUSE0 region must be set. Other SoCs, not affected by this erratum, will not need this region. Map the 'fuse_base' region only when the erratum fix is needed. Signed-off-by: Bryan Brattlof Link: https://lore.kernel.org/r/20221031232702.10339-5-bb@ti.com Signed-off-by: Daniel Lezcano --- drivers/thermal/k3_j72xx_bandgap.c | 34 +++++++++++++++++++----------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/drivers/thermal/k3_j72xx_bandgap.c b/drivers/thermal/k3_j72xx_bandgap.c index 395a73cb37425..031ea1091909a 100644 --- a/drivers/thermal/k3_j72xx_bandgap.c +++ b/drivers/thermal/k3_j72xx_bandgap.c @@ -386,15 +386,32 @@ static int k3_j72xx_bandgap_probe(struct platform_device *pdev) if (IS_ERR(bgp->cfg2_base)) return PTR_ERR(bgp->cfg2_base); - res = platform_get_resource(pdev, IORESOURCE_MEM, 2); - fuse_base = devm_ioremap_resource(dev, res); - if (IS_ERR(fuse_base)) - return PTR_ERR(fuse_base); - driver_data = of_device_get_match_data(dev); if (driver_data) workaround_needed = driver_data->has_errata_i2128; + /* + * Some of TI's J721E SoCs require a software trimming procedure + * for the temperature monitors to function properly. To determine + * if this particular SoC is NOT affected, both bits in the + * WKUP_SPARE_FUSE0[31:30] will be set (0xC0000000) indicating + * when software trimming should NOT be applied. + * + * https://www.ti.com/lit/er/sprz455c/sprz455c.pdf + */ + if (workaround_needed) { + res = platform_get_resource(pdev, IORESOURCE_MEM, 2); + fuse_base = devm_ioremap_resource(dev, res); + if (IS_ERR(fuse_base)) + return PTR_ERR(fuse_base); + + if ((readl(fuse_base) & 0xc0000000) == 0xc0000000) + workaround_needed = false; + } + + dev_dbg(bgp->dev, "Work around %sneeded\n", + workaround_needed ? "" : "not "); + pm_runtime_enable(dev); ret = pm_runtime_get_sync(dev); if (ret < 0) { @@ -427,13 +444,6 @@ static int k3_j72xx_bandgap_probe(struct platform_device *pdev) goto err_free_ref_table; } - /* Workaround not needed if bit30/bit31 is set even for J721e */ - if (workaround_needed && (readl(fuse_base + 0x0) & 0xc0000000) == 0xc0000000) - workaround_needed = false; - - dev_dbg(bgp->dev, "Work around %sneeded\n", - workaround_needed ? "" : "not "); - if (!workaround_needed) init_table(5, ref_table, golden_factors); else -- GitLab From effe8db0a421480fc4dad0c7bf380b8e245cbb8c Mon Sep 17 00:00:00 2001 From: Bryan Brattlof Date: Mon, 31 Oct 2022 18:26:56 -0500 Subject: [PATCH 1384/1423] dt-bindings: thermal: k3-j72xx: elaborate on binding description Elaborate on the function of this device node as well as some of the properties this node uses. Signed-off-by: Bryan Brattlof Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20221031232702.10339-6-bb@ti.com Signed-off-by: Daniel Lezcano --- .../bindings/thermal/ti,j72xx-thermal.yaml | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/thermal/ti,j72xx-thermal.yaml b/Documentation/devicetree/bindings/thermal/ti,j72xx-thermal.yaml index c74f124ebfc00..3bb870a26872f 100644 --- a/Documentation/devicetree/bindings/thermal/ti,j72xx-thermal.yaml +++ b/Documentation/devicetree/bindings/thermal/ti,j72xx-thermal.yaml @@ -9,6 +9,19 @@ title: Texas Instruments J72XX VTM (DTS) binding maintainers: - Keerthy +description: | + The TI K3 family of SoCs typically have a Voltage & Thermal + Management (VTM) device to control up to 8 temperature diode + sensors to measure silicon junction temperatures from different + hotspots of the chip as well as provide temperature, interrupt + and alerting information. + + The following polynomial equation can then be used to convert + value returned by this device into a temperature in Celsius + + Temp(C) = (-9.2627e-12) * x^4 + (6.0373e-08) * x^3 + \ + (-1.7058e-04) * x^2 + (3.2512e-01) * x + (-4.9003e+01) + properties: compatible: enum: @@ -19,7 +32,11 @@ properties: items: - description: VTM cfg1 register space - description: VTM cfg2 register space - - description: VTM efuse register space + - description: | + A software trimming method must be applied to some Jacinto + devices to function properly. This eFuse region provides + the information needed for these SoCs to report + temperatures accurately. power-domains: maxItems: 1 -- GitLab From c4026d3e2578291016703cd75f3a6f786f60cd80 Mon Sep 17 00:00:00 2001 From: Bryan Brattlof Date: Mon, 31 Oct 2022 18:26:57 -0500 Subject: [PATCH 1385/1423] dt-bindings: thermal: k3-j72xx: conditionally require efuse reg range Only some of TI's J721E SoCs will need a eFuse register range mapped to determine if they're affected by TI's i2128 erratum. All other SoC will not need this eFuse range to function properly Update the bindings for the k3_j72xx_bandgap thermal driver so other devices will only need two register ranges Signed-off-by: Bryan Brattlof Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20221031232702.10339-7-bb@ti.com Signed-off-by: Daniel Lezcano --- .../bindings/thermal/ti,j72xx-thermal.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Documentation/devicetree/bindings/thermal/ti,j72xx-thermal.yaml b/Documentation/devicetree/bindings/thermal/ti,j72xx-thermal.yaml index 3bb870a26872f..0509c9cec224d 100644 --- a/Documentation/devicetree/bindings/thermal/ti,j72xx-thermal.yaml +++ b/Documentation/devicetree/bindings/thermal/ti,j72xx-thermal.yaml @@ -37,6 +37,7 @@ properties: devices to function properly. This eFuse region provides the information needed for these SoCs to report temperatures accurately. + minItems: 2 power-domains: maxItems: 1 @@ -44,6 +45,21 @@ properties: "#thermal-sensor-cells": const: 1 +allOf: + - if: + properties: + compatible: + contains: + const: ti,j721e-vtm + then: + properties: + reg: + minItems: 3 + else: + properties: + reg: + maxItems: 2 + required: - compatible - reg -- GitLab From bf438ed026199ff0182408d8592b6c11382d1a51 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Tue, 15 Nov 2022 12:16:29 +0000 Subject: [PATCH 1386/1423] dt-bindings: thermal: rzg2l-thermal: Document RZ/Five SoC The TSU block on the RZ/Five SoC is identical to one found on the RZ/G2UL SoC. "renesas,r9a07g043-tsu" compatible string will be used on the RZ/Five SoC so to make this clear, update the comment to include RZ/Five SoC. No driver changes are required as generic compatible string "renesas,rzg2l-tsu" will be used as a fallback on RZ/Five SoC. Signed-off-by: Lad Prabhakar Acked-by: Rob Herring Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20221115121629.1181667-1-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Daniel Lezcano --- Documentation/devicetree/bindings/thermal/rzg2l-thermal.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/thermal/rzg2l-thermal.yaml b/Documentation/devicetree/bindings/thermal/rzg2l-thermal.yaml index 1d83733978481..03f4b926e53c9 100644 --- a/Documentation/devicetree/bindings/thermal/rzg2l-thermal.yaml +++ b/Documentation/devicetree/bindings/thermal/rzg2l-thermal.yaml @@ -17,7 +17,7 @@ properties: compatible: items: - enum: - - renesas,r9a07g043-tsu # RZ/G2UL + - renesas,r9a07g043-tsu # RZ/G2UL and RZ/Five - renesas,r9a07g044-tsu # RZ/G2{L,LC} - renesas,r9a07g054-tsu # RZ/V2L - const: renesas,rzg2l-tsu -- GitLab From 33dc955c5a2795f17ff81751563836d498ac794f Mon Sep 17 00:00:00 2001 From: Minghao Chi Date: Thu, 17 Nov 2022 14:09:52 +0800 Subject: [PATCH 1387/1423] thermal/drivers/st: Use devm_platform_get_and_ioremap_resource() Convert platform_get_resource(), devm_ioremap_resource() to a single call to devm_platform_get_and_ioremap_resource(), as this is exactly what this function does. Signed-off-by: Minghao Chi Signed-off-by: ye xingchen Link: https://lore.kernel.org/r/202211171409524332954@zte.com.cn Signed-off-by: Daniel Lezcano --- drivers/thermal/st/stm_thermal.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/thermal/st/stm_thermal.c b/drivers/thermal/st/stm_thermal.c index 78feb802a87d2..e7834ccc7976b 100644 --- a/drivers/thermal/st/stm_thermal.c +++ b/drivers/thermal/st/stm_thermal.c @@ -488,7 +488,6 @@ MODULE_DEVICE_TABLE(of, stm_thermal_of_match); static int stm_thermal_probe(struct platform_device *pdev) { struct stm_thermal_sensor *sensor; - struct resource *res; void __iomem *base; int ret; @@ -506,8 +505,7 @@ static int stm_thermal_probe(struct platform_device *pdev) sensor->dev = &pdev->dev; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - base = devm_ioremap_resource(&pdev->dev, res); + base = devm_platform_get_and_ioremap_resource(pdev, 0, NULL); if (IS_ERR(base)) return PTR_ERR(base); -- GitLab From 4a9f20112c22cdcd6b81fafb18534a2a5a629a14 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Wed, 16 Nov 2022 11:09:50 +0100 Subject: [PATCH 1388/1423] dt-bindings: thermal: qcom-tsens: Add compatible for sm8550 The Qualcomm SM8550 platform has three instances of the tsens block, add a compatible for these instances. Signed-off-by: Neil Armstrong Reviewed-by: Krzysztof Kozlowski Acked-by: Amit Kucheria Link: https://lore.kernel.org/r/20221114-narmstrong-sm8550-upstream-tsens-v1-0-0e169822830f@linaro.org Signed-off-by: Daniel Lezcano --- Documentation/devicetree/bindings/thermal/qcom-tsens.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml b/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml index 5bcfddc877d36..7e04804b6e80b 100644 --- a/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml +++ b/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml @@ -58,6 +58,7 @@ properties: - qcom,sm8250-tsens - qcom,sm8350-tsens - qcom,sm8450-tsens + - qcom,sm8550-tsens - const: qcom,tsens-v2 - description: v2 of TSENS with combined interrupt -- GitLab From 46a891e45be97c6781ac34f5ec777d69370e252b Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Wed, 16 Mar 2022 11:03:22 -0700 Subject: [PATCH 1389/1423] thermal/drivers/qcom/lmh: Fix irq handler return value After enough invocations the LMh irq is eventually reported as bad, because the handler doesn't return IRQ_HANDLED, fix this. Fixes: 53bca371cdf7 ("thermal/drivers/qcom: Add support for LMh driver") Reported-by: Daniel Lezcano Signed-off-by: Bjorn Andersson Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20220316180322.88132-1-bjorn.andersson@linaro.org Signed-off-by: Daniel Lezcano --- drivers/thermal/qcom/lmh.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/qcom/lmh.c b/drivers/thermal/qcom/lmh.c index d3d9b9fa49e81..4122a51e98741 100644 --- a/drivers/thermal/qcom/lmh.c +++ b/drivers/thermal/qcom/lmh.c @@ -45,7 +45,7 @@ static irqreturn_t lmh_handle_irq(int hw_irq, void *data) if (irq) generic_handle_irq(irq); - return 0; + return IRQ_HANDLED; } static void lmh_enable_interrupt(struct irq_data *d) -- GitLab From 5011a110295d25418f5918a6af7bfcdb00dd4e34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 12 Dec 2022 23:02:17 +0100 Subject: [PATCH 1390/1423] thermal/drivers/imx_sc_thermal: Drop empty platform remove function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A remove callback just returning 0 is equivalent to no remove callback at all. So drop the useless function. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20221212220217.3777176-1-u.kleine-koenig@pengutronix.de Signed-off-by: Daniel Lezcano --- drivers/thermal/imx_sc_thermal.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/thermal/imx_sc_thermal.c b/drivers/thermal/imx_sc_thermal.c index 5d92b70a5d53a..4df925e3a80bd 100644 --- a/drivers/thermal/imx_sc_thermal.c +++ b/drivers/thermal/imx_sc_thermal.c @@ -127,11 +127,6 @@ static int imx_sc_thermal_probe(struct platform_device *pdev) return 0; } -static int imx_sc_thermal_remove(struct platform_device *pdev) -{ - return 0; -} - static int imx_sc_sensors[] = { IMX_SC_R_SYSTEM, IMX_SC_R_PMIC_0, -1 }; static const struct of_device_id imx_sc_thermal_table[] = { @@ -142,7 +137,6 @@ MODULE_DEVICE_TABLE(of, imx_sc_thermal_table); static struct platform_driver imx_sc_thermal_driver = { .probe = imx_sc_thermal_probe, - .remove = imx_sc_thermal_remove, .driver = { .name = "imx-sc-thermal", .of_match_table = imx_sc_thermal_table, -- GitLab From 549a715b98a13c6d05452be3ad37e980087bb081 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 7 Dec 2022 00:09:59 +0000 Subject: [PATCH 1391/1423] KVM: x86: Add proper ReST tables for userspace MSR exits/flags Add ReST formatting to the set of userspace MSR exits/flags so that the resulting HTML docs generate a table instead of malformed gunk. This also fixes a warning that was introduced by a recent cleanup of the relevant documentation (yay copy+paste). >> Documentation/virt/kvm/api.rst:7287: WARNING: Block quote ends without a blank line; unexpected unindent. Fixes: 1ae099540e8c ("KVM: x86: Allow deflecting unknown MSR accesses to user space") Fixes: 1f158147181b ("KVM: x86: Clean up KVM_CAP_X86_USER_SPACE_MSR documentation") Reported-by: kernel test robot Signed-off-by: Sean Christopherson Message-Id: <20221207000959.2035098-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index c618fae44ad7f..778c6460d1de6 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6455,9 +6455,11 @@ The "reason" field specifies why the MSR interception occurred. Userspace will only receive MSR exits when a particular reason was requested during through ENABLE_CAP. Currently valid exit reasons are: - KVM_MSR_EXIT_REASON_UNKNOWN - access to MSR that is unknown to KVM - KVM_MSR_EXIT_REASON_INVAL - access to invalid MSRs or reserved bits - KVM_MSR_EXIT_REASON_FILTER - access blocked by KVM_X86_SET_MSR_FILTER +============================ ======================================== + KVM_MSR_EXIT_REASON_UNKNOWN access to MSR that is unknown to KVM + KVM_MSR_EXIT_REASON_INVAL access to invalid MSRs or reserved bits + KVM_MSR_EXIT_REASON_FILTER access blocked by KVM_X86_SET_MSR_FILTER +============================ ======================================== For KVM_EXIT_X86_RDMSR, the "index" field tells userspace which MSR the guest wants to read. To respond to this request with a successful read, userspace @@ -7256,11 +7258,13 @@ to inform a user that an MSR was not emulated/virtualized by KVM. The valid mask flags are: - KVM_MSR_EXIT_REASON_UNKNOWN - intercept accesses to unknown (to KVM) MSRs - KVM_MSR_EXIT_REASON_INVAL - intercept accesses that are architecturally - invalid according to the vCPU model and/or mode - KVM_MSR_EXIT_REASON_FILTER - intercept accesses that are denied by userspace - via KVM_X86_SET_MSR_FILTER +============================ =============================================== + KVM_MSR_EXIT_REASON_UNKNOWN intercept accesses to unknown (to KVM) MSRs + KVM_MSR_EXIT_REASON_INVAL intercept accesses that are architecturally + invalid according to the vCPU model and/or mode + KVM_MSR_EXIT_REASON_FILTER intercept accesses that are denied by userspace + via KVM_X86_SET_MSR_FILTER +============================ =============================================== 7.22 KVM_CAP_X86_BUS_LOCK_EXIT ------------------------------- -- GitLab From 025e3b507a3a8e1ee96a3112bb67495c77d6cdb6 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 1 Nov 2022 17:09:46 +0200 Subject: [PATCH 1392/1423] fbdev: ssd1307fb: Drop optional dependency Only a single out of three devices need a PWM, so from driver it's optional. Moreover it's a single driver in the entire kernel that currently selects PWM. Unfortunately this selection is a root cause of the circular dependencies when we want to enable optional PWM for some other drivers that select GPIOLIB. Fixes: a2ed00da5047 ("drivers/video: add support for the Solomon SSD1307 OLED Controller") Signed-off-by: Andy Shevchenko Signed-off-by: Helge Deller --- drivers/video/fbdev/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig index 71019b167f8b0..66f36b69e8f39 100644 --- a/drivers/video/fbdev/Kconfig +++ b/drivers/video/fbdev/Kconfig @@ -2254,7 +2254,6 @@ config FB_SSD1307 select FB_SYS_COPYAREA select FB_SYS_IMAGEBLIT select FB_DEFERRED_IO - select PWM select FB_BACKLIGHT help This driver implements support for the Solomon SSD1307 -- GitLab From 6273c43769cbd4451b03d239dc16d5c54bb7279a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 1 Nov 2022 17:09:47 +0200 Subject: [PATCH 1393/1423] fbdev: ssd1307fb: Drop duplicate NULL checks for PWM APIs pwm_disable() and pwm_put() are NULL-aware, no need to duplicate the check in the caller. Signed-off-by: Andy Shevchenko Signed-off-by: Helge Deller --- drivers/video/fbdev/ssd1307fb.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/video/fbdev/ssd1307fb.c b/drivers/video/fbdev/ssd1307fb.c index 5c891aa00d596..046b9990d27cd 100644 --- a/drivers/video/fbdev/ssd1307fb.c +++ b/drivers/video/fbdev/ssd1307fb.c @@ -803,10 +803,8 @@ static int ssd1307fb_probe(struct i2c_client *client) bl_init_error: unregister_framebuffer(info); panel_init_error: - if (par->device_info->need_pwm) { - pwm_disable(par->pwm); - pwm_put(par->pwm); - } + pwm_disable(par->pwm); + pwm_put(par->pwm); regulator_enable_error: if (par->vbat_reg) regulator_disable(par->vbat_reg); @@ -827,10 +825,8 @@ static void ssd1307fb_remove(struct i2c_client *client) backlight_device_unregister(info->bl_dev); unregister_framebuffer(info); - if (par->device_info->need_pwm) { - pwm_disable(par->pwm); - pwm_put(par->pwm); - } + pwm_disable(par->pwm); + pwm_put(par->pwm); if (par->vbat_reg) regulator_disable(par->vbat_reg); fb_deferred_io_cleanup(info); -- GitLab From 28f24e90ffc4bf2199c91783b98dfc71de1e3a2f Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 1 Nov 2022 10:29:47 +0000 Subject: [PATCH 1394/1423] fbdev: omapfb: remove redundant variable checksum Variable checksum is being used to accumulate values however it is never read or used afterwards. It is redundant and can be removed. Signed-off-by: Colin Ian King Signed-off-by: Helge Deller --- drivers/video/fbdev/omap2/omapfb/dss/hdmi5_core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/video/fbdev/omap2/omapfb/dss/hdmi5_core.c b/drivers/video/fbdev/omap2/omapfb/dss/hdmi5_core.c index cb63bc0e92caa..b33f62c5cb220 100644 --- a/drivers/video/fbdev/omap2/omapfb/dss/hdmi5_core.c +++ b/drivers/video/fbdev/omap2/omapfb/dss/hdmi5_core.c @@ -129,7 +129,6 @@ static int hdmi_core_ddc_edid(struct hdmi_core_data *core, u8 *pedid, u8 ext) { void __iomem *base = core->base; u8 cur_addr; - char checksum = 0; const int retries = 1000; u8 seg_ptr = ext / 2; u8 edidbase = ((ext % 2) * 0x80); @@ -178,7 +177,6 @@ static int hdmi_core_ddc_edid(struct hdmi_core_data *core, u8 *pedid, u8 ext) } pedid[cur_addr] = REG_GET(base, HDMI_CORE_I2CM_DATAI, 7, 0); - checksum += pedid[cur_addr]; } return 0; -- GitLab From 257030d4ee7c76892e9a4e5bf94f8c0fc41c6e46 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Thu, 3 Nov 2022 20:16:30 -0700 Subject: [PATCH 1395/1423] fbdev: omapfb: connector-hdmi: switch to using gpiod API Switch the driver from legacy gpio API that is deprecated to the newer gpiod API that respects line polarities described in ACPI/DT. Signed-off-by: Dmitry Torokhov Signed-off-by: Helge Deller --- .../omap2/omapfb/displays/connector-hdmi.c | 49 ++++++------------- 1 file changed, 14 insertions(+), 35 deletions(-) diff --git a/drivers/video/fbdev/omap2/omapfb/displays/connector-hdmi.c b/drivers/video/fbdev/omap2/omapfb/displays/connector-hdmi.c index 670b9c6eb5a9c..8f9ff9fb4ca4c 100644 --- a/drivers/video/fbdev/omap2/omapfb/displays/connector-hdmi.c +++ b/drivers/video/fbdev/omap2/omapfb/displays/connector-hdmi.c @@ -6,11 +6,12 @@ * Author: Tomi Valkeinen */ +#include +#include #include #include #include #include -#include #include @@ -41,7 +42,7 @@ struct panel_drv_data { struct omap_video_timings timings; - int hpd_gpio; + struct gpio_desc *hpd_gpio; }; #define to_panel_data(x) container_of(x, struct panel_drv_data, dssdev) @@ -155,8 +156,8 @@ static bool hdmic_detect(struct omap_dss_device *dssdev) struct panel_drv_data *ddata = to_panel_data(dssdev); struct omap_dss_device *in = ddata->in; - if (gpio_is_valid(ddata->hpd_gpio)) - return gpio_get_value_cansleep(ddata->hpd_gpio); + if (ddata->hpd_gpio) + return gpiod_get_value_cansleep(ddata->hpd_gpio); else return in->ops.hdmi->detect(in); } @@ -197,31 +198,6 @@ static struct omap_dss_driver hdmic_driver = { .set_hdmi_infoframe = hdmic_set_infoframe, }; -static int hdmic_probe_of(struct platform_device *pdev) -{ - struct panel_drv_data *ddata = platform_get_drvdata(pdev); - struct device_node *node = pdev->dev.of_node; - struct omap_dss_device *in; - int gpio; - - /* HPD GPIO */ - gpio = of_get_named_gpio(node, "hpd-gpios", 0); - if (gpio_is_valid(gpio)) - ddata->hpd_gpio = gpio; - else - ddata->hpd_gpio = -ENODEV; - - in = omapdss_of_find_source_for_first_ep(node); - if (IS_ERR(in)) { - dev_err(&pdev->dev, "failed to find video source\n"); - return PTR_ERR(in); - } - - ddata->in = in; - - return 0; -} - static int hdmic_probe(struct platform_device *pdev) { struct panel_drv_data *ddata; @@ -238,15 +214,18 @@ static int hdmic_probe(struct platform_device *pdev) platform_set_drvdata(pdev, ddata); ddata->dev = &pdev->dev; - r = hdmic_probe_of(pdev); + ddata->hpd_gpio = devm_gpiod_get_optional(&pdev->dev, "hpd", GPIOD_IN); + r = PTR_ERR_OR_ZERO(ddata->hpd_gpio); if (r) return r; - if (gpio_is_valid(ddata->hpd_gpio)) { - r = devm_gpio_request_one(&pdev->dev, ddata->hpd_gpio, - GPIOF_DIR_IN, "hdmi_hpd"); - if (r) - goto err_reg; + gpiod_set_consumer_name(ddata->hpd_gpio, "hdmi_hpd"); + + ddata->in = omapdss_of_find_source_for_first_ep(pdev->dev.of_node); + r = PTR_ERR_OR_ZERO(ddata->in); + if (r) { + dev_err(&pdev->dev, "failed to find video source\n"); + return r; } ddata->timings = hdmic_default_timings; -- GitLab From 5845b32edc1ee1039d040716cf9d78bbd52e13c3 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Thu, 3 Nov 2022 20:16:31 -0700 Subject: [PATCH 1396/1423] fbdev: omapfb: panel-sony-acx565akm: remove support for platform data There are no users of panel_acx565akm_platform_data in the mainline kernel so support for it can be removed from the panel driver. Signed-off-by: Dmitry Torokhov Signed-off-by: Helge Deller --- .../omapfb/displays/panel-sony-acx565akm.c | 45 +++---------------- include/video/omap-panel-data.h | 16 ------- 2 files changed, 6 insertions(+), 55 deletions(-) diff --git a/drivers/video/fbdev/omap2/omapfb/displays/panel-sony-acx565akm.c b/drivers/video/fbdev/omap2/omapfb/displays/panel-sony-acx565akm.c index c0965bee12c57..0c81d3ff41974 100644 --- a/drivers/video/fbdev/omap2/omapfb/displays/panel-sony-acx565akm.c +++ b/drivers/video/fbdev/omap2/omapfb/displays/panel-sony-acx565akm.c @@ -23,7 +23,6 @@ #include #include